mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 10:47:25 +01:00
Reindent code to use just two spaces
This commit is contained in:
parent
a1e6147466
commit
c04b6337e9
17 changed files with 1583 additions and 1582 deletions
88
R/analyze.R
88
R/analyze.R
|
|
@ -18,54 +18,54 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
analyze <- function(preset, progress = NULL, include_results = TRUE) {
|
analyze <- function(preset, progress = NULL, include_results = TRUE) {
|
||||||
if (!inherits(preset, "geposan_preset")) {
|
if (!inherits(preset, "geposan_preset")) {
|
||||||
stop("Preset is invalid. Use geposan::preset() to create one.")
|
stop("Preset is invalid. Use geposan::preset() to create one.")
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is.null(progress)) {
|
if (is.null(progress)) {
|
||||||
progress_bar <- progress::progress_bar$new()
|
progress_bar <- progress::progress_bar$new()
|
||||||
progress_bar$update(0.0)
|
progress_bar$update(0.0)
|
||||||
|
|
||||||
progress <- function(progress_value) {
|
progress <- function(progress_value) {
|
||||||
if (!progress_bar$finished) {
|
if (!progress_bar$finished) {
|
||||||
progress_bar$update(progress_value)
|
progress_bar$update(progress_value)
|
||||||
if (progress_value >= 1.0) {
|
if (progress_value >= 1.0) {
|
||||||
progress_bar$terminate()
|
progress_bar$terminate()
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
progress_buffer <- 0.0
|
progress_buffer <- 0.0
|
||||||
method_count <- length(preset$methods)
|
method_count <- length(preset$methods)
|
||||||
|
|
||||||
method_progress <- function(progress_value) {
|
method_progress <- function(progress_value) {
|
||||||
progress(progress_buffer + progress_value / method_count)
|
progress(progress_buffer + progress_value / method_count)
|
||||||
}
|
}
|
||||||
|
|
||||||
scores <- data.table(gene = preset$gene_id)
|
scores <- data.table(gene = preset$gene_id)
|
||||||
results <- list()
|
results <- list()
|
||||||
|
|
||||||
for (method in preset$methods) {
|
for (method in preset$methods) {
|
||||||
method_results <- method$func(preset, method_progress)
|
method_results <- method$func(preset, method_progress)
|
||||||
|
|
||||||
scores <- merge(scores, method_results$scores, by = "gene")
|
scores <- merge(scores, method_results$scores, by = "gene")
|
||||||
setnames(scores, "score", method$id)
|
setnames(scores, "score", method$id)
|
||||||
|
|
||||||
results <- c(results, list(method_results))
|
results <- c(results, list(method_results))
|
||||||
|
|
||||||
progress_buffer <- progress_buffer + 1 / method_count
|
progress_buffer <- progress_buffer + 1 / method_count
|
||||||
progress(progress_buffer)
|
progress(progress_buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
structure(
|
structure(
|
||||||
list(
|
list(
|
||||||
preset = preset,
|
preset = preset,
|
||||||
scores = scores,
|
scores = scores,
|
||||||
results = if (include_results) results else NULL
|
results = if (include_results) results else NULL
|
||||||
),
|
),
|
||||||
class = "geposan_analysis"
|
class = "geposan_analysis"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Print an analysis object.
|
#' Print an analysis object.
|
||||||
|
|
@ -77,14 +77,14 @@ analyze <- function(preset, progress = NULL, include_results = TRUE) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
print.geposan_analysis <- function(x, ...) {
|
print.geposan_analysis <- function(x, ...) {
|
||||||
cat("geposan analysis:\n\n")
|
cat("geposan analysis:\n\n")
|
||||||
print(x$preset)
|
print(x$preset)
|
||||||
|
cat("\n")
|
||||||
|
|
||||||
|
for (result in x$results) {
|
||||||
|
print(result)
|
||||||
cat("\n")
|
cat("\n")
|
||||||
|
}
|
||||||
|
|
||||||
for (result in x$results) {
|
invisible(x)
|
||||||
print(result)
|
|
||||||
cat("\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
invisible(x)
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
104
R/comparison.R
104
R/comparison.R
|
|
@ -18,38 +18,38 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
compare <- function(ranking, comparison_gene_ids) {
|
compare <- function(ranking, comparison_gene_ids) {
|
||||||
if (!inherits(ranking, "geposan_ranking")) {
|
if (!inherits(ranking, "geposan_ranking")) {
|
||||||
stop("Invalid ranking. Use geposan::ranking().")
|
stop("Invalid ranking. Use geposan::ranking().")
|
||||||
}
|
}
|
||||||
|
|
||||||
comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
|
comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
|
||||||
|
|
||||||
quantiles <- data.table(
|
quantiles <- data.table(
|
||||||
quantile = c("0%", "25%", "50%", "75%", "100%"),
|
quantile = c("0%", "25%", "50%", "75%", "100%"),
|
||||||
score = stats::quantile(comparison_ranking[, score]),
|
score = stats::quantile(comparison_ranking[, score]),
|
||||||
rank = stats::quantile(
|
rank = stats::quantile(
|
||||||
comparison_ranking[, rank],
|
comparison_ranking[, rank],
|
||||||
probs = seq(1, 0, -0.25)
|
probs = seq(1, 0, -0.25)
|
||||||
),
|
),
|
||||||
percentile = stats::quantile(comparison_ranking[, percentile])
|
percentile = stats::quantile(comparison_ranking[, percentile])
|
||||||
)
|
)
|
||||||
|
|
||||||
p_value <- stats::wilcox.test(
|
p_value <- stats::wilcox.test(
|
||||||
x = comparison_ranking[, score],
|
x = comparison_ranking[, score],
|
||||||
y = ranking[!gene %chin% comparison_gene_ids, score],
|
y = ranking[!gene %chin% comparison_gene_ids, score],
|
||||||
alternative = "greater"
|
alternative = "greater"
|
||||||
)$p.value
|
)$p.value
|
||||||
|
|
||||||
structure(
|
structure(
|
||||||
list(
|
list(
|
||||||
quantiles = quantiles,
|
quantiles = quantiles,
|
||||||
mean_score = comparison_ranking[, mean(score)],
|
mean_score = comparison_ranking[, mean(score)],
|
||||||
mean_rank = comparison_ranking[, mean(rank)],
|
mean_rank = comparison_ranking[, mean(rank)],
|
||||||
mean_percentile = comparison_ranking[, mean(percentile)],
|
mean_percentile = comparison_ranking[, mean(percentile)],
|
||||||
p_value = p_value
|
p_value = p_value
|
||||||
),
|
),
|
||||||
class = "geposan_comparison"
|
class = "geposan_comparison"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' S3 method to print a comparison object.
|
#' S3 method to print a comparison object.
|
||||||
|
|
@ -61,32 +61,32 @@ compare <- function(ranking, comparison_gene_ids) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
print.geposan_comparison <- function(x, ...) {
|
print.geposan_comparison <- function(x, ...) {
|
||||||
cat("geposan comparison:\n\n")
|
cat("geposan comparison:\n\n")
|
||||||
|
|
||||||
quantiles_formatted <- x$quantiles[, .(
|
quantiles_formatted <- x$quantiles[, .(
|
||||||
"Quantile" = quantile,
|
"Quantile" = quantile,
|
||||||
"Score" = round(score, 3),
|
"Score" = round(score, 3),
|
||||||
"Rank" = rank,
|
"Rank" = rank,
|
||||||
"Percentile" = paste0(
|
"Percentile" = paste0(
|
||||||
format(round(percentile * 100, 1), nsmall = 1),
|
format(round(percentile * 100, 1), nsmall = 1),
|
||||||
"%"
|
"%"
|
||||||
)
|
)
|
||||||
)]
|
)]
|
||||||
|
|
||||||
print(quantiles_formatted, row.names = FALSE)
|
print(quantiles_formatted, row.names = FALSE)
|
||||||
|
|
||||||
cat(sprintf(
|
cat(sprintf(
|
||||||
paste0(
|
paste0(
|
||||||
"\n Mean score: %.3f",
|
"\n Mean score: %.3f",
|
||||||
"\n Mean rank: %.1f",
|
"\n Mean rank: %.1f",
|
||||||
"\n Mean percentile: %.1f%%",
|
"\n Mean percentile: %.1f%%",
|
||||||
"\n p-value for better scores: %.4f\n"
|
"\n p-value for better scores: %.4f\n"
|
||||||
),
|
),
|
||||||
x$mean_score,
|
x$mean_score,
|
||||||
x$mean_rank,
|
x$mean_rank,
|
||||||
x$mean_percentile * 100,
|
x$mean_percentile * 100,
|
||||||
x$p_value
|
x$p_value
|
||||||
))
|
))
|
||||||
|
|
||||||
invisible(x)
|
invisible(x)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
68
R/method.R
68
R/method.R
|
|
@ -12,34 +12,34 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
method <- function(id, name, description, func) {
|
method <- function(id, name, description, func) {
|
||||||
stopifnot(is.character(id) & length(id) == 1)
|
stopifnot(is.character(id) & length(id) == 1)
|
||||||
stopifnot(is.character(name) & length(name) == 1)
|
stopifnot(is.character(name) & length(name) == 1)
|
||||||
stopifnot(is.character(description) & length(description) == 1)
|
stopifnot(is.character(description) & length(description) == 1)
|
||||||
stopifnot(is.function(func))
|
stopifnot(is.function(func))
|
||||||
|
|
||||||
structure(
|
structure(
|
||||||
list(
|
list(
|
||||||
id = id,
|
id = id,
|
||||||
name = name,
|
name = name,
|
||||||
description = description,
|
description = description,
|
||||||
func = func
|
func = func
|
||||||
),
|
),
|
||||||
class = "geposan_method"
|
class = "geposan_method"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Get a list of all available methods.
|
#' Get a list of all available methods.
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
all_methods <- function() {
|
all_methods <- function() {
|
||||||
list(
|
list(
|
||||||
clustering(),
|
clustering(),
|
||||||
correlation(),
|
correlation(),
|
||||||
neural(),
|
neural(),
|
||||||
adjacency(),
|
adjacency(),
|
||||||
species_adjacency(),
|
species_adjacency(),
|
||||||
proximity()
|
proximity()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Print a method object.
|
#' Print a method object.
|
||||||
|
|
@ -51,18 +51,18 @@ all_methods <- function() {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
print.geposan_method <- function(x, ...) {
|
print.geposan_method <- function(x, ...) {
|
||||||
cat(sprintf(
|
cat(sprintf(
|
||||||
paste0(
|
paste0(
|
||||||
"geposan method:",
|
"geposan method:",
|
||||||
"\n Method ID: %s",
|
"\n Method ID: %s",
|
||||||
"\n Name: %s",
|
"\n Name: %s",
|
||||||
"\n Description: %s",
|
"\n Description: %s",
|
||||||
"\n"
|
"\n"
|
||||||
),
|
),
|
||||||
x$id,
|
x$id,
|
||||||
x$name,
|
x$name,
|
||||||
x$description
|
x$description
|
||||||
))
|
))
|
||||||
|
|
||||||
invisible(x)
|
invisible(x)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,14 +12,14 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
densest <- function(data) {
|
densest <- function(data) {
|
||||||
as.numeric(if (length(data) <= 0) {
|
as.numeric(if (length(data) <= 0) {
|
||||||
NULL
|
NULL
|
||||||
} else if (length(data) == 1) {
|
} else if (length(data) == 1) {
|
||||||
data
|
data
|
||||||
} else {
|
} else {
|
||||||
density <- stats::density(data)
|
density <- stats::density(data)
|
||||||
mean(density$x[density$y == max(density$y)])
|
mean(density$x[density$y == max(density$y)])
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Score genes based on their proximity to the reference genes.
|
#' Score genes based on their proximity to the reference genes.
|
||||||
|
|
@ -40,80 +40,80 @@ densest <- function(data) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
adjacency <- function(distance_estimate = densest, summarize = stats::median) {
|
adjacency <- function(distance_estimate = densest, summarize = stats::median) {
|
||||||
method(
|
method(
|
||||||
id = "adjacency",
|
id = "adjacency",
|
||||||
name = "Adjacency",
|
name = "Adjacency",
|
||||||
description = "Adjacency to reference genes",
|
description = "Adjacency to reference genes",
|
||||||
function(preset, progress) {
|
function(preset, progress) {
|
||||||
species_ids <- preset$species_ids
|
species_ids <- preset$species_ids
|
||||||
gene_ids <- preset$gene_ids
|
gene_ids <- preset$gene_ids
|
||||||
reference_gene_ids <- preset$reference_gene_ids
|
reference_gene_ids <- preset$reference_gene_ids
|
||||||
|
|
||||||
cached(
|
cached(
|
||||||
"adjacency",
|
"adjacency",
|
||||||
c(
|
c(
|
||||||
species_ids,
|
species_ids,
|
||||||
gene_ids,
|
gene_ids,
|
||||||
reference_gene_ids,
|
reference_gene_ids,
|
||||||
distance_estimate,
|
distance_estimate,
|
||||||
summarize
|
summarize
|
||||||
),
|
),
|
||||||
{ # nolint
|
{ # nolint
|
||||||
# Filter distances by species and gene and summarize each
|
# Filter distances by species and gene and summarize each
|
||||||
# gene's distance values using the estimation function.
|
# gene's distance values using the estimation function.
|
||||||
data <- geposan::distances[
|
data <- geposan::distances[
|
||||||
species %chin% species_ids & gene %chin% gene_ids,
|
species %chin% species_ids & gene %chin% gene_ids,
|
||||||
.(distance = as.numeric(distance_estimate(distance))),
|
.(distance = as.numeric(distance_estimate(distance))),
|
||||||
by = gene
|
by = gene
|
||||||
]
|
]
|
||||||
|
|
||||||
# Compute the absolute value of the difference between the
|
# Compute the absolute value of the difference between the
|
||||||
# estimated distances of each gene to the reference genes.
|
# estimated distances of each gene to the reference genes.
|
||||||
compute_difference <- function(distance_value,
|
compute_difference <- function(distance_value,
|
||||||
comparison_ids) {
|
comparison_ids) {
|
||||||
differences <- data[
|
differences <- data[
|
||||||
gene %chin% comparison_ids,
|
gene %chin% comparison_ids,
|
||||||
.(difference = abs(distance_value - distance))
|
.(difference = abs(distance_value - distance))
|
||||||
]
|
]
|
||||||
|
|
||||||
summarize(differences$difference)
|
summarize(differences$difference)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Compute the differences to the reference genes.
|
# Compute the differences to the reference genes.
|
||||||
data[
|
data[
|
||||||
!gene %chin% reference_gene_ids,
|
!gene %chin% reference_gene_ids,
|
||||||
difference := compute_difference(
|
difference := compute_difference(
|
||||||
distance,
|
distance,
|
||||||
reference_gene_ids
|
reference_gene_ids
|
||||||
),
|
),
|
||||||
by = gene
|
by = gene
|
||||||
]
|
]
|
||||||
|
|
||||||
progress(0.5)
|
progress(0.5)
|
||||||
|
|
||||||
# Exclude the reference gene itself when computing its
|
# Exclude the reference gene itself when computing its
|
||||||
# difference.
|
# difference.
|
||||||
data[
|
data[
|
||||||
gene %chin% reference_gene_ids,
|
gene %chin% reference_gene_ids,
|
||||||
difference := compute_difference(
|
difference := compute_difference(
|
||||||
distance,
|
distance,
|
||||||
reference_gene_ids[reference_gene_ids != gene]
|
reference_gene_ids[reference_gene_ids != gene]
|
||||||
),
|
),
|
||||||
by = gene
|
by = gene
|
||||||
]
|
]
|
||||||
|
|
||||||
# Compute the final score by normalizing the difference.
|
# Compute the final score by normalizing the difference.
|
||||||
data[, score := 1 - difference / max(difference)]
|
data[, score := 1 - difference / max(difference)]
|
||||||
|
|
||||||
progress(1.0)
|
progress(1.0)
|
||||||
|
|
||||||
result(
|
result(
|
||||||
method = "adjacency",
|
method = "adjacency",
|
||||||
scores = data[, .(gene, score)],
|
scores = data[, .(gene, score)],
|
||||||
details = list(data = data)
|
details = list(data = data)
|
||||||
)
|
)
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -13,33 +13,33 @@
|
||||||
#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
|
#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
|
||||||
#' etc.
|
#' etc.
|
||||||
clusteriness <- function(data, span = 100000, weight = 0.7) {
|
clusteriness <- function(data, span = 100000, weight = 0.7) {
|
||||||
n <- length(data)
|
n <- length(data)
|
||||||
|
|
||||||
# Return a score of 0.0 if there is just one or no value at all.
|
# Return a score of 0.0 if there is just one or no value at all.
|
||||||
if (n < 2) {
|
if (n < 2) {
|
||||||
return(0.0)
|
return(0.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cluster the data and compute the cluster sizes.
|
||||||
|
|
||||||
|
tree <- stats::hclust(stats::dist(data))
|
||||||
|
clusters <- stats::cutree(tree, h = span)
|
||||||
|
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
|
||||||
|
|
||||||
|
# Compute the "clusteriness" score.
|
||||||
|
|
||||||
|
score <- 0.0
|
||||||
|
|
||||||
|
for (i in seq_along(cluster_sizes)) {
|
||||||
|
cluster_size <- cluster_sizes[i]
|
||||||
|
|
||||||
|
if (cluster_size >= 2) {
|
||||||
|
cluster_score <- cluster_size / n
|
||||||
|
score <- score + weight^(i - 1) * cluster_score
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# Cluster the data and compute the cluster sizes.
|
score
|
||||||
|
|
||||||
tree <- stats::hclust(stats::dist(data))
|
|
||||||
clusters <- stats::cutree(tree, h = span)
|
|
||||||
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
|
|
||||||
|
|
||||||
# Compute the "clusteriness" score.
|
|
||||||
|
|
||||||
score <- 0.0
|
|
||||||
|
|
||||||
for (i in seq_along(cluster_sizes)) {
|
|
||||||
cluster_size <- cluster_sizes[i]
|
|
||||||
|
|
||||||
if (cluster_size >= 2) {
|
|
||||||
cluster_score <- cluster_size / n
|
|
||||||
score <- score + weight^(i - 1) * cluster_score
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
score
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Process genes clustering their distance to telomeres.
|
#' Process genes clustering their distance to telomeres.
|
||||||
|
|
@ -53,41 +53,41 @@ clusteriness <- function(data, span = 100000, weight = 0.7) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
clustering <- function() {
|
clustering <- function() {
|
||||||
method(
|
method(
|
||||||
id = "clustering",
|
id = "clustering",
|
||||||
name = "Clustering",
|
name = "Clustering",
|
||||||
description = "Clustering of genes",
|
description = "Clustering of genes",
|
||||||
function(preset, progress) {
|
function(preset, progress) {
|
||||||
species_ids <- preset$species_ids
|
species_ids <- preset$species_ids
|
||||||
gene_ids <- preset$gene_ids
|
gene_ids <- preset$gene_ids
|
||||||
|
|
||||||
cached("clustering", c(species_ids, gene_ids), {
|
cached("clustering", c(species_ids, gene_ids), {
|
||||||
scores <- data.table(gene = gene_ids)
|
scores <- data.table(gene = gene_ids)
|
||||||
|
|
||||||
# Prefilter the input data by species.
|
# Prefilter the input data by species.
|
||||||
distances <- geposan::distances[species %chin% species_ids]
|
distances <- geposan::distances[species %chin% species_ids]
|
||||||
|
|
||||||
genes_done <- 0
|
genes_done <- 0
|
||||||
genes_total <- length(gene_ids)
|
genes_total <- length(gene_ids)
|
||||||
|
|
||||||
# Perform the cluster analysis for one gene.
|
# Perform the cluster analysis for one gene.
|
||||||
compute <- function(gene_id) {
|
compute <- function(gene_id) {
|
||||||
data <- distances[gene == gene_id, distance]
|
data <- distances[gene == gene_id, distance]
|
||||||
score <- clusteriness(data)
|
score <- clusteriness(data)
|
||||||
|
|
||||||
genes_done <<- genes_done + 1
|
genes_done <<- genes_done + 1
|
||||||
progress(genes_done / genes_total)
|
progress(genes_done / genes_total)
|
||||||
|
|
||||||
score
|
score
|
||||||
}
|
|
||||||
|
|
||||||
scores[, score := compute(gene), by = gene]
|
|
||||||
|
|
||||||
result(
|
|
||||||
method = "clustering",
|
|
||||||
scores = scores
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
)
|
|
||||||
|
scores[, score := compute(gene), by = gene]
|
||||||
|
|
||||||
|
result(
|
||||||
|
method = "clustering",
|
||||||
|
scores = scores
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -8,96 +8,97 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
correlation <- function(summarize = stats::median) {
|
correlation <- function(summarize = stats::median) {
|
||||||
method(
|
method(
|
||||||
id = "correlation",
|
id = "correlation",
|
||||||
name = "Correlation",
|
name = "Correlation",
|
||||||
description = "Correlation with reference genes",
|
description = "Correlation with reference genes",
|
||||||
function(preset, progress) {
|
function(preset, progress) {
|
||||||
species_ids <- preset$species_ids
|
species_ids <- preset$species_ids
|
||||||
gene_ids <- preset$gene_ids
|
gene_ids <- preset$gene_ids
|
||||||
reference_gene_ids <- preset$reference_gene_ids
|
reference_gene_ids <- preset$reference_gene_ids
|
||||||
|
|
||||||
cached(
|
cached(
|
||||||
"correlation",
|
"correlation",
|
||||||
c(species_ids, gene_ids, reference_gene_ids, summarize),
|
c(species_ids, gene_ids, reference_gene_ids, summarize),
|
||||||
{ # nolint
|
{ # nolint
|
||||||
# Prefilter distances by species.
|
# Prefilter distances by species.
|
||||||
distances <- geposan::distances[species %chin% species_ids]
|
distances <- geposan::distances[species %chin% species_ids]
|
||||||
|
|
||||||
# Tranform data to get species as rows and genes as columns.
|
# Tranform data to get species as rows and genes as columns.
|
||||||
# We construct columns per species, because it requires
|
# We construct columns per species, because it requires
|
||||||
# fewer iterations, and transpose the table afterwards.
|
# fewer iterations, and transpose the table afterwards.
|
||||||
|
|
||||||
data <- data.table(gene = gene_ids)
|
data <- data.table(gene = gene_ids)
|
||||||
|
|
||||||
# Make a column containing distance data for each species.
|
# Make a column containing distance data for each species.
|
||||||
for (species_id in species_ids) {
|
for (species_id in species_ids) {
|
||||||
species_data <- distances[
|
species_data <- distances[
|
||||||
species == species_id,
|
species == species_id,
|
||||||
.(gene, distance)
|
.(gene, distance)
|
||||||
]
|
]
|
||||||
|
|
||||||
data <- merge(data, species_data, all.x = TRUE)
|
data <- merge(data, species_data, all.x = TRUE)
|
||||||
setnames(data, "distance", species_id)
|
setnames(data, "distance", species_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Transpose to the desired format.
|
# Transpose to the desired format.
|
||||||
data <- transpose(data, make.names = "gene")
|
data <- transpose(data, make.names = "gene")
|
||||||
|
|
||||||
progress(0.33)
|
progress(0.33)
|
||||||
|
|
||||||
# Take the reference data.
|
# Take the reference data.
|
||||||
reference_data <- data[, ..reference_gene_ids]
|
reference_data <- data[, ..reference_gene_ids]
|
||||||
|
|
||||||
# Perform the correlation between all possible pairs.
|
# Perform the correlation between all possible pairs.
|
||||||
results <- stats::cor(
|
results <- stats::cor(
|
||||||
data[, ..gene_ids],
|
data[, ..gene_ids],
|
||||||
reference_data,
|
reference_data,
|
||||||
use = "pairwise.complete.obs",
|
use = "pairwise.complete.obs",
|
||||||
method = "spearman"
|
method = "spearman"
|
||||||
)
|
)
|
||||||
|
|
||||||
results <- data.table(results, keep.rownames = TRUE)
|
results <- data.table(results, keep.rownames = TRUE)
|
||||||
setnames(results, "rn", "gene")
|
setnames(results, "rn", "gene")
|
||||||
|
|
||||||
# Remove correlations between the reference genes
|
# Remove correlations between the reference genes
|
||||||
# themselves.
|
# themselves.
|
||||||
for (reference_gene_id in reference_gene_ids) {
|
for (reference_gene_id in reference_gene_ids) {
|
||||||
column <- quote(reference_gene_id)
|
column <- quote(reference_gene_id)
|
||||||
results[gene == reference_gene_id, eval(column) := NA]
|
results[gene == reference_gene_id, eval(column) := NA]
|
||||||
}
|
}
|
||||||
|
|
||||||
progress(0.66)
|
progress(0.66)
|
||||||
|
|
||||||
# Combine the correlation coefficients.
|
# Combine the correlation coefficients.
|
||||||
results[,
|
results[,
|
||||||
max_correlation := as.double(summarize(stats::na.omit(
|
max_correlation := as.double(summarize(stats::na.omit(
|
||||||
# Convert the data.table subset into a
|
# Convert the data.table subset into a
|
||||||
# vector to get the correct na.omit
|
# vector to get the correct na.omit
|
||||||
# behavior.
|
# behavior.
|
||||||
as.matrix(.SD)[1, ]
|
as.matrix(.SD)[1, ]
|
||||||
))),
|
))),
|
||||||
.SDcols = reference_gene_ids,
|
.SDcols = reference_gene_ids,
|
||||||
by = gene
|
by = gene
|
||||||
]
|
]
|
||||||
|
|
||||||
# Normalize scores.
|
# Normalize scores.
|
||||||
results[,
|
results[
|
||||||
score := (max_correlation - min(max_correlation)) /
|
,
|
||||||
(max(max_correlation) - min(max_correlation))
|
score := (max_correlation - min(max_correlation)) /
|
||||||
]
|
(max(max_correlation) - min(max_correlation))
|
||||||
|
]
|
||||||
|
|
||||||
# Normalize scores.
|
# Normalize scores.
|
||||||
|
|
||||||
results[, .(gene, score)]
|
results[, .(gene, score)]
|
||||||
|
|
||||||
result(
|
result(
|
||||||
method = "correlation",
|
method = "correlation",
|
||||||
scores = results[, .(gene, score)],
|
scores = results[, .(gene, score)],
|
||||||
details = list(all_correlations = results)
|
details = list(all_correlations = results)
|
||||||
)
|
)
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,244 +12,244 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
neural <- function(seed = 180199, n_models = 5) {
|
neural <- function(seed = 180199, n_models = 5) {
|
||||||
method(
|
method(
|
||||||
id = "neural",
|
id = "neural",
|
||||||
name = "Neural",
|
name = "Neural",
|
||||||
description = "Assessment by neural network",
|
description = "Assessment by neural network",
|
||||||
function(preset, progress) {
|
function(preset, progress) {
|
||||||
species_ids <- preset$species_ids
|
species_ids <- preset$species_ids
|
||||||
gene_ids <- preset$gene_ids
|
gene_ids <- preset$gene_ids
|
||||||
reference_gene_ids <- preset$reference_gene_ids
|
reference_gene_ids <- preset$reference_gene_ids
|
||||||
|
|
||||||
cached(
|
cached(
|
||||||
"neural",
|
"neural",
|
||||||
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
|
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
|
||||||
{ # nolint
|
{ # nolint
|
||||||
reference_count <- length(reference_gene_ids)
|
reference_count <- length(reference_gene_ids)
|
||||||
stopifnot(n_models %in% 2:reference_count)
|
stopifnot(n_models %in% 2:reference_count)
|
||||||
|
|
||||||
# Make results reproducible.
|
# Make results reproducible.
|
||||||
tensorflow::set_random_seed(seed)
|
tensorflow::set_random_seed(seed)
|
||||||
|
|
||||||
# Step 1: Prepare input data.
|
# Step 1: Prepare input data.
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
|
|
||||||
# Prefilter distances by species.
|
# Prefilter distances by species.
|
||||||
distances <- geposan::distances[species %chin% species_ids]
|
distances <- geposan::distances[species %chin% species_ids]
|
||||||
|
|
||||||
# Input data for the network. This contains the gene ID as
|
# Input data for the network. This contains the gene ID as
|
||||||
# an identifier as well as the per-species gene distances as
|
# an identifier as well as the per-species gene distances as
|
||||||
# input variables.
|
# input variables.
|
||||||
data <- data.table(gene = gene_ids)
|
data <- data.table(gene = gene_ids)
|
||||||
|
|
||||||
# Buffer to keep track of the names of the input variables.
|
# Buffer to keep track of the names of the input variables.
|
||||||
input_vars <- NULL
|
input_vars <- NULL
|
||||||
|
|
||||||
# Make a columns containing positions and distances for each
|
# Make a columns containing positions and distances for each
|
||||||
# species.
|
# species.
|
||||||
for (species_id in species_ids) {
|
for (species_id in species_ids) {
|
||||||
species_data <- distances[
|
species_data <- distances[
|
||||||
species == species_id,
|
species == species_id,
|
||||||
.(gene, distance)
|
.(gene, distance)
|
||||||
]
|
]
|
||||||
|
|
||||||
# Only include species with at least 25% known values.
|
# Only include species with at least 25% known values.
|
||||||
# As positions and distances always coexist, we don't
|
# As positions and distances always coexist, we don't
|
||||||
# loose any data here.
|
# loose any data here.
|
||||||
|
|
||||||
species_data <- stats::na.omit(species_data)
|
species_data <- stats::na.omit(species_data)
|
||||||
|
|
||||||
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
|
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
|
||||||
data <- merge(data, species_data, all.x = TRUE)
|
data <- merge(data, species_data, all.x = TRUE)
|
||||||
|
|
||||||
# Replace missing data with mean values. The neural
|
# Replace missing data with mean values. The neural
|
||||||
# network can't handle NAs in a meaningful way.
|
# network can't handle NAs in a meaningful way.
|
||||||
# Choosing extreme values here would result in
|
# Choosing extreme values here would result in
|
||||||
# heavily biased results. Therefore, the mean value
|
# heavily biased results. Therefore, the mean value
|
||||||
# is chosen as a compromise. However, this will of
|
# is chosen as a compromise. However, this will of
|
||||||
# course lessen the significance of the results.
|
# course lessen the significance of the results.
|
||||||
|
|
||||||
mean_distance <- round(
|
mean_distance <- round(
|
||||||
species_data[, mean(distance)]
|
species_data[, mean(distance)]
|
||||||
)
|
)
|
||||||
|
|
||||||
data[is.na(distance), distance := mean_distance]
|
data[is.na(distance), distance := mean_distance]
|
||||||
|
|
||||||
# Name the new column after the species.
|
# Name the new column after the species.
|
||||||
setnames(data, "distance", species_id)
|
setnames(data, "distance", species_id)
|
||||||
|
|
||||||
# Add the input variable to the buffer.
|
# Add the input variable to the buffer.
|
||||||
input_vars <- c(input_vars, species_id)
|
input_vars <- c(input_vars, species_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
progress(0.1)
|
progress(0.1)
|
||||||
|
|
||||||
# Step 2: Prepare training data.
|
# Step 2: Prepare training data.
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
|
|
||||||
# Take out the reference data.
|
# Take out the reference data.
|
||||||
|
|
||||||
reference_data <- data[gene %chin% reference_gene_ids]
|
reference_data <- data[gene %chin% reference_gene_ids]
|
||||||
reference_data[, score := 1.0]
|
reference_data[, score := 1.0]
|
||||||
|
|
||||||
# Take out random samples from the remaining genes. This is
|
# Take out random samples from the remaining genes. This is
|
||||||
# another compromise with a negative impact on
|
# another compromise with a negative impact on
|
||||||
# significance. We assume that a random gene is not likely
|
# significance. We assume that a random gene is not likely
|
||||||
# to match the reference genes.
|
# to match the reference genes.
|
||||||
|
|
||||||
without_reference_data <- data[
|
without_reference_data <- data[
|
||||||
!gene %chin% reference_gene_ids
|
!gene %chin% reference_gene_ids
|
||||||
]
|
]
|
||||||
|
|
||||||
control_data <- without_reference_data[
|
control_data <- without_reference_data[
|
||||||
sample(
|
sample(
|
||||||
nrow(without_reference_data),
|
nrow(without_reference_data),
|
||||||
reference_count
|
reference_count
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
control_data[, score := 0.0]
|
|
||||||
|
|
||||||
# Split the training data into random sets to have
|
|
||||||
# validation data for each model.
|
|
||||||
|
|
||||||
# Scramble the source tables.
|
|
||||||
reference_data <- reference_data[sample(reference_count)]
|
|
||||||
control_data <- control_data[sample(reference_count)]
|
|
||||||
|
|
||||||
networks <- list()
|
|
||||||
|
|
||||||
indices <- seq_len(reference_count)
|
|
||||||
indices_split <- split(indices, indices %% n_models)
|
|
||||||
|
|
||||||
for (i in seq_len(n_models)) {
|
|
||||||
training_data <- rbindlist(list(
|
|
||||||
reference_data[!indices_split[[i]]],
|
|
||||||
control_data[!indices_split[[i]]]
|
|
||||||
))
|
|
||||||
|
|
||||||
validation_data <- rbindlist(list(
|
|
||||||
reference_data[indices_split[[i]]],
|
|
||||||
control_data[indices_split[[i]]]
|
|
||||||
))
|
|
||||||
|
|
||||||
networks[[i]] <- list(
|
|
||||||
training_data = training_data,
|
|
||||||
validation_data = validation_data
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Step 3: Create, train and apply neural network.
|
|
||||||
# -----------------------------------------------
|
|
||||||
|
|
||||||
# Layers for the neural network.
|
|
||||||
input_layer <- length(input_vars)
|
|
||||||
layer1 <- input_layer
|
|
||||||
layer2 <- 0.5 * input_layer
|
|
||||||
layer3 <- 0.5 * layer2
|
|
||||||
|
|
||||||
# Convert data to matrix and normalize it.
|
|
||||||
to_matrix <- function(data) {
|
|
||||||
data_matrix <- as.matrix(data[, ..input_vars])
|
|
||||||
colnames(data_matrix) <- NULL
|
|
||||||
keras::normalize(data_matrix)
|
|
||||||
}
|
|
||||||
|
|
||||||
data_matrix <- to_matrix(data)
|
|
||||||
output_vars <- NULL
|
|
||||||
|
|
||||||
for (i in seq_along(networks)) {
|
|
||||||
# Create a new model for each training session, because
|
|
||||||
# the model would keep its state across training
|
|
||||||
# sessions otherwise.
|
|
||||||
model <- keras::keras_model_sequential() |>
|
|
||||||
keras::layer_dense(
|
|
||||||
units = layer1,
|
|
||||||
activation = "relu",
|
|
||||||
input_shape = input_layer,
|
|
||||||
) |>
|
|
||||||
keras::layer_dense(
|
|
||||||
units = layer2,
|
|
||||||
activation = "relu",
|
|
||||||
kernel_regularizer = keras::regularizer_l2()
|
|
||||||
) |>
|
|
||||||
keras::layer_dense(
|
|
||||||
units = layer3,
|
|
||||||
activation = "relu",
|
|
||||||
kernel_regularizer = keras::regularizer_l2()
|
|
||||||
) |>
|
|
||||||
keras::layer_dense(
|
|
||||||
units = 1,
|
|
||||||
activation = "sigmoid"
|
|
||||||
) |>
|
|
||||||
keras::compile(
|
|
||||||
loss = keras::loss_mean_absolute_error(),
|
|
||||||
optimizer = keras::optimizer_adam()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Train the model.
|
|
||||||
|
|
||||||
network <- networks[[i]]
|
|
||||||
|
|
||||||
training_data <- network$training_data
|
|
||||||
training_matrix <- to_matrix(training_data)
|
|
||||||
validation_data <- network$validation_data
|
|
||||||
validation_matrix <- to_matrix(validation_data)
|
|
||||||
|
|
||||||
fit <- keras::fit(
|
|
||||||
model,
|
|
||||||
x = training_matrix,
|
|
||||||
y = training_data$score,
|
|
||||||
validation_data = list(
|
|
||||||
x_val = validation_matrix,
|
|
||||||
y_val = validation_data$score
|
|
||||||
),
|
|
||||||
epochs = 500,
|
|
||||||
verbose = FALSE
|
|
||||||
)
|
|
||||||
|
|
||||||
# Apply the model.
|
|
||||||
|
|
||||||
data[, new_score := stats::predict(model, data_matrix)]
|
|
||||||
|
|
||||||
# Remove the values of the training data itself.
|
|
||||||
data[gene %chin% training_data$gene, new_score := NA]
|
|
||||||
|
|
||||||
output_var <- sprintf("score%i", i)
|
|
||||||
setnames(data, "new_score", output_var)
|
|
||||||
output_vars <- c(output_vars, output_var)
|
|
||||||
|
|
||||||
|
|
||||||
# Store the details.
|
|
||||||
|
|
||||||
networks[[i]]$model <- keras::serialize_model(model)
|
|
||||||
networks[[i]]$fit <- fit
|
|
||||||
|
|
||||||
progress(0.1 + i * (0.9 / n_models))
|
|
||||||
}
|
|
||||||
|
|
||||||
# Compute the final score as the mean score.
|
|
||||||
data[,
|
|
||||||
score := mean(as.numeric(.SD), na.rm = TRUE),
|
|
||||||
.SDcols = output_vars,
|
|
||||||
by = gene
|
|
||||||
]
|
|
||||||
|
|
||||||
progress(1.0)
|
|
||||||
|
|
||||||
result(
|
|
||||||
method = "neural",
|
|
||||||
scores = data[, .(gene, score)],
|
|
||||||
details = list(
|
|
||||||
seed = seed,
|
|
||||||
n_models = n_models,
|
|
||||||
all_results = data[, !..input_vars],
|
|
||||||
networks = networks
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
control_data[, score := 0.0]
|
||||||
|
|
||||||
|
# Split the training data into random sets to have
|
||||||
|
# validation data for each model.
|
||||||
|
|
||||||
|
# Scramble the source tables.
|
||||||
|
reference_data <- reference_data[sample(reference_count)]
|
||||||
|
control_data <- control_data[sample(reference_count)]
|
||||||
|
|
||||||
|
networks <- list()
|
||||||
|
|
||||||
|
indices <- seq_len(reference_count)
|
||||||
|
indices_split <- split(indices, indices %% n_models)
|
||||||
|
|
||||||
|
for (i in seq_len(n_models)) {
|
||||||
|
training_data <- rbindlist(list(
|
||||||
|
reference_data[!indices_split[[i]]],
|
||||||
|
control_data[!indices_split[[i]]]
|
||||||
|
))
|
||||||
|
|
||||||
|
validation_data <- rbindlist(list(
|
||||||
|
reference_data[indices_split[[i]]],
|
||||||
|
control_data[indices_split[[i]]]
|
||||||
|
))
|
||||||
|
|
||||||
|
networks[[i]] <- list(
|
||||||
|
training_data = training_data,
|
||||||
|
validation_data = validation_data
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Step 3: Create, train and apply neural network.
|
||||||
|
# -----------------------------------------------
|
||||||
|
|
||||||
|
# Layers for the neural network.
|
||||||
|
input_layer <- length(input_vars)
|
||||||
|
layer1 <- input_layer
|
||||||
|
layer2 <- 0.5 * input_layer
|
||||||
|
layer3 <- 0.5 * layer2
|
||||||
|
|
||||||
|
# Convert data to matrix and normalize it.
|
||||||
|
to_matrix <- function(data) {
|
||||||
|
data_matrix <- as.matrix(data[, ..input_vars])
|
||||||
|
colnames(data_matrix) <- NULL
|
||||||
|
keras::normalize(data_matrix)
|
||||||
|
}
|
||||||
|
|
||||||
|
data_matrix <- to_matrix(data)
|
||||||
|
output_vars <- NULL
|
||||||
|
|
||||||
|
for (i in seq_along(networks)) {
|
||||||
|
# Create a new model for each training session, because
|
||||||
|
# the model would keep its state across training
|
||||||
|
# sessions otherwise.
|
||||||
|
model <- keras::keras_model_sequential() |>
|
||||||
|
keras::layer_dense(
|
||||||
|
units = layer1,
|
||||||
|
activation = "relu",
|
||||||
|
input_shape = input_layer,
|
||||||
|
) |>
|
||||||
|
keras::layer_dense(
|
||||||
|
units = layer2,
|
||||||
|
activation = "relu",
|
||||||
|
kernel_regularizer = keras::regularizer_l2()
|
||||||
|
) |>
|
||||||
|
keras::layer_dense(
|
||||||
|
units = layer3,
|
||||||
|
activation = "relu",
|
||||||
|
kernel_regularizer = keras::regularizer_l2()
|
||||||
|
) |>
|
||||||
|
keras::layer_dense(
|
||||||
|
units = 1,
|
||||||
|
activation = "sigmoid"
|
||||||
|
) |>
|
||||||
|
keras::compile(
|
||||||
|
loss = keras::loss_mean_absolute_error(),
|
||||||
|
optimizer = keras::optimizer_adam()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Train the model.
|
||||||
|
|
||||||
|
network <- networks[[i]]
|
||||||
|
|
||||||
|
training_data <- network$training_data
|
||||||
|
training_matrix <- to_matrix(training_data)
|
||||||
|
validation_data <- network$validation_data
|
||||||
|
validation_matrix <- to_matrix(validation_data)
|
||||||
|
|
||||||
|
fit <- keras::fit(
|
||||||
|
model,
|
||||||
|
x = training_matrix,
|
||||||
|
y = training_data$score,
|
||||||
|
validation_data = list(
|
||||||
|
x_val = validation_matrix,
|
||||||
|
y_val = validation_data$score
|
||||||
|
),
|
||||||
|
epochs = 500,
|
||||||
|
verbose = FALSE
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply the model.
|
||||||
|
|
||||||
|
data[, new_score := stats::predict(model, data_matrix)]
|
||||||
|
|
||||||
|
# Remove the values of the training data itself.
|
||||||
|
data[gene %chin% training_data$gene, new_score := NA]
|
||||||
|
|
||||||
|
output_var <- sprintf("score%i", i)
|
||||||
|
setnames(data, "new_score", output_var)
|
||||||
|
output_vars <- c(output_vars, output_var)
|
||||||
|
|
||||||
|
|
||||||
|
# Store the details.
|
||||||
|
|
||||||
|
networks[[i]]$model <- keras::serialize_model(model)
|
||||||
|
networks[[i]]$fit <- fit
|
||||||
|
|
||||||
|
progress(0.1 + i * (0.9 / n_models))
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compute the final score as the mean score.
|
||||||
|
data[,
|
||||||
|
score := mean(as.numeric(.SD), na.rm = TRUE),
|
||||||
|
.SDcols = output_vars,
|
||||||
|
by = gene
|
||||||
|
]
|
||||||
|
|
||||||
|
progress(1.0)
|
||||||
|
|
||||||
|
result(
|
||||||
|
method = "neural",
|
||||||
|
scores = data[, .(gene, score)],
|
||||||
|
details = list(
|
||||||
|
seed = seed,
|
||||||
|
n_models = n_models,
|
||||||
|
all_results = data[, !..input_vars],
|
||||||
|
networks = networks
|
||||||
|
)
|
||||||
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -11,38 +11,38 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
proximity <- function(summarize = stats::median) {
|
proximity <- function(summarize = stats::median) {
|
||||||
method(
|
method(
|
||||||
id = "proximity",
|
id = "proximity",
|
||||||
name = "Proximity",
|
name = "Proximity",
|
||||||
description = "Proximity to telomeres",
|
description = "Proximity to telomeres",
|
||||||
function(preset, progress) {
|
function(preset, progress) {
|
||||||
species_ids <- preset$species_ids
|
species_ids <- preset$species_ids
|
||||||
gene_ids <- preset$gene_ids
|
gene_ids <- preset$gene_ids
|
||||||
|
|
||||||
cached("proximity", c(species_ids, gene_ids), {
|
cached("proximity", c(species_ids, gene_ids), {
|
||||||
# Prefilter distances by species and gene.
|
# Prefilter distances by species and gene.
|
||||||
data <- geposan::distances[
|
data <- geposan::distances[
|
||||||
species %chin% preset$species_ids &
|
species %chin% preset$species_ids &
|
||||||
gene %chin% preset$gene_ids
|
gene %chin% preset$gene_ids
|
||||||
]
|
]
|
||||||
|
|
||||||
# Compute the score as described above.
|
# Compute the score as described above.
|
||||||
data <- data[,
|
data <- data[,
|
||||||
.(combined_distance = as.double(summarize(distance))),
|
.(combined_distance = as.double(summarize(distance))),
|
||||||
by = "gene"
|
by = "gene"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Normalize scores.
|
# Normalize scores.
|
||||||
data[, score := 1 - combined_distance / max(combined_distance)]
|
data[, score := 1 - combined_distance / max(combined_distance)]
|
||||||
|
|
||||||
progress(1.0)
|
progress(1.0)
|
||||||
|
|
||||||
result(
|
result(
|
||||||
method = "proximity",
|
method = "proximity",
|
||||||
scores = data[, .(gene, score)],
|
scores = data[, .(gene, score)],
|
||||||
details = list(data = data)
|
details = list(data = data)
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,136 +15,136 @@
|
||||||
#' @export
|
#' @export
|
||||||
species_adjacency <- function(distance_estimate = stats::median,
|
species_adjacency <- function(distance_estimate = stats::median,
|
||||||
summarize = stats::median) {
|
summarize = stats::median) {
|
||||||
method(
|
method(
|
||||||
id = "species_adjacency",
|
id = "species_adjacency",
|
||||||
name = "Species adj.",
|
name = "Species adj.",
|
||||||
description = "Species adjacency",
|
description = "Species adjacency",
|
||||||
function(preset, progress) {
|
function(preset, progress) {
|
||||||
species_ids <- preset$species_ids
|
species_ids <- preset$species_ids
|
||||||
gene_ids <- preset$gene_ids
|
gene_ids <- preset$gene_ids
|
||||||
reference_gene_ids <- preset$reference_gene_ids
|
reference_gene_ids <- preset$reference_gene_ids
|
||||||
|
|
||||||
cached(
|
cached(
|
||||||
"species_adjacency",
|
"species_adjacency",
|
||||||
c(
|
c(
|
||||||
species_ids,
|
species_ids,
|
||||||
gene_ids,
|
gene_ids,
|
||||||
reference_gene_ids,
|
reference_gene_ids,
|
||||||
distance_estimate,
|
distance_estimate,
|
||||||
summarize
|
summarize
|
||||||
),
|
),
|
||||||
{ # nolint
|
{ # nolint
|
||||||
# Prefilter distances.
|
# Prefilter distances.
|
||||||
data <- geposan::distances[
|
data <- geposan::distances[
|
||||||
species %chin% species_ids & gene %chin% gene_ids
|
species %chin% species_ids & gene %chin% gene_ids
|
||||||
]
|
]
|
||||||
|
|
||||||
progress_state <- 0.0
|
progress_state <- 0.0
|
||||||
progress_step <- 0.9 / length(species_ids)
|
progress_step <- 0.9 / length(species_ids)
|
||||||
|
|
||||||
# Iterate through all species and find the distance
|
# Iterate through all species and find the distance
|
||||||
# estimates within that species.
|
# estimates within that species.
|
||||||
for (species_id in species_ids) {
|
for (species_id in species_ids) {
|
||||||
# For all genes, compute the distance to one reference
|
# For all genes, compute the distance to one reference
|
||||||
# gene at a time in one go.
|
# gene at a time in one go.
|
||||||
for (reference_gene_id in reference_gene_ids) {
|
for (reference_gene_id in reference_gene_ids) {
|
||||||
comparison_distance <- data[
|
comparison_distance <- data[
|
||||||
species == species_id &
|
species == species_id &
|
||||||
gene == reference_gene_id,
|
gene == reference_gene_id,
|
||||||
distance
|
distance
|
||||||
]
|
]
|
||||||
|
|
||||||
column <- quote(reference_gene_id)
|
column <- quote(reference_gene_id)
|
||||||
|
|
||||||
if (length(comparison_distance) != 1) {
|
if (length(comparison_distance) != 1) {
|
||||||
# If we don't have a comparison distance, we
|
# If we don't have a comparison distance, we
|
||||||
# can't compute a difference. This happens, if
|
# can't compute a difference. This happens, if
|
||||||
# the species doesn't have the reference gene.
|
# the species doesn't have the reference gene.
|
||||||
data[
|
data[
|
||||||
species == species_id &
|
species == species_id &
|
||||||
gene %chin% gene_ids,
|
gene %chin% gene_ids,
|
||||||
eval(column) := NA_integer_
|
eval(column) := NA_integer_
|
||||||
]
|
]
|
||||||
} else {
|
} else {
|
||||||
data[
|
data[
|
||||||
species == species_id &
|
species == species_id &
|
||||||
gene %chin% gene_ids,
|
gene %chin% gene_ids,
|
||||||
eval(column) :=
|
eval(column) :=
|
||||||
abs(distance - comparison_distance)
|
abs(distance - comparison_distance)
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Combine the distances to the different reference genes
|
# Combine the distances to the different reference genes
|
||||||
# into one value using the provided function.
|
# into one value using the provided function.
|
||||||
data[
|
data[
|
||||||
species == species_id &
|
species == species_id &
|
||||||
gene %chin% gene_ids,
|
gene %chin% gene_ids,
|
||||||
combined_distance := as.numeric(
|
combined_distance := as.numeric(
|
||||||
distance_estimate(stats::na.omit(
|
distance_estimate(stats::na.omit(
|
||||||
# Convert the data.table subset into a
|
# Convert the data.table subset into a
|
||||||
# vector to get the correct na.omit
|
# vector to get the correct na.omit
|
||||||
# behavior.
|
# behavior.
|
||||||
as.matrix(.SD)[1, ]
|
as.matrix(.SD)[1, ]
|
||||||
))
|
))
|
||||||
),
|
),
|
||||||
.SDcols = reference_gene_ids,
|
.SDcols = reference_gene_ids,
|
||||||
by = gene
|
by = gene
|
||||||
]
|
]
|
||||||
|
|
||||||
progress_state <- progress_state + progress_step
|
progress_state <- progress_state + progress_step
|
||||||
progress(progress_state)
|
progress(progress_state)
|
||||||
}
|
}
|
||||||
|
|
||||||
progress(0.9)
|
progress(0.9)
|
||||||
|
|
||||||
# Remove the distances between the reference genes.
|
# Remove the distances between the reference genes.
|
||||||
for (reference_gene_id in reference_gene_ids) {
|
for (reference_gene_id in reference_gene_ids) {
|
||||||
column <- quote(reference_gene_id)
|
column <- quote(reference_gene_id)
|
||||||
data[gene == reference_gene_id, eval(column) := NA]
|
data[gene == reference_gene_id, eval(column) := NA]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Recompute the combined distance for the reference genes.
|
# Recompute the combined distance for the reference genes.
|
||||||
data[
|
data[
|
||||||
gene %chin% reference_gene_ids,
|
gene %chin% reference_gene_ids,
|
||||||
combined_distance := as.numeric(
|
combined_distance := as.numeric(
|
||||||
distance_estimate(stats::na.omit(
|
distance_estimate(stats::na.omit(
|
||||||
as.matrix(.SD)[1, ])
|
as.matrix(.SD)[1, ]
|
||||||
)
|
))
|
||||||
),
|
),
|
||||||
.SDcols = reference_gene_ids,
|
.SDcols = reference_gene_ids,
|
||||||
by = list(species, gene)
|
by = list(species, gene)
|
||||||
]
|
]
|
||||||
|
|
||||||
# Combine the distances into one value.
|
# Combine the distances into one value.
|
||||||
results <- data[,
|
results <- data[,
|
||||||
.(
|
.(
|
||||||
summarized_distances = as.numeric(
|
summarized_distances = as.numeric(
|
||||||
summarize(stats::na.omit(combined_distance))
|
summarize(stats::na.omit(combined_distance))
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
by = gene
|
by = gene
|
||||||
]
|
]
|
||||||
|
|
||||||
# Compute the final score by normalizing the difference.
|
# Compute the final score by normalizing the difference.
|
||||||
results[
|
results[
|
||||||
,
|
,
|
||||||
score := 1 - summarized_distances /
|
score := 1 - summarized_distances /
|
||||||
max(summarized_distances)
|
max(summarized_distances)
|
||||||
]
|
]
|
||||||
|
|
||||||
progress(1.0)
|
progress(1.0)
|
||||||
|
|
||||||
result(
|
result(
|
||||||
method = "species_adjacency",
|
method = "species_adjacency",
|
||||||
scores = results[, .(gene, score)],
|
scores = results[, .(gene, score)],
|
||||||
details = list(
|
details = list(
|
||||||
data = data,
|
data = data,
|
||||||
results = results
|
results = results
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
674
R/plots.R
674
R/plots.R
|
|
@ -9,7 +9,7 @@ base_color_transparent <- function() "#1964bf80"
|
||||||
#' Color palette for gene sets.
|
#' Color palette for gene sets.
|
||||||
#' @noRd
|
#' @noRd
|
||||||
gene_set_color <- function(index) {
|
gene_set_color <- function(index) {
|
||||||
c("#FF7F00", "#4DAF4A", "#984EA3")[index]
|
c("#FF7F00", "#4DAF4A", "#984EA3")[index]
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Plot gene positions.
|
#' Plot gene positions.
|
||||||
|
|
@ -22,74 +22,74 @@ gene_set_color <- function(index) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
plot_positions <- function(species_ids, gene_sets) {
|
plot_positions <- function(species_ids, gene_sets) {
|
||||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||||
stop("Please install \"plotly\" to use this function.")
|
stop("Please install \"plotly\" to use this function.")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Prefilter data by species.
|
# Prefilter data by species.
|
||||||
data <- geposan::distances[species %chin% species_ids]
|
data <- geposan::distances[species %chin% species_ids]
|
||||||
|
|
||||||
species_max_distance <- data[,
|
species_max_distance <- data[,
|
||||||
.(max_distance = max(distance)),
|
.(max_distance = max(distance)),
|
||||||
by = species
|
by = species
|
||||||
]
|
]
|
||||||
|
|
||||||
# Prefilter species.
|
# Prefilter species.
|
||||||
species <- geposan::species[id %chin% species_ids]
|
species <- geposan::species[id %chin% species_ids]
|
||||||
|
|
||||||
plot <- plotly::plot_ly() |>
|
plot <- plotly::plot_ly() |>
|
||||||
plotly::layout(
|
plotly::layout(
|
||||||
xaxis = list(
|
xaxis = list(
|
||||||
title = "Species",
|
title = "Species",
|
||||||
tickvals = species$id,
|
tickvals = species$id,
|
||||||
ticktext = species$name
|
ticktext = species$name
|
||||||
),
|
),
|
||||||
yaxis = list(title = "Distance to telomeres [Bp]"),
|
yaxis = list(title = "Distance to telomeres [Bp]"),
|
||||||
bargap = 0.9
|
bargap = 0.9
|
||||||
) |>
|
) |>
|
||||||
plotly::add_bars(
|
plotly::add_bars(
|
||||||
data = species_max_distance,
|
data = species_max_distance,
|
||||||
x = ~species,
|
x = ~species,
|
||||||
y = ~max_distance,
|
y = ~max_distance,
|
||||||
name = "All genes",
|
name = "All genes",
|
||||||
marker = list(color = base_color())
|
marker = list(color = base_color())
|
||||||
|
)
|
||||||
|
|
||||||
|
if (length(gene_sets) > 0) {
|
||||||
|
# Include gene information which will be used for labeling
|
||||||
|
gene_set_data <- merge(
|
||||||
|
data[gene %chin% unlist(gene_sets)],
|
||||||
|
geposan::genes,
|
||||||
|
by.x = "gene",
|
||||||
|
by.y = "id"
|
||||||
|
)
|
||||||
|
|
||||||
|
index <- 1
|
||||||
|
|
||||||
|
for (gene_set_name in names(gene_sets)) {
|
||||||
|
gene_set <- gene_sets[[gene_set_name]]
|
||||||
|
|
||||||
|
plot <- plot |> plotly::add_markers(
|
||||||
|
data = gene_set_data[gene %chin% gene_set],
|
||||||
|
x = ~species,
|
||||||
|
y = ~distance,
|
||||||
|
name = gene_set_name,
|
||||||
|
text = ~ glue::glue(
|
||||||
|
"<b>{name}</b><br>",
|
||||||
|
"{round(distance / 1000000, digits = 2)} MBp"
|
||||||
|
),
|
||||||
|
hoverinfo = "text",
|
||||||
|
marker = list(
|
||||||
|
size = 10,
|
||||||
|
color = gene_set_color(index)
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if (length(gene_sets) > 0) {
|
index <- index + 1
|
||||||
# Include gene information which will be used for labeling
|
|
||||||
gene_set_data <- merge(
|
|
||||||
data[gene %chin% unlist(gene_sets)],
|
|
||||||
geposan::genes,
|
|
||||||
by.x = "gene",
|
|
||||||
by.y = "id"
|
|
||||||
)
|
|
||||||
|
|
||||||
index <- 1
|
|
||||||
|
|
||||||
for (gene_set_name in names(gene_sets)) {
|
|
||||||
gene_set <- gene_sets[[gene_set_name]]
|
|
||||||
|
|
||||||
plot <- plot |> plotly::add_markers(
|
|
||||||
data = gene_set_data[gene %chin% gene_set],
|
|
||||||
x = ~species,
|
|
||||||
y = ~distance,
|
|
||||||
name = gene_set_name,
|
|
||||||
text = ~ glue::glue(
|
|
||||||
"<b>{name}</b><br>",
|
|
||||||
"{round(distance / 1000000, digits = 2)} MBp"
|
|
||||||
),
|
|
||||||
hoverinfo = "text",
|
|
||||||
marker = list(
|
|
||||||
size = 10,
|
|
||||||
color = gene_set_color(index)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
index <- index + 1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
plot
|
plot
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -108,75 +108,75 @@ plot_positions <- function(species_ids, gene_sets) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
plot_rankings <- function(rankings, gene_sets) {
|
plot_rankings <- function(rankings, gene_sets) {
|
||||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||||
stop("Please install \"plotly\" to use this function.")
|
stop("Please install \"plotly\" to use this function.")
|
||||||
}
|
}
|
||||||
|
|
||||||
plot <- plotly::plot_ly() |>
|
plot <- plotly::plot_ly() |>
|
||||||
plotly::layout(
|
plotly::layout(
|
||||||
xaxis = list(tickvals = names(rankings)),
|
xaxis = list(tickvals = names(rankings)),
|
||||||
yaxis = list(title = "Score")
|
yaxis = list(title = "Score")
|
||||||
|
)
|
||||||
|
|
||||||
|
is_first <- TRUE
|
||||||
|
|
||||||
|
for (ranking_name in names(rankings)) {
|
||||||
|
ranking <- rankings[[ranking_name]]
|
||||||
|
|
||||||
|
plot <- plot |> plotly::add_trace(
|
||||||
|
data = ranking,
|
||||||
|
x = ranking_name,
|
||||||
|
y = ~score,
|
||||||
|
name = "All genes",
|
||||||
|
type = "violin",
|
||||||
|
spanmode = "hard",
|
||||||
|
points = FALSE,
|
||||||
|
showlegend = is_first,
|
||||||
|
hoverinfo = "skip",
|
||||||
|
line = list(color = base_color()),
|
||||||
|
fillcolor = base_color_transparent()
|
||||||
|
)
|
||||||
|
|
||||||
|
if (length(gene_sets) > 0) {
|
||||||
|
gene_set_data <- merge(
|
||||||
|
ranking[gene %chin% unlist(gene_sets)],
|
||||||
|
geposan::genes,
|
||||||
|
by.x = "gene",
|
||||||
|
by.y = "id"
|
||||||
|
)
|
||||||
|
|
||||||
|
index <- 1
|
||||||
|
|
||||||
|
for (gene_set_name in names(gene_sets)) {
|
||||||
|
gene_set <- gene_sets[[gene_set_name]]
|
||||||
|
|
||||||
|
plot <- plot |> plotly::add_markers(
|
||||||
|
data = gene_set_data[gene %chin% gene_set],
|
||||||
|
x = ranking_name,
|
||||||
|
y = ~score,
|
||||||
|
name = gene_set_name,
|
||||||
|
text = ~ glue::glue(
|
||||||
|
"<b>{name}</b><br>",
|
||||||
|
"Score: {round(score, digits = 2)}<br>",
|
||||||
|
"Rank: {rank}<br>",
|
||||||
|
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||||
|
),
|
||||||
|
hoverinfo = "text",
|
||||||
|
showlegend = is_first,
|
||||||
|
marker = list(
|
||||||
|
size = 10,
|
||||||
|
color = gene_set_color(index)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
is_first <- TRUE
|
index <- index + 1
|
||||||
|
}
|
||||||
for (ranking_name in names(rankings)) {
|
|
||||||
ranking <- rankings[[ranking_name]]
|
|
||||||
|
|
||||||
plot <- plot |> plotly::add_trace(
|
|
||||||
data = ranking,
|
|
||||||
x = ranking_name,
|
|
||||||
y = ~score,
|
|
||||||
name = "All genes",
|
|
||||||
type = "violin",
|
|
||||||
spanmode = "hard",
|
|
||||||
points = FALSE,
|
|
||||||
showlegend = is_first,
|
|
||||||
hoverinfo = "skip",
|
|
||||||
line = list(color = base_color()),
|
|
||||||
fillcolor = base_color_transparent()
|
|
||||||
)
|
|
||||||
|
|
||||||
if (length(gene_sets) > 0) {
|
|
||||||
gene_set_data <- merge(
|
|
||||||
ranking[gene %chin% unlist(gene_sets)],
|
|
||||||
geposan::genes,
|
|
||||||
by.x = "gene",
|
|
||||||
by.y = "id"
|
|
||||||
)
|
|
||||||
|
|
||||||
index <- 1
|
|
||||||
|
|
||||||
for (gene_set_name in names(gene_sets)) {
|
|
||||||
gene_set <- gene_sets[[gene_set_name]]
|
|
||||||
|
|
||||||
plot <- plot |> plotly::add_markers(
|
|
||||||
data = gene_set_data[gene %chin% gene_set],
|
|
||||||
x = ranking_name,
|
|
||||||
y = ~score,
|
|
||||||
name = gene_set_name,
|
|
||||||
text = ~ glue::glue(
|
|
||||||
"<b>{name}</b><br>",
|
|
||||||
"Score: {round(score, digits = 2)}<br>",
|
|
||||||
"Rank: {rank}<br>",
|
|
||||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
|
||||||
),
|
|
||||||
hoverinfo = "text",
|
|
||||||
showlegend = is_first,
|
|
||||||
marker = list(
|
|
||||||
size = 10,
|
|
||||||
color = gene_set_color(index)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
index <- index + 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
is_first <- FALSE
|
|
||||||
}
|
}
|
||||||
|
|
||||||
plot
|
is_first <- FALSE
|
||||||
|
}
|
||||||
|
|
||||||
|
plot
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -194,88 +194,88 @@ plot_rankings <- function(rankings, gene_sets) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
|
plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
|
||||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||||
stop("Please install \"plotly\" to use this function.")
|
stop("Please install \"plotly\" to use this function.")
|
||||||
}
|
}
|
||||||
|
|
||||||
# To speed up rendering, don't show every single gene.
|
# To speed up rendering, don't show every single gene.
|
||||||
n_ranks <- nrow(ranking)
|
n_ranks <- nrow(ranking)
|
||||||
sample_ranking <- ranking[seq(1, n_ranks, 5)]
|
sample_ranking <- ranking[seq(1, n_ranks, 5)]
|
||||||
|
|
||||||
plot <- plotly::plot_ly() |>
|
plot <- plotly::plot_ly() |>
|
||||||
plotly::add_lines(
|
plotly::add_lines(
|
||||||
data = sample_ranking,
|
data = sample_ranking,
|
||||||
x = ~percentile,
|
x = ~percentile,
|
||||||
y = ~score,
|
y = ~score,
|
||||||
name = "All genes",
|
name = "All genes",
|
||||||
hoverinfo = "skip",
|
hoverinfo = "skip",
|
||||||
line = list(width = 4, color = base_color())
|
line = list(width = 4, color = base_color())
|
||||||
) |>
|
) |>
|
||||||
plotly::layout(
|
plotly::layout(
|
||||||
xaxis = list(
|
xaxis = list(
|
||||||
title = "Percentile",
|
title = "Percentile",
|
||||||
tickformat = ".0%"
|
tickformat = ".0%"
|
||||||
),
|
),
|
||||||
yaxis = list(title = "Score")
|
yaxis = list(title = "Score")
|
||||||
|
)
|
||||||
|
|
||||||
|
if (length(gene_sets) > 0) {
|
||||||
|
# Include gene information which will be used for labeling
|
||||||
|
gene_set_data <- merge(
|
||||||
|
ranking[gene %chin% unlist(gene_sets)],
|
||||||
|
geposan::genes,
|
||||||
|
by.x = "gene",
|
||||||
|
by.y = "id"
|
||||||
|
)
|
||||||
|
|
||||||
|
index <- 1
|
||||||
|
|
||||||
|
for (gene_set_name in names(gene_sets)) {
|
||||||
|
gene_set <- gene_sets[[gene_set_name]]
|
||||||
|
|
||||||
|
plot <- plot |> plotly::add_markers(
|
||||||
|
data = gene_set_data[gene %chin% gene_set],
|
||||||
|
x = ~percentile,
|
||||||
|
y = ~score,
|
||||||
|
name = gene_set_name,
|
||||||
|
text = ~ glue::glue(
|
||||||
|
"<b>{name}</b><br>",
|
||||||
|
"Score: {round(score, digits = 2)}<br>",
|
||||||
|
"Rank: {rank}<br>",
|
||||||
|
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||||
|
),
|
||||||
|
hoverinfo = "text",
|
||||||
|
marker = list(
|
||||||
|
size = 10,
|
||||||
|
color = gene_set_color(index)
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if (length(gene_sets) > 0) {
|
index <- index + 1
|
||||||
# Include gene information which will be used for labeling
|
}
|
||||||
gene_set_data <- merge(
|
}
|
||||||
ranking[gene %chin% unlist(gene_sets)],
|
|
||||||
geposan::genes,
|
|
||||||
by.x = "gene",
|
if (!is.null(max_rank)) {
|
||||||
by.y = "id"
|
first_not_included_rank <- max_rank + 1
|
||||||
|
last_rank <- ranking[, .N]
|
||||||
|
|
||||||
|
if (first_not_included_rank <= last_rank) {
|
||||||
|
plot <- plot |> plotly::layout(
|
||||||
|
shapes = list(
|
||||||
|
type = "rect",
|
||||||
|
fillcolor = "black",
|
||||||
|
opacity = 0.1,
|
||||||
|
x0 = 1 - first_not_included_rank / n_ranks,
|
||||||
|
x1 = 1 - last_rank / n_ranks,
|
||||||
|
y0 = 0.0,
|
||||||
|
y1 = 1.0
|
||||||
)
|
)
|
||||||
|
)
|
||||||
index <- 1
|
|
||||||
|
|
||||||
for (gene_set_name in names(gene_sets)) {
|
|
||||||
gene_set <- gene_sets[[gene_set_name]]
|
|
||||||
|
|
||||||
plot <- plot |> plotly::add_markers(
|
|
||||||
data = gene_set_data[gene %chin% gene_set],
|
|
||||||
x = ~percentile,
|
|
||||||
y = ~score,
|
|
||||||
name = gene_set_name,
|
|
||||||
text = ~ glue::glue(
|
|
||||||
"<b>{name}</b><br>",
|
|
||||||
"Score: {round(score, digits = 2)}<br>",
|
|
||||||
"Rank: {rank}<br>",
|
|
||||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
|
||||||
),
|
|
||||||
hoverinfo = "text",
|
|
||||||
marker = list(
|
|
||||||
size = 10,
|
|
||||||
color = gene_set_color(index)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
index <- index + 1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
plot
|
||||||
if (!is.null(max_rank)) {
|
|
||||||
first_not_included_rank <- max_rank + 1
|
|
||||||
last_rank <- ranking[, .N]
|
|
||||||
|
|
||||||
if (first_not_included_rank <= last_rank) {
|
|
||||||
plot <- plot |> plotly::layout(
|
|
||||||
shapes = list(
|
|
||||||
type = "rect",
|
|
||||||
fillcolor = "black",
|
|
||||||
opacity = 0.1,
|
|
||||||
x0 = 1 - first_not_included_rank / n_ranks,
|
|
||||||
x1 = 1 - last_rank / n_ranks,
|
|
||||||
y0 = 0.0,
|
|
||||||
y1 = 1.0
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
plot
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Visualize a ranking by comparing gene sets in a boxplot.
|
#' Visualize a ranking by comparing gene sets in a boxplot.
|
||||||
|
|
@ -290,44 +290,44 @@ plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
plot_boxplot <- function(ranking, gene_sets = NULL) {
|
plot_boxplot <- function(ranking, gene_sets = NULL) {
|
||||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||||
stop("Please install \"plotly\" to use this function.")
|
stop("Please install \"plotly\" to use this function.")
|
||||||
|
}
|
||||||
|
|
||||||
|
plot <- plotly::plot_ly() |>
|
||||||
|
plotly::add_boxplot(
|
||||||
|
data = ranking,
|
||||||
|
x = "All genes",
|
||||||
|
y = ~score,
|
||||||
|
name = "All genes",
|
||||||
|
showlegend = FALSE,
|
||||||
|
line = list(color = base_color())
|
||||||
|
) |>
|
||||||
|
plotly::layout(
|
||||||
|
xaxis = list(tickvals = c("All genes", names(gene_sets))),
|
||||||
|
yaxis = list(title = "Score")
|
||||||
|
)
|
||||||
|
|
||||||
|
if (length(gene_sets) > 0) {
|
||||||
|
index <- 1
|
||||||
|
|
||||||
|
for (gene_set_name in names(gene_sets)) {
|
||||||
|
gene_set <- gene_sets[[gene_set_name]]
|
||||||
|
|
||||||
|
plot <- plot |> plotly::add_boxplot(
|
||||||
|
data = ranking[gene %chin% gene_set],
|
||||||
|
x = gene_set_name,
|
||||||
|
y = ~score,
|
||||||
|
name = gene_set_name,
|
||||||
|
showlegend = FALSE,
|
||||||
|
line = list(color = gene_set_color(index))
|
||||||
|
)
|
||||||
|
|
||||||
|
index <- index + 1
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
plot <- plotly::plot_ly() |>
|
plot
|
||||||
plotly::add_boxplot(
|
|
||||||
data = ranking,
|
|
||||||
x = "All genes",
|
|
||||||
y = ~score,
|
|
||||||
name = "All genes",
|
|
||||||
showlegend = FALSE,
|
|
||||||
line = list(color = base_color())
|
|
||||||
) |>
|
|
||||||
plotly::layout(
|
|
||||||
xaxis = list(tickvals = c("All genes", names(gene_sets))),
|
|
||||||
yaxis = list(title = "Score")
|
|
||||||
)
|
|
||||||
|
|
||||||
if (length(gene_sets) > 0) {
|
|
||||||
index <- 1
|
|
||||||
|
|
||||||
for (gene_set_name in names(gene_sets)) {
|
|
||||||
gene_set <- gene_sets[[gene_set_name]]
|
|
||||||
|
|
||||||
plot <- plot |> plotly::add_boxplot(
|
|
||||||
data = ranking[gene %chin% gene_set],
|
|
||||||
x = gene_set_name,
|
|
||||||
y = ~score,
|
|
||||||
name = gene_set_name,
|
|
||||||
showlegend = FALSE,
|
|
||||||
line = list(color = gene_set_color(index))
|
|
||||||
)
|
|
||||||
|
|
||||||
index <- index + 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
plot
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Show the distribution of scores across chromosomes.
|
#' Show the distribution of scores across chromosomes.
|
||||||
|
|
@ -340,46 +340,46 @@ plot_boxplot <- function(ranking, gene_sets = NULL) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
plot_chromosomes <- function(ranking) {
|
plot_chromosomes <- function(ranking) {
|
||||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||||
stop("Please install \"plotly\" to use this function.")
|
stop("Please install \"plotly\" to use this function.")
|
||||||
}
|
}
|
||||||
|
|
||||||
data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
|
data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
|
||||||
data <- data[, .(score = mean(score)), by = "chromosome"]
|
data <- data[, .(score = mean(score)), by = "chromosome"]
|
||||||
|
|
||||||
# Get an orderable integer from a chromosome name.
|
# Get an orderable integer from a chromosome name.
|
||||||
chromosome_index <- function(chromosome) {
|
chromosome_index <- function(chromosome) {
|
||||||
index <- suppressWarnings(as.integer(chromosome))
|
index <- suppressWarnings(as.integer(chromosome))
|
||||||
|
|
||||||
ifelse(
|
ifelse(
|
||||||
!is.na(index),
|
!is.na(index),
|
||||||
index,
|
index,
|
||||||
ifelse(
|
ifelse(
|
||||||
chromosome == "X",
|
chromosome == "X",
|
||||||
998,
|
998,
|
||||||
999
|
999
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
data[, index := chromosome_index(chromosome)]
|
data[, index := chromosome_index(chromosome)]
|
||||||
setorder(data, "index")
|
setorder(data, "index")
|
||||||
|
|
||||||
plotly::plot_ly(
|
plotly::plot_ly(
|
||||||
data = data,
|
data = data,
|
||||||
x = ~chromosome,
|
x = ~chromosome,
|
||||||
y = ~score,
|
y = ~score,
|
||||||
type = "bar",
|
type = "bar",
|
||||||
marker = list(color = base_color())
|
marker = list(color = base_color())
|
||||||
) |>
|
) |>
|
||||||
plotly::layout(
|
plotly::layout(
|
||||||
xaxis = list(
|
xaxis = list(
|
||||||
title = "Chromosome",
|
title = "Chromosome",
|
||||||
categoryorder = "array",
|
categoryorder = "array",
|
||||||
categoryarray = ~chromosome
|
categoryarray = ~chromosome
|
||||||
),
|
),
|
||||||
yaxis = list(title = "Mean score")
|
yaxis = list(title = "Mean score")
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Plot scores in relation to chromosomal position of genes.
|
#' Plot scores in relation to chromosomal position of genes.
|
||||||
|
|
@ -398,74 +398,74 @@ plot_chromosomes <- function(ranking) {
|
||||||
plot_scores_by_position <- function(ranking,
|
plot_scores_by_position <- function(ranking,
|
||||||
chromosome_name = NULL,
|
chromosome_name = NULL,
|
||||||
gene_sets = NULL) {
|
gene_sets = NULL) {
|
||||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||||
stop("Please install \"plotly\" to use this function.")
|
stop("Please install \"plotly\" to use this function.")
|
||||||
}
|
}
|
||||||
|
|
||||||
distance_data <- if (!is.null(chromosome_name)) {
|
distance_data <- if (!is.null(chromosome_name)) {
|
||||||
chromosome_name_ <- chromosome_name
|
chromosome_name_ <- chromosome_name
|
||||||
geposan::distances[
|
geposan::distances[
|
||||||
species == "hsapiens" &
|
species == "hsapiens" &
|
||||||
chromosome_name == chromosome_name_
|
chromosome_name == chromosome_name_
|
||||||
]
|
]
|
||||||
} else {
|
} else {
|
||||||
geposan::distances[species == "hsapiens"]
|
geposan::distances[species == "hsapiens"]
|
||||||
}
|
}
|
||||||
|
|
||||||
data <- merge(ranking, distance_data, by = "gene")
|
data <- merge(ranking, distance_data, by = "gene")
|
||||||
|
|
||||||
data <- merge(
|
data <- merge(
|
||||||
data,
|
data,
|
||||||
geposan::genes,
|
geposan::genes,
|
||||||
by.x = "gene",
|
by.x = "gene",
|
||||||
by.y = "id"
|
by.y = "id"
|
||||||
|
)
|
||||||
|
|
||||||
|
data[, `:=`(gene_set = "All genes", color = base_color())]
|
||||||
|
|
||||||
|
index <- 1
|
||||||
|
for (gene_set_name in names(gene_sets)) {
|
||||||
|
gene_set_genes <- gene_sets[[gene_set_name]]
|
||||||
|
data[
|
||||||
|
gene %chin% gene_set_genes,
|
||||||
|
`:=`(
|
||||||
|
gene_set = gene_set_name,
|
||||||
|
color = gene_set_color(index)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
index <- index + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Use distances instead of positions in case all chromosomes are included.
|
||||||
|
if (is.null(chromosome_name)) {
|
||||||
|
data[, x := distance]
|
||||||
|
} else {
|
||||||
|
data[, x := start_position]
|
||||||
|
}
|
||||||
|
|
||||||
|
plotly::plot_ly() |>
|
||||||
|
plotly::add_markers(
|
||||||
|
data = data,
|
||||||
|
x = ~x,
|
||||||
|
y = ~score,
|
||||||
|
name = ~gene_set,
|
||||||
|
text = ~ glue::glue(
|
||||||
|
"<b>{name}</b><br>",
|
||||||
|
if (is.null(chromosome_name)) "Distance: " else "Position: ",
|
||||||
|
"{round(x / 1000000, digits = 2)} MBp<br>",
|
||||||
|
"Score: {round(score, digits = 2)}<br>",
|
||||||
|
"Rank: {rank}<br>",
|
||||||
|
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||||
|
),
|
||||||
|
hoverinfo = "text",
|
||||||
|
) |>
|
||||||
|
plotly::layout(
|
||||||
|
xaxis = list(title = if (is.null(chromosome_name)) {
|
||||||
|
"Distance (Bp)"
|
||||||
|
} else {
|
||||||
|
"Position (Bp)"
|
||||||
|
}),
|
||||||
|
yaxis = list(title = "Score")
|
||||||
)
|
)
|
||||||
|
|
||||||
data[, `:=`(gene_set = "All genes", color = base_color())]
|
|
||||||
|
|
||||||
index <- 1
|
|
||||||
for (gene_set_name in names(gene_sets)) {
|
|
||||||
gene_set_genes <- gene_sets[[gene_set_name]]
|
|
||||||
data[
|
|
||||||
gene %chin% gene_set_genes,
|
|
||||||
`:=`(
|
|
||||||
gene_set = gene_set_name,
|
|
||||||
color = gene_set_color(index)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
index <- index + 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# Use distances instead of positions in case all chromosomes are included.
|
|
||||||
if (is.null(chromosome_name)) {
|
|
||||||
data[, x := distance]
|
|
||||||
} else {
|
|
||||||
data[, x := start_position]
|
|
||||||
}
|
|
||||||
|
|
||||||
plotly::plot_ly() |>
|
|
||||||
plotly::add_markers(
|
|
||||||
data = data,
|
|
||||||
x = ~x,
|
|
||||||
y = ~score,
|
|
||||||
name = ~gene_set,
|
|
||||||
text = ~ glue::glue(
|
|
||||||
"<b>{name}</b><br>",
|
|
||||||
if (is.null(chromosome_name)) "Distance: " else "Position: ",
|
|
||||||
"{round(x / 1000000, digits = 2)} MBp<br>",
|
|
||||||
"Score: {round(score, digits = 2)}<br>",
|
|
||||||
"Rank: {rank}<br>",
|
|
||||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
|
||||||
),
|
|
||||||
hoverinfo = "text",
|
|
||||||
) |>
|
|
||||||
plotly::layout(
|
|
||||||
xaxis = list(title = if (is.null(chromosome_name)) {
|
|
||||||
"Distance (Bp)"
|
|
||||||
} else {
|
|
||||||
"Position (Bp)"
|
|
||||||
}),
|
|
||||||
yaxis = list(title = "Score")
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
116
R/preset.R
116
R/preset.R
|
|
@ -21,55 +21,55 @@ preset <- function(reference_gene_ids,
|
||||||
methods = all_methods(),
|
methods = all_methods(),
|
||||||
species_ids = geposan::species$id,
|
species_ids = geposan::species$id,
|
||||||
gene_ids = geposan::genes$id) {
|
gene_ids = geposan::genes$id) {
|
||||||
# Count included species per gene.
|
# Count included species per gene.
|
||||||
genes_n_species <- geposan::distances[
|
genes_n_species <- geposan::distances[
|
||||||
species %chin% species_ids,
|
species %chin% species_ids,
|
||||||
.(n_species = .N),
|
.(n_species = .N),
|
||||||
by = "gene"
|
by = "gene"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Filter out genes with less than 25% existing orthologs.
|
# Filter out genes with less than 25% existing orthologs.
|
||||||
gene_ids_filtered <- genes_n_species[
|
gene_ids_filtered <- genes_n_species[
|
||||||
gene %chin% gene_ids &
|
gene %chin% gene_ids &
|
||||||
n_species >= 0.25 * length(species_ids),
|
n_species >= 0.25 * length(species_ids),
|
||||||
gene
|
gene
|
||||||
]
|
]
|
||||||
|
|
||||||
reference_gene_ids_excluded <- reference_gene_ids[
|
reference_gene_ids_excluded <- reference_gene_ids[
|
||||||
!reference_gene_ids %chin% gene_ids_filtered
|
!reference_gene_ids %chin% gene_ids_filtered
|
||||||
]
|
]
|
||||||
|
|
||||||
if (length(reference_gene_ids_excluded > 0)) {
|
if (length(reference_gene_ids_excluded > 0)) {
|
||||||
warning(paste0(
|
warning(paste0(
|
||||||
"The following reference gene IDs are excluded from the preset ",
|
"The following reference gene IDs are excluded from the preset ",
|
||||||
"because they don't have enough data: ",
|
"because they don't have enough data: ",
|
||||||
paste(reference_gene_ids_excluded, collapse = ", ")
|
paste(reference_gene_ids_excluded, collapse = ", ")
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
reference_gene_ids_included <- reference_gene_ids[
|
reference_gene_ids_included <- reference_gene_ids[
|
||||||
reference_gene_ids %chin% gene_ids_filtered
|
reference_gene_ids %chin% gene_ids_filtered
|
||||||
]
|
]
|
||||||
|
|
||||||
if (length(reference_gene_ids_included) < 1) {
|
if (length(reference_gene_ids_included) < 1) {
|
||||||
stop(paste0(
|
stop(paste0(
|
||||||
"There has to be at least one reference gene for the preset to be ",
|
"There has to be at least one reference gene for the preset to be ",
|
||||||
"valid. Please note that some methods may require more reference ",
|
"valid. Please note that some methods may require more reference ",
|
||||||
"genes."
|
"genes."
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
# The included data gets sorted to be able to produce predictable hashes
|
# The included data gets sorted to be able to produce predictable hashes
|
||||||
# for the object later.
|
# for the object later.
|
||||||
structure(
|
structure(
|
||||||
list(
|
list(
|
||||||
reference_gene_ids = sort(reference_gene_ids_included),
|
reference_gene_ids = sort(reference_gene_ids_included),
|
||||||
methods = methods,
|
methods = methods,
|
||||||
species_ids = sort(species_ids),
|
species_ids = sort(species_ids),
|
||||||
gene_ids = sort(gene_ids_filtered)
|
gene_ids = sort(gene_ids_filtered)
|
||||||
),
|
),
|
||||||
class = "geposan_preset"
|
class = "geposan_preset"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' S3 method to print a preset object.
|
#' S3 method to print a preset object.
|
||||||
|
|
@ -81,20 +81,20 @@ preset <- function(reference_gene_ids,
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
print.geposan_preset <- function(x, ...) {
|
print.geposan_preset <- function(x, ...) {
|
||||||
cat(sprintf(
|
cat(sprintf(
|
||||||
paste0(
|
paste0(
|
||||||
"geposan preset:",
|
"geposan preset:",
|
||||||
"\n Reference genes: %i",
|
"\n Reference genes: %i",
|
||||||
"\n Included methods: %s",
|
"\n Included methods: %s",
|
||||||
"\n Number of species: %i",
|
"\n Number of species: %i",
|
||||||
"\n Number of genes: %i",
|
"\n Number of genes: %i",
|
||||||
"\n"
|
"\n"
|
||||||
),
|
),
|
||||||
length(x$reference_gene_ids),
|
length(x$reference_gene_ids),
|
||||||
paste(sapply(x$methods, function(m) m$id), collapse = ", "),
|
paste(sapply(x$methods, function(m) m$id), collapse = ", "),
|
||||||
length(x$species_ids),
|
length(x$species_ids),
|
||||||
length(x$gene_ids)
|
length(x$gene_ids)
|
||||||
))
|
))
|
||||||
|
|
||||||
invisible(x)
|
invisible(x)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
112
R/ranking.R
112
R/ranking.R
|
|
@ -13,35 +13,35 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
ranking <- function(analysis, weights) {
|
ranking <- function(analysis, weights) {
|
||||||
ranking <- if (inherits(analysis, "geposan_analysis")) {
|
ranking <- if (inherits(analysis, "geposan_analysis")) {
|
||||||
copy(analysis$scores)
|
copy(analysis$scores)
|
||||||
} else if (inherits(analysis, "geposan_ranking")) {
|
} else if (inherits(analysis, "geposan_ranking")) {
|
||||||
copy(analysis)
|
copy(analysis)
|
||||||
} else {
|
} else {
|
||||||
stop("Invalid analyis. Use geposan::analyze().")
|
stop("Invalid analyis. Use geposan::analyze().")
|
||||||
}
|
}
|
||||||
|
|
||||||
ranking[, score := 0.0]
|
ranking[, score := 0.0]
|
||||||
|
|
||||||
for (method in names(weights)) {
|
for (method in names(weights)) {
|
||||||
weighted <- weights[[method]] * ranking[, ..method]
|
weighted <- weights[[method]] * ranking[, ..method]
|
||||||
ranking[, score := score + weighted]
|
ranking[, score := score + weighted]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Normalize scores to be between 0.0 and 1.0.
|
# Normalize scores to be between 0.0 and 1.0.
|
||||||
min_score <- ranking[, min(score)]
|
min_score <- ranking[, min(score)]
|
||||||
max_score <- ranking[, max(score)]
|
max_score <- ranking[, max(score)]
|
||||||
score_range <- max_score - min_score
|
score_range <- max_score - min_score
|
||||||
ranking[, score := (score - min_score) / score_range]
|
ranking[, score := (score - min_score) / score_range]
|
||||||
|
|
||||||
setorder(ranking, -score)
|
setorder(ranking, -score)
|
||||||
ranking[, rank := .I]
|
ranking[, rank := .I]
|
||||||
ranking[, percentile := 1 - rank / nrow(ranking)]
|
ranking[, percentile := 1 - rank / nrow(ranking)]
|
||||||
|
|
||||||
structure(
|
structure(
|
||||||
ranking,
|
ranking,
|
||||||
class = c("geposan_ranking", class(ranking))
|
class = c("geposan_ranking", class(ranking))
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Find the best weights to rank the results.
|
#' Find the best weights to rank the results.
|
||||||
|
|
@ -61,40 +61,40 @@ ranking <- function(analysis, weights) {
|
||||||
#' @export
|
#' @export
|
||||||
optimal_weights <- function(analysis, methods, reference_gene_ids,
|
optimal_weights <- function(analysis, methods, reference_gene_ids,
|
||||||
target = "mean") {
|
target = "mean") {
|
||||||
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
|
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
|
||||||
stop("Invalid analyis. Use geposan::analyze().")
|
stop("Invalid analyis. Use geposan::analyze().")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Compute the target rank of the reference genes when applying the
|
||||||
|
# weights.
|
||||||
|
target_rank <- function(factors) {
|
||||||
|
data <- ranking(analysis, as.list(factors))
|
||||||
|
|
||||||
|
result <- data[
|
||||||
|
gene %chin% reference_gene_ids,
|
||||||
|
if (target == "min") {
|
||||||
|
min(rank)
|
||||||
|
} else if (target == "max") {
|
||||||
|
max(rank)
|
||||||
|
} else if (target == "mean") {
|
||||||
|
mean(rank)
|
||||||
|
} else {
|
||||||
|
stats::median(rank)
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
if (result > 0) {
|
||||||
|
result
|
||||||
|
} else {
|
||||||
|
Inf
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
initial_factors <- rep(1.0, length(methods))
|
||||||
|
names(initial_factors) <- methods
|
||||||
|
|
||||||
# Compute the target rank of the reference genes when applying the
|
optimal_factors <- stats::optim(initial_factors, target_rank)$par
|
||||||
# weights.
|
|
||||||
target_rank <- function(factors) {
|
|
||||||
data <- ranking(analysis, as.list(factors))
|
|
||||||
|
|
||||||
result <- data[
|
as.list(optimal_factors / max(abs(optimal_factors)))
|
||||||
gene %chin% reference_gene_ids,
|
|
||||||
if (target == "min") {
|
|
||||||
min(rank)
|
|
||||||
} else if (target == "max") {
|
|
||||||
max(rank)
|
|
||||||
} else if (target == "mean") {
|
|
||||||
mean(rank)
|
|
||||||
} else {
|
|
||||||
stats::median(rank)
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
if (result > 0) {
|
|
||||||
result
|
|
||||||
} else {
|
|
||||||
Inf
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
initial_factors <- rep(1.0, length(methods))
|
|
||||||
names(initial_factors) <- methods
|
|
||||||
|
|
||||||
optimal_factors <- stats::optim(initial_factors, target_rank)$par
|
|
||||||
|
|
||||||
as.list(optimal_factors / max(abs(optimal_factors)))
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
48
R/result.R
48
R/result.R
|
|
@ -10,18 +10,18 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
result <- function(method_id, scores, details = list()) {
|
result <- function(method_id, scores, details = list()) {
|
||||||
stopifnot(is.data.frame(scores) &
|
stopifnot(is.data.frame(scores) &
|
||||||
c("gene", "score") %chin% colnames(scores))
|
c("gene", "score") %chin% colnames(scores))
|
||||||
stopifnot(is.list(details))
|
stopifnot(is.list(details))
|
||||||
|
|
||||||
structure(
|
structure(
|
||||||
list(
|
list(
|
||||||
method_id = method_id,
|
method_id = method_id,
|
||||||
scores = scores,
|
scores = scores,
|
||||||
details = details
|
details = details
|
||||||
),
|
),
|
||||||
class = "geposan_result"
|
class = "geposan_result"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Print a result object.
|
#' Print a result object.
|
||||||
|
|
@ -33,18 +33,18 @@ result <- function(method_id, scores, details = list()) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
print.geposan_result <- function(x, ...) {
|
print.geposan_result <- function(x, ...) {
|
||||||
cat(sprintf(
|
cat(sprintf(
|
||||||
paste0(
|
paste0(
|
||||||
"geposan result:",
|
"geposan result:",
|
||||||
"\n Method: %s",
|
"\n Method: %s",
|
||||||
"\n Number of genes: %i",
|
"\n Number of genes: %i",
|
||||||
"\n Available details: %s",
|
"\n Available details: %s",
|
||||||
"\n"
|
"\n"
|
||||||
),
|
),
|
||||||
x$method_id,
|
x$method_id,
|
||||||
nrow(x$scores),
|
nrow(x$scores),
|
||||||
paste(names(x$details), collapse = ", ")
|
paste(names(x$details), collapse = ", ")
|
||||||
))
|
))
|
||||||
|
|
||||||
invisible(x)
|
invisible(x)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
30
R/utils.R
30
R/utils.R
|
|
@ -8,25 +8,25 @@
|
||||||
# @param objects A vector of objects that this expression depends on. The hash
|
# @param objects A vector of objects that this expression depends on. The hash
|
||||||
# of those objects will be used for identifying the cache file.
|
# of those objects will be used for identifying the cache file.
|
||||||
cached <- function(name, objects, expr) {
|
cached <- function(name, objects, expr) {
|
||||||
if (!dir.exists("cache")) {
|
if (!dir.exists("cache")) {
|
||||||
dir.create("cache")
|
dir.create("cache")
|
||||||
}
|
}
|
||||||
|
|
||||||
id <- rlang::hash(objects)
|
id <- rlang::hash(objects)
|
||||||
cache_file <- sprintf("cache/%s_%s.rda", name, id)
|
cache_file <- sprintf("cache/%s_%s.rda", name, id)
|
||||||
|
|
||||||
if (file.exists(cache_file)) {
|
if (file.exists(cache_file)) {
|
||||||
# If the cache file exists, we restore the data from it.
|
# If the cache file exists, we restore the data from it.
|
||||||
load(cache_file)
|
load(cache_file)
|
||||||
} else {
|
} else {
|
||||||
# If the cache file doesn't exist, we have to do the computation.
|
# If the cache file doesn't exist, we have to do the computation.
|
||||||
data <- expr
|
data <- expr
|
||||||
|
|
||||||
# The results are cached for the next run.
|
# The results are cached for the next run.
|
||||||
save(data, file = cache_file, compress = "xz")
|
save(data, file = cache_file, compress = "xz")
|
||||||
}
|
}
|
||||||
|
|
||||||
data
|
data
|
||||||
}
|
}
|
||||||
|
|
||||||
# This is needed to make data.table's symbols available within the package.
|
# This is needed to make data.table's symbols available within the package.
|
||||||
|
|
|
||||||
132
R/validate.R
132
R/validate.R
|
|
@ -27,68 +27,68 @@
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
|
validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
|
||||||
if (!inherits(ranking, "geposan_ranking")) {
|
if (!inherits(ranking, "geposan_ranking")) {
|
||||||
stop("Ranking is invalid. Use geposan::ranking().")
|
stop("Ranking is invalid. Use geposan::ranking().")
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is.null(progress)) {
|
if (is.null(progress)) {
|
||||||
progress_bar <- progress::progress_bar$new()
|
progress_bar <- progress::progress_bar$new()
|
||||||
progress_bar$update(0.0)
|
progress_bar$update(0.0)
|
||||||
|
|
||||||
progress <- function(progress_value) {
|
progress <- function(progress_value) {
|
||||||
if (!progress_bar$finished) {
|
if (!progress_bar$finished) {
|
||||||
progress_bar$update(progress_value)
|
progress_bar$update(progress_value)
|
||||||
if (progress_value >= 1.0) {
|
if (progress_value >= 1.0) {
|
||||||
progress_bar$terminate()
|
progress_bar$terminate()
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
progress_state <- 0.0
|
progress_state <- 0.0
|
||||||
progress_step <- 1.0 / length(reference_gene_ids)
|
progress_step <- 1.0 / length(reference_gene_ids)
|
||||||
|
|
||||||
results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
|
results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
|
||||||
|
|
||||||
for (gene_id in reference_gene_ids) {
|
for (gene_id in reference_gene_ids) {
|
||||||
included_gene_ids <- reference_gene_ids[
|
included_gene_ids <- reference_gene_ids[
|
||||||
reference_gene_ids != gene_id
|
reference_gene_ids != gene_id
|
||||||
]
|
]
|
||||||
|
|
||||||
weights <- optimal_weights(
|
weights <- optimal_weights(
|
||||||
ranking,
|
ranking,
|
||||||
method_ids,
|
method_ids,
|
||||||
included_gene_ids
|
included_gene_ids
|
||||||
)
|
|
||||||
|
|
||||||
ranking_validation <- ranking(ranking, weights)
|
|
||||||
|
|
||||||
results[
|
|
||||||
gene == gene_id,
|
|
||||||
percentile_validation := ranking_validation[
|
|
||||||
gene == gene_id,
|
|
||||||
percentile
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
if (!is.null(progress)) {
|
|
||||||
progress_state <- progress_state + progress_step
|
|
||||||
progress(progress_state)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
results[, error := percentile - percentile_validation]
|
|
||||||
setorder(results, error)
|
|
||||||
|
|
||||||
structure(
|
|
||||||
list(
|
|
||||||
validation = results,
|
|
||||||
mean_percentile_original = results[, mean(percentile)],
|
|
||||||
mean_percentile_validation = results[, mean(percentile_validation)],
|
|
||||||
mean_error = results[, mean(error)]
|
|
||||||
),
|
|
||||||
class = "geposan_validation"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ranking_validation <- ranking(ranking, weights)
|
||||||
|
|
||||||
|
results[
|
||||||
|
gene == gene_id,
|
||||||
|
percentile_validation := ranking_validation[
|
||||||
|
gene == gene_id,
|
||||||
|
percentile
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
if (!is.null(progress)) {
|
||||||
|
progress_state <- progress_state + progress_step
|
||||||
|
progress(progress_state)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
results[, error := percentile - percentile_validation]
|
||||||
|
setorder(results, error)
|
||||||
|
|
||||||
|
structure(
|
||||||
|
list(
|
||||||
|
validation = results,
|
||||||
|
mean_percentile_original = results[, mean(percentile)],
|
||||||
|
mean_percentile_validation = results[, mean(percentile_validation)],
|
||||||
|
mean_error = results[, mean(error)]
|
||||||
|
),
|
||||||
|
class = "geposan_validation"
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' S3 method to print a validation object.
|
#' S3 method to print a validation object.
|
||||||
|
|
@ -100,18 +100,18 @@ validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
print.geposan_validation <- function(x, ...) {
|
print.geposan_validation <- function(x, ...) {
|
||||||
cat(sprintf(
|
cat(sprintf(
|
||||||
paste0(
|
paste0(
|
||||||
"geposan validation:",
|
"geposan validation:",
|
||||||
"\n Mean percentile original: %.1f%%",
|
"\n Mean percentile original: %.1f%%",
|
||||||
"\n Mean percentile validation: %.1f%%",
|
"\n Mean percentile validation: %.1f%%",
|
||||||
"\n Mean error: %.1f percent points",
|
"\n Mean error: %.1f percent points",
|
||||||
"\n"
|
"\n"
|
||||||
),
|
),
|
||||||
x$mean_percentile_original * 100,
|
x$mean_percentile_original * 100,
|
||||||
x$mean_percentile_validation * 100,
|
x$mean_percentile_validation * 100,
|
||||||
x$mean_error * 100
|
x$mean_error * 100
|
||||||
))
|
))
|
||||||
|
|
||||||
invisible(x)
|
invisible(x)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,23 +5,23 @@ ensembl_api_url <- "https://rest.ensembl.org"
|
||||||
|
|
||||||
#' Perform a request to the Ensembl REST API.
|
#' Perform a request to the Ensembl REST API.
|
||||||
ensembl_request <- function(api_path) {
|
ensembl_request <- function(api_path) {
|
||||||
content(stop_for_status(GET(
|
content(stop_for_status(GET(
|
||||||
paste0(ensembl_api_url, api_path),
|
paste0(ensembl_api_url, api_path),
|
||||||
content_type_json()
|
content_type_json()
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Get IDs of all available vertebrates.
|
#' Get IDs of all available vertebrates.
|
||||||
get_species_ids <- function() {
|
get_species_ids <- function() {
|
||||||
species <- ensembl_request("/info/species")$species
|
species <- ensembl_request("/info/species")$species
|
||||||
sapply(species, function(species) species$name)
|
sapply(species, function(species) species$name)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Get all chromosomes names for a species.
|
#' Get all chromosomes names for a species.
|
||||||
get_species_chromosomes <- function(species_id) {
|
get_species_chromosomes <- function(species_id) {
|
||||||
chromosomes <- unlist(ensembl_request(
|
chromosomes <- unlist(ensembl_request(
|
||||||
paste0("/info/assembly/", species_id)
|
paste0("/info/assembly/", species_id)
|
||||||
)$karyotype)
|
)$karyotype)
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Get a vector of all available unqiue chromosome names.
|
#' Get a vector of all available unqiue chromosome names.
|
||||||
|
|
@ -29,6 +29,6 @@ get_species_chromosomes <- function(species_id) {
|
||||||
#' There are multiple names for mitochondrial DNA which have to be removed
|
#' There are multiple names for mitochondrial DNA which have to be removed
|
||||||
#' manually, unfortunately.
|
#' manually, unfortunately.
|
||||||
get_all_chromosomes <- function() {
|
get_all_chromosomes <- function() {
|
||||||
chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
|
chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
|
||||||
unique(unlist(chromosomes))
|
unique(unlist(chromosomes))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,8 +12,8 @@ ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
|
||||||
|
|
||||||
# Filter out species ID and name from the result.
|
# Filter out species ID and name from the result.
|
||||||
species <- ensembl_datasets[, .(
|
species <- ensembl_datasets[, .(
|
||||||
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
|
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
|
||||||
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
|
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
|
||||||
)]
|
)]
|
||||||
|
|
||||||
# List of assemblies that the Ensembl Rest API advertises as chromosomes.
|
# List of assemblies that the Ensembl Rest API advertises as chromosomes.
|
||||||
|
|
@ -23,232 +23,232 @@ species <- ensembl_datasets[, .(
|
||||||
#
|
#
|
||||||
# See get_all_chromosomes()
|
# See get_all_chromosomes()
|
||||||
valid_chromosome_names <- c(
|
valid_chromosome_names <- c(
|
||||||
"1",
|
"1",
|
||||||
"2",
|
"2",
|
||||||
"3",
|
"3",
|
||||||
"4",
|
"4",
|
||||||
"5",
|
"5",
|
||||||
"6",
|
"6",
|
||||||
"7",
|
"7",
|
||||||
"8",
|
"8",
|
||||||
"9",
|
"9",
|
||||||
"10",
|
"10",
|
||||||
"11",
|
"11",
|
||||||
"12",
|
"12",
|
||||||
"13",
|
"13",
|
||||||
"14",
|
"14",
|
||||||
"15",
|
"15",
|
||||||
"16",
|
"16",
|
||||||
"17",
|
"17",
|
||||||
"18",
|
"18",
|
||||||
"19",
|
"19",
|
||||||
"20",
|
"20",
|
||||||
"X",
|
"X",
|
||||||
"Y",
|
"Y",
|
||||||
"21",
|
"21",
|
||||||
"4A",
|
"4A",
|
||||||
"1A",
|
"1A",
|
||||||
"22",
|
"22",
|
||||||
"23",
|
"23",
|
||||||
"24",
|
"24",
|
||||||
"25LG1",
|
"25LG1",
|
||||||
"25LG2",
|
"25LG2",
|
||||||
"26",
|
"26",
|
||||||
"27",
|
"27",
|
||||||
"28",
|
"28",
|
||||||
"LGE22",
|
"LGE22",
|
||||||
"Z",
|
"Z",
|
||||||
"25",
|
"25",
|
||||||
"29",
|
"29",
|
||||||
"A1",
|
"A1",
|
||||||
"A2",
|
"A2",
|
||||||
"A3",
|
"A3",
|
||||||
"B1",
|
"B1",
|
||||||
"B2",
|
"B2",
|
||||||
"B3",
|
"B3",
|
||||||
"B4",
|
"B4",
|
||||||
"C1",
|
"C1",
|
||||||
"C2",
|
"C2",
|
||||||
"D1",
|
"D1",
|
||||||
"D2",
|
"D2",
|
||||||
"D3",
|
"D3",
|
||||||
"D4",
|
"D4",
|
||||||
"E1",
|
"E1",
|
||||||
"E2",
|
"E2",
|
||||||
"E3",
|
"E3",
|
||||||
"F1",
|
"F1",
|
||||||
"F2",
|
"F2",
|
||||||
"2A",
|
"2A",
|
||||||
"2B",
|
"2B",
|
||||||
"LG01",
|
"LG01",
|
||||||
"LG02",
|
"LG02",
|
||||||
"LG03",
|
"LG03",
|
||||||
"LG04",
|
"LG04",
|
||||||
"LG05",
|
"LG05",
|
||||||
"LG06",
|
"LG06",
|
||||||
"LG07",
|
"LG07",
|
||||||
"LG08",
|
"LG08",
|
||||||
"LG09",
|
"LG09",
|
||||||
"LG10",
|
"LG10",
|
||||||
"LG11",
|
"LG11",
|
||||||
"LG12",
|
"LG12",
|
||||||
"LG13",
|
"LG13",
|
||||||
"LG14",
|
"LG14",
|
||||||
"LG15",
|
"LG15",
|
||||||
"LG16",
|
"LG16",
|
||||||
"LG17",
|
"LG17",
|
||||||
"LG18",
|
"LG18",
|
||||||
"LG19",
|
"LG19",
|
||||||
"LG20",
|
"LG20",
|
||||||
"LG21",
|
"LG21",
|
||||||
"LG22",
|
"LG22",
|
||||||
"LG23",
|
"LG23",
|
||||||
"LG24",
|
"LG24",
|
||||||
"LG25",
|
"LG25",
|
||||||
"I",
|
"I",
|
||||||
"II",
|
"II",
|
||||||
"III",
|
"III",
|
||||||
"IV",
|
"IV",
|
||||||
"V",
|
"V",
|
||||||
"VI",
|
"VI",
|
||||||
"VII",
|
"VII",
|
||||||
"VIII",
|
"VIII",
|
||||||
"IX",
|
"IX",
|
||||||
"XI",
|
"XI",
|
||||||
"XII",
|
"XII",
|
||||||
"XIII",
|
"XIII",
|
||||||
"XIV",
|
"XIV",
|
||||||
"XV",
|
"XV",
|
||||||
"XVI",
|
"XVI",
|
||||||
"XVII",
|
"XVII",
|
||||||
"XVIII",
|
"XVIII",
|
||||||
"XIX",
|
"XIX",
|
||||||
"XX",
|
"XX",
|
||||||
"XXI",
|
"XXI",
|
||||||
"XXII",
|
"XXII",
|
||||||
"XXIII",
|
"XXIII",
|
||||||
"XXIV",
|
"XXIV",
|
||||||
"W",
|
"W",
|
||||||
"30",
|
"30",
|
||||||
"31",
|
"31",
|
||||||
"32",
|
"32",
|
||||||
"33",
|
"33",
|
||||||
"34",
|
"34",
|
||||||
"35",
|
"35",
|
||||||
"36",
|
"36",
|
||||||
"37",
|
"37",
|
||||||
"38",
|
"38",
|
||||||
"39",
|
"39",
|
||||||
"40",
|
"40",
|
||||||
"2L",
|
"2L",
|
||||||
"2R",
|
"2R",
|
||||||
"3L",
|
"3L",
|
||||||
"3R",
|
"3R",
|
||||||
"LG1",
|
"LG1",
|
||||||
"LG2",
|
"LG2",
|
||||||
"LG3",
|
"LG3",
|
||||||
"LG4",
|
"LG4",
|
||||||
"LG5",
|
"LG5",
|
||||||
"LG6",
|
"LG6",
|
||||||
"LG7",
|
"LG7",
|
||||||
"LG8",
|
"LG8",
|
||||||
"LG9",
|
"LG9",
|
||||||
"LG26",
|
"LG26",
|
||||||
"LG27",
|
"LG27",
|
||||||
"LG28",
|
"LG28",
|
||||||
"LG29",
|
"LG29",
|
||||||
"LG30",
|
"LG30",
|
||||||
"1a",
|
"1a",
|
||||||
"7b",
|
"7b",
|
||||||
"22a",
|
"22a",
|
||||||
"LGE22C19W28_E50C23",
|
"LGE22C19W28_E50C23",
|
||||||
"LGE64",
|
"LGE64",
|
||||||
"7a",
|
"7a",
|
||||||
"MIC_1",
|
"MIC_1",
|
||||||
"MIC_10",
|
"MIC_10",
|
||||||
"MIC_11",
|
"MIC_11",
|
||||||
"MIC_2",
|
"MIC_2",
|
||||||
"MIC_3",
|
"MIC_3",
|
||||||
"MIC_4",
|
"MIC_4",
|
||||||
"MIC_5",
|
"MIC_5",
|
||||||
"MIC_6",
|
"MIC_6",
|
||||||
"MIC_7",
|
"MIC_7",
|
||||||
"MIC_8",
|
"MIC_8",
|
||||||
"MIC_9",
|
"MIC_9",
|
||||||
"sgr01",
|
"sgr01",
|
||||||
"sgr02",
|
"sgr02",
|
||||||
"sgr03",
|
"sgr03",
|
||||||
"sgr04",
|
"sgr04",
|
||||||
"sgr05",
|
"sgr05",
|
||||||
"sgr06",
|
"sgr06",
|
||||||
"sgr07",
|
"sgr07",
|
||||||
"sgr08",
|
"sgr08",
|
||||||
"sgr09",
|
"sgr09",
|
||||||
"sgr10",
|
"sgr10",
|
||||||
"sgr11",
|
"sgr11",
|
||||||
"sgr12",
|
"sgr12",
|
||||||
"sgr13",
|
"sgr13",
|
||||||
"sgr14",
|
"sgr14",
|
||||||
"sgr15",
|
"sgr15",
|
||||||
"sgr16",
|
"sgr16",
|
||||||
"sgr17",
|
"sgr17",
|
||||||
"sgr18",
|
"sgr18",
|
||||||
"sgr19",
|
"sgr19",
|
||||||
"X1",
|
"X1",
|
||||||
"X2",
|
"X2",
|
||||||
"X3",
|
"X3",
|
||||||
"X4",
|
"X4",
|
||||||
"X5",
|
"X5",
|
||||||
"a",
|
"a",
|
||||||
"b",
|
"b",
|
||||||
"c",
|
"c",
|
||||||
"d",
|
"d",
|
||||||
"f",
|
"f",
|
||||||
"g",
|
"g",
|
||||||
"h",
|
"h",
|
||||||
"41",
|
"41",
|
||||||
"42",
|
"42",
|
||||||
"43",
|
"43",
|
||||||
"44",
|
"44",
|
||||||
"45",
|
"45",
|
||||||
"46",
|
"46",
|
||||||
"47",
|
"47",
|
||||||
"48",
|
"48",
|
||||||
"49",
|
"49",
|
||||||
"50",
|
"50",
|
||||||
"LG28B",
|
"LG28B",
|
||||||
"LG30F",
|
"LG30F",
|
||||||
"LG36F",
|
"LG36F",
|
||||||
"LG37M",
|
"LG37M",
|
||||||
"LG42F",
|
"LG42F",
|
||||||
"LG44F",
|
"LG44F",
|
||||||
"LG45M",
|
"LG45M",
|
||||||
"LG48F",
|
"LG48F",
|
||||||
"LG49B",
|
"LG49B",
|
||||||
"LG34",
|
"LG34",
|
||||||
"LG35",
|
"LG35",
|
||||||
"LG7_11",
|
"LG7_11",
|
||||||
"groupI",
|
"groupI",
|
||||||
"groupII",
|
"groupII",
|
||||||
"groupIII",
|
"groupIII",
|
||||||
"groupIV",
|
"groupIV",
|
||||||
"groupV",
|
"groupV",
|
||||||
"groupVI",
|
"groupVI",
|
||||||
"groupVII",
|
"groupVII",
|
||||||
"groupVIII",
|
"groupVIII",
|
||||||
"groupIX",
|
"groupIX",
|
||||||
"groupX",
|
"groupX",
|
||||||
"groupXI",
|
"groupXI",
|
||||||
"groupXII",
|
"groupXII",
|
||||||
"groupXIII",
|
"groupXIII",
|
||||||
"groupXIV",
|
"groupXIV",
|
||||||
"groupXV",
|
"groupXV",
|
||||||
"groupXVI",
|
"groupXVI",
|
||||||
"groupXVII",
|
"groupXVII",
|
||||||
"groupXVIII",
|
"groupXVIII",
|
||||||
"groupXIX",
|
"groupXIX",
|
||||||
"groupXX",
|
"groupXX",
|
||||||
"groupXXI"
|
"groupXXI"
|
||||||
)
|
)
|
||||||
|
|
||||||
#' Get all chromosome names for an Ensembl dataset.
|
#' Get all chromosome names for an Ensembl dataset.
|
||||||
|
|
@ -256,8 +256,8 @@ valid_chromosome_names <- c(
|
||||||
#' The function tries to filter out valid chromosome names from the available
|
#' The function tries to filter out valid chromosome names from the available
|
||||||
#' assemblies in the dataset.
|
#' assemblies in the dataset.
|
||||||
get_chromosome_names <- function(dataset) {
|
get_chromosome_names <- function(dataset) {
|
||||||
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
|
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
|
||||||
chromosome_names[chromosome_names %chin% valid_chromosome_names]
|
chromosome_names[chromosome_names %chin% valid_chromosome_names]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Retrieve information on human genes. This will only include genes on
|
# Retrieve information on human genes. This will only include genes on
|
||||||
|
|
@ -267,16 +267,16 @@ rlog::log_info("Retrieving information on human genes")
|
||||||
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
||||||
|
|
||||||
human_data <- data.table(biomaRt::getBM(
|
human_data <- data.table(biomaRt::getBM(
|
||||||
attributes = c(
|
attributes = c(
|
||||||
"ensembl_gene_id",
|
"ensembl_gene_id",
|
||||||
"hgnc_symbol",
|
"hgnc_symbol",
|
||||||
"chromosome_name",
|
"chromosome_name",
|
||||||
"start_position",
|
"start_position",
|
||||||
"end_position"
|
"end_position"
|
||||||
),
|
),
|
||||||
filters = "chromosome_name",
|
filters = "chromosome_name",
|
||||||
values = get_chromosome_names(dataset),
|
values = get_chromosome_names(dataset),
|
||||||
mart = dataset
|
mart = dataset
|
||||||
))
|
))
|
||||||
|
|
||||||
# Remove duplicated gene IDs (at the time of writing, there are a handful).
|
# Remove duplicated gene IDs (at the time of writing, there are a handful).
|
||||||
|
|
@ -284,9 +284,9 @@ human_data <- unique(human_data, by = "ensembl_gene_id")
|
||||||
|
|
||||||
# Only keep relevant information on genes.
|
# Only keep relevant information on genes.
|
||||||
genes <- human_data[, .(
|
genes <- human_data[, .(
|
||||||
id = ensembl_gene_id,
|
id = ensembl_gene_id,
|
||||||
name = hgnc_symbol,
|
name = hgnc_symbol,
|
||||||
chromosome = chromosome_name
|
chromosome = chromosome_name
|
||||||
)]
|
)]
|
||||||
|
|
||||||
# Retrieve gene distance data across species.
|
# Retrieve gene distance data across species.
|
||||||
|
|
@ -296,39 +296,39 @@ distances <- data.table()
|
||||||
|
|
||||||
#' Handle data for one species.
|
#' Handle data for one species.
|
||||||
handle_species <- function(species_id, species_data) {
|
handle_species <- function(species_id, species_data) {
|
||||||
chromosomes <- species_data[,
|
chromosomes <- species_data[,
|
||||||
.(chromosome_length = max(end_position)),
|
.(chromosome_length = max(end_position)),
|
||||||
by = chromosome_name
|
by = chromosome_name
|
||||||
]
|
]
|
||||||
|
|
||||||
# Store the number of chromosomes in the species table.
|
# Store the number of chromosomes in the species table.
|
||||||
species[id == species_id, n_chromosomes := nrow(chromosomes)]
|
species[id == species_id, n_chromosomes := nrow(chromosomes)]
|
||||||
|
|
||||||
# Store the median chromosome length in the species table.
|
# Store the median chromosome length in the species table.
|
||||||
species[
|
species[
|
||||||
id == species_id,
|
id == species_id,
|
||||||
median_chromosome_length := chromosomes[, median(chromosome_length)]
|
median_chromosome_length := chromosomes[, median(chromosome_length)]
|
||||||
]
|
]
|
||||||
|
|
||||||
# Precompute the genes' distance to the nearest telomere.
|
# Precompute the genes' distance to the nearest telomere.
|
||||||
species_distances <- species_data[
|
species_distances <- species_data[
|
||||||
chromosomes,
|
chromosomes,
|
||||||
.(
|
.(
|
||||||
species = species_id,
|
species = species_id,
|
||||||
gene = ensembl_gene_id,
|
gene = ensembl_gene_id,
|
||||||
chromosome_name = chromosome_name,
|
chromosome_name = chromosome_name,
|
||||||
start_position = start_position,
|
start_position = start_position,
|
||||||
end_position = end_position,
|
end_position = end_position,
|
||||||
distance = pmin(
|
distance = pmin(
|
||||||
start_position,
|
start_position,
|
||||||
chromosome_length - end_position
|
chromosome_length - end_position
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
on = "chromosome_name"
|
on = "chromosome_name"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Add species distances to the distances table.
|
# Add species distances to the distances table.
|
||||||
distances <<- rbindlist(list(distances, species_distances))
|
distances <<- rbindlist(list(distances, species_distances))
|
||||||
}
|
}
|
||||||
|
|
||||||
# Handle the human first, as we already retrieved the data and don't need to
|
# Handle the human first, as we already retrieved the data and don't need to
|
||||||
|
|
@ -337,67 +337,67 @@ handle_species("hsapiens", human_data)
|
||||||
|
|
||||||
# Iterate through all other species and retrieve their distance data.
|
# Iterate through all other species and retrieve their distance data.
|
||||||
for (species_id in species[id != "hsapiens", id]) {
|
for (species_id in species[id != "hsapiens", id]) {
|
||||||
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
|
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
|
||||||
|
|
||||||
dataset <- biomaRt::useDataset(
|
dataset <- biomaRt::useDataset(
|
||||||
sprintf("%s_gene_ensembl", species_id),
|
sprintf("%s_gene_ensembl", species_id),
|
||||||
mart = ensembl
|
mart = ensembl
|
||||||
)
|
)
|
||||||
|
|
||||||
# Besides the attributes that are always present, we need to check for
|
# Besides the attributes that are always present, we need to check for
|
||||||
# human orthologs. Some species don't have that information and will be
|
# human orthologs. Some species don't have that information and will be
|
||||||
# skipped.
|
# skipped.
|
||||||
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
||||||
biomaRt::listAttributes(dataset, what = "name")) {
|
biomaRt::listAttributes(dataset, what = "name")) {
|
||||||
rlog::log_info("No data on human orthologs")
|
rlog::log_info("No data on human orthologs")
|
||||||
species <- species[id != species_id]
|
species <- species[id != species_id]
|
||||||
|
|
||||||
next
|
next
|
||||||
}
|
}
|
||||||
|
|
||||||
chromosome_names <- get_chromosome_names(dataset)
|
chromosome_names <- get_chromosome_names(dataset)
|
||||||
|
|
||||||
# Skip the species, if there are no assembled chromosomes.
|
# Skip the species, if there are no assembled chromosomes.
|
||||||
if (length(chromosome_names) <= 0) {
|
if (length(chromosome_names) <= 0) {
|
||||||
rlog::log_info("No matching chromosome assemblies")
|
rlog::log_info("No matching chromosome assemblies")
|
||||||
species <- species[id != species_id]
|
species <- species[id != species_id]
|
||||||
|
|
||||||
next
|
next
|
||||||
}
|
}
|
||||||
|
|
||||||
# Retrieve information on all genes of the current species, that have
|
# Retrieve information on all genes of the current species, that have
|
||||||
# human orthologs. This is called "homolog" in the Ensembl schema.
|
# human orthologs. This is called "homolog" in the Ensembl schema.
|
||||||
species_distances <- data.table(biomaRt::getBM(
|
species_distances <- data.table(biomaRt::getBM(
|
||||||
attributes = c(
|
attributes = c(
|
||||||
"hsapiens_homolog_ensembl_gene",
|
"hsapiens_homolog_ensembl_gene",
|
||||||
"chromosome_name",
|
"chromosome_name",
|
||||||
"start_position",
|
"start_position",
|
||||||
"end_position"
|
"end_position"
|
||||||
),
|
),
|
||||||
filters = c("with_hsapiens_homolog", "chromosome_name"),
|
filters = c("with_hsapiens_homolog", "chromosome_name"),
|
||||||
values = list(TRUE, chromosome_names),
|
values = list(TRUE, chromosome_names),
|
||||||
mart = dataset
|
mart = dataset
|
||||||
))
|
))
|
||||||
|
|
||||||
# Only include human genes that we have information on.
|
# Only include human genes that we have information on.
|
||||||
species_distances <- species_distances[
|
species_distances <- species_distances[
|
||||||
hsapiens_homolog_ensembl_gene %chin% genes$id
|
hsapiens_homolog_ensembl_gene %chin% genes$id
|
||||||
]
|
]
|
||||||
|
|
||||||
# Only include one ortholog per human gene.
|
# Only include one ortholog per human gene.
|
||||||
species_distances <- unique(
|
species_distances <- unique(
|
||||||
species_distances,
|
species_distances,
|
||||||
by = "hsapiens_homolog_ensembl_gene"
|
by = "hsapiens_homolog_ensembl_gene"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Rename gene ID column to match the human data.
|
# Rename gene ID column to match the human data.
|
||||||
setnames(
|
setnames(
|
||||||
species_distances,
|
species_distances,
|
||||||
"hsapiens_homolog_ensembl_gene",
|
"hsapiens_homolog_ensembl_gene",
|
||||||
"ensembl_gene_id"
|
"ensembl_gene_id"
|
||||||
)
|
)
|
||||||
|
|
||||||
handle_species(species_id, species_distances)
|
handle_species(species_id, species_distances)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Save data in the appropriate place.
|
# Save data in the appropriate place.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue