mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-25 19:37:23 +02:00
Reindent code to use just two spaces
This commit is contained in:
parent
a1e6147466
commit
c04b6337e9
17 changed files with 1583 additions and 1582 deletions
88
R/analyze.R
88
R/analyze.R
|
|
@ -18,54 +18,54 @@
|
|||
#'
|
||||
#' @export
|
||||
analyze <- function(preset, progress = NULL, include_results = TRUE) {
|
||||
if (!inherits(preset, "geposan_preset")) {
|
||||
stop("Preset is invalid. Use geposan::preset() to create one.")
|
||||
}
|
||||
if (!inherits(preset, "geposan_preset")) {
|
||||
stop("Preset is invalid. Use geposan::preset() to create one.")
|
||||
}
|
||||
|
||||
if (is.null(progress)) {
|
||||
progress_bar <- progress::progress_bar$new()
|
||||
progress_bar$update(0.0)
|
||||
if (is.null(progress)) {
|
||||
progress_bar <- progress::progress_bar$new()
|
||||
progress_bar$update(0.0)
|
||||
|
||||
progress <- function(progress_value) {
|
||||
if (!progress_bar$finished) {
|
||||
progress_bar$update(progress_value)
|
||||
if (progress_value >= 1.0) {
|
||||
progress_bar$terminate()
|
||||
}
|
||||
}
|
||||
progress <- function(progress_value) {
|
||||
if (!progress_bar$finished) {
|
||||
progress_bar$update(progress_value)
|
||||
if (progress_value >= 1.0) {
|
||||
progress_bar$terminate()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
progress_buffer <- 0.0
|
||||
method_count <- length(preset$methods)
|
||||
progress_buffer <- 0.0
|
||||
method_count <- length(preset$methods)
|
||||
|
||||
method_progress <- function(progress_value) {
|
||||
progress(progress_buffer + progress_value / method_count)
|
||||
}
|
||||
method_progress <- function(progress_value) {
|
||||
progress(progress_buffer + progress_value / method_count)
|
||||
}
|
||||
|
||||
scores <- data.table(gene = preset$gene_id)
|
||||
results <- list()
|
||||
scores <- data.table(gene = preset$gene_id)
|
||||
results <- list()
|
||||
|
||||
for (method in preset$methods) {
|
||||
method_results <- method$func(preset, method_progress)
|
||||
for (method in preset$methods) {
|
||||
method_results <- method$func(preset, method_progress)
|
||||
|
||||
scores <- merge(scores, method_results$scores, by = "gene")
|
||||
setnames(scores, "score", method$id)
|
||||
scores <- merge(scores, method_results$scores, by = "gene")
|
||||
setnames(scores, "score", method$id)
|
||||
|
||||
results <- c(results, list(method_results))
|
||||
results <- c(results, list(method_results))
|
||||
|
||||
progress_buffer <- progress_buffer + 1 / method_count
|
||||
progress(progress_buffer)
|
||||
}
|
||||
progress_buffer <- progress_buffer + 1 / method_count
|
||||
progress(progress_buffer)
|
||||
}
|
||||
|
||||
structure(
|
||||
list(
|
||||
preset = preset,
|
||||
scores = scores,
|
||||
results = if (include_results) results else NULL
|
||||
),
|
||||
class = "geposan_analysis"
|
||||
)
|
||||
structure(
|
||||
list(
|
||||
preset = preset,
|
||||
scores = scores,
|
||||
results = if (include_results) results else NULL
|
||||
),
|
||||
class = "geposan_analysis"
|
||||
)
|
||||
}
|
||||
|
||||
#' Print an analysis object.
|
||||
|
|
@ -77,14 +77,14 @@ analyze <- function(preset, progress = NULL, include_results = TRUE) {
|
|||
#'
|
||||
#' @export
|
||||
print.geposan_analysis <- function(x, ...) {
|
||||
cat("geposan analysis:\n\n")
|
||||
print(x$preset)
|
||||
cat("geposan analysis:\n\n")
|
||||
print(x$preset)
|
||||
cat("\n")
|
||||
|
||||
for (result in x$results) {
|
||||
print(result)
|
||||
cat("\n")
|
||||
}
|
||||
|
||||
for (result in x$results) {
|
||||
print(result)
|
||||
cat("\n")
|
||||
}
|
||||
|
||||
invisible(x)
|
||||
invisible(x)
|
||||
}
|
||||
|
|
|
|||
104
R/comparison.R
104
R/comparison.R
|
|
@ -18,38 +18,38 @@
|
|||
#'
|
||||
#' @export
|
||||
compare <- function(ranking, comparison_gene_ids) {
|
||||
if (!inherits(ranking, "geposan_ranking")) {
|
||||
stop("Invalid ranking. Use geposan::ranking().")
|
||||
}
|
||||
if (!inherits(ranking, "geposan_ranking")) {
|
||||
stop("Invalid ranking. Use geposan::ranking().")
|
||||
}
|
||||
|
||||
comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
|
||||
comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
|
||||
|
||||
quantiles <- data.table(
|
||||
quantile = c("0%", "25%", "50%", "75%", "100%"),
|
||||
score = stats::quantile(comparison_ranking[, score]),
|
||||
rank = stats::quantile(
|
||||
comparison_ranking[, rank],
|
||||
probs = seq(1, 0, -0.25)
|
||||
),
|
||||
percentile = stats::quantile(comparison_ranking[, percentile])
|
||||
)
|
||||
quantiles <- data.table(
|
||||
quantile = c("0%", "25%", "50%", "75%", "100%"),
|
||||
score = stats::quantile(comparison_ranking[, score]),
|
||||
rank = stats::quantile(
|
||||
comparison_ranking[, rank],
|
||||
probs = seq(1, 0, -0.25)
|
||||
),
|
||||
percentile = stats::quantile(comparison_ranking[, percentile])
|
||||
)
|
||||
|
||||
p_value <- stats::wilcox.test(
|
||||
x = comparison_ranking[, score],
|
||||
y = ranking[!gene %chin% comparison_gene_ids, score],
|
||||
alternative = "greater"
|
||||
)$p.value
|
||||
p_value <- stats::wilcox.test(
|
||||
x = comparison_ranking[, score],
|
||||
y = ranking[!gene %chin% comparison_gene_ids, score],
|
||||
alternative = "greater"
|
||||
)$p.value
|
||||
|
||||
structure(
|
||||
list(
|
||||
quantiles = quantiles,
|
||||
mean_score = comparison_ranking[, mean(score)],
|
||||
mean_rank = comparison_ranking[, mean(rank)],
|
||||
mean_percentile = comparison_ranking[, mean(percentile)],
|
||||
p_value = p_value
|
||||
),
|
||||
class = "geposan_comparison"
|
||||
)
|
||||
structure(
|
||||
list(
|
||||
quantiles = quantiles,
|
||||
mean_score = comparison_ranking[, mean(score)],
|
||||
mean_rank = comparison_ranking[, mean(rank)],
|
||||
mean_percentile = comparison_ranking[, mean(percentile)],
|
||||
p_value = p_value
|
||||
),
|
||||
class = "geposan_comparison"
|
||||
)
|
||||
}
|
||||
|
||||
#' S3 method to print a comparison object.
|
||||
|
|
@ -61,32 +61,32 @@ compare <- function(ranking, comparison_gene_ids) {
|
|||
#'
|
||||
#' @export
|
||||
print.geposan_comparison <- function(x, ...) {
|
||||
cat("geposan comparison:\n\n")
|
||||
cat("geposan comparison:\n\n")
|
||||
|
||||
quantiles_formatted <- x$quantiles[, .(
|
||||
"Quantile" = quantile,
|
||||
"Score" = round(score, 3),
|
||||
"Rank" = rank,
|
||||
"Percentile" = paste0(
|
||||
format(round(percentile * 100, 1), nsmall = 1),
|
||||
"%"
|
||||
)
|
||||
)]
|
||||
quantiles_formatted <- x$quantiles[, .(
|
||||
"Quantile" = quantile,
|
||||
"Score" = round(score, 3),
|
||||
"Rank" = rank,
|
||||
"Percentile" = paste0(
|
||||
format(round(percentile * 100, 1), nsmall = 1),
|
||||
"%"
|
||||
)
|
||||
)]
|
||||
|
||||
print(quantiles_formatted, row.names = FALSE)
|
||||
print(quantiles_formatted, row.names = FALSE)
|
||||
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"\n Mean score: %.3f",
|
||||
"\n Mean rank: %.1f",
|
||||
"\n Mean percentile: %.1f%%",
|
||||
"\n p-value for better scores: %.4f\n"
|
||||
),
|
||||
x$mean_score,
|
||||
x$mean_rank,
|
||||
x$mean_percentile * 100,
|
||||
x$p_value
|
||||
))
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"\n Mean score: %.3f",
|
||||
"\n Mean rank: %.1f",
|
||||
"\n Mean percentile: %.1f%%",
|
||||
"\n p-value for better scores: %.4f\n"
|
||||
),
|
||||
x$mean_score,
|
||||
x$mean_rank,
|
||||
x$mean_percentile * 100,
|
||||
x$p_value
|
||||
))
|
||||
|
||||
invisible(x)
|
||||
invisible(x)
|
||||
}
|
||||
|
|
|
|||
68
R/method.R
68
R/method.R
|
|
@ -12,34 +12,34 @@
|
|||
#'
|
||||
#' @export
|
||||
method <- function(id, name, description, func) {
|
||||
stopifnot(is.character(id) & length(id) == 1)
|
||||
stopifnot(is.character(name) & length(name) == 1)
|
||||
stopifnot(is.character(description) & length(description) == 1)
|
||||
stopifnot(is.function(func))
|
||||
stopifnot(is.character(id) & length(id) == 1)
|
||||
stopifnot(is.character(name) & length(name) == 1)
|
||||
stopifnot(is.character(description) & length(description) == 1)
|
||||
stopifnot(is.function(func))
|
||||
|
||||
structure(
|
||||
list(
|
||||
id = id,
|
||||
name = name,
|
||||
description = description,
|
||||
func = func
|
||||
),
|
||||
class = "geposan_method"
|
||||
)
|
||||
structure(
|
||||
list(
|
||||
id = id,
|
||||
name = name,
|
||||
description = description,
|
||||
func = func
|
||||
),
|
||||
class = "geposan_method"
|
||||
)
|
||||
}
|
||||
|
||||
#' Get a list of all available methods.
|
||||
#'
|
||||
#' @export
|
||||
all_methods <- function() {
|
||||
list(
|
||||
clustering(),
|
||||
correlation(),
|
||||
neural(),
|
||||
adjacency(),
|
||||
species_adjacency(),
|
||||
proximity()
|
||||
)
|
||||
list(
|
||||
clustering(),
|
||||
correlation(),
|
||||
neural(),
|
||||
adjacency(),
|
||||
species_adjacency(),
|
||||
proximity()
|
||||
)
|
||||
}
|
||||
|
||||
#' Print a method object.
|
||||
|
|
@ -51,18 +51,18 @@ all_methods <- function() {
|
|||
#'
|
||||
#' @export
|
||||
print.geposan_method <- function(x, ...) {
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan method:",
|
||||
"\n Method ID: %s",
|
||||
"\n Name: %s",
|
||||
"\n Description: %s",
|
||||
"\n"
|
||||
),
|
||||
x$id,
|
||||
x$name,
|
||||
x$description
|
||||
))
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan method:",
|
||||
"\n Method ID: %s",
|
||||
"\n Name: %s",
|
||||
"\n Description: %s",
|
||||
"\n"
|
||||
),
|
||||
x$id,
|
||||
x$name,
|
||||
x$description
|
||||
))
|
||||
|
||||
invisible(x)
|
||||
invisible(x)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,14 +12,14 @@
|
|||
#'
|
||||
#' @export
|
||||
densest <- function(data) {
|
||||
as.numeric(if (length(data) <= 0) {
|
||||
NULL
|
||||
} else if (length(data) == 1) {
|
||||
data
|
||||
} else {
|
||||
density <- stats::density(data)
|
||||
mean(density$x[density$y == max(density$y)])
|
||||
})
|
||||
as.numeric(if (length(data) <= 0) {
|
||||
NULL
|
||||
} else if (length(data) == 1) {
|
||||
data
|
||||
} else {
|
||||
density <- stats::density(data)
|
||||
mean(density$x[density$y == max(density$y)])
|
||||
})
|
||||
}
|
||||
|
||||
#' Score genes based on their proximity to the reference genes.
|
||||
|
|
@ -40,80 +40,80 @@ densest <- function(data) {
|
|||
#'
|
||||
#' @export
|
||||
adjacency <- function(distance_estimate = densest, summarize = stats::median) {
|
||||
method(
|
||||
id = "adjacency",
|
||||
name = "Adjacency",
|
||||
description = "Adjacency to reference genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
method(
|
||||
id = "adjacency",
|
||||
name = "Adjacency",
|
||||
description = "Adjacency to reference genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
|
||||
cached(
|
||||
"adjacency",
|
||||
c(
|
||||
species_ids,
|
||||
gene_ids,
|
||||
reference_gene_ids,
|
||||
distance_estimate,
|
||||
summarize
|
||||
),
|
||||
{ # nolint
|
||||
# Filter distances by species and gene and summarize each
|
||||
# gene's distance values using the estimation function.
|
||||
data <- geposan::distances[
|
||||
species %chin% species_ids & gene %chin% gene_ids,
|
||||
.(distance = as.numeric(distance_estimate(distance))),
|
||||
by = gene
|
||||
]
|
||||
cached(
|
||||
"adjacency",
|
||||
c(
|
||||
species_ids,
|
||||
gene_ids,
|
||||
reference_gene_ids,
|
||||
distance_estimate,
|
||||
summarize
|
||||
),
|
||||
{ # nolint
|
||||
# Filter distances by species and gene and summarize each
|
||||
# gene's distance values using the estimation function.
|
||||
data <- geposan::distances[
|
||||
species %chin% species_ids & gene %chin% gene_ids,
|
||||
.(distance = as.numeric(distance_estimate(distance))),
|
||||
by = gene
|
||||
]
|
||||
|
||||
# Compute the absolute value of the difference between the
|
||||
# estimated distances of each gene to the reference genes.
|
||||
compute_difference <- function(distance_value,
|
||||
comparison_ids) {
|
||||
differences <- data[
|
||||
gene %chin% comparison_ids,
|
||||
.(difference = abs(distance_value - distance))
|
||||
]
|
||||
# Compute the absolute value of the difference between the
|
||||
# estimated distances of each gene to the reference genes.
|
||||
compute_difference <- function(distance_value,
|
||||
comparison_ids) {
|
||||
differences <- data[
|
||||
gene %chin% comparison_ids,
|
||||
.(difference = abs(distance_value - distance))
|
||||
]
|
||||
|
||||
summarize(differences$difference)
|
||||
}
|
||||
summarize(differences$difference)
|
||||
}
|
||||
|
||||
# Compute the differences to the reference genes.
|
||||
data[
|
||||
!gene %chin% reference_gene_ids,
|
||||
difference := compute_difference(
|
||||
distance,
|
||||
reference_gene_ids
|
||||
),
|
||||
by = gene
|
||||
]
|
||||
# Compute the differences to the reference genes.
|
||||
data[
|
||||
!gene %chin% reference_gene_ids,
|
||||
difference := compute_difference(
|
||||
distance,
|
||||
reference_gene_ids
|
||||
),
|
||||
by = gene
|
||||
]
|
||||
|
||||
progress(0.5)
|
||||
progress(0.5)
|
||||
|
||||
# Exclude the reference gene itself when computing its
|
||||
# difference.
|
||||
data[
|
||||
gene %chin% reference_gene_ids,
|
||||
difference := compute_difference(
|
||||
distance,
|
||||
reference_gene_ids[reference_gene_ids != gene]
|
||||
),
|
||||
by = gene
|
||||
]
|
||||
# Exclude the reference gene itself when computing its
|
||||
# difference.
|
||||
data[
|
||||
gene %chin% reference_gene_ids,
|
||||
difference := compute_difference(
|
||||
distance,
|
||||
reference_gene_ids[reference_gene_ids != gene]
|
||||
),
|
||||
by = gene
|
||||
]
|
||||
|
||||
# Compute the final score by normalizing the difference.
|
||||
data[, score := 1 - difference / max(difference)]
|
||||
# Compute the final score by normalizing the difference.
|
||||
data[, score := 1 - difference / max(difference)]
|
||||
|
||||
progress(1.0)
|
||||
progress(1.0)
|
||||
|
||||
result(
|
||||
method = "adjacency",
|
||||
scores = data[, .(gene, score)],
|
||||
details = list(data = data)
|
||||
)
|
||||
}
|
||||
)
|
||||
result(
|
||||
method = "adjacency",
|
||||
scores = data[, .(gene, score)],
|
||||
details = list(data = data)
|
||||
)
|
||||
}
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,33 +13,33 @@
|
|||
#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
|
||||
#' etc.
|
||||
clusteriness <- function(data, span = 100000, weight = 0.7) {
|
||||
n <- length(data)
|
||||
n <- length(data)
|
||||
|
||||
# Return a score of 0.0 if there is just one or no value at all.
|
||||
if (n < 2) {
|
||||
return(0.0)
|
||||
# Return a score of 0.0 if there is just one or no value at all.
|
||||
if (n < 2) {
|
||||
return(0.0)
|
||||
}
|
||||
|
||||
# Cluster the data and compute the cluster sizes.
|
||||
|
||||
tree <- stats::hclust(stats::dist(data))
|
||||
clusters <- stats::cutree(tree, h = span)
|
||||
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
|
||||
|
||||
# Compute the "clusteriness" score.
|
||||
|
||||
score <- 0.0
|
||||
|
||||
for (i in seq_along(cluster_sizes)) {
|
||||
cluster_size <- cluster_sizes[i]
|
||||
|
||||
if (cluster_size >= 2) {
|
||||
cluster_score <- cluster_size / n
|
||||
score <- score + weight^(i - 1) * cluster_score
|
||||
}
|
||||
}
|
||||
|
||||
# Cluster the data and compute the cluster sizes.
|
||||
|
||||
tree <- stats::hclust(stats::dist(data))
|
||||
clusters <- stats::cutree(tree, h = span)
|
||||
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
|
||||
|
||||
# Compute the "clusteriness" score.
|
||||
|
||||
score <- 0.0
|
||||
|
||||
for (i in seq_along(cluster_sizes)) {
|
||||
cluster_size <- cluster_sizes[i]
|
||||
|
||||
if (cluster_size >= 2) {
|
||||
cluster_score <- cluster_size / n
|
||||
score <- score + weight^(i - 1) * cluster_score
|
||||
}
|
||||
}
|
||||
|
||||
score
|
||||
score
|
||||
}
|
||||
|
||||
#' Process genes clustering their distance to telomeres.
|
||||
|
|
@ -53,41 +53,41 @@ clusteriness <- function(data, span = 100000, weight = 0.7) {
|
|||
#'
|
||||
#' @export
|
||||
clustering <- function() {
|
||||
method(
|
||||
id = "clustering",
|
||||
name = "Clustering",
|
||||
description = "Clustering of genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
method(
|
||||
id = "clustering",
|
||||
name = "Clustering",
|
||||
description = "Clustering of genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
|
||||
cached("clustering", c(species_ids, gene_ids), {
|
||||
scores <- data.table(gene = gene_ids)
|
||||
cached("clustering", c(species_ids, gene_ids), {
|
||||
scores <- data.table(gene = gene_ids)
|
||||
|
||||
# Prefilter the input data by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
# Prefilter the input data by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
|
||||
genes_done <- 0
|
||||
genes_total <- length(gene_ids)
|
||||
genes_done <- 0
|
||||
genes_total <- length(gene_ids)
|
||||
|
||||
# Perform the cluster analysis for one gene.
|
||||
compute <- function(gene_id) {
|
||||
data <- distances[gene == gene_id, distance]
|
||||
score <- clusteriness(data)
|
||||
# Perform the cluster analysis for one gene.
|
||||
compute <- function(gene_id) {
|
||||
data <- distances[gene == gene_id, distance]
|
||||
score <- clusteriness(data)
|
||||
|
||||
genes_done <<- genes_done + 1
|
||||
progress(genes_done / genes_total)
|
||||
genes_done <<- genes_done + 1
|
||||
progress(genes_done / genes_total)
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
scores[, score := compute(gene), by = gene]
|
||||
|
||||
result(
|
||||
method = "clustering",
|
||||
scores = scores
|
||||
)
|
||||
})
|
||||
score
|
||||
}
|
||||
)
|
||||
|
||||
scores[, score := compute(gene), by = gene]
|
||||
|
||||
result(
|
||||
method = "clustering",
|
||||
scores = scores
|
||||
)
|
||||
})
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,96 +8,97 @@
|
|||
#'
|
||||
#' @export
|
||||
correlation <- function(summarize = stats::median) {
|
||||
method(
|
||||
id = "correlation",
|
||||
name = "Correlation",
|
||||
description = "Correlation with reference genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
method(
|
||||
id = "correlation",
|
||||
name = "Correlation",
|
||||
description = "Correlation with reference genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
|
||||
cached(
|
||||
"correlation",
|
||||
c(species_ids, gene_ids, reference_gene_ids, summarize),
|
||||
{ # nolint
|
||||
# Prefilter distances by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
cached(
|
||||
"correlation",
|
||||
c(species_ids, gene_ids, reference_gene_ids, summarize),
|
||||
{ # nolint
|
||||
# Prefilter distances by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
|
||||
# Tranform data to get species as rows and genes as columns.
|
||||
# We construct columns per species, because it requires
|
||||
# fewer iterations, and transpose the table afterwards.
|
||||
# Tranform data to get species as rows and genes as columns.
|
||||
# We construct columns per species, because it requires
|
||||
# fewer iterations, and transpose the table afterwards.
|
||||
|
||||
data <- data.table(gene = gene_ids)
|
||||
data <- data.table(gene = gene_ids)
|
||||
|
||||
# Make a column containing distance data for each species.
|
||||
for (species_id in species_ids) {
|
||||
species_data <- distances[
|
||||
species == species_id,
|
||||
.(gene, distance)
|
||||
]
|
||||
# Make a column containing distance data for each species.
|
||||
for (species_id in species_ids) {
|
||||
species_data <- distances[
|
||||
species == species_id,
|
||||
.(gene, distance)
|
||||
]
|
||||
|
||||
data <- merge(data, species_data, all.x = TRUE)
|
||||
setnames(data, "distance", species_id)
|
||||
}
|
||||
data <- merge(data, species_data, all.x = TRUE)
|
||||
setnames(data, "distance", species_id)
|
||||
}
|
||||
|
||||
# Transpose to the desired format.
|
||||
data <- transpose(data, make.names = "gene")
|
||||
# Transpose to the desired format.
|
||||
data <- transpose(data, make.names = "gene")
|
||||
|
||||
progress(0.33)
|
||||
progress(0.33)
|
||||
|
||||
# Take the reference data.
|
||||
reference_data <- data[, ..reference_gene_ids]
|
||||
# Take the reference data.
|
||||
reference_data <- data[, ..reference_gene_ids]
|
||||
|
||||
# Perform the correlation between all possible pairs.
|
||||
results <- stats::cor(
|
||||
data[, ..gene_ids],
|
||||
reference_data,
|
||||
use = "pairwise.complete.obs",
|
||||
method = "spearman"
|
||||
)
|
||||
# Perform the correlation between all possible pairs.
|
||||
results <- stats::cor(
|
||||
data[, ..gene_ids],
|
||||
reference_data,
|
||||
use = "pairwise.complete.obs",
|
||||
method = "spearman"
|
||||
)
|
||||
|
||||
results <- data.table(results, keep.rownames = TRUE)
|
||||
setnames(results, "rn", "gene")
|
||||
results <- data.table(results, keep.rownames = TRUE)
|
||||
setnames(results, "rn", "gene")
|
||||
|
||||
# Remove correlations between the reference genes
|
||||
# themselves.
|
||||
for (reference_gene_id in reference_gene_ids) {
|
||||
column <- quote(reference_gene_id)
|
||||
results[gene == reference_gene_id, eval(column) := NA]
|
||||
}
|
||||
# Remove correlations between the reference genes
|
||||
# themselves.
|
||||
for (reference_gene_id in reference_gene_ids) {
|
||||
column <- quote(reference_gene_id)
|
||||
results[gene == reference_gene_id, eval(column) := NA]
|
||||
}
|
||||
|
||||
progress(0.66)
|
||||
progress(0.66)
|
||||
|
||||
# Combine the correlation coefficients.
|
||||
results[,
|
||||
max_correlation := as.double(summarize(stats::na.omit(
|
||||
# Convert the data.table subset into a
|
||||
# vector to get the correct na.omit
|
||||
# behavior.
|
||||
as.matrix(.SD)[1, ]
|
||||
))),
|
||||
.SDcols = reference_gene_ids,
|
||||
by = gene
|
||||
]
|
||||
# Combine the correlation coefficients.
|
||||
results[,
|
||||
max_correlation := as.double(summarize(stats::na.omit(
|
||||
# Convert the data.table subset into a
|
||||
# vector to get the correct na.omit
|
||||
# behavior.
|
||||
as.matrix(.SD)[1, ]
|
||||
))),
|
||||
.SDcols = reference_gene_ids,
|
||||
by = gene
|
||||
]
|
||||
|
||||
# Normalize scores.
|
||||
results[,
|
||||
score := (max_correlation - min(max_correlation)) /
|
||||
(max(max_correlation) - min(max_correlation))
|
||||
]
|
||||
# Normalize scores.
|
||||
results[
|
||||
,
|
||||
score := (max_correlation - min(max_correlation)) /
|
||||
(max(max_correlation) - min(max_correlation))
|
||||
]
|
||||
|
||||
# Normalize scores.
|
||||
# Normalize scores.
|
||||
|
||||
results[, .(gene, score)]
|
||||
results[, .(gene, score)]
|
||||
|
||||
result(
|
||||
method = "correlation",
|
||||
scores = results[, .(gene, score)],
|
||||
details = list(all_correlations = results)
|
||||
)
|
||||
}
|
||||
)
|
||||
result(
|
||||
method = "correlation",
|
||||
scores = results[, .(gene, score)],
|
||||
details = list(all_correlations = results)
|
||||
)
|
||||
}
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,244 +12,244 @@
|
|||
#'
|
||||
#' @export
|
||||
neural <- function(seed = 180199, n_models = 5) {
|
||||
method(
|
||||
id = "neural",
|
||||
name = "Neural",
|
||||
description = "Assessment by neural network",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
method(
|
||||
id = "neural",
|
||||
name = "Neural",
|
||||
description = "Assessment by neural network",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
|
||||
cached(
|
||||
"neural",
|
||||
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
|
||||
{ # nolint
|
||||
reference_count <- length(reference_gene_ids)
|
||||
stopifnot(n_models %in% 2:reference_count)
|
||||
cached(
|
||||
"neural",
|
||||
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
|
||||
{ # nolint
|
||||
reference_count <- length(reference_gene_ids)
|
||||
stopifnot(n_models %in% 2:reference_count)
|
||||
|
||||
# Make results reproducible.
|
||||
tensorflow::set_random_seed(seed)
|
||||
# Make results reproducible.
|
||||
tensorflow::set_random_seed(seed)
|
||||
|
||||
# Step 1: Prepare input data.
|
||||
# ---------------------------
|
||||
# Step 1: Prepare input data.
|
||||
# ---------------------------
|
||||
|
||||
# Prefilter distances by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
# Prefilter distances by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
|
||||
# Input data for the network. This contains the gene ID as
|
||||
# an identifier as well as the per-species gene distances as
|
||||
# input variables.
|
||||
data <- data.table(gene = gene_ids)
|
||||
# Input data for the network. This contains the gene ID as
|
||||
# an identifier as well as the per-species gene distances as
|
||||
# input variables.
|
||||
data <- data.table(gene = gene_ids)
|
||||
|
||||
# Buffer to keep track of the names of the input variables.
|
||||
input_vars <- NULL
|
||||
# Buffer to keep track of the names of the input variables.
|
||||
input_vars <- NULL
|
||||
|
||||
# Make a columns containing positions and distances for each
|
||||
# species.
|
||||
for (species_id in species_ids) {
|
||||
species_data <- distances[
|
||||
species == species_id,
|
||||
.(gene, distance)
|
||||
]
|
||||
# Make a columns containing positions and distances for each
|
||||
# species.
|
||||
for (species_id in species_ids) {
|
||||
species_data <- distances[
|
||||
species == species_id,
|
||||
.(gene, distance)
|
||||
]
|
||||
|
||||
# Only include species with at least 25% known values.
|
||||
# As positions and distances always coexist, we don't
|
||||
# loose any data here.
|
||||
# Only include species with at least 25% known values.
|
||||
# As positions and distances always coexist, we don't
|
||||
# loose any data here.
|
||||
|
||||
species_data <- stats::na.omit(species_data)
|
||||
species_data <- stats::na.omit(species_data)
|
||||
|
||||
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
|
||||
data <- merge(data, species_data, all.x = TRUE)
|
||||
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
|
||||
data <- merge(data, species_data, all.x = TRUE)
|
||||
|
||||
# Replace missing data with mean values. The neural
|
||||
# network can't handle NAs in a meaningful way.
|
||||
# Choosing extreme values here would result in
|
||||
# heavily biased results. Therefore, the mean value
|
||||
# is chosen as a compromise. However, this will of
|
||||
# course lessen the significance of the results.
|
||||
# Replace missing data with mean values. The neural
|
||||
# network can't handle NAs in a meaningful way.
|
||||
# Choosing extreme values here would result in
|
||||
# heavily biased results. Therefore, the mean value
|
||||
# is chosen as a compromise. However, this will of
|
||||
# course lessen the significance of the results.
|
||||
|
||||
mean_distance <- round(
|
||||
species_data[, mean(distance)]
|
||||
)
|
||||
mean_distance <- round(
|
||||
species_data[, mean(distance)]
|
||||
)
|
||||
|
||||
data[is.na(distance), distance := mean_distance]
|
||||
data[is.na(distance), distance := mean_distance]
|
||||
|
||||
# Name the new column after the species.
|
||||
setnames(data, "distance", species_id)
|
||||
# Name the new column after the species.
|
||||
setnames(data, "distance", species_id)
|
||||
|
||||
# Add the input variable to the buffer.
|
||||
input_vars <- c(input_vars, species_id)
|
||||
}
|
||||
}
|
||||
# Add the input variable to the buffer.
|
||||
input_vars <- c(input_vars, species_id)
|
||||
}
|
||||
}
|
||||
|
||||
progress(0.1)
|
||||
progress(0.1)
|
||||
|
||||
# Step 2: Prepare training data.
|
||||
# ------------------------------
|
||||
# Step 2: Prepare training data.
|
||||
# ------------------------------
|
||||
|
||||
# Take out the reference data.
|
||||
# Take out the reference data.
|
||||
|
||||
reference_data <- data[gene %chin% reference_gene_ids]
|
||||
reference_data[, score := 1.0]
|
||||
reference_data <- data[gene %chin% reference_gene_ids]
|
||||
reference_data[, score := 1.0]
|
||||
|
||||
# Take out random samples from the remaining genes. This is
|
||||
# another compromise with a negative impact on
|
||||
# significance. We assume that a random gene is not likely
|
||||
# to match the reference genes.
|
||||
# Take out random samples from the remaining genes. This is
|
||||
# another compromise with a negative impact on
|
||||
# significance. We assume that a random gene is not likely
|
||||
# to match the reference genes.
|
||||
|
||||
without_reference_data <- data[
|
||||
!gene %chin% reference_gene_ids
|
||||
]
|
||||
without_reference_data <- data[
|
||||
!gene %chin% reference_gene_ids
|
||||
]
|
||||
|
||||
control_data <- without_reference_data[
|
||||
sample(
|
||||
nrow(without_reference_data),
|
||||
reference_count
|
||||
)
|
||||
]
|
||||
|
||||
control_data[, score := 0.0]
|
||||
|
||||
# Split the training data into random sets to have
|
||||
# validation data for each model.
|
||||
|
||||
# Scramble the source tables.
|
||||
reference_data <- reference_data[sample(reference_count)]
|
||||
control_data <- control_data[sample(reference_count)]
|
||||
|
||||
networks <- list()
|
||||
|
||||
indices <- seq_len(reference_count)
|
||||
indices_split <- split(indices, indices %% n_models)
|
||||
|
||||
for (i in seq_len(n_models)) {
|
||||
training_data <- rbindlist(list(
|
||||
reference_data[!indices_split[[i]]],
|
||||
control_data[!indices_split[[i]]]
|
||||
))
|
||||
|
||||
validation_data <- rbindlist(list(
|
||||
reference_data[indices_split[[i]]],
|
||||
control_data[indices_split[[i]]]
|
||||
))
|
||||
|
||||
networks[[i]] <- list(
|
||||
training_data = training_data,
|
||||
validation_data = validation_data
|
||||
)
|
||||
}
|
||||
|
||||
# Step 3: Create, train and apply neural network.
|
||||
# -----------------------------------------------
|
||||
|
||||
# Layers for the neural network.
|
||||
input_layer <- length(input_vars)
|
||||
layer1 <- input_layer
|
||||
layer2 <- 0.5 * input_layer
|
||||
layer3 <- 0.5 * layer2
|
||||
|
||||
# Convert data to matrix and normalize it.
|
||||
to_matrix <- function(data) {
|
||||
data_matrix <- as.matrix(data[, ..input_vars])
|
||||
colnames(data_matrix) <- NULL
|
||||
keras::normalize(data_matrix)
|
||||
}
|
||||
|
||||
data_matrix <- to_matrix(data)
|
||||
output_vars <- NULL
|
||||
|
||||
for (i in seq_along(networks)) {
|
||||
# Create a new model for each training session, because
|
||||
# the model would keep its state across training
|
||||
# sessions otherwise.
|
||||
model <- keras::keras_model_sequential() |>
|
||||
keras::layer_dense(
|
||||
units = layer1,
|
||||
activation = "relu",
|
||||
input_shape = input_layer,
|
||||
) |>
|
||||
keras::layer_dense(
|
||||
units = layer2,
|
||||
activation = "relu",
|
||||
kernel_regularizer = keras::regularizer_l2()
|
||||
) |>
|
||||
keras::layer_dense(
|
||||
units = layer3,
|
||||
activation = "relu",
|
||||
kernel_regularizer = keras::regularizer_l2()
|
||||
) |>
|
||||
keras::layer_dense(
|
||||
units = 1,
|
||||
activation = "sigmoid"
|
||||
) |>
|
||||
keras::compile(
|
||||
loss = keras::loss_mean_absolute_error(),
|
||||
optimizer = keras::optimizer_adam()
|
||||
)
|
||||
|
||||
# Train the model.
|
||||
|
||||
network <- networks[[i]]
|
||||
|
||||
training_data <- network$training_data
|
||||
training_matrix <- to_matrix(training_data)
|
||||
validation_data <- network$validation_data
|
||||
validation_matrix <- to_matrix(validation_data)
|
||||
|
||||
fit <- keras::fit(
|
||||
model,
|
||||
x = training_matrix,
|
||||
y = training_data$score,
|
||||
validation_data = list(
|
||||
x_val = validation_matrix,
|
||||
y_val = validation_data$score
|
||||
),
|
||||
epochs = 500,
|
||||
verbose = FALSE
|
||||
)
|
||||
|
||||
# Apply the model.
|
||||
|
||||
data[, new_score := stats::predict(model, data_matrix)]
|
||||
|
||||
# Remove the values of the training data itself.
|
||||
data[gene %chin% training_data$gene, new_score := NA]
|
||||
|
||||
output_var <- sprintf("score%i", i)
|
||||
setnames(data, "new_score", output_var)
|
||||
output_vars <- c(output_vars, output_var)
|
||||
|
||||
|
||||
# Store the details.
|
||||
|
||||
networks[[i]]$model <- keras::serialize_model(model)
|
||||
networks[[i]]$fit <- fit
|
||||
|
||||
progress(0.1 + i * (0.9 / n_models))
|
||||
}
|
||||
|
||||
# Compute the final score as the mean score.
|
||||
data[,
|
||||
score := mean(as.numeric(.SD), na.rm = TRUE),
|
||||
.SDcols = output_vars,
|
||||
by = gene
|
||||
]
|
||||
|
||||
progress(1.0)
|
||||
|
||||
result(
|
||||
method = "neural",
|
||||
scores = data[, .(gene, score)],
|
||||
details = list(
|
||||
seed = seed,
|
||||
n_models = n_models,
|
||||
all_results = data[, !..input_vars],
|
||||
networks = networks
|
||||
)
|
||||
)
|
||||
}
|
||||
control_data <- without_reference_data[
|
||||
sample(
|
||||
nrow(without_reference_data),
|
||||
reference_count
|
||||
)
|
||||
]
|
||||
|
||||
control_data[, score := 0.0]
|
||||
|
||||
# Split the training data into random sets to have
|
||||
# validation data for each model.
|
||||
|
||||
# Scramble the source tables.
|
||||
reference_data <- reference_data[sample(reference_count)]
|
||||
control_data <- control_data[sample(reference_count)]
|
||||
|
||||
networks <- list()
|
||||
|
||||
indices <- seq_len(reference_count)
|
||||
indices_split <- split(indices, indices %% n_models)
|
||||
|
||||
for (i in seq_len(n_models)) {
|
||||
training_data <- rbindlist(list(
|
||||
reference_data[!indices_split[[i]]],
|
||||
control_data[!indices_split[[i]]]
|
||||
))
|
||||
|
||||
validation_data <- rbindlist(list(
|
||||
reference_data[indices_split[[i]]],
|
||||
control_data[indices_split[[i]]]
|
||||
))
|
||||
|
||||
networks[[i]] <- list(
|
||||
training_data = training_data,
|
||||
validation_data = validation_data
|
||||
)
|
||||
}
|
||||
|
||||
# Step 3: Create, train and apply neural network.
|
||||
# -----------------------------------------------
|
||||
|
||||
# Layers for the neural network.
|
||||
input_layer <- length(input_vars)
|
||||
layer1 <- input_layer
|
||||
layer2 <- 0.5 * input_layer
|
||||
layer3 <- 0.5 * layer2
|
||||
|
||||
# Convert data to matrix and normalize it.
|
||||
to_matrix <- function(data) {
|
||||
data_matrix <- as.matrix(data[, ..input_vars])
|
||||
colnames(data_matrix) <- NULL
|
||||
keras::normalize(data_matrix)
|
||||
}
|
||||
|
||||
data_matrix <- to_matrix(data)
|
||||
output_vars <- NULL
|
||||
|
||||
for (i in seq_along(networks)) {
|
||||
# Create a new model for each training session, because
|
||||
# the model would keep its state across training
|
||||
# sessions otherwise.
|
||||
model <- keras::keras_model_sequential() |>
|
||||
keras::layer_dense(
|
||||
units = layer1,
|
||||
activation = "relu",
|
||||
input_shape = input_layer,
|
||||
) |>
|
||||
keras::layer_dense(
|
||||
units = layer2,
|
||||
activation = "relu",
|
||||
kernel_regularizer = keras::regularizer_l2()
|
||||
) |>
|
||||
keras::layer_dense(
|
||||
units = layer3,
|
||||
activation = "relu",
|
||||
kernel_regularizer = keras::regularizer_l2()
|
||||
) |>
|
||||
keras::layer_dense(
|
||||
units = 1,
|
||||
activation = "sigmoid"
|
||||
) |>
|
||||
keras::compile(
|
||||
loss = keras::loss_mean_absolute_error(),
|
||||
optimizer = keras::optimizer_adam()
|
||||
)
|
||||
|
||||
# Train the model.
|
||||
|
||||
network <- networks[[i]]
|
||||
|
||||
training_data <- network$training_data
|
||||
training_matrix <- to_matrix(training_data)
|
||||
validation_data <- network$validation_data
|
||||
validation_matrix <- to_matrix(validation_data)
|
||||
|
||||
fit <- keras::fit(
|
||||
model,
|
||||
x = training_matrix,
|
||||
y = training_data$score,
|
||||
validation_data = list(
|
||||
x_val = validation_matrix,
|
||||
y_val = validation_data$score
|
||||
),
|
||||
epochs = 500,
|
||||
verbose = FALSE
|
||||
)
|
||||
|
||||
# Apply the model.
|
||||
|
||||
data[, new_score := stats::predict(model, data_matrix)]
|
||||
|
||||
# Remove the values of the training data itself.
|
||||
data[gene %chin% training_data$gene, new_score := NA]
|
||||
|
||||
output_var <- sprintf("score%i", i)
|
||||
setnames(data, "new_score", output_var)
|
||||
output_vars <- c(output_vars, output_var)
|
||||
|
||||
|
||||
# Store the details.
|
||||
|
||||
networks[[i]]$model <- keras::serialize_model(model)
|
||||
networks[[i]]$fit <- fit
|
||||
|
||||
progress(0.1 + i * (0.9 / n_models))
|
||||
}
|
||||
|
||||
# Compute the final score as the mean score.
|
||||
data[,
|
||||
score := mean(as.numeric(.SD), na.rm = TRUE),
|
||||
.SDcols = output_vars,
|
||||
by = gene
|
||||
]
|
||||
|
||||
progress(1.0)
|
||||
|
||||
result(
|
||||
method = "neural",
|
||||
scores = data[, .(gene, score)],
|
||||
details = list(
|
||||
seed = seed,
|
||||
n_models = n_models,
|
||||
all_results = data[, !..input_vars],
|
||||
networks = networks
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,38 +11,38 @@
|
|||
#'
|
||||
#' @export
|
||||
proximity <- function(summarize = stats::median) {
|
||||
method(
|
||||
id = "proximity",
|
||||
name = "Proximity",
|
||||
description = "Proximity to telomeres",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
method(
|
||||
id = "proximity",
|
||||
name = "Proximity",
|
||||
description = "Proximity to telomeres",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
|
||||
cached("proximity", c(species_ids, gene_ids), {
|
||||
# Prefilter distances by species and gene.
|
||||
data <- geposan::distances[
|
||||
species %chin% preset$species_ids &
|
||||
gene %chin% preset$gene_ids
|
||||
]
|
||||
cached("proximity", c(species_ids, gene_ids), {
|
||||
# Prefilter distances by species and gene.
|
||||
data <- geposan::distances[
|
||||
species %chin% preset$species_ids &
|
||||
gene %chin% preset$gene_ids
|
||||
]
|
||||
|
||||
# Compute the score as described above.
|
||||
data <- data[,
|
||||
.(combined_distance = as.double(summarize(distance))),
|
||||
by = "gene"
|
||||
]
|
||||
# Compute the score as described above.
|
||||
data <- data[,
|
||||
.(combined_distance = as.double(summarize(distance))),
|
||||
by = "gene"
|
||||
]
|
||||
|
||||
# Normalize scores.
|
||||
data[, score := 1 - combined_distance / max(combined_distance)]
|
||||
# Normalize scores.
|
||||
data[, score := 1 - combined_distance / max(combined_distance)]
|
||||
|
||||
progress(1.0)
|
||||
progress(1.0)
|
||||
|
||||
result(
|
||||
method = "proximity",
|
||||
scores = data[, .(gene, score)],
|
||||
details = list(data = data)
|
||||
)
|
||||
})
|
||||
}
|
||||
)
|
||||
result(
|
||||
method = "proximity",
|
||||
scores = data[, .(gene, score)],
|
||||
details = list(data = data)
|
||||
)
|
||||
})
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,136 +15,136 @@
|
|||
#' @export
|
||||
species_adjacency <- function(distance_estimate = stats::median,
|
||||
summarize = stats::median) {
|
||||
method(
|
||||
id = "species_adjacency",
|
||||
name = "Species adj.",
|
||||
description = "Species adjacency",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
method(
|
||||
id = "species_adjacency",
|
||||
name = "Species adj.",
|
||||
description = "Species adjacency",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
|
||||
cached(
|
||||
"species_adjacency",
|
||||
c(
|
||||
species_ids,
|
||||
gene_ids,
|
||||
reference_gene_ids,
|
||||
distance_estimate,
|
||||
summarize
|
||||
),
|
||||
{ # nolint
|
||||
# Prefilter distances.
|
||||
data <- geposan::distances[
|
||||
species %chin% species_ids & gene %chin% gene_ids
|
||||
]
|
||||
cached(
|
||||
"species_adjacency",
|
||||
c(
|
||||
species_ids,
|
||||
gene_ids,
|
||||
reference_gene_ids,
|
||||
distance_estimate,
|
||||
summarize
|
||||
),
|
||||
{ # nolint
|
||||
# Prefilter distances.
|
||||
data <- geposan::distances[
|
||||
species %chin% species_ids & gene %chin% gene_ids
|
||||
]
|
||||
|
||||
progress_state <- 0.0
|
||||
progress_step <- 0.9 / length(species_ids)
|
||||
progress_state <- 0.0
|
||||
progress_step <- 0.9 / length(species_ids)
|
||||
|
||||
# Iterate through all species and find the distance
|
||||
# estimates within that species.
|
||||
for (species_id in species_ids) {
|
||||
# For all genes, compute the distance to one reference
|
||||
# gene at a time in one go.
|
||||
for (reference_gene_id in reference_gene_ids) {
|
||||
comparison_distance <- data[
|
||||
species == species_id &
|
||||
gene == reference_gene_id,
|
||||
distance
|
||||
]
|
||||
# Iterate through all species and find the distance
|
||||
# estimates within that species.
|
||||
for (species_id in species_ids) {
|
||||
# For all genes, compute the distance to one reference
|
||||
# gene at a time in one go.
|
||||
for (reference_gene_id in reference_gene_ids) {
|
||||
comparison_distance <- data[
|
||||
species == species_id &
|
||||
gene == reference_gene_id,
|
||||
distance
|
||||
]
|
||||
|
||||
column <- quote(reference_gene_id)
|
||||
column <- quote(reference_gene_id)
|
||||
|
||||
if (length(comparison_distance) != 1) {
|
||||
# If we don't have a comparison distance, we
|
||||
# can't compute a difference. This happens, if
|
||||
# the species doesn't have the reference gene.
|
||||
data[
|
||||
species == species_id &
|
||||
gene %chin% gene_ids,
|
||||
eval(column) := NA_integer_
|
||||
]
|
||||
} else {
|
||||
data[
|
||||
species == species_id &
|
||||
gene %chin% gene_ids,
|
||||
eval(column) :=
|
||||
abs(distance - comparison_distance)
|
||||
]
|
||||
}
|
||||
}
|
||||
if (length(comparison_distance) != 1) {
|
||||
# If we don't have a comparison distance, we
|
||||
# can't compute a difference. This happens, if
|
||||
# the species doesn't have the reference gene.
|
||||
data[
|
||||
species == species_id &
|
||||
gene %chin% gene_ids,
|
||||
eval(column) := NA_integer_
|
||||
]
|
||||
} else {
|
||||
data[
|
||||
species == species_id &
|
||||
gene %chin% gene_ids,
|
||||
eval(column) :=
|
||||
abs(distance - comparison_distance)
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# Combine the distances to the different reference genes
|
||||
# into one value using the provided function.
|
||||
data[
|
||||
species == species_id &
|
||||
gene %chin% gene_ids,
|
||||
combined_distance := as.numeric(
|
||||
distance_estimate(stats::na.omit(
|
||||
# Convert the data.table subset into a
|
||||
# vector to get the correct na.omit
|
||||
# behavior.
|
||||
as.matrix(.SD)[1, ]
|
||||
))
|
||||
),
|
||||
.SDcols = reference_gene_ids,
|
||||
by = gene
|
||||
]
|
||||
# Combine the distances to the different reference genes
|
||||
# into one value using the provided function.
|
||||
data[
|
||||
species == species_id &
|
||||
gene %chin% gene_ids,
|
||||
combined_distance := as.numeric(
|
||||
distance_estimate(stats::na.omit(
|
||||
# Convert the data.table subset into a
|
||||
# vector to get the correct na.omit
|
||||
# behavior.
|
||||
as.matrix(.SD)[1, ]
|
||||
))
|
||||
),
|
||||
.SDcols = reference_gene_ids,
|
||||
by = gene
|
||||
]
|
||||
|
||||
progress_state <- progress_state + progress_step
|
||||
progress(progress_state)
|
||||
}
|
||||
progress_state <- progress_state + progress_step
|
||||
progress(progress_state)
|
||||
}
|
||||
|
||||
progress(0.9)
|
||||
progress(0.9)
|
||||
|
||||
# Remove the distances between the reference genes.
|
||||
for (reference_gene_id in reference_gene_ids) {
|
||||
column <- quote(reference_gene_id)
|
||||
data[gene == reference_gene_id, eval(column) := NA]
|
||||
}
|
||||
# Remove the distances between the reference genes.
|
||||
for (reference_gene_id in reference_gene_ids) {
|
||||
column <- quote(reference_gene_id)
|
||||
data[gene == reference_gene_id, eval(column) := NA]
|
||||
}
|
||||
|
||||
# Recompute the combined distance for the reference genes.
|
||||
data[
|
||||
gene %chin% reference_gene_ids,
|
||||
combined_distance := as.numeric(
|
||||
distance_estimate(stats::na.omit(
|
||||
as.matrix(.SD)[1, ])
|
||||
)
|
||||
),
|
||||
.SDcols = reference_gene_ids,
|
||||
by = list(species, gene)
|
||||
]
|
||||
# Recompute the combined distance for the reference genes.
|
||||
data[
|
||||
gene %chin% reference_gene_ids,
|
||||
combined_distance := as.numeric(
|
||||
distance_estimate(stats::na.omit(
|
||||
as.matrix(.SD)[1, ]
|
||||
))
|
||||
),
|
||||
.SDcols = reference_gene_ids,
|
||||
by = list(species, gene)
|
||||
]
|
||||
|
||||
# Combine the distances into one value.
|
||||
results <- data[,
|
||||
.(
|
||||
summarized_distances = as.numeric(
|
||||
summarize(stats::na.omit(combined_distance))
|
||||
)
|
||||
),
|
||||
by = gene
|
||||
]
|
||||
# Combine the distances into one value.
|
||||
results <- data[,
|
||||
.(
|
||||
summarized_distances = as.numeric(
|
||||
summarize(stats::na.omit(combined_distance))
|
||||
)
|
||||
),
|
||||
by = gene
|
||||
]
|
||||
|
||||
# Compute the final score by normalizing the difference.
|
||||
results[
|
||||
,
|
||||
score := 1 - summarized_distances /
|
||||
max(summarized_distances)
|
||||
]
|
||||
# Compute the final score by normalizing the difference.
|
||||
results[
|
||||
,
|
||||
score := 1 - summarized_distances /
|
||||
max(summarized_distances)
|
||||
]
|
||||
|
||||
progress(1.0)
|
||||
progress(1.0)
|
||||
|
||||
result(
|
||||
method = "species_adjacency",
|
||||
scores = results[, .(gene, score)],
|
||||
details = list(
|
||||
data = data,
|
||||
results = results
|
||||
)
|
||||
)
|
||||
}
|
||||
result(
|
||||
method = "species_adjacency",
|
||||
scores = results[, .(gene, score)],
|
||||
details = list(
|
||||
data = data,
|
||||
results = results
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
|
|||
674
R/plots.R
674
R/plots.R
|
|
@ -9,7 +9,7 @@ base_color_transparent <- function() "#1964bf80"
|
|||
#' Color palette for gene sets.
|
||||
#' @noRd
|
||||
gene_set_color <- function(index) {
|
||||
c("#FF7F00", "#4DAF4A", "#984EA3")[index]
|
||||
c("#FF7F00", "#4DAF4A", "#984EA3")[index]
|
||||
}
|
||||
|
||||
#' Plot gene positions.
|
||||
|
|
@ -22,74 +22,74 @@ gene_set_color <- function(index) {
|
|||
#'
|
||||
#' @export
|
||||
plot_positions <- function(species_ids, gene_sets) {
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
|
||||
# Prefilter data by species.
|
||||
data <- geposan::distances[species %chin% species_ids]
|
||||
# Prefilter data by species.
|
||||
data <- geposan::distances[species %chin% species_ids]
|
||||
|
||||
species_max_distance <- data[,
|
||||
.(max_distance = max(distance)),
|
||||
by = species
|
||||
]
|
||||
species_max_distance <- data[,
|
||||
.(max_distance = max(distance)),
|
||||
by = species
|
||||
]
|
||||
|
||||
# Prefilter species.
|
||||
species <- geposan::species[id %chin% species_ids]
|
||||
# Prefilter species.
|
||||
species <- geposan::species[id %chin% species_ids]
|
||||
|
||||
plot <- plotly::plot_ly() |>
|
||||
plotly::layout(
|
||||
xaxis = list(
|
||||
title = "Species",
|
||||
tickvals = species$id,
|
||||
ticktext = species$name
|
||||
),
|
||||
yaxis = list(title = "Distance to telomeres [Bp]"),
|
||||
bargap = 0.9
|
||||
) |>
|
||||
plotly::add_bars(
|
||||
data = species_max_distance,
|
||||
x = ~species,
|
||||
y = ~max_distance,
|
||||
name = "All genes",
|
||||
marker = list(color = base_color())
|
||||
plot <- plotly::plot_ly() |>
|
||||
plotly::layout(
|
||||
xaxis = list(
|
||||
title = "Species",
|
||||
tickvals = species$id,
|
||||
ticktext = species$name
|
||||
),
|
||||
yaxis = list(title = "Distance to telomeres [Bp]"),
|
||||
bargap = 0.9
|
||||
) |>
|
||||
plotly::add_bars(
|
||||
data = species_max_distance,
|
||||
x = ~species,
|
||||
y = ~max_distance,
|
||||
name = "All genes",
|
||||
marker = list(color = base_color())
|
||||
)
|
||||
|
||||
if (length(gene_sets) > 0) {
|
||||
# Include gene information which will be used for labeling
|
||||
gene_set_data <- merge(
|
||||
data[gene %chin% unlist(gene_sets)],
|
||||
geposan::genes,
|
||||
by.x = "gene",
|
||||
by.y = "id"
|
||||
)
|
||||
|
||||
index <- 1
|
||||
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set <- gene_sets[[gene_set_name]]
|
||||
|
||||
plot <- plot |> plotly::add_markers(
|
||||
data = gene_set_data[gene %chin% gene_set],
|
||||
x = ~species,
|
||||
y = ~distance,
|
||||
name = gene_set_name,
|
||||
text = ~ glue::glue(
|
||||
"<b>{name}</b><br>",
|
||||
"{round(distance / 1000000, digits = 2)} MBp"
|
||||
),
|
||||
hoverinfo = "text",
|
||||
marker = list(
|
||||
size = 10,
|
||||
color = gene_set_color(index)
|
||||
)
|
||||
)
|
||||
|
||||
if (length(gene_sets) > 0) {
|
||||
# Include gene information which will be used for labeling
|
||||
gene_set_data <- merge(
|
||||
data[gene %chin% unlist(gene_sets)],
|
||||
geposan::genes,
|
||||
by.x = "gene",
|
||||
by.y = "id"
|
||||
)
|
||||
|
||||
index <- 1
|
||||
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set <- gene_sets[[gene_set_name]]
|
||||
|
||||
plot <- plot |> plotly::add_markers(
|
||||
data = gene_set_data[gene %chin% gene_set],
|
||||
x = ~species,
|
||||
y = ~distance,
|
||||
name = gene_set_name,
|
||||
text = ~ glue::glue(
|
||||
"<b>{name}</b><br>",
|
||||
"{round(distance / 1000000, digits = 2)} MBp"
|
||||
),
|
||||
hoverinfo = "text",
|
||||
marker = list(
|
||||
size = 10,
|
||||
color = gene_set_color(index)
|
||||
)
|
||||
)
|
||||
|
||||
index <- index + 1
|
||||
}
|
||||
index <- index + 1
|
||||
}
|
||||
}
|
||||
|
||||
plot
|
||||
plot
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -108,75 +108,75 @@ plot_positions <- function(species_ids, gene_sets) {
|
|||
#'
|
||||
#' @export
|
||||
plot_rankings <- function(rankings, gene_sets) {
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
|
||||
plot <- plotly::plot_ly() |>
|
||||
plotly::layout(
|
||||
xaxis = list(tickvals = names(rankings)),
|
||||
yaxis = list(title = "Score")
|
||||
plot <- plotly::plot_ly() |>
|
||||
plotly::layout(
|
||||
xaxis = list(tickvals = names(rankings)),
|
||||
yaxis = list(title = "Score")
|
||||
)
|
||||
|
||||
is_first <- TRUE
|
||||
|
||||
for (ranking_name in names(rankings)) {
|
||||
ranking <- rankings[[ranking_name]]
|
||||
|
||||
plot <- plot |> plotly::add_trace(
|
||||
data = ranking,
|
||||
x = ranking_name,
|
||||
y = ~score,
|
||||
name = "All genes",
|
||||
type = "violin",
|
||||
spanmode = "hard",
|
||||
points = FALSE,
|
||||
showlegend = is_first,
|
||||
hoverinfo = "skip",
|
||||
line = list(color = base_color()),
|
||||
fillcolor = base_color_transparent()
|
||||
)
|
||||
|
||||
if (length(gene_sets) > 0) {
|
||||
gene_set_data <- merge(
|
||||
ranking[gene %chin% unlist(gene_sets)],
|
||||
geposan::genes,
|
||||
by.x = "gene",
|
||||
by.y = "id"
|
||||
)
|
||||
|
||||
index <- 1
|
||||
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set <- gene_sets[[gene_set_name]]
|
||||
|
||||
plot <- plot |> plotly::add_markers(
|
||||
data = gene_set_data[gene %chin% gene_set],
|
||||
x = ranking_name,
|
||||
y = ~score,
|
||||
name = gene_set_name,
|
||||
text = ~ glue::glue(
|
||||
"<b>{name}</b><br>",
|
||||
"Score: {round(score, digits = 2)}<br>",
|
||||
"Rank: {rank}<br>",
|
||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||
),
|
||||
hoverinfo = "text",
|
||||
showlegend = is_first,
|
||||
marker = list(
|
||||
size = 10,
|
||||
color = gene_set_color(index)
|
||||
)
|
||||
)
|
||||
|
||||
is_first <- TRUE
|
||||
|
||||
for (ranking_name in names(rankings)) {
|
||||
ranking <- rankings[[ranking_name]]
|
||||
|
||||
plot <- plot |> plotly::add_trace(
|
||||
data = ranking,
|
||||
x = ranking_name,
|
||||
y = ~score,
|
||||
name = "All genes",
|
||||
type = "violin",
|
||||
spanmode = "hard",
|
||||
points = FALSE,
|
||||
showlegend = is_first,
|
||||
hoverinfo = "skip",
|
||||
line = list(color = base_color()),
|
||||
fillcolor = base_color_transparent()
|
||||
)
|
||||
|
||||
if (length(gene_sets) > 0) {
|
||||
gene_set_data <- merge(
|
||||
ranking[gene %chin% unlist(gene_sets)],
|
||||
geposan::genes,
|
||||
by.x = "gene",
|
||||
by.y = "id"
|
||||
)
|
||||
|
||||
index <- 1
|
||||
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set <- gene_sets[[gene_set_name]]
|
||||
|
||||
plot <- plot |> plotly::add_markers(
|
||||
data = gene_set_data[gene %chin% gene_set],
|
||||
x = ranking_name,
|
||||
y = ~score,
|
||||
name = gene_set_name,
|
||||
text = ~ glue::glue(
|
||||
"<b>{name}</b><br>",
|
||||
"Score: {round(score, digits = 2)}<br>",
|
||||
"Rank: {rank}<br>",
|
||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||
),
|
||||
hoverinfo = "text",
|
||||
showlegend = is_first,
|
||||
marker = list(
|
||||
size = 10,
|
||||
color = gene_set_color(index)
|
||||
)
|
||||
)
|
||||
|
||||
index <- index + 1
|
||||
}
|
||||
}
|
||||
|
||||
is_first <- FALSE
|
||||
index <- index + 1
|
||||
}
|
||||
}
|
||||
|
||||
plot
|
||||
is_first <- FALSE
|
||||
}
|
||||
|
||||
plot
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -194,88 +194,88 @@ plot_rankings <- function(rankings, gene_sets) {
|
|||
#'
|
||||
#' @export
|
||||
plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
|
||||
# To speed up rendering, don't show every single gene.
|
||||
n_ranks <- nrow(ranking)
|
||||
sample_ranking <- ranking[seq(1, n_ranks, 5)]
|
||||
# To speed up rendering, don't show every single gene.
|
||||
n_ranks <- nrow(ranking)
|
||||
sample_ranking <- ranking[seq(1, n_ranks, 5)]
|
||||
|
||||
plot <- plotly::plot_ly() |>
|
||||
plotly::add_lines(
|
||||
data = sample_ranking,
|
||||
x = ~percentile,
|
||||
y = ~score,
|
||||
name = "All genes",
|
||||
hoverinfo = "skip",
|
||||
line = list(width = 4, color = base_color())
|
||||
) |>
|
||||
plotly::layout(
|
||||
xaxis = list(
|
||||
title = "Percentile",
|
||||
tickformat = ".0%"
|
||||
),
|
||||
yaxis = list(title = "Score")
|
||||
plot <- plotly::plot_ly() |>
|
||||
plotly::add_lines(
|
||||
data = sample_ranking,
|
||||
x = ~percentile,
|
||||
y = ~score,
|
||||
name = "All genes",
|
||||
hoverinfo = "skip",
|
||||
line = list(width = 4, color = base_color())
|
||||
) |>
|
||||
plotly::layout(
|
||||
xaxis = list(
|
||||
title = "Percentile",
|
||||
tickformat = ".0%"
|
||||
),
|
||||
yaxis = list(title = "Score")
|
||||
)
|
||||
|
||||
if (length(gene_sets) > 0) {
|
||||
# Include gene information which will be used for labeling
|
||||
gene_set_data <- merge(
|
||||
ranking[gene %chin% unlist(gene_sets)],
|
||||
geposan::genes,
|
||||
by.x = "gene",
|
||||
by.y = "id"
|
||||
)
|
||||
|
||||
index <- 1
|
||||
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set <- gene_sets[[gene_set_name]]
|
||||
|
||||
plot <- plot |> plotly::add_markers(
|
||||
data = gene_set_data[gene %chin% gene_set],
|
||||
x = ~percentile,
|
||||
y = ~score,
|
||||
name = gene_set_name,
|
||||
text = ~ glue::glue(
|
||||
"<b>{name}</b><br>",
|
||||
"Score: {round(score, digits = 2)}<br>",
|
||||
"Rank: {rank}<br>",
|
||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||
),
|
||||
hoverinfo = "text",
|
||||
marker = list(
|
||||
size = 10,
|
||||
color = gene_set_color(index)
|
||||
)
|
||||
)
|
||||
|
||||
if (length(gene_sets) > 0) {
|
||||
# Include gene information which will be used for labeling
|
||||
gene_set_data <- merge(
|
||||
ranking[gene %chin% unlist(gene_sets)],
|
||||
geposan::genes,
|
||||
by.x = "gene",
|
||||
by.y = "id"
|
||||
index <- index + 1
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!is.null(max_rank)) {
|
||||
first_not_included_rank <- max_rank + 1
|
||||
last_rank <- ranking[, .N]
|
||||
|
||||
if (first_not_included_rank <= last_rank) {
|
||||
plot <- plot |> plotly::layout(
|
||||
shapes = list(
|
||||
type = "rect",
|
||||
fillcolor = "black",
|
||||
opacity = 0.1,
|
||||
x0 = 1 - first_not_included_rank / n_ranks,
|
||||
x1 = 1 - last_rank / n_ranks,
|
||||
y0 = 0.0,
|
||||
y1 = 1.0
|
||||
)
|
||||
|
||||
index <- 1
|
||||
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set <- gene_sets[[gene_set_name]]
|
||||
|
||||
plot <- plot |> plotly::add_markers(
|
||||
data = gene_set_data[gene %chin% gene_set],
|
||||
x = ~percentile,
|
||||
y = ~score,
|
||||
name = gene_set_name,
|
||||
text = ~ glue::glue(
|
||||
"<b>{name}</b><br>",
|
||||
"Score: {round(score, digits = 2)}<br>",
|
||||
"Rank: {rank}<br>",
|
||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||
),
|
||||
hoverinfo = "text",
|
||||
marker = list(
|
||||
size = 10,
|
||||
color = gene_set_color(index)
|
||||
)
|
||||
)
|
||||
|
||||
index <- index + 1
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!is.null(max_rank)) {
|
||||
first_not_included_rank <- max_rank + 1
|
||||
last_rank <- ranking[, .N]
|
||||
|
||||
if (first_not_included_rank <= last_rank) {
|
||||
plot <- plot |> plotly::layout(
|
||||
shapes = list(
|
||||
type = "rect",
|
||||
fillcolor = "black",
|
||||
opacity = 0.1,
|
||||
x0 = 1 - first_not_included_rank / n_ranks,
|
||||
x1 = 1 - last_rank / n_ranks,
|
||||
y0 = 0.0,
|
||||
y1 = 1.0
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
plot
|
||||
plot
|
||||
}
|
||||
|
||||
#' Visualize a ranking by comparing gene sets in a boxplot.
|
||||
|
|
@ -290,44 +290,44 @@ plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
|
|||
#'
|
||||
#' @export
|
||||
plot_boxplot <- function(ranking, gene_sets = NULL) {
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
|
||||
plot <- plotly::plot_ly() |>
|
||||
plotly::add_boxplot(
|
||||
data = ranking,
|
||||
x = "All genes",
|
||||
y = ~score,
|
||||
name = "All genes",
|
||||
showlegend = FALSE,
|
||||
line = list(color = base_color())
|
||||
) |>
|
||||
plotly::layout(
|
||||
xaxis = list(tickvals = c("All genes", names(gene_sets))),
|
||||
yaxis = list(title = "Score")
|
||||
)
|
||||
|
||||
if (length(gene_sets) > 0) {
|
||||
index <- 1
|
||||
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set <- gene_sets[[gene_set_name]]
|
||||
|
||||
plot <- plot |> plotly::add_boxplot(
|
||||
data = ranking[gene %chin% gene_set],
|
||||
x = gene_set_name,
|
||||
y = ~score,
|
||||
name = gene_set_name,
|
||||
showlegend = FALSE,
|
||||
line = list(color = gene_set_color(index))
|
||||
)
|
||||
|
||||
index <- index + 1
|
||||
}
|
||||
}
|
||||
|
||||
plot <- plotly::plot_ly() |>
|
||||
plotly::add_boxplot(
|
||||
data = ranking,
|
||||
x = "All genes",
|
||||
y = ~score,
|
||||
name = "All genes",
|
||||
showlegend = FALSE,
|
||||
line = list(color = base_color())
|
||||
) |>
|
||||
plotly::layout(
|
||||
xaxis = list(tickvals = c("All genes", names(gene_sets))),
|
||||
yaxis = list(title = "Score")
|
||||
)
|
||||
|
||||
if (length(gene_sets) > 0) {
|
||||
index <- 1
|
||||
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set <- gene_sets[[gene_set_name]]
|
||||
|
||||
plot <- plot |> plotly::add_boxplot(
|
||||
data = ranking[gene %chin% gene_set],
|
||||
x = gene_set_name,
|
||||
y = ~score,
|
||||
name = gene_set_name,
|
||||
showlegend = FALSE,
|
||||
line = list(color = gene_set_color(index))
|
||||
)
|
||||
|
||||
index <- index + 1
|
||||
}
|
||||
}
|
||||
|
||||
plot
|
||||
plot
|
||||
}
|
||||
|
||||
#' Show the distribution of scores across chromosomes.
|
||||
|
|
@ -340,46 +340,46 @@ plot_boxplot <- function(ranking, gene_sets = NULL) {
|
|||
#'
|
||||
#' @export
|
||||
plot_chromosomes <- function(ranking) {
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
|
||||
data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
|
||||
data <- data[, .(score = mean(score)), by = "chromosome"]
|
||||
data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
|
||||
data <- data[, .(score = mean(score)), by = "chromosome"]
|
||||
|
||||
# Get an orderable integer from a chromosome name.
|
||||
chromosome_index <- function(chromosome) {
|
||||
index <- suppressWarnings(as.integer(chromosome))
|
||||
# Get an orderable integer from a chromosome name.
|
||||
chromosome_index <- function(chromosome) {
|
||||
index <- suppressWarnings(as.integer(chromosome))
|
||||
|
||||
ifelse(
|
||||
!is.na(index),
|
||||
index,
|
||||
ifelse(
|
||||
chromosome == "X",
|
||||
998,
|
||||
999
|
||||
)
|
||||
)
|
||||
}
|
||||
ifelse(
|
||||
!is.na(index),
|
||||
index,
|
||||
ifelse(
|
||||
chromosome == "X",
|
||||
998,
|
||||
999
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
data[, index := chromosome_index(chromosome)]
|
||||
setorder(data, "index")
|
||||
data[, index := chromosome_index(chromosome)]
|
||||
setorder(data, "index")
|
||||
|
||||
plotly::plot_ly(
|
||||
data = data,
|
||||
x = ~chromosome,
|
||||
y = ~score,
|
||||
type = "bar",
|
||||
marker = list(color = base_color())
|
||||
) |>
|
||||
plotly::layout(
|
||||
xaxis = list(
|
||||
title = "Chromosome",
|
||||
categoryorder = "array",
|
||||
categoryarray = ~chromosome
|
||||
),
|
||||
yaxis = list(title = "Mean score")
|
||||
)
|
||||
plotly::plot_ly(
|
||||
data = data,
|
||||
x = ~chromosome,
|
||||
y = ~score,
|
||||
type = "bar",
|
||||
marker = list(color = base_color())
|
||||
) |>
|
||||
plotly::layout(
|
||||
xaxis = list(
|
||||
title = "Chromosome",
|
||||
categoryorder = "array",
|
||||
categoryarray = ~chromosome
|
||||
),
|
||||
yaxis = list(title = "Mean score")
|
||||
)
|
||||
}
|
||||
|
||||
#' Plot scores in relation to chromosomal position of genes.
|
||||
|
|
@ -398,74 +398,74 @@ plot_chromosomes <- function(ranking) {
|
|||
plot_scores_by_position <- function(ranking,
|
||||
chromosome_name = NULL,
|
||||
gene_sets = NULL) {
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
if (!requireNamespace("plotly", quietly = TRUE)) {
|
||||
stop("Please install \"plotly\" to use this function.")
|
||||
}
|
||||
|
||||
distance_data <- if (!is.null(chromosome_name)) {
|
||||
chromosome_name_ <- chromosome_name
|
||||
geposan::distances[
|
||||
species == "hsapiens" &
|
||||
chromosome_name == chromosome_name_
|
||||
]
|
||||
} else {
|
||||
geposan::distances[species == "hsapiens"]
|
||||
}
|
||||
distance_data <- if (!is.null(chromosome_name)) {
|
||||
chromosome_name_ <- chromosome_name
|
||||
geposan::distances[
|
||||
species == "hsapiens" &
|
||||
chromosome_name == chromosome_name_
|
||||
]
|
||||
} else {
|
||||
geposan::distances[species == "hsapiens"]
|
||||
}
|
||||
|
||||
data <- merge(ranking, distance_data, by = "gene")
|
||||
data <- merge(ranking, distance_data, by = "gene")
|
||||
|
||||
data <- merge(
|
||||
data,
|
||||
geposan::genes,
|
||||
by.x = "gene",
|
||||
by.y = "id"
|
||||
data <- merge(
|
||||
data,
|
||||
geposan::genes,
|
||||
by.x = "gene",
|
||||
by.y = "id"
|
||||
)
|
||||
|
||||
data[, `:=`(gene_set = "All genes", color = base_color())]
|
||||
|
||||
index <- 1
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set_genes <- gene_sets[[gene_set_name]]
|
||||
data[
|
||||
gene %chin% gene_set_genes,
|
||||
`:=`(
|
||||
gene_set = gene_set_name,
|
||||
color = gene_set_color(index)
|
||||
)
|
||||
]
|
||||
|
||||
index <- index + 1
|
||||
}
|
||||
|
||||
# Use distances instead of positions in case all chromosomes are included.
|
||||
if (is.null(chromosome_name)) {
|
||||
data[, x := distance]
|
||||
} else {
|
||||
data[, x := start_position]
|
||||
}
|
||||
|
||||
plotly::plot_ly() |>
|
||||
plotly::add_markers(
|
||||
data = data,
|
||||
x = ~x,
|
||||
y = ~score,
|
||||
name = ~gene_set,
|
||||
text = ~ glue::glue(
|
||||
"<b>{name}</b><br>",
|
||||
if (is.null(chromosome_name)) "Distance: " else "Position: ",
|
||||
"{round(x / 1000000, digits = 2)} MBp<br>",
|
||||
"Score: {round(score, digits = 2)}<br>",
|
||||
"Rank: {rank}<br>",
|
||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||
),
|
||||
hoverinfo = "text",
|
||||
) |>
|
||||
plotly::layout(
|
||||
xaxis = list(title = if (is.null(chromosome_name)) {
|
||||
"Distance (Bp)"
|
||||
} else {
|
||||
"Position (Bp)"
|
||||
}),
|
||||
yaxis = list(title = "Score")
|
||||
)
|
||||
|
||||
data[, `:=`(gene_set = "All genes", color = base_color())]
|
||||
|
||||
index <- 1
|
||||
for (gene_set_name in names(gene_sets)) {
|
||||
gene_set_genes <- gene_sets[[gene_set_name]]
|
||||
data[
|
||||
gene %chin% gene_set_genes,
|
||||
`:=`(
|
||||
gene_set = gene_set_name,
|
||||
color = gene_set_color(index)
|
||||
)
|
||||
]
|
||||
|
||||
index <- index + 1
|
||||
}
|
||||
|
||||
# Use distances instead of positions in case all chromosomes are included.
|
||||
if (is.null(chromosome_name)) {
|
||||
data[, x := distance]
|
||||
} else {
|
||||
data[, x := start_position]
|
||||
}
|
||||
|
||||
plotly::plot_ly() |>
|
||||
plotly::add_markers(
|
||||
data = data,
|
||||
x = ~x,
|
||||
y = ~score,
|
||||
name = ~gene_set,
|
||||
text = ~ glue::glue(
|
||||
"<b>{name}</b><br>",
|
||||
if (is.null(chromosome_name)) "Distance: " else "Position: ",
|
||||
"{round(x / 1000000, digits = 2)} MBp<br>",
|
||||
"Score: {round(score, digits = 2)}<br>",
|
||||
"Rank: {rank}<br>",
|
||||
"Percentile: {round(percentile * 100, digits = 2)}%"
|
||||
),
|
||||
hoverinfo = "text",
|
||||
) |>
|
||||
plotly::layout(
|
||||
xaxis = list(title = if (is.null(chromosome_name)) {
|
||||
"Distance (Bp)"
|
||||
} else {
|
||||
"Position (Bp)"
|
||||
}),
|
||||
yaxis = list(title = "Score")
|
||||
)
|
||||
}
|
||||
|
|
|
|||
116
R/preset.R
116
R/preset.R
|
|
@ -21,55 +21,55 @@ preset <- function(reference_gene_ids,
|
|||
methods = all_methods(),
|
||||
species_ids = geposan::species$id,
|
||||
gene_ids = geposan::genes$id) {
|
||||
# Count included species per gene.
|
||||
genes_n_species <- geposan::distances[
|
||||
species %chin% species_ids,
|
||||
.(n_species = .N),
|
||||
by = "gene"
|
||||
]
|
||||
# Count included species per gene.
|
||||
genes_n_species <- geposan::distances[
|
||||
species %chin% species_ids,
|
||||
.(n_species = .N),
|
||||
by = "gene"
|
||||
]
|
||||
|
||||
# Filter out genes with less than 25% existing orthologs.
|
||||
gene_ids_filtered <- genes_n_species[
|
||||
gene %chin% gene_ids &
|
||||
n_species >= 0.25 * length(species_ids),
|
||||
gene
|
||||
]
|
||||
# Filter out genes with less than 25% existing orthologs.
|
||||
gene_ids_filtered <- genes_n_species[
|
||||
gene %chin% gene_ids &
|
||||
n_species >= 0.25 * length(species_ids),
|
||||
gene
|
||||
]
|
||||
|
||||
reference_gene_ids_excluded <- reference_gene_ids[
|
||||
!reference_gene_ids %chin% gene_ids_filtered
|
||||
]
|
||||
reference_gene_ids_excluded <- reference_gene_ids[
|
||||
!reference_gene_ids %chin% gene_ids_filtered
|
||||
]
|
||||
|
||||
if (length(reference_gene_ids_excluded > 0)) {
|
||||
warning(paste0(
|
||||
"The following reference gene IDs are excluded from the preset ",
|
||||
"because they don't have enough data: ",
|
||||
paste(reference_gene_ids_excluded, collapse = ", ")
|
||||
))
|
||||
}
|
||||
if (length(reference_gene_ids_excluded > 0)) {
|
||||
warning(paste0(
|
||||
"The following reference gene IDs are excluded from the preset ",
|
||||
"because they don't have enough data: ",
|
||||
paste(reference_gene_ids_excluded, collapse = ", ")
|
||||
))
|
||||
}
|
||||
|
||||
reference_gene_ids_included <- reference_gene_ids[
|
||||
reference_gene_ids %chin% gene_ids_filtered
|
||||
]
|
||||
reference_gene_ids_included <- reference_gene_ids[
|
||||
reference_gene_ids %chin% gene_ids_filtered
|
||||
]
|
||||
|
||||
if (length(reference_gene_ids_included) < 1) {
|
||||
stop(paste0(
|
||||
"There has to be at least one reference gene for the preset to be ",
|
||||
"valid. Please note that some methods may require more reference ",
|
||||
"genes."
|
||||
))
|
||||
}
|
||||
if (length(reference_gene_ids_included) < 1) {
|
||||
stop(paste0(
|
||||
"There has to be at least one reference gene for the preset to be ",
|
||||
"valid. Please note that some methods may require more reference ",
|
||||
"genes."
|
||||
))
|
||||
}
|
||||
|
||||
# The included data gets sorted to be able to produce predictable hashes
|
||||
# for the object later.
|
||||
structure(
|
||||
list(
|
||||
reference_gene_ids = sort(reference_gene_ids_included),
|
||||
methods = methods,
|
||||
species_ids = sort(species_ids),
|
||||
gene_ids = sort(gene_ids_filtered)
|
||||
),
|
||||
class = "geposan_preset"
|
||||
)
|
||||
# The included data gets sorted to be able to produce predictable hashes
|
||||
# for the object later.
|
||||
structure(
|
||||
list(
|
||||
reference_gene_ids = sort(reference_gene_ids_included),
|
||||
methods = methods,
|
||||
species_ids = sort(species_ids),
|
||||
gene_ids = sort(gene_ids_filtered)
|
||||
),
|
||||
class = "geposan_preset"
|
||||
)
|
||||
}
|
||||
|
||||
#' S3 method to print a preset object.
|
||||
|
|
@ -81,20 +81,20 @@ preset <- function(reference_gene_ids,
|
|||
#'
|
||||
#' @export
|
||||
print.geposan_preset <- function(x, ...) {
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan preset:",
|
||||
"\n Reference genes: %i",
|
||||
"\n Included methods: %s",
|
||||
"\n Number of species: %i",
|
||||
"\n Number of genes: %i",
|
||||
"\n"
|
||||
),
|
||||
length(x$reference_gene_ids),
|
||||
paste(sapply(x$methods, function(m) m$id), collapse = ", "),
|
||||
length(x$species_ids),
|
||||
length(x$gene_ids)
|
||||
))
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan preset:",
|
||||
"\n Reference genes: %i",
|
||||
"\n Included methods: %s",
|
||||
"\n Number of species: %i",
|
||||
"\n Number of genes: %i",
|
||||
"\n"
|
||||
),
|
||||
length(x$reference_gene_ids),
|
||||
paste(sapply(x$methods, function(m) m$id), collapse = ", "),
|
||||
length(x$species_ids),
|
||||
length(x$gene_ids)
|
||||
))
|
||||
|
||||
invisible(x)
|
||||
invisible(x)
|
||||
}
|
||||
|
|
|
|||
112
R/ranking.R
112
R/ranking.R
|
|
@ -13,35 +13,35 @@
|
|||
#'
|
||||
#' @export
|
||||
ranking <- function(analysis, weights) {
|
||||
ranking <- if (inherits(analysis, "geposan_analysis")) {
|
||||
copy(analysis$scores)
|
||||
} else if (inherits(analysis, "geposan_ranking")) {
|
||||
copy(analysis)
|
||||
} else {
|
||||
stop("Invalid analyis. Use geposan::analyze().")
|
||||
}
|
||||
ranking <- if (inherits(analysis, "geposan_analysis")) {
|
||||
copy(analysis$scores)
|
||||
} else if (inherits(analysis, "geposan_ranking")) {
|
||||
copy(analysis)
|
||||
} else {
|
||||
stop("Invalid analyis. Use geposan::analyze().")
|
||||
}
|
||||
|
||||
ranking[, score := 0.0]
|
||||
ranking[, score := 0.0]
|
||||
|
||||
for (method in names(weights)) {
|
||||
weighted <- weights[[method]] * ranking[, ..method]
|
||||
ranking[, score := score + weighted]
|
||||
}
|
||||
for (method in names(weights)) {
|
||||
weighted <- weights[[method]] * ranking[, ..method]
|
||||
ranking[, score := score + weighted]
|
||||
}
|
||||
|
||||
# Normalize scores to be between 0.0 and 1.0.
|
||||
min_score <- ranking[, min(score)]
|
||||
max_score <- ranking[, max(score)]
|
||||
score_range <- max_score - min_score
|
||||
ranking[, score := (score - min_score) / score_range]
|
||||
# Normalize scores to be between 0.0 and 1.0.
|
||||
min_score <- ranking[, min(score)]
|
||||
max_score <- ranking[, max(score)]
|
||||
score_range <- max_score - min_score
|
||||
ranking[, score := (score - min_score) / score_range]
|
||||
|
||||
setorder(ranking, -score)
|
||||
ranking[, rank := .I]
|
||||
ranking[, percentile := 1 - rank / nrow(ranking)]
|
||||
setorder(ranking, -score)
|
||||
ranking[, rank := .I]
|
||||
ranking[, percentile := 1 - rank / nrow(ranking)]
|
||||
|
||||
structure(
|
||||
ranking,
|
||||
class = c("geposan_ranking", class(ranking))
|
||||
)
|
||||
structure(
|
||||
ranking,
|
||||
class = c("geposan_ranking", class(ranking))
|
||||
)
|
||||
}
|
||||
|
||||
#' Find the best weights to rank the results.
|
||||
|
|
@ -61,40 +61,40 @@ ranking <- function(analysis, weights) {
|
|||
#' @export
|
||||
optimal_weights <- function(analysis, methods, reference_gene_ids,
|
||||
target = "mean") {
|
||||
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
|
||||
stop("Invalid analyis. Use geposan::analyze().")
|
||||
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
|
||||
stop("Invalid analyis. Use geposan::analyze().")
|
||||
}
|
||||
|
||||
|
||||
# Compute the target rank of the reference genes when applying the
|
||||
# weights.
|
||||
target_rank <- function(factors) {
|
||||
data <- ranking(analysis, as.list(factors))
|
||||
|
||||
result <- data[
|
||||
gene %chin% reference_gene_ids,
|
||||
if (target == "min") {
|
||||
min(rank)
|
||||
} else if (target == "max") {
|
||||
max(rank)
|
||||
} else if (target == "mean") {
|
||||
mean(rank)
|
||||
} else {
|
||||
stats::median(rank)
|
||||
}
|
||||
]
|
||||
|
||||
if (result > 0) {
|
||||
result
|
||||
} else {
|
||||
Inf
|
||||
}
|
||||
}
|
||||
|
||||
initial_factors <- rep(1.0, length(methods))
|
||||
names(initial_factors) <- methods
|
||||
|
||||
# Compute the target rank of the reference genes when applying the
|
||||
# weights.
|
||||
target_rank <- function(factors) {
|
||||
data <- ranking(analysis, as.list(factors))
|
||||
optimal_factors <- stats::optim(initial_factors, target_rank)$par
|
||||
|
||||
result <- data[
|
||||
gene %chin% reference_gene_ids,
|
||||
if (target == "min") {
|
||||
min(rank)
|
||||
} else if (target == "max") {
|
||||
max(rank)
|
||||
} else if (target == "mean") {
|
||||
mean(rank)
|
||||
} else {
|
||||
stats::median(rank)
|
||||
}
|
||||
]
|
||||
|
||||
if (result > 0) {
|
||||
result
|
||||
} else {
|
||||
Inf
|
||||
}
|
||||
}
|
||||
|
||||
initial_factors <- rep(1.0, length(methods))
|
||||
names(initial_factors) <- methods
|
||||
|
||||
optimal_factors <- stats::optim(initial_factors, target_rank)$par
|
||||
|
||||
as.list(optimal_factors / max(abs(optimal_factors)))
|
||||
as.list(optimal_factors / max(abs(optimal_factors)))
|
||||
}
|
||||
|
|
|
|||
48
R/result.R
48
R/result.R
|
|
@ -10,18 +10,18 @@
|
|||
#'
|
||||
#' @export
|
||||
result <- function(method_id, scores, details = list()) {
|
||||
stopifnot(is.data.frame(scores) &
|
||||
c("gene", "score") %chin% colnames(scores))
|
||||
stopifnot(is.list(details))
|
||||
stopifnot(is.data.frame(scores) &
|
||||
c("gene", "score") %chin% colnames(scores))
|
||||
stopifnot(is.list(details))
|
||||
|
||||
structure(
|
||||
list(
|
||||
method_id = method_id,
|
||||
scores = scores,
|
||||
details = details
|
||||
),
|
||||
class = "geposan_result"
|
||||
)
|
||||
structure(
|
||||
list(
|
||||
method_id = method_id,
|
||||
scores = scores,
|
||||
details = details
|
||||
),
|
||||
class = "geposan_result"
|
||||
)
|
||||
}
|
||||
|
||||
#' Print a result object.
|
||||
|
|
@ -33,18 +33,18 @@ result <- function(method_id, scores, details = list()) {
|
|||
#'
|
||||
#' @export
|
||||
print.geposan_result <- function(x, ...) {
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan result:",
|
||||
"\n Method: %s",
|
||||
"\n Number of genes: %i",
|
||||
"\n Available details: %s",
|
||||
"\n"
|
||||
),
|
||||
x$method_id,
|
||||
nrow(x$scores),
|
||||
paste(names(x$details), collapse = ", ")
|
||||
))
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan result:",
|
||||
"\n Method: %s",
|
||||
"\n Number of genes: %i",
|
||||
"\n Available details: %s",
|
||||
"\n"
|
||||
),
|
||||
x$method_id,
|
||||
nrow(x$scores),
|
||||
paste(names(x$details), collapse = ", ")
|
||||
))
|
||||
|
||||
invisible(x)
|
||||
invisible(x)
|
||||
}
|
||||
|
|
|
|||
30
R/utils.R
30
R/utils.R
|
|
@ -8,25 +8,25 @@
|
|||
# @param objects A vector of objects that this expression depends on. The hash
|
||||
# of those objects will be used for identifying the cache file.
|
||||
cached <- function(name, objects, expr) {
|
||||
if (!dir.exists("cache")) {
|
||||
dir.create("cache")
|
||||
}
|
||||
if (!dir.exists("cache")) {
|
||||
dir.create("cache")
|
||||
}
|
||||
|
||||
id <- rlang::hash(objects)
|
||||
cache_file <- sprintf("cache/%s_%s.rda", name, id)
|
||||
id <- rlang::hash(objects)
|
||||
cache_file <- sprintf("cache/%s_%s.rda", name, id)
|
||||
|
||||
if (file.exists(cache_file)) {
|
||||
# If the cache file exists, we restore the data from it.
|
||||
load(cache_file)
|
||||
} else {
|
||||
# If the cache file doesn't exist, we have to do the computation.
|
||||
data <- expr
|
||||
if (file.exists(cache_file)) {
|
||||
# If the cache file exists, we restore the data from it.
|
||||
load(cache_file)
|
||||
} else {
|
||||
# If the cache file doesn't exist, we have to do the computation.
|
||||
data <- expr
|
||||
|
||||
# The results are cached for the next run.
|
||||
save(data, file = cache_file, compress = "xz")
|
||||
}
|
||||
# The results are cached for the next run.
|
||||
save(data, file = cache_file, compress = "xz")
|
||||
}
|
||||
|
||||
data
|
||||
data
|
||||
}
|
||||
|
||||
# This is needed to make data.table's symbols available within the package.
|
||||
|
|
|
|||
132
R/validate.R
132
R/validate.R
|
|
@ -27,68 +27,68 @@
|
|||
#'
|
||||
#' @export
|
||||
validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
|
||||
if (!inherits(ranking, "geposan_ranking")) {
|
||||
stop("Ranking is invalid. Use geposan::ranking().")
|
||||
}
|
||||
if (!inherits(ranking, "geposan_ranking")) {
|
||||
stop("Ranking is invalid. Use geposan::ranking().")
|
||||
}
|
||||
|
||||
if (is.null(progress)) {
|
||||
progress_bar <- progress::progress_bar$new()
|
||||
progress_bar$update(0.0)
|
||||
if (is.null(progress)) {
|
||||
progress_bar <- progress::progress_bar$new()
|
||||
progress_bar$update(0.0)
|
||||
|
||||
progress <- function(progress_value) {
|
||||
if (!progress_bar$finished) {
|
||||
progress_bar$update(progress_value)
|
||||
if (progress_value >= 1.0) {
|
||||
progress_bar$terminate()
|
||||
}
|
||||
}
|
||||
progress <- function(progress_value) {
|
||||
if (!progress_bar$finished) {
|
||||
progress_bar$update(progress_value)
|
||||
if (progress_value >= 1.0) {
|
||||
progress_bar$terminate()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
progress_state <- 0.0
|
||||
progress_step <- 1.0 / length(reference_gene_ids)
|
||||
progress_state <- 0.0
|
||||
progress_step <- 1.0 / length(reference_gene_ids)
|
||||
|
||||
results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
|
||||
results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
|
||||
|
||||
for (gene_id in reference_gene_ids) {
|
||||
included_gene_ids <- reference_gene_ids[
|
||||
reference_gene_ids != gene_id
|
||||
]
|
||||
for (gene_id in reference_gene_ids) {
|
||||
included_gene_ids <- reference_gene_ids[
|
||||
reference_gene_ids != gene_id
|
||||
]
|
||||
|
||||
weights <- optimal_weights(
|
||||
ranking,
|
||||
method_ids,
|
||||
included_gene_ids
|
||||
)
|
||||
|
||||
ranking_validation <- ranking(ranking, weights)
|
||||
|
||||
results[
|
||||
gene == gene_id,
|
||||
percentile_validation := ranking_validation[
|
||||
gene == gene_id,
|
||||
percentile
|
||||
]
|
||||
]
|
||||
|
||||
if (!is.null(progress)) {
|
||||
progress_state <- progress_state + progress_step
|
||||
progress(progress_state)
|
||||
}
|
||||
}
|
||||
|
||||
results[, error := percentile - percentile_validation]
|
||||
setorder(results, error)
|
||||
|
||||
structure(
|
||||
list(
|
||||
validation = results,
|
||||
mean_percentile_original = results[, mean(percentile)],
|
||||
mean_percentile_validation = results[, mean(percentile_validation)],
|
||||
mean_error = results[, mean(error)]
|
||||
),
|
||||
class = "geposan_validation"
|
||||
weights <- optimal_weights(
|
||||
ranking,
|
||||
method_ids,
|
||||
included_gene_ids
|
||||
)
|
||||
|
||||
ranking_validation <- ranking(ranking, weights)
|
||||
|
||||
results[
|
||||
gene == gene_id,
|
||||
percentile_validation := ranking_validation[
|
||||
gene == gene_id,
|
||||
percentile
|
||||
]
|
||||
]
|
||||
|
||||
if (!is.null(progress)) {
|
||||
progress_state <- progress_state + progress_step
|
||||
progress(progress_state)
|
||||
}
|
||||
}
|
||||
|
||||
results[, error := percentile - percentile_validation]
|
||||
setorder(results, error)
|
||||
|
||||
structure(
|
||||
list(
|
||||
validation = results,
|
||||
mean_percentile_original = results[, mean(percentile)],
|
||||
mean_percentile_validation = results[, mean(percentile_validation)],
|
||||
mean_error = results[, mean(error)]
|
||||
),
|
||||
class = "geposan_validation"
|
||||
)
|
||||
}
|
||||
|
||||
#' S3 method to print a validation object.
|
||||
|
|
@ -100,18 +100,18 @@ validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
|
|||
#'
|
||||
#' @export
|
||||
print.geposan_validation <- function(x, ...) {
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan validation:",
|
||||
"\n Mean percentile original: %.1f%%",
|
||||
"\n Mean percentile validation: %.1f%%",
|
||||
"\n Mean error: %.1f percent points",
|
||||
"\n"
|
||||
),
|
||||
x$mean_percentile_original * 100,
|
||||
x$mean_percentile_validation * 100,
|
||||
x$mean_error * 100
|
||||
))
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan validation:",
|
||||
"\n Mean percentile original: %.1f%%",
|
||||
"\n Mean percentile validation: %.1f%%",
|
||||
"\n Mean error: %.1f percent points",
|
||||
"\n"
|
||||
),
|
||||
x$mean_percentile_original * 100,
|
||||
x$mean_percentile_validation * 100,
|
||||
x$mean_error * 100
|
||||
))
|
||||
|
||||
invisible(x)
|
||||
invisible(x)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,23 +5,23 @@ ensembl_api_url <- "https://rest.ensembl.org"
|
|||
|
||||
#' Perform a request to the Ensembl REST API.
|
||||
ensembl_request <- function(api_path) {
|
||||
content(stop_for_status(GET(
|
||||
paste0(ensembl_api_url, api_path),
|
||||
content_type_json()
|
||||
)))
|
||||
content(stop_for_status(GET(
|
||||
paste0(ensembl_api_url, api_path),
|
||||
content_type_json()
|
||||
)))
|
||||
}
|
||||
|
||||
#' Get IDs of all available vertebrates.
|
||||
get_species_ids <- function() {
|
||||
species <- ensembl_request("/info/species")$species
|
||||
sapply(species, function(species) species$name)
|
||||
species <- ensembl_request("/info/species")$species
|
||||
sapply(species, function(species) species$name)
|
||||
}
|
||||
|
||||
#' Get all chromosomes names for a species.
|
||||
get_species_chromosomes <- function(species_id) {
|
||||
chromosomes <- unlist(ensembl_request(
|
||||
paste0("/info/assembly/", species_id)
|
||||
)$karyotype)
|
||||
chromosomes <- unlist(ensembl_request(
|
||||
paste0("/info/assembly/", species_id)
|
||||
)$karyotype)
|
||||
}
|
||||
|
||||
#' Get a vector of all available unqiue chromosome names.
|
||||
|
|
@ -29,6 +29,6 @@ get_species_chromosomes <- function(species_id) {
|
|||
#' There are multiple names for mitochondrial DNA which have to be removed
|
||||
#' manually, unfortunately.
|
||||
get_all_chromosomes <- function() {
|
||||
chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
|
||||
unique(unlist(chromosomes))
|
||||
chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
|
||||
unique(unlist(chromosomes))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,8 +12,8 @@ ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
|
|||
|
||||
# Filter out species ID and name from the result.
|
||||
species <- ensembl_datasets[, .(
|
||||
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
|
||||
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
|
||||
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
|
||||
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
|
||||
)]
|
||||
|
||||
# List of assemblies that the Ensembl Rest API advertises as chromosomes.
|
||||
|
|
@ -23,232 +23,232 @@ species <- ensembl_datasets[, .(
|
|||
#
|
||||
# See get_all_chromosomes()
|
||||
valid_chromosome_names <- c(
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"7",
|
||||
"8",
|
||||
"9",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"14",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"X",
|
||||
"Y",
|
||||
"21",
|
||||
"4A",
|
||||
"1A",
|
||||
"22",
|
||||
"23",
|
||||
"24",
|
||||
"25LG1",
|
||||
"25LG2",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"LGE22",
|
||||
"Z",
|
||||
"25",
|
||||
"29",
|
||||
"A1",
|
||||
"A2",
|
||||
"A3",
|
||||
"B1",
|
||||
"B2",
|
||||
"B3",
|
||||
"B4",
|
||||
"C1",
|
||||
"C2",
|
||||
"D1",
|
||||
"D2",
|
||||
"D3",
|
||||
"D4",
|
||||
"E1",
|
||||
"E2",
|
||||
"E3",
|
||||
"F1",
|
||||
"F2",
|
||||
"2A",
|
||||
"2B",
|
||||
"LG01",
|
||||
"LG02",
|
||||
"LG03",
|
||||
"LG04",
|
||||
"LG05",
|
||||
"LG06",
|
||||
"LG07",
|
||||
"LG08",
|
||||
"LG09",
|
||||
"LG10",
|
||||
"LG11",
|
||||
"LG12",
|
||||
"LG13",
|
||||
"LG14",
|
||||
"LG15",
|
||||
"LG16",
|
||||
"LG17",
|
||||
"LG18",
|
||||
"LG19",
|
||||
"LG20",
|
||||
"LG21",
|
||||
"LG22",
|
||||
"LG23",
|
||||
"LG24",
|
||||
"LG25",
|
||||
"I",
|
||||
"II",
|
||||
"III",
|
||||
"IV",
|
||||
"V",
|
||||
"VI",
|
||||
"VII",
|
||||
"VIII",
|
||||
"IX",
|
||||
"XI",
|
||||
"XII",
|
||||
"XIII",
|
||||
"XIV",
|
||||
"XV",
|
||||
"XVI",
|
||||
"XVII",
|
||||
"XVIII",
|
||||
"XIX",
|
||||
"XX",
|
||||
"XXI",
|
||||
"XXII",
|
||||
"XXIII",
|
||||
"XXIV",
|
||||
"W",
|
||||
"30",
|
||||
"31",
|
||||
"32",
|
||||
"33",
|
||||
"34",
|
||||
"35",
|
||||
"36",
|
||||
"37",
|
||||
"38",
|
||||
"39",
|
||||
"40",
|
||||
"2L",
|
||||
"2R",
|
||||
"3L",
|
||||
"3R",
|
||||
"LG1",
|
||||
"LG2",
|
||||
"LG3",
|
||||
"LG4",
|
||||
"LG5",
|
||||
"LG6",
|
||||
"LG7",
|
||||
"LG8",
|
||||
"LG9",
|
||||
"LG26",
|
||||
"LG27",
|
||||
"LG28",
|
||||
"LG29",
|
||||
"LG30",
|
||||
"1a",
|
||||
"7b",
|
||||
"22a",
|
||||
"LGE22C19W28_E50C23",
|
||||
"LGE64",
|
||||
"7a",
|
||||
"MIC_1",
|
||||
"MIC_10",
|
||||
"MIC_11",
|
||||
"MIC_2",
|
||||
"MIC_3",
|
||||
"MIC_4",
|
||||
"MIC_5",
|
||||
"MIC_6",
|
||||
"MIC_7",
|
||||
"MIC_8",
|
||||
"MIC_9",
|
||||
"sgr01",
|
||||
"sgr02",
|
||||
"sgr03",
|
||||
"sgr04",
|
||||
"sgr05",
|
||||
"sgr06",
|
||||
"sgr07",
|
||||
"sgr08",
|
||||
"sgr09",
|
||||
"sgr10",
|
||||
"sgr11",
|
||||
"sgr12",
|
||||
"sgr13",
|
||||
"sgr14",
|
||||
"sgr15",
|
||||
"sgr16",
|
||||
"sgr17",
|
||||
"sgr18",
|
||||
"sgr19",
|
||||
"X1",
|
||||
"X2",
|
||||
"X3",
|
||||
"X4",
|
||||
"X5",
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"41",
|
||||
"42",
|
||||
"43",
|
||||
"44",
|
||||
"45",
|
||||
"46",
|
||||
"47",
|
||||
"48",
|
||||
"49",
|
||||
"50",
|
||||
"LG28B",
|
||||
"LG30F",
|
||||
"LG36F",
|
||||
"LG37M",
|
||||
"LG42F",
|
||||
"LG44F",
|
||||
"LG45M",
|
||||
"LG48F",
|
||||
"LG49B",
|
||||
"LG34",
|
||||
"LG35",
|
||||
"LG7_11",
|
||||
"groupI",
|
||||
"groupII",
|
||||
"groupIII",
|
||||
"groupIV",
|
||||
"groupV",
|
||||
"groupVI",
|
||||
"groupVII",
|
||||
"groupVIII",
|
||||
"groupIX",
|
||||
"groupX",
|
||||
"groupXI",
|
||||
"groupXII",
|
||||
"groupXIII",
|
||||
"groupXIV",
|
||||
"groupXV",
|
||||
"groupXVI",
|
||||
"groupXVII",
|
||||
"groupXVIII",
|
||||
"groupXIX",
|
||||
"groupXX",
|
||||
"groupXXI"
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"7",
|
||||
"8",
|
||||
"9",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"14",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"X",
|
||||
"Y",
|
||||
"21",
|
||||
"4A",
|
||||
"1A",
|
||||
"22",
|
||||
"23",
|
||||
"24",
|
||||
"25LG1",
|
||||
"25LG2",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"LGE22",
|
||||
"Z",
|
||||
"25",
|
||||
"29",
|
||||
"A1",
|
||||
"A2",
|
||||
"A3",
|
||||
"B1",
|
||||
"B2",
|
||||
"B3",
|
||||
"B4",
|
||||
"C1",
|
||||
"C2",
|
||||
"D1",
|
||||
"D2",
|
||||
"D3",
|
||||
"D4",
|
||||
"E1",
|
||||
"E2",
|
||||
"E3",
|
||||
"F1",
|
||||
"F2",
|
||||
"2A",
|
||||
"2B",
|
||||
"LG01",
|
||||
"LG02",
|
||||
"LG03",
|
||||
"LG04",
|
||||
"LG05",
|
||||
"LG06",
|
||||
"LG07",
|
||||
"LG08",
|
||||
"LG09",
|
||||
"LG10",
|
||||
"LG11",
|
||||
"LG12",
|
||||
"LG13",
|
||||
"LG14",
|
||||
"LG15",
|
||||
"LG16",
|
||||
"LG17",
|
||||
"LG18",
|
||||
"LG19",
|
||||
"LG20",
|
||||
"LG21",
|
||||
"LG22",
|
||||
"LG23",
|
||||
"LG24",
|
||||
"LG25",
|
||||
"I",
|
||||
"II",
|
||||
"III",
|
||||
"IV",
|
||||
"V",
|
||||
"VI",
|
||||
"VII",
|
||||
"VIII",
|
||||
"IX",
|
||||
"XI",
|
||||
"XII",
|
||||
"XIII",
|
||||
"XIV",
|
||||
"XV",
|
||||
"XVI",
|
||||
"XVII",
|
||||
"XVIII",
|
||||
"XIX",
|
||||
"XX",
|
||||
"XXI",
|
||||
"XXII",
|
||||
"XXIII",
|
||||
"XXIV",
|
||||
"W",
|
||||
"30",
|
||||
"31",
|
||||
"32",
|
||||
"33",
|
||||
"34",
|
||||
"35",
|
||||
"36",
|
||||
"37",
|
||||
"38",
|
||||
"39",
|
||||
"40",
|
||||
"2L",
|
||||
"2R",
|
||||
"3L",
|
||||
"3R",
|
||||
"LG1",
|
||||
"LG2",
|
||||
"LG3",
|
||||
"LG4",
|
||||
"LG5",
|
||||
"LG6",
|
||||
"LG7",
|
||||
"LG8",
|
||||
"LG9",
|
||||
"LG26",
|
||||
"LG27",
|
||||
"LG28",
|
||||
"LG29",
|
||||
"LG30",
|
||||
"1a",
|
||||
"7b",
|
||||
"22a",
|
||||
"LGE22C19W28_E50C23",
|
||||
"LGE64",
|
||||
"7a",
|
||||
"MIC_1",
|
||||
"MIC_10",
|
||||
"MIC_11",
|
||||
"MIC_2",
|
||||
"MIC_3",
|
||||
"MIC_4",
|
||||
"MIC_5",
|
||||
"MIC_6",
|
||||
"MIC_7",
|
||||
"MIC_8",
|
||||
"MIC_9",
|
||||
"sgr01",
|
||||
"sgr02",
|
||||
"sgr03",
|
||||
"sgr04",
|
||||
"sgr05",
|
||||
"sgr06",
|
||||
"sgr07",
|
||||
"sgr08",
|
||||
"sgr09",
|
||||
"sgr10",
|
||||
"sgr11",
|
||||
"sgr12",
|
||||
"sgr13",
|
||||
"sgr14",
|
||||
"sgr15",
|
||||
"sgr16",
|
||||
"sgr17",
|
||||
"sgr18",
|
||||
"sgr19",
|
||||
"X1",
|
||||
"X2",
|
||||
"X3",
|
||||
"X4",
|
||||
"X5",
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"41",
|
||||
"42",
|
||||
"43",
|
||||
"44",
|
||||
"45",
|
||||
"46",
|
||||
"47",
|
||||
"48",
|
||||
"49",
|
||||
"50",
|
||||
"LG28B",
|
||||
"LG30F",
|
||||
"LG36F",
|
||||
"LG37M",
|
||||
"LG42F",
|
||||
"LG44F",
|
||||
"LG45M",
|
||||
"LG48F",
|
||||
"LG49B",
|
||||
"LG34",
|
||||
"LG35",
|
||||
"LG7_11",
|
||||
"groupI",
|
||||
"groupII",
|
||||
"groupIII",
|
||||
"groupIV",
|
||||
"groupV",
|
||||
"groupVI",
|
||||
"groupVII",
|
||||
"groupVIII",
|
||||
"groupIX",
|
||||
"groupX",
|
||||
"groupXI",
|
||||
"groupXII",
|
||||
"groupXIII",
|
||||
"groupXIV",
|
||||
"groupXV",
|
||||
"groupXVI",
|
||||
"groupXVII",
|
||||
"groupXVIII",
|
||||
"groupXIX",
|
||||
"groupXX",
|
||||
"groupXXI"
|
||||
)
|
||||
|
||||
#' Get all chromosome names for an Ensembl dataset.
|
||||
|
|
@ -256,8 +256,8 @@ valid_chromosome_names <- c(
|
|||
#' The function tries to filter out valid chromosome names from the available
|
||||
#' assemblies in the dataset.
|
||||
get_chromosome_names <- function(dataset) {
|
||||
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
|
||||
chromosome_names[chromosome_names %chin% valid_chromosome_names]
|
||||
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
|
||||
chromosome_names[chromosome_names %chin% valid_chromosome_names]
|
||||
}
|
||||
|
||||
# Retrieve information on human genes. This will only include genes on
|
||||
|
|
@ -267,16 +267,16 @@ rlog::log_info("Retrieving information on human genes")
|
|||
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
||||
|
||||
human_data <- data.table(biomaRt::getBM(
|
||||
attributes = c(
|
||||
"ensembl_gene_id",
|
||||
"hgnc_symbol",
|
||||
"chromosome_name",
|
||||
"start_position",
|
||||
"end_position"
|
||||
),
|
||||
filters = "chromosome_name",
|
||||
values = get_chromosome_names(dataset),
|
||||
mart = dataset
|
||||
attributes = c(
|
||||
"ensembl_gene_id",
|
||||
"hgnc_symbol",
|
||||
"chromosome_name",
|
||||
"start_position",
|
||||
"end_position"
|
||||
),
|
||||
filters = "chromosome_name",
|
||||
values = get_chromosome_names(dataset),
|
||||
mart = dataset
|
||||
))
|
||||
|
||||
# Remove duplicated gene IDs (at the time of writing, there are a handful).
|
||||
|
|
@ -284,9 +284,9 @@ human_data <- unique(human_data, by = "ensembl_gene_id")
|
|||
|
||||
# Only keep relevant information on genes.
|
||||
genes <- human_data[, .(
|
||||
id = ensembl_gene_id,
|
||||
name = hgnc_symbol,
|
||||
chromosome = chromosome_name
|
||||
id = ensembl_gene_id,
|
||||
name = hgnc_symbol,
|
||||
chromosome = chromosome_name
|
||||
)]
|
||||
|
||||
# Retrieve gene distance data across species.
|
||||
|
|
@ -296,39 +296,39 @@ distances <- data.table()
|
|||
|
||||
#' Handle data for one species.
|
||||
handle_species <- function(species_id, species_data) {
|
||||
chromosomes <- species_data[,
|
||||
.(chromosome_length = max(end_position)),
|
||||
by = chromosome_name
|
||||
]
|
||||
chromosomes <- species_data[,
|
||||
.(chromosome_length = max(end_position)),
|
||||
by = chromosome_name
|
||||
]
|
||||
|
||||
# Store the number of chromosomes in the species table.
|
||||
species[id == species_id, n_chromosomes := nrow(chromosomes)]
|
||||
# Store the number of chromosomes in the species table.
|
||||
species[id == species_id, n_chromosomes := nrow(chromosomes)]
|
||||
|
||||
# Store the median chromosome length in the species table.
|
||||
species[
|
||||
id == species_id,
|
||||
median_chromosome_length := chromosomes[, median(chromosome_length)]
|
||||
]
|
||||
# Store the median chromosome length in the species table.
|
||||
species[
|
||||
id == species_id,
|
||||
median_chromosome_length := chromosomes[, median(chromosome_length)]
|
||||
]
|
||||
|
||||
# Precompute the genes' distance to the nearest telomere.
|
||||
species_distances <- species_data[
|
||||
chromosomes,
|
||||
.(
|
||||
species = species_id,
|
||||
gene = ensembl_gene_id,
|
||||
chromosome_name = chromosome_name,
|
||||
start_position = start_position,
|
||||
end_position = end_position,
|
||||
distance = pmin(
|
||||
start_position,
|
||||
chromosome_length - end_position
|
||||
)
|
||||
),
|
||||
on = "chromosome_name"
|
||||
]
|
||||
# Precompute the genes' distance to the nearest telomere.
|
||||
species_distances <- species_data[
|
||||
chromosomes,
|
||||
.(
|
||||
species = species_id,
|
||||
gene = ensembl_gene_id,
|
||||
chromosome_name = chromosome_name,
|
||||
start_position = start_position,
|
||||
end_position = end_position,
|
||||
distance = pmin(
|
||||
start_position,
|
||||
chromosome_length - end_position
|
||||
)
|
||||
),
|
||||
on = "chromosome_name"
|
||||
]
|
||||
|
||||
# Add species distances to the distances table.
|
||||
distances <<- rbindlist(list(distances, species_distances))
|
||||
# Add species distances to the distances table.
|
||||
distances <<- rbindlist(list(distances, species_distances))
|
||||
}
|
||||
|
||||
# Handle the human first, as we already retrieved the data and don't need to
|
||||
|
|
@ -337,67 +337,67 @@ handle_species("hsapiens", human_data)
|
|||
|
||||
# Iterate through all other species and retrieve their distance data.
|
||||
for (species_id in species[id != "hsapiens", id]) {
|
||||
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
|
||||
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
|
||||
|
||||
dataset <- biomaRt::useDataset(
|
||||
sprintf("%s_gene_ensembl", species_id),
|
||||
mart = ensembl
|
||||
)
|
||||
dataset <- biomaRt::useDataset(
|
||||
sprintf("%s_gene_ensembl", species_id),
|
||||
mart = ensembl
|
||||
)
|
||||
|
||||
# Besides the attributes that are always present, we need to check for
|
||||
# human orthologs. Some species don't have that information and will be
|
||||
# skipped.
|
||||
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
||||
biomaRt::listAttributes(dataset, what = "name")) {
|
||||
rlog::log_info("No data on human orthologs")
|
||||
species <- species[id != species_id]
|
||||
# Besides the attributes that are always present, we need to check for
|
||||
# human orthologs. Some species don't have that information and will be
|
||||
# skipped.
|
||||
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
||||
biomaRt::listAttributes(dataset, what = "name")) {
|
||||
rlog::log_info("No data on human orthologs")
|
||||
species <- species[id != species_id]
|
||||
|
||||
next
|
||||
}
|
||||
next
|
||||
}
|
||||
|
||||
chromosome_names <- get_chromosome_names(dataset)
|
||||
chromosome_names <- get_chromosome_names(dataset)
|
||||
|
||||
# Skip the species, if there are no assembled chromosomes.
|
||||
if (length(chromosome_names) <= 0) {
|
||||
rlog::log_info("No matching chromosome assemblies")
|
||||
species <- species[id != species_id]
|
||||
# Skip the species, if there are no assembled chromosomes.
|
||||
if (length(chromosome_names) <= 0) {
|
||||
rlog::log_info("No matching chromosome assemblies")
|
||||
species <- species[id != species_id]
|
||||
|
||||
next
|
||||
}
|
||||
next
|
||||
}
|
||||
|
||||
# Retrieve information on all genes of the current species, that have
|
||||
# human orthologs. This is called "homolog" in the Ensembl schema.
|
||||
species_distances <- data.table(biomaRt::getBM(
|
||||
attributes = c(
|
||||
"hsapiens_homolog_ensembl_gene",
|
||||
"chromosome_name",
|
||||
"start_position",
|
||||
"end_position"
|
||||
),
|
||||
filters = c("with_hsapiens_homolog", "chromosome_name"),
|
||||
values = list(TRUE, chromosome_names),
|
||||
mart = dataset
|
||||
))
|
||||
# Retrieve information on all genes of the current species, that have
|
||||
# human orthologs. This is called "homolog" in the Ensembl schema.
|
||||
species_distances <- data.table(biomaRt::getBM(
|
||||
attributes = c(
|
||||
"hsapiens_homolog_ensembl_gene",
|
||||
"chromosome_name",
|
||||
"start_position",
|
||||
"end_position"
|
||||
),
|
||||
filters = c("with_hsapiens_homolog", "chromosome_name"),
|
||||
values = list(TRUE, chromosome_names),
|
||||
mart = dataset
|
||||
))
|
||||
|
||||
# Only include human genes that we have information on.
|
||||
species_distances <- species_distances[
|
||||
hsapiens_homolog_ensembl_gene %chin% genes$id
|
||||
]
|
||||
# Only include human genes that we have information on.
|
||||
species_distances <- species_distances[
|
||||
hsapiens_homolog_ensembl_gene %chin% genes$id
|
||||
]
|
||||
|
||||
# Only include one ortholog per human gene.
|
||||
species_distances <- unique(
|
||||
species_distances,
|
||||
by = "hsapiens_homolog_ensembl_gene"
|
||||
)
|
||||
# Only include one ortholog per human gene.
|
||||
species_distances <- unique(
|
||||
species_distances,
|
||||
by = "hsapiens_homolog_ensembl_gene"
|
||||
)
|
||||
|
||||
# Rename gene ID column to match the human data.
|
||||
setnames(
|
||||
species_distances,
|
||||
"hsapiens_homolog_ensembl_gene",
|
||||
"ensembl_gene_id"
|
||||
)
|
||||
# Rename gene ID column to match the human data.
|
||||
setnames(
|
||||
species_distances,
|
||||
"hsapiens_homolog_ensembl_gene",
|
||||
"ensembl_gene_id"
|
||||
)
|
||||
|
||||
handle_species(species_id, species_distances)
|
||||
handle_species(species_id, species_distances)
|
||||
}
|
||||
|
||||
# Save data in the appropriate place.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue