diff --git a/R/analyze.R b/R/analyze.R
index b17dbc8..1605427 100644
--- a/R/analyze.R
+++ b/R/analyze.R
@@ -18,54 +18,54 @@
#'
#' @export
analyze <- function(preset, progress = NULL, include_results = TRUE) {
- if (!inherits(preset, "geposan_preset")) {
- stop("Preset is invalid. Use geposan::preset() to create one.")
- }
+ if (!inherits(preset, "geposan_preset")) {
+ stop("Preset is invalid. Use geposan::preset() to create one.")
+ }
- if (is.null(progress)) {
- progress_bar <- progress::progress_bar$new()
- progress_bar$update(0.0)
+ if (is.null(progress)) {
+ progress_bar <- progress::progress_bar$new()
+ progress_bar$update(0.0)
- progress <- function(progress_value) {
- if (!progress_bar$finished) {
- progress_bar$update(progress_value)
- if (progress_value >= 1.0) {
- progress_bar$terminate()
- }
- }
+ progress <- function(progress_value) {
+ if (!progress_bar$finished) {
+ progress_bar$update(progress_value)
+ if (progress_value >= 1.0) {
+ progress_bar$terminate()
}
+ }
}
+ }
- progress_buffer <- 0.0
- method_count <- length(preset$methods)
+ progress_buffer <- 0.0
+ method_count <- length(preset$methods)
- method_progress <- function(progress_value) {
- progress(progress_buffer + progress_value / method_count)
- }
+ method_progress <- function(progress_value) {
+ progress(progress_buffer + progress_value / method_count)
+ }
- scores <- data.table(gene = preset$gene_id)
- results <- list()
+ scores <- data.table(gene = preset$gene_id)
+ results <- list()
- for (method in preset$methods) {
- method_results <- method$func(preset, method_progress)
+ for (method in preset$methods) {
+ method_results <- method$func(preset, method_progress)
- scores <- merge(scores, method_results$scores, by = "gene")
- setnames(scores, "score", method$id)
+ scores <- merge(scores, method_results$scores, by = "gene")
+ setnames(scores, "score", method$id)
- results <- c(results, list(method_results))
+ results <- c(results, list(method_results))
- progress_buffer <- progress_buffer + 1 / method_count
- progress(progress_buffer)
- }
+ progress_buffer <- progress_buffer + 1 / method_count
+ progress(progress_buffer)
+ }
- structure(
- list(
- preset = preset,
- scores = scores,
- results = if (include_results) results else NULL
- ),
- class = "geposan_analysis"
- )
+ structure(
+ list(
+ preset = preset,
+ scores = scores,
+ results = if (include_results) results else NULL
+ ),
+ class = "geposan_analysis"
+ )
}
#' Print an analysis object.
@@ -77,14 +77,14 @@ analyze <- function(preset, progress = NULL, include_results = TRUE) {
#'
#' @export
print.geposan_analysis <- function(x, ...) {
- cat("geposan analysis:\n\n")
- print(x$preset)
+ cat("geposan analysis:\n\n")
+ print(x$preset)
+ cat("\n")
+
+ for (result in x$results) {
+ print(result)
cat("\n")
+ }
- for (result in x$results) {
- print(result)
- cat("\n")
- }
-
- invisible(x)
+ invisible(x)
}
diff --git a/R/comparison.R b/R/comparison.R
index 49e73af..62d21d4 100644
--- a/R/comparison.R
+++ b/R/comparison.R
@@ -18,38 +18,38 @@
#'
#' @export
compare <- function(ranking, comparison_gene_ids) {
- if (!inherits(ranking, "geposan_ranking")) {
- stop("Invalid ranking. Use geposan::ranking().")
- }
+ if (!inherits(ranking, "geposan_ranking")) {
+ stop("Invalid ranking. Use geposan::ranking().")
+ }
- comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
+ comparison_ranking <- ranking[gene %chin% comparison_gene_ids]
- quantiles <- data.table(
- quantile = c("0%", "25%", "50%", "75%", "100%"),
- score = stats::quantile(comparison_ranking[, score]),
- rank = stats::quantile(
- comparison_ranking[, rank],
- probs = seq(1, 0, -0.25)
- ),
- percentile = stats::quantile(comparison_ranking[, percentile])
- )
+ quantiles <- data.table(
+ quantile = c("0%", "25%", "50%", "75%", "100%"),
+ score = stats::quantile(comparison_ranking[, score]),
+ rank = stats::quantile(
+ comparison_ranking[, rank],
+ probs = seq(1, 0, -0.25)
+ ),
+ percentile = stats::quantile(comparison_ranking[, percentile])
+ )
- p_value <- stats::wilcox.test(
- x = comparison_ranking[, score],
- y = ranking[!gene %chin% comparison_gene_ids, score],
- alternative = "greater"
- )$p.value
+ p_value <- stats::wilcox.test(
+ x = comparison_ranking[, score],
+ y = ranking[!gene %chin% comparison_gene_ids, score],
+ alternative = "greater"
+ )$p.value
- structure(
- list(
- quantiles = quantiles,
- mean_score = comparison_ranking[, mean(score)],
- mean_rank = comparison_ranking[, mean(rank)],
- mean_percentile = comparison_ranking[, mean(percentile)],
- p_value = p_value
- ),
- class = "geposan_comparison"
- )
+ structure(
+ list(
+ quantiles = quantiles,
+ mean_score = comparison_ranking[, mean(score)],
+ mean_rank = comparison_ranking[, mean(rank)],
+ mean_percentile = comparison_ranking[, mean(percentile)],
+ p_value = p_value
+ ),
+ class = "geposan_comparison"
+ )
}
#' S3 method to print a comparison object.
@@ -61,32 +61,32 @@ compare <- function(ranking, comparison_gene_ids) {
#'
#' @export
print.geposan_comparison <- function(x, ...) {
- cat("geposan comparison:\n\n")
+ cat("geposan comparison:\n\n")
- quantiles_formatted <- x$quantiles[, .(
- "Quantile" = quantile,
- "Score" = round(score, 3),
- "Rank" = rank,
- "Percentile" = paste0(
- format(round(percentile * 100, 1), nsmall = 1),
- "%"
- )
- )]
+ quantiles_formatted <- x$quantiles[, .(
+ "Quantile" = quantile,
+ "Score" = round(score, 3),
+ "Rank" = rank,
+ "Percentile" = paste0(
+ format(round(percentile * 100, 1), nsmall = 1),
+ "%"
+ )
+ )]
- print(quantiles_formatted, row.names = FALSE)
+ print(quantiles_formatted, row.names = FALSE)
- cat(sprintf(
- paste0(
- "\n Mean score: %.3f",
- "\n Mean rank: %.1f",
- "\n Mean percentile: %.1f%%",
- "\n p-value for better scores: %.4f\n"
- ),
- x$mean_score,
- x$mean_rank,
- x$mean_percentile * 100,
- x$p_value
- ))
+ cat(sprintf(
+ paste0(
+ "\n Mean score: %.3f",
+ "\n Mean rank: %.1f",
+ "\n Mean percentile: %.1f%%",
+ "\n p-value for better scores: %.4f\n"
+ ),
+ x$mean_score,
+ x$mean_rank,
+ x$mean_percentile * 100,
+ x$p_value
+ ))
- invisible(x)
+ invisible(x)
}
diff --git a/R/method.R b/R/method.R
index cff0f61..2d909b1 100644
--- a/R/method.R
+++ b/R/method.R
@@ -12,34 +12,34 @@
#'
#' @export
method <- function(id, name, description, func) {
- stopifnot(is.character(id) & length(id) == 1)
- stopifnot(is.character(name) & length(name) == 1)
- stopifnot(is.character(description) & length(description) == 1)
- stopifnot(is.function(func))
+ stopifnot(is.character(id) & length(id) == 1)
+ stopifnot(is.character(name) & length(name) == 1)
+ stopifnot(is.character(description) & length(description) == 1)
+ stopifnot(is.function(func))
- structure(
- list(
- id = id,
- name = name,
- description = description,
- func = func
- ),
- class = "geposan_method"
- )
+ structure(
+ list(
+ id = id,
+ name = name,
+ description = description,
+ func = func
+ ),
+ class = "geposan_method"
+ )
}
#' Get a list of all available methods.
#'
#' @export
all_methods <- function() {
- list(
- clustering(),
- correlation(),
- neural(),
- adjacency(),
- species_adjacency(),
- proximity()
- )
+ list(
+ clustering(),
+ correlation(),
+ neural(),
+ adjacency(),
+ species_adjacency(),
+ proximity()
+ )
}
#' Print a method object.
@@ -51,18 +51,18 @@ all_methods <- function() {
#'
#' @export
print.geposan_method <- function(x, ...) {
- cat(sprintf(
- paste0(
- "geposan method:",
- "\n Method ID: %s",
- "\n Name: %s",
- "\n Description: %s",
- "\n"
- ),
- x$id,
- x$name,
- x$description
- ))
+ cat(sprintf(
+ paste0(
+ "geposan method:",
+ "\n Method ID: %s",
+ "\n Name: %s",
+ "\n Description: %s",
+ "\n"
+ ),
+ x$id,
+ x$name,
+ x$description
+ ))
- invisible(x)
+ invisible(x)
}
diff --git a/R/method_adjacency.R b/R/method_adjacency.R
index 26ad099..2ece9e6 100644
--- a/R/method_adjacency.R
+++ b/R/method_adjacency.R
@@ -12,14 +12,14 @@
#'
#' @export
densest <- function(data) {
- as.numeric(if (length(data) <= 0) {
- NULL
- } else if (length(data) == 1) {
- data
- } else {
- density <- stats::density(data)
- mean(density$x[density$y == max(density$y)])
- })
+ as.numeric(if (length(data) <= 0) {
+ NULL
+ } else if (length(data) == 1) {
+ data
+ } else {
+ density <- stats::density(data)
+ mean(density$x[density$y == max(density$y)])
+ })
}
#' Score genes based on their proximity to the reference genes.
@@ -40,80 +40,80 @@ densest <- function(data) {
#'
#' @export
adjacency <- function(distance_estimate = densest, summarize = stats::median) {
- method(
- id = "adjacency",
- name = "Adjacency",
- description = "Adjacency to reference genes",
- function(preset, progress) {
- species_ids <- preset$species_ids
- gene_ids <- preset$gene_ids
- reference_gene_ids <- preset$reference_gene_ids
+ method(
+ id = "adjacency",
+ name = "Adjacency",
+ description = "Adjacency to reference genes",
+ function(preset, progress) {
+ species_ids <- preset$species_ids
+ gene_ids <- preset$gene_ids
+ reference_gene_ids <- preset$reference_gene_ids
- cached(
- "adjacency",
- c(
- species_ids,
- gene_ids,
- reference_gene_ids,
- distance_estimate,
- summarize
- ),
- { # nolint
- # Filter distances by species and gene and summarize each
- # gene's distance values using the estimation function.
- data <- geposan::distances[
- species %chin% species_ids & gene %chin% gene_ids,
- .(distance = as.numeric(distance_estimate(distance))),
- by = gene
- ]
+ cached(
+ "adjacency",
+ c(
+ species_ids,
+ gene_ids,
+ reference_gene_ids,
+ distance_estimate,
+ summarize
+ ),
+ { # nolint
+ # Filter distances by species and gene and summarize each
+ # gene's distance values using the estimation function.
+ data <- geposan::distances[
+ species %chin% species_ids & gene %chin% gene_ids,
+ .(distance = as.numeric(distance_estimate(distance))),
+ by = gene
+ ]
- # Compute the absolute value of the difference between the
- # estimated distances of each gene to the reference genes.
- compute_difference <- function(distance_value,
- comparison_ids) {
- differences <- data[
- gene %chin% comparison_ids,
- .(difference = abs(distance_value - distance))
- ]
+ # Compute the absolute value of the difference between the
+ # estimated distances of each gene to the reference genes.
+ compute_difference <- function(distance_value,
+ comparison_ids) {
+ differences <- data[
+ gene %chin% comparison_ids,
+ .(difference = abs(distance_value - distance))
+ ]
- summarize(differences$difference)
- }
+ summarize(differences$difference)
+ }
- # Compute the differences to the reference genes.
- data[
- !gene %chin% reference_gene_ids,
- difference := compute_difference(
- distance,
- reference_gene_ids
- ),
- by = gene
- ]
+ # Compute the differences to the reference genes.
+ data[
+ !gene %chin% reference_gene_ids,
+ difference := compute_difference(
+ distance,
+ reference_gene_ids
+ ),
+ by = gene
+ ]
- progress(0.5)
+ progress(0.5)
- # Exclude the reference gene itself when computing its
- # difference.
- data[
- gene %chin% reference_gene_ids,
- difference := compute_difference(
- distance,
- reference_gene_ids[reference_gene_ids != gene]
- ),
- by = gene
- ]
+ # Exclude the reference gene itself when computing its
+ # difference.
+ data[
+ gene %chin% reference_gene_ids,
+ difference := compute_difference(
+ distance,
+ reference_gene_ids[reference_gene_ids != gene]
+ ),
+ by = gene
+ ]
- # Compute the final score by normalizing the difference.
- data[, score := 1 - difference / max(difference)]
+ # Compute the final score by normalizing the difference.
+ data[, score := 1 - difference / max(difference)]
- progress(1.0)
+ progress(1.0)
- result(
- method = "adjacency",
- scores = data[, .(gene, score)],
- details = list(data = data)
- )
- }
- )
+ result(
+ method = "adjacency",
+ scores = data[, .(gene, score)],
+ details = list(data = data)
+ )
}
- )
+ )
+ }
+ )
}
diff --git a/R/method_clustering.R b/R/method_clustering.R
index 51087f5..7c34052 100644
--- a/R/method_clustering.R
+++ b/R/method_clustering.R
@@ -13,33 +13,33 @@
#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
#' etc.
clusteriness <- function(data, span = 100000, weight = 0.7) {
- n <- length(data)
+ n <- length(data)
- # Return a score of 0.0 if there is just one or no value at all.
- if (n < 2) {
- return(0.0)
+ # Return a score of 0.0 if there is just one or no value at all.
+ if (n < 2) {
+ return(0.0)
+ }
+
+ # Cluster the data and compute the cluster sizes.
+
+ tree <- stats::hclust(stats::dist(data))
+ clusters <- stats::cutree(tree, h = span)
+ cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
+
+ # Compute the "clusteriness" score.
+
+ score <- 0.0
+
+ for (i in seq_along(cluster_sizes)) {
+ cluster_size <- cluster_sizes[i]
+
+ if (cluster_size >= 2) {
+ cluster_score <- cluster_size / n
+ score <- score + weight^(i - 1) * cluster_score
}
+ }
- # Cluster the data and compute the cluster sizes.
-
- tree <- stats::hclust(stats::dist(data))
- clusters <- stats::cutree(tree, h = span)
- cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
-
- # Compute the "clusteriness" score.
-
- score <- 0.0
-
- for (i in seq_along(cluster_sizes)) {
- cluster_size <- cluster_sizes[i]
-
- if (cluster_size >= 2) {
- cluster_score <- cluster_size / n
- score <- score + weight^(i - 1) * cluster_score
- }
- }
-
- score
+ score
}
#' Process genes clustering their distance to telomeres.
@@ -53,41 +53,41 @@ clusteriness <- function(data, span = 100000, weight = 0.7) {
#'
#' @export
clustering <- function() {
- method(
- id = "clustering",
- name = "Clustering",
- description = "Clustering of genes",
- function(preset, progress) {
- species_ids <- preset$species_ids
- gene_ids <- preset$gene_ids
+ method(
+ id = "clustering",
+ name = "Clustering",
+ description = "Clustering of genes",
+ function(preset, progress) {
+ species_ids <- preset$species_ids
+ gene_ids <- preset$gene_ids
- cached("clustering", c(species_ids, gene_ids), {
- scores <- data.table(gene = gene_ids)
+ cached("clustering", c(species_ids, gene_ids), {
+ scores <- data.table(gene = gene_ids)
- # Prefilter the input data by species.
- distances <- geposan::distances[species %chin% species_ids]
+ # Prefilter the input data by species.
+ distances <- geposan::distances[species %chin% species_ids]
- genes_done <- 0
- genes_total <- length(gene_ids)
+ genes_done <- 0
+ genes_total <- length(gene_ids)
- # Perform the cluster analysis for one gene.
- compute <- function(gene_id) {
- data <- distances[gene == gene_id, distance]
- score <- clusteriness(data)
+ # Perform the cluster analysis for one gene.
+ compute <- function(gene_id) {
+ data <- distances[gene == gene_id, distance]
+ score <- clusteriness(data)
- genes_done <<- genes_done + 1
- progress(genes_done / genes_total)
+ genes_done <<- genes_done + 1
+ progress(genes_done / genes_total)
- score
- }
-
- scores[, score := compute(gene), by = gene]
-
- result(
- method = "clustering",
- scores = scores
- )
- })
+ score
}
- )
+
+ scores[, score := compute(gene), by = gene]
+
+ result(
+ method = "clustering",
+ scores = scores
+ )
+ })
+ }
+ )
}
diff --git a/R/method_correlation.R b/R/method_correlation.R
index 44b36f6..1b69861 100644
--- a/R/method_correlation.R
+++ b/R/method_correlation.R
@@ -8,96 +8,97 @@
#'
#' @export
correlation <- function(summarize = stats::median) {
- method(
- id = "correlation",
- name = "Correlation",
- description = "Correlation with reference genes",
- function(preset, progress) {
- species_ids <- preset$species_ids
- gene_ids <- preset$gene_ids
- reference_gene_ids <- preset$reference_gene_ids
+ method(
+ id = "correlation",
+ name = "Correlation",
+ description = "Correlation with reference genes",
+ function(preset, progress) {
+ species_ids <- preset$species_ids
+ gene_ids <- preset$gene_ids
+ reference_gene_ids <- preset$reference_gene_ids
- cached(
- "correlation",
- c(species_ids, gene_ids, reference_gene_ids, summarize),
- { # nolint
- # Prefilter distances by species.
- distances <- geposan::distances[species %chin% species_ids]
+ cached(
+ "correlation",
+ c(species_ids, gene_ids, reference_gene_ids, summarize),
+ { # nolint
+ # Prefilter distances by species.
+ distances <- geposan::distances[species %chin% species_ids]
- # Tranform data to get species as rows and genes as columns.
- # We construct columns per species, because it requires
- # fewer iterations, and transpose the table afterwards.
+ # Tranform data to get species as rows and genes as columns.
+ # We construct columns per species, because it requires
+ # fewer iterations, and transpose the table afterwards.
- data <- data.table(gene = gene_ids)
+ data <- data.table(gene = gene_ids)
- # Make a column containing distance data for each species.
- for (species_id in species_ids) {
- species_data <- distances[
- species == species_id,
- .(gene, distance)
- ]
+ # Make a column containing distance data for each species.
+ for (species_id in species_ids) {
+ species_data <- distances[
+ species == species_id,
+ .(gene, distance)
+ ]
- data <- merge(data, species_data, all.x = TRUE)
- setnames(data, "distance", species_id)
- }
+ data <- merge(data, species_data, all.x = TRUE)
+ setnames(data, "distance", species_id)
+ }
- # Transpose to the desired format.
- data <- transpose(data, make.names = "gene")
+ # Transpose to the desired format.
+ data <- transpose(data, make.names = "gene")
- progress(0.33)
+ progress(0.33)
- # Take the reference data.
- reference_data <- data[, ..reference_gene_ids]
+ # Take the reference data.
+ reference_data <- data[, ..reference_gene_ids]
- # Perform the correlation between all possible pairs.
- results <- stats::cor(
- data[, ..gene_ids],
- reference_data,
- use = "pairwise.complete.obs",
- method = "spearman"
- )
+ # Perform the correlation between all possible pairs.
+ results <- stats::cor(
+ data[, ..gene_ids],
+ reference_data,
+ use = "pairwise.complete.obs",
+ method = "spearman"
+ )
- results <- data.table(results, keep.rownames = TRUE)
- setnames(results, "rn", "gene")
+ results <- data.table(results, keep.rownames = TRUE)
+ setnames(results, "rn", "gene")
- # Remove correlations between the reference genes
- # themselves.
- for (reference_gene_id in reference_gene_ids) {
- column <- quote(reference_gene_id)
- results[gene == reference_gene_id, eval(column) := NA]
- }
+ # Remove correlations between the reference genes
+ # themselves.
+ for (reference_gene_id in reference_gene_ids) {
+ column <- quote(reference_gene_id)
+ results[gene == reference_gene_id, eval(column) := NA]
+ }
- progress(0.66)
+ progress(0.66)
- # Combine the correlation coefficients.
- results[,
- max_correlation := as.double(summarize(stats::na.omit(
- # Convert the data.table subset into a
- # vector to get the correct na.omit
- # behavior.
- as.matrix(.SD)[1, ]
- ))),
- .SDcols = reference_gene_ids,
- by = gene
- ]
+ # Combine the correlation coefficients.
+ results[,
+ max_correlation := as.double(summarize(stats::na.omit(
+ # Convert the data.table subset into a
+ # vector to get the correct na.omit
+ # behavior.
+ as.matrix(.SD)[1, ]
+ ))),
+ .SDcols = reference_gene_ids,
+ by = gene
+ ]
- # Normalize scores.
- results[,
- score := (max_correlation - min(max_correlation)) /
- (max(max_correlation) - min(max_correlation))
- ]
+ # Normalize scores.
+ results[
+ ,
+ score := (max_correlation - min(max_correlation)) /
+ (max(max_correlation) - min(max_correlation))
+ ]
- # Normalize scores.
+ # Normalize scores.
- results[, .(gene, score)]
+ results[, .(gene, score)]
- result(
- method = "correlation",
- scores = results[, .(gene, score)],
- details = list(all_correlations = results)
- )
- }
- )
+ result(
+ method = "correlation",
+ scores = results[, .(gene, score)],
+ details = list(all_correlations = results)
+ )
}
- )
+ )
+ }
+ )
}
diff --git a/R/method_neural.R b/R/method_neural.R
index 2b3323c..6f62d49 100644
--- a/R/method_neural.R
+++ b/R/method_neural.R
@@ -12,244 +12,244 @@
#'
#' @export
neural <- function(seed = 180199, n_models = 5) {
- method(
- id = "neural",
- name = "Neural",
- description = "Assessment by neural network",
- function(preset, progress) {
- species_ids <- preset$species_ids
- gene_ids <- preset$gene_ids
- reference_gene_ids <- preset$reference_gene_ids
+ method(
+ id = "neural",
+ name = "Neural",
+ description = "Assessment by neural network",
+ function(preset, progress) {
+ species_ids <- preset$species_ids
+ gene_ids <- preset$gene_ids
+ reference_gene_ids <- preset$reference_gene_ids
- cached(
- "neural",
- c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
- { # nolint
- reference_count <- length(reference_gene_ids)
- stopifnot(n_models %in% 2:reference_count)
+ cached(
+ "neural",
+ c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
+ { # nolint
+ reference_count <- length(reference_gene_ids)
+ stopifnot(n_models %in% 2:reference_count)
- # Make results reproducible.
- tensorflow::set_random_seed(seed)
+ # Make results reproducible.
+ tensorflow::set_random_seed(seed)
- # Step 1: Prepare input data.
- # ---------------------------
+ # Step 1: Prepare input data.
+ # ---------------------------
- # Prefilter distances by species.
- distances <- geposan::distances[species %chin% species_ids]
+ # Prefilter distances by species.
+ distances <- geposan::distances[species %chin% species_ids]
- # Input data for the network. This contains the gene ID as
- # an identifier as well as the per-species gene distances as
- # input variables.
- data <- data.table(gene = gene_ids)
+ # Input data for the network. This contains the gene ID as
+ # an identifier as well as the per-species gene distances as
+ # input variables.
+ data <- data.table(gene = gene_ids)
- # Buffer to keep track of the names of the input variables.
- input_vars <- NULL
+ # Buffer to keep track of the names of the input variables.
+ input_vars <- NULL
- # Make a columns containing positions and distances for each
- # species.
- for (species_id in species_ids) {
- species_data <- distances[
- species == species_id,
- .(gene, distance)
- ]
+ # Make a columns containing positions and distances for each
+ # species.
+ for (species_id in species_ids) {
+ species_data <- distances[
+ species == species_id,
+ .(gene, distance)
+ ]
- # Only include species with at least 25% known values.
- # As positions and distances always coexist, we don't
- # loose any data here.
+ # Only include species with at least 25% known values.
+ # As positions and distances always coexist, we don't
+ # loose any data here.
- species_data <- stats::na.omit(species_data)
+ species_data <- stats::na.omit(species_data)
- if (nrow(species_data) >= 0.25 * length(gene_ids)) {
- data <- merge(data, species_data, all.x = TRUE)
+ if (nrow(species_data) >= 0.25 * length(gene_ids)) {
+ data <- merge(data, species_data, all.x = TRUE)
- # Replace missing data with mean values. The neural
- # network can't handle NAs in a meaningful way.
- # Choosing extreme values here would result in
- # heavily biased results. Therefore, the mean value
- # is chosen as a compromise. However, this will of
- # course lessen the significance of the results.
+ # Replace missing data with mean values. The neural
+ # network can't handle NAs in a meaningful way.
+ # Choosing extreme values here would result in
+ # heavily biased results. Therefore, the mean value
+ # is chosen as a compromise. However, this will of
+ # course lessen the significance of the results.
- mean_distance <- round(
- species_data[, mean(distance)]
- )
+ mean_distance <- round(
+ species_data[, mean(distance)]
+ )
- data[is.na(distance), distance := mean_distance]
+ data[is.na(distance), distance := mean_distance]
- # Name the new column after the species.
- setnames(data, "distance", species_id)
+ # Name the new column after the species.
+ setnames(data, "distance", species_id)
- # Add the input variable to the buffer.
- input_vars <- c(input_vars, species_id)
- }
- }
+ # Add the input variable to the buffer.
+ input_vars <- c(input_vars, species_id)
+ }
+ }
- progress(0.1)
+ progress(0.1)
- # Step 2: Prepare training data.
- # ------------------------------
+ # Step 2: Prepare training data.
+ # ------------------------------
- # Take out the reference data.
+ # Take out the reference data.
- reference_data <- data[gene %chin% reference_gene_ids]
- reference_data[, score := 1.0]
+ reference_data <- data[gene %chin% reference_gene_ids]
+ reference_data[, score := 1.0]
- # Take out random samples from the remaining genes. This is
- # another compromise with a negative impact on
- # significance. We assume that a random gene is not likely
- # to match the reference genes.
+ # Take out random samples from the remaining genes. This is
+ # another compromise with a negative impact on
+ # significance. We assume that a random gene is not likely
+ # to match the reference genes.
- without_reference_data <- data[
- !gene %chin% reference_gene_ids
- ]
+ without_reference_data <- data[
+ !gene %chin% reference_gene_ids
+ ]
- control_data <- without_reference_data[
- sample(
- nrow(without_reference_data),
- reference_count
- )
- ]
-
- control_data[, score := 0.0]
-
- # Split the training data into random sets to have
- # validation data for each model.
-
- # Scramble the source tables.
- reference_data <- reference_data[sample(reference_count)]
- control_data <- control_data[sample(reference_count)]
-
- networks <- list()
-
- indices <- seq_len(reference_count)
- indices_split <- split(indices, indices %% n_models)
-
- for (i in seq_len(n_models)) {
- training_data <- rbindlist(list(
- reference_data[!indices_split[[i]]],
- control_data[!indices_split[[i]]]
- ))
-
- validation_data <- rbindlist(list(
- reference_data[indices_split[[i]]],
- control_data[indices_split[[i]]]
- ))
-
- networks[[i]] <- list(
- training_data = training_data,
- validation_data = validation_data
- )
- }
-
- # Step 3: Create, train and apply neural network.
- # -----------------------------------------------
-
- # Layers for the neural network.
- input_layer <- length(input_vars)
- layer1 <- input_layer
- layer2 <- 0.5 * input_layer
- layer3 <- 0.5 * layer2
-
- # Convert data to matrix and normalize it.
- to_matrix <- function(data) {
- data_matrix <- as.matrix(data[, ..input_vars])
- colnames(data_matrix) <- NULL
- keras::normalize(data_matrix)
- }
-
- data_matrix <- to_matrix(data)
- output_vars <- NULL
-
- for (i in seq_along(networks)) {
- # Create a new model for each training session, because
- # the model would keep its state across training
- # sessions otherwise.
- model <- keras::keras_model_sequential() |>
- keras::layer_dense(
- units = layer1,
- activation = "relu",
- input_shape = input_layer,
- ) |>
- keras::layer_dense(
- units = layer2,
- activation = "relu",
- kernel_regularizer = keras::regularizer_l2()
- ) |>
- keras::layer_dense(
- units = layer3,
- activation = "relu",
- kernel_regularizer = keras::regularizer_l2()
- ) |>
- keras::layer_dense(
- units = 1,
- activation = "sigmoid"
- ) |>
- keras::compile(
- loss = keras::loss_mean_absolute_error(),
- optimizer = keras::optimizer_adam()
- )
-
- # Train the model.
-
- network <- networks[[i]]
-
- training_data <- network$training_data
- training_matrix <- to_matrix(training_data)
- validation_data <- network$validation_data
- validation_matrix <- to_matrix(validation_data)
-
- fit <- keras::fit(
- model,
- x = training_matrix,
- y = training_data$score,
- validation_data = list(
- x_val = validation_matrix,
- y_val = validation_data$score
- ),
- epochs = 500,
- verbose = FALSE
- )
-
- # Apply the model.
-
- data[, new_score := stats::predict(model, data_matrix)]
-
- # Remove the values of the training data itself.
- data[gene %chin% training_data$gene, new_score := NA]
-
- output_var <- sprintf("score%i", i)
- setnames(data, "new_score", output_var)
- output_vars <- c(output_vars, output_var)
-
-
- # Store the details.
-
- networks[[i]]$model <- keras::serialize_model(model)
- networks[[i]]$fit <- fit
-
- progress(0.1 + i * (0.9 / n_models))
- }
-
- # Compute the final score as the mean score.
- data[,
- score := mean(as.numeric(.SD), na.rm = TRUE),
- .SDcols = output_vars,
- by = gene
- ]
-
- progress(1.0)
-
- result(
- method = "neural",
- scores = data[, .(gene, score)],
- details = list(
- seed = seed,
- n_models = n_models,
- all_results = data[, !..input_vars],
- networks = networks
- )
- )
- }
+ control_data <- without_reference_data[
+ sample(
+ nrow(without_reference_data),
+ reference_count
)
+ ]
+
+ control_data[, score := 0.0]
+
+ # Split the training data into random sets to have
+ # validation data for each model.
+
+ # Scramble the source tables.
+ reference_data <- reference_data[sample(reference_count)]
+ control_data <- control_data[sample(reference_count)]
+
+ networks <- list()
+
+ indices <- seq_len(reference_count)
+ indices_split <- split(indices, indices %% n_models)
+
+ for (i in seq_len(n_models)) {
+ training_data <- rbindlist(list(
+ reference_data[!indices_split[[i]]],
+ control_data[!indices_split[[i]]]
+ ))
+
+ validation_data <- rbindlist(list(
+ reference_data[indices_split[[i]]],
+ control_data[indices_split[[i]]]
+ ))
+
+ networks[[i]] <- list(
+ training_data = training_data,
+ validation_data = validation_data
+ )
+ }
+
+ # Step 3: Create, train and apply neural network.
+ # -----------------------------------------------
+
+ # Layers for the neural network.
+ input_layer <- length(input_vars)
+ layer1 <- input_layer
+ layer2 <- 0.5 * input_layer
+ layer3 <- 0.5 * layer2
+
+ # Convert data to matrix and normalize it.
+ to_matrix <- function(data) {
+ data_matrix <- as.matrix(data[, ..input_vars])
+ colnames(data_matrix) <- NULL
+ keras::normalize(data_matrix)
+ }
+
+ data_matrix <- to_matrix(data)
+ output_vars <- NULL
+
+ for (i in seq_along(networks)) {
+ # Create a new model for each training session, because
+ # the model would keep its state across training
+ # sessions otherwise.
+ model <- keras::keras_model_sequential() |>
+ keras::layer_dense(
+ units = layer1,
+ activation = "relu",
+ input_shape = input_layer,
+ ) |>
+ keras::layer_dense(
+ units = layer2,
+ activation = "relu",
+ kernel_regularizer = keras::regularizer_l2()
+ ) |>
+ keras::layer_dense(
+ units = layer3,
+ activation = "relu",
+ kernel_regularizer = keras::regularizer_l2()
+ ) |>
+ keras::layer_dense(
+ units = 1,
+ activation = "sigmoid"
+ ) |>
+ keras::compile(
+ loss = keras::loss_mean_absolute_error(),
+ optimizer = keras::optimizer_adam()
+ )
+
+ # Train the model.
+
+ network <- networks[[i]]
+
+ training_data <- network$training_data
+ training_matrix <- to_matrix(training_data)
+ validation_data <- network$validation_data
+ validation_matrix <- to_matrix(validation_data)
+
+ fit <- keras::fit(
+ model,
+ x = training_matrix,
+ y = training_data$score,
+ validation_data = list(
+ x_val = validation_matrix,
+ y_val = validation_data$score
+ ),
+ epochs = 500,
+ verbose = FALSE
+ )
+
+ # Apply the model.
+
+ data[, new_score := stats::predict(model, data_matrix)]
+
+ # Remove the values of the training data itself.
+ data[gene %chin% training_data$gene, new_score := NA]
+
+ output_var <- sprintf("score%i", i)
+ setnames(data, "new_score", output_var)
+ output_vars <- c(output_vars, output_var)
+
+
+ # Store the details.
+
+ networks[[i]]$model <- keras::serialize_model(model)
+ networks[[i]]$fit <- fit
+
+ progress(0.1 + i * (0.9 / n_models))
+ }
+
+ # Compute the final score as the mean score.
+ data[,
+ score := mean(as.numeric(.SD), na.rm = TRUE),
+ .SDcols = output_vars,
+ by = gene
+ ]
+
+ progress(1.0)
+
+ result(
+ method = "neural",
+ scores = data[, .(gene, score)],
+ details = list(
+ seed = seed,
+ n_models = n_models,
+ all_results = data[, !..input_vars],
+ networks = networks
+ )
+ )
}
- )
+ )
+ }
+ )
}
diff --git a/R/method_proximity.R b/R/method_proximity.R
index 3861517..20a6d55 100644
--- a/R/method_proximity.R
+++ b/R/method_proximity.R
@@ -11,38 +11,38 @@
#'
#' @export
proximity <- function(summarize = stats::median) {
- method(
- id = "proximity",
- name = "Proximity",
- description = "Proximity to telomeres",
- function(preset, progress) {
- species_ids <- preset$species_ids
- gene_ids <- preset$gene_ids
+ method(
+ id = "proximity",
+ name = "Proximity",
+ description = "Proximity to telomeres",
+ function(preset, progress) {
+ species_ids <- preset$species_ids
+ gene_ids <- preset$gene_ids
- cached("proximity", c(species_ids, gene_ids), {
- # Prefilter distances by species and gene.
- data <- geposan::distances[
- species %chin% preset$species_ids &
- gene %chin% preset$gene_ids
- ]
+ cached("proximity", c(species_ids, gene_ids), {
+ # Prefilter distances by species and gene.
+ data <- geposan::distances[
+ species %chin% preset$species_ids &
+ gene %chin% preset$gene_ids
+ ]
- # Compute the score as described above.
- data <- data[,
- .(combined_distance = as.double(summarize(distance))),
- by = "gene"
- ]
+ # Compute the score as described above.
+ data <- data[,
+ .(combined_distance = as.double(summarize(distance))),
+ by = "gene"
+ ]
- # Normalize scores.
- data[, score := 1 - combined_distance / max(combined_distance)]
+ # Normalize scores.
+ data[, score := 1 - combined_distance / max(combined_distance)]
- progress(1.0)
+ progress(1.0)
- result(
- method = "proximity",
- scores = data[, .(gene, score)],
- details = list(data = data)
- )
- })
- }
- )
+ result(
+ method = "proximity",
+ scores = data[, .(gene, score)],
+ details = list(data = data)
+ )
+ })
+ }
+ )
}
diff --git a/R/method_species_adjacency.R b/R/method_species_adjacency.R
index d4d042f..76f06b5 100644
--- a/R/method_species_adjacency.R
+++ b/R/method_species_adjacency.R
@@ -15,136 +15,136 @@
#' @export
species_adjacency <- function(distance_estimate = stats::median,
summarize = stats::median) {
- method(
- id = "species_adjacency",
- name = "Species adj.",
- description = "Species adjacency",
- function(preset, progress) {
- species_ids <- preset$species_ids
- gene_ids <- preset$gene_ids
- reference_gene_ids <- preset$reference_gene_ids
+ method(
+ id = "species_adjacency",
+ name = "Species adj.",
+ description = "Species adjacency",
+ function(preset, progress) {
+ species_ids <- preset$species_ids
+ gene_ids <- preset$gene_ids
+ reference_gene_ids <- preset$reference_gene_ids
- cached(
- "species_adjacency",
- c(
- species_ids,
- gene_ids,
- reference_gene_ids,
- distance_estimate,
- summarize
- ),
- { # nolint
- # Prefilter distances.
- data <- geposan::distances[
- species %chin% species_ids & gene %chin% gene_ids
- ]
+ cached(
+ "species_adjacency",
+ c(
+ species_ids,
+ gene_ids,
+ reference_gene_ids,
+ distance_estimate,
+ summarize
+ ),
+ { # nolint
+ # Prefilter distances.
+ data <- geposan::distances[
+ species %chin% species_ids & gene %chin% gene_ids
+ ]
- progress_state <- 0.0
- progress_step <- 0.9 / length(species_ids)
+ progress_state <- 0.0
+ progress_step <- 0.9 / length(species_ids)
- # Iterate through all species and find the distance
- # estimates within that species.
- for (species_id in species_ids) {
- # For all genes, compute the distance to one reference
- # gene at a time in one go.
- for (reference_gene_id in reference_gene_ids) {
- comparison_distance <- data[
- species == species_id &
- gene == reference_gene_id,
- distance
- ]
+ # Iterate through all species and find the distance
+ # estimates within that species.
+ for (species_id in species_ids) {
+ # For all genes, compute the distance to one reference
+ # gene at a time in one go.
+ for (reference_gene_id in reference_gene_ids) {
+ comparison_distance <- data[
+ species == species_id &
+ gene == reference_gene_id,
+ distance
+ ]
- column <- quote(reference_gene_id)
+ column <- quote(reference_gene_id)
- if (length(comparison_distance) != 1) {
- # If we don't have a comparison distance, we
- # can't compute a difference. This happens, if
- # the species doesn't have the reference gene.
- data[
- species == species_id &
- gene %chin% gene_ids,
- eval(column) := NA_integer_
- ]
- } else {
- data[
- species == species_id &
- gene %chin% gene_ids,
- eval(column) :=
- abs(distance - comparison_distance)
- ]
- }
- }
+ if (length(comparison_distance) != 1) {
+ # If we don't have a comparison distance, we
+ # can't compute a difference. This happens, if
+ # the species doesn't have the reference gene.
+ data[
+ species == species_id &
+ gene %chin% gene_ids,
+ eval(column) := NA_integer_
+ ]
+ } else {
+ data[
+ species == species_id &
+ gene %chin% gene_ids,
+ eval(column) :=
+ abs(distance - comparison_distance)
+ ]
+ }
+ }
- # Combine the distances to the different reference genes
- # into one value using the provided function.
- data[
- species == species_id &
- gene %chin% gene_ids,
- combined_distance := as.numeric(
- distance_estimate(stats::na.omit(
- # Convert the data.table subset into a
- # vector to get the correct na.omit
- # behavior.
- as.matrix(.SD)[1, ]
- ))
- ),
- .SDcols = reference_gene_ids,
- by = gene
- ]
+ # Combine the distances to the different reference genes
+ # into one value using the provided function.
+ data[
+ species == species_id &
+ gene %chin% gene_ids,
+ combined_distance := as.numeric(
+ distance_estimate(stats::na.omit(
+ # Convert the data.table subset into a
+ # vector to get the correct na.omit
+ # behavior.
+ as.matrix(.SD)[1, ]
+ ))
+ ),
+ .SDcols = reference_gene_ids,
+ by = gene
+ ]
- progress_state <- progress_state + progress_step
- progress(progress_state)
- }
+ progress_state <- progress_state + progress_step
+ progress(progress_state)
+ }
- progress(0.9)
+ progress(0.9)
- # Remove the distances between the reference genes.
- for (reference_gene_id in reference_gene_ids) {
- column <- quote(reference_gene_id)
- data[gene == reference_gene_id, eval(column) := NA]
- }
+ # Remove the distances between the reference genes.
+ for (reference_gene_id in reference_gene_ids) {
+ column <- quote(reference_gene_id)
+ data[gene == reference_gene_id, eval(column) := NA]
+ }
- # Recompute the combined distance for the reference genes.
- data[
- gene %chin% reference_gene_ids,
- combined_distance := as.numeric(
- distance_estimate(stats::na.omit(
- as.matrix(.SD)[1, ])
- )
- ),
- .SDcols = reference_gene_ids,
- by = list(species, gene)
- ]
+ # Recompute the combined distance for the reference genes.
+ data[
+ gene %chin% reference_gene_ids,
+ combined_distance := as.numeric(
+ distance_estimate(stats::na.omit(
+ as.matrix(.SD)[1, ]
+ ))
+ ),
+ .SDcols = reference_gene_ids,
+ by = list(species, gene)
+ ]
- # Combine the distances into one value.
- results <- data[,
- .(
- summarized_distances = as.numeric(
- summarize(stats::na.omit(combined_distance))
- )
- ),
- by = gene
- ]
+ # Combine the distances into one value.
+ results <- data[,
+ .(
+ summarized_distances = as.numeric(
+ summarize(stats::na.omit(combined_distance))
+ )
+ ),
+ by = gene
+ ]
- # Compute the final score by normalizing the difference.
- results[
- ,
- score := 1 - summarized_distances /
- max(summarized_distances)
- ]
+ # Compute the final score by normalizing the difference.
+ results[
+ ,
+ score := 1 - summarized_distances /
+ max(summarized_distances)
+ ]
- progress(1.0)
+ progress(1.0)
- result(
- method = "species_adjacency",
- scores = results[, .(gene, score)],
- details = list(
- data = data,
- results = results
- )
- )
- }
+ result(
+ method = "species_adjacency",
+ scores = results[, .(gene, score)],
+ details = list(
+ data = data,
+ results = results
)
+ )
}
- )
+ )
+ }
+ )
}
diff --git a/R/plots.R b/R/plots.R
index 6b59352..0dd8c1a 100644
--- a/R/plots.R
+++ b/R/plots.R
@@ -9,7 +9,7 @@ base_color_transparent <- function() "#1964bf80"
#' Color palette for gene sets.
#' @noRd
gene_set_color <- function(index) {
- c("#FF7F00", "#4DAF4A", "#984EA3")[index]
+ c("#FF7F00", "#4DAF4A", "#984EA3")[index]
}
#' Plot gene positions.
@@ -22,74 +22,74 @@ gene_set_color <- function(index) {
#'
#' @export
plot_positions <- function(species_ids, gene_sets) {
- if (!requireNamespace("plotly", quietly = TRUE)) {
- stop("Please install \"plotly\" to use this function.")
- }
+ if (!requireNamespace("plotly", quietly = TRUE)) {
+ stop("Please install \"plotly\" to use this function.")
+ }
- # Prefilter data by species.
- data <- geposan::distances[species %chin% species_ids]
+ # Prefilter data by species.
+ data <- geposan::distances[species %chin% species_ids]
- species_max_distance <- data[,
- .(max_distance = max(distance)),
- by = species
- ]
+ species_max_distance <- data[,
+ .(max_distance = max(distance)),
+ by = species
+ ]
- # Prefilter species.
- species <- geposan::species[id %chin% species_ids]
+ # Prefilter species.
+ species <- geposan::species[id %chin% species_ids]
- plot <- plotly::plot_ly() |>
- plotly::layout(
- xaxis = list(
- title = "Species",
- tickvals = species$id,
- ticktext = species$name
- ),
- yaxis = list(title = "Distance to telomeres [Bp]"),
- bargap = 0.9
- ) |>
- plotly::add_bars(
- data = species_max_distance,
- x = ~species,
- y = ~max_distance,
- name = "All genes",
- marker = list(color = base_color())
+ plot <- plotly::plot_ly() |>
+ plotly::layout(
+ xaxis = list(
+ title = "Species",
+ tickvals = species$id,
+ ticktext = species$name
+ ),
+ yaxis = list(title = "Distance to telomeres [Bp]"),
+ bargap = 0.9
+ ) |>
+ plotly::add_bars(
+ data = species_max_distance,
+ x = ~species,
+ y = ~max_distance,
+ name = "All genes",
+ marker = list(color = base_color())
+ )
+
+ if (length(gene_sets) > 0) {
+ # Include gene information which will be used for labeling
+ gene_set_data <- merge(
+ data[gene %chin% unlist(gene_sets)],
+ geposan::genes,
+ by.x = "gene",
+ by.y = "id"
+ )
+
+ index <- 1
+
+ for (gene_set_name in names(gene_sets)) {
+ gene_set <- gene_sets[[gene_set_name]]
+
+ plot <- plot |> plotly::add_markers(
+ data = gene_set_data[gene %chin% gene_set],
+ x = ~species,
+ y = ~distance,
+ name = gene_set_name,
+ text = ~ glue::glue(
+ "{name}
",
+ "{round(distance / 1000000, digits = 2)} MBp"
+ ),
+ hoverinfo = "text",
+ marker = list(
+ size = 10,
+ color = gene_set_color(index)
)
+ )
- if (length(gene_sets) > 0) {
- # Include gene information which will be used for labeling
- gene_set_data <- merge(
- data[gene %chin% unlist(gene_sets)],
- geposan::genes,
- by.x = "gene",
- by.y = "id"
- )
-
- index <- 1
-
- for (gene_set_name in names(gene_sets)) {
- gene_set <- gene_sets[[gene_set_name]]
-
- plot <- plot |> plotly::add_markers(
- data = gene_set_data[gene %chin% gene_set],
- x = ~species,
- y = ~distance,
- name = gene_set_name,
- text = ~ glue::glue(
- "{name}
",
- "{round(distance / 1000000, digits = 2)} MBp"
- ),
- hoverinfo = "text",
- marker = list(
- size = 10,
- color = gene_set_color(index)
- )
- )
-
- index <- index + 1
- }
+ index <- index + 1
}
+ }
- plot
+ plot
}
@@ -108,75 +108,75 @@ plot_positions <- function(species_ids, gene_sets) {
#'
#' @export
plot_rankings <- function(rankings, gene_sets) {
- if (!requireNamespace("plotly", quietly = TRUE)) {
- stop("Please install \"plotly\" to use this function.")
- }
+ if (!requireNamespace("plotly", quietly = TRUE)) {
+ stop("Please install \"plotly\" to use this function.")
+ }
- plot <- plotly::plot_ly() |>
- plotly::layout(
- xaxis = list(tickvals = names(rankings)),
- yaxis = list(title = "Score")
+ plot <- plotly::plot_ly() |>
+ plotly::layout(
+ xaxis = list(tickvals = names(rankings)),
+ yaxis = list(title = "Score")
+ )
+
+ is_first <- TRUE
+
+ for (ranking_name in names(rankings)) {
+ ranking <- rankings[[ranking_name]]
+
+ plot <- plot |> plotly::add_trace(
+ data = ranking,
+ x = ranking_name,
+ y = ~score,
+ name = "All genes",
+ type = "violin",
+ spanmode = "hard",
+ points = FALSE,
+ showlegend = is_first,
+ hoverinfo = "skip",
+ line = list(color = base_color()),
+ fillcolor = base_color_transparent()
+ )
+
+ if (length(gene_sets) > 0) {
+ gene_set_data <- merge(
+ ranking[gene %chin% unlist(gene_sets)],
+ geposan::genes,
+ by.x = "gene",
+ by.y = "id"
+ )
+
+ index <- 1
+
+ for (gene_set_name in names(gene_sets)) {
+ gene_set <- gene_sets[[gene_set_name]]
+
+ plot <- plot |> plotly::add_markers(
+ data = gene_set_data[gene %chin% gene_set],
+ x = ranking_name,
+ y = ~score,
+ name = gene_set_name,
+ text = ~ glue::glue(
+ "{name}
",
+ "Score: {round(score, digits = 2)}
",
+ "Rank: {rank}
",
+ "Percentile: {round(percentile * 100, digits = 2)}%"
+ ),
+ hoverinfo = "text",
+ showlegend = is_first,
+ marker = list(
+ size = 10,
+ color = gene_set_color(index)
+ )
)
- is_first <- TRUE
-
- for (ranking_name in names(rankings)) {
- ranking <- rankings[[ranking_name]]
-
- plot <- plot |> plotly::add_trace(
- data = ranking,
- x = ranking_name,
- y = ~score,
- name = "All genes",
- type = "violin",
- spanmode = "hard",
- points = FALSE,
- showlegend = is_first,
- hoverinfo = "skip",
- line = list(color = base_color()),
- fillcolor = base_color_transparent()
- )
-
- if (length(gene_sets) > 0) {
- gene_set_data <- merge(
- ranking[gene %chin% unlist(gene_sets)],
- geposan::genes,
- by.x = "gene",
- by.y = "id"
- )
-
- index <- 1
-
- for (gene_set_name in names(gene_sets)) {
- gene_set <- gene_sets[[gene_set_name]]
-
- plot <- plot |> plotly::add_markers(
- data = gene_set_data[gene %chin% gene_set],
- x = ranking_name,
- y = ~score,
- name = gene_set_name,
- text = ~ glue::glue(
- "{name}
",
- "Score: {round(score, digits = 2)}
",
- "Rank: {rank}
",
- "Percentile: {round(percentile * 100, digits = 2)}%"
- ),
- hoverinfo = "text",
- showlegend = is_first,
- marker = list(
- size = 10,
- color = gene_set_color(index)
- )
- )
-
- index <- index + 1
- }
- }
-
- is_first <- FALSE
+ index <- index + 1
+ }
}
- plot
+ is_first <- FALSE
+ }
+
+ plot
}
@@ -194,88 +194,88 @@ plot_rankings <- function(rankings, gene_sets) {
#'
#' @export
plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
- if (!requireNamespace("plotly", quietly = TRUE)) {
- stop("Please install \"plotly\" to use this function.")
- }
+ if (!requireNamespace("plotly", quietly = TRUE)) {
+ stop("Please install \"plotly\" to use this function.")
+ }
- # To speed up rendering, don't show every single gene.
- n_ranks <- nrow(ranking)
- sample_ranking <- ranking[seq(1, n_ranks, 5)]
+ # To speed up rendering, don't show every single gene.
+ n_ranks <- nrow(ranking)
+ sample_ranking <- ranking[seq(1, n_ranks, 5)]
- plot <- plotly::plot_ly() |>
- plotly::add_lines(
- data = sample_ranking,
- x = ~percentile,
- y = ~score,
- name = "All genes",
- hoverinfo = "skip",
- line = list(width = 4, color = base_color())
- ) |>
- plotly::layout(
- xaxis = list(
- title = "Percentile",
- tickformat = ".0%"
- ),
- yaxis = list(title = "Score")
+ plot <- plotly::plot_ly() |>
+ plotly::add_lines(
+ data = sample_ranking,
+ x = ~percentile,
+ y = ~score,
+ name = "All genes",
+ hoverinfo = "skip",
+ line = list(width = 4, color = base_color())
+ ) |>
+ plotly::layout(
+ xaxis = list(
+ title = "Percentile",
+ tickformat = ".0%"
+ ),
+ yaxis = list(title = "Score")
+ )
+
+ if (length(gene_sets) > 0) {
+ # Include gene information which will be used for labeling
+ gene_set_data <- merge(
+ ranking[gene %chin% unlist(gene_sets)],
+ geposan::genes,
+ by.x = "gene",
+ by.y = "id"
+ )
+
+ index <- 1
+
+ for (gene_set_name in names(gene_sets)) {
+ gene_set <- gene_sets[[gene_set_name]]
+
+ plot <- plot |> plotly::add_markers(
+ data = gene_set_data[gene %chin% gene_set],
+ x = ~percentile,
+ y = ~score,
+ name = gene_set_name,
+ text = ~ glue::glue(
+ "{name}
",
+ "Score: {round(score, digits = 2)}
",
+ "Rank: {rank}
",
+ "Percentile: {round(percentile * 100, digits = 2)}%"
+ ),
+ hoverinfo = "text",
+ marker = list(
+ size = 10,
+ color = gene_set_color(index)
)
+ )
- if (length(gene_sets) > 0) {
- # Include gene information which will be used for labeling
- gene_set_data <- merge(
- ranking[gene %chin% unlist(gene_sets)],
- geposan::genes,
- by.x = "gene",
- by.y = "id"
+ index <- index + 1
+ }
+ }
+
+
+ if (!is.null(max_rank)) {
+ first_not_included_rank <- max_rank + 1
+ last_rank <- ranking[, .N]
+
+ if (first_not_included_rank <= last_rank) {
+ plot <- plot |> plotly::layout(
+ shapes = list(
+ type = "rect",
+ fillcolor = "black",
+ opacity = 0.1,
+ x0 = 1 - first_not_included_rank / n_ranks,
+ x1 = 1 - last_rank / n_ranks,
+ y0 = 0.0,
+ y1 = 1.0
)
-
- index <- 1
-
- for (gene_set_name in names(gene_sets)) {
- gene_set <- gene_sets[[gene_set_name]]
-
- plot <- plot |> plotly::add_markers(
- data = gene_set_data[gene %chin% gene_set],
- x = ~percentile,
- y = ~score,
- name = gene_set_name,
- text = ~ glue::glue(
- "{name}
",
- "Score: {round(score, digits = 2)}
",
- "Rank: {rank}
",
- "Percentile: {round(percentile * 100, digits = 2)}%"
- ),
- hoverinfo = "text",
- marker = list(
- size = 10,
- color = gene_set_color(index)
- )
- )
-
- index <- index + 1
- }
+ )
}
+ }
-
- if (!is.null(max_rank)) {
- first_not_included_rank <- max_rank + 1
- last_rank <- ranking[, .N]
-
- if (first_not_included_rank <= last_rank) {
- plot <- plot |> plotly::layout(
- shapes = list(
- type = "rect",
- fillcolor = "black",
- opacity = 0.1,
- x0 = 1 - first_not_included_rank / n_ranks,
- x1 = 1 - last_rank / n_ranks,
- y0 = 0.0,
- y1 = 1.0
- )
- )
- }
- }
-
- plot
+ plot
}
#' Visualize a ranking by comparing gene sets in a boxplot.
@@ -290,44 +290,44 @@ plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) {
#'
#' @export
plot_boxplot <- function(ranking, gene_sets = NULL) {
- if (!requireNamespace("plotly", quietly = TRUE)) {
- stop("Please install \"plotly\" to use this function.")
+ if (!requireNamespace("plotly", quietly = TRUE)) {
+ stop("Please install \"plotly\" to use this function.")
+ }
+
+ plot <- plotly::plot_ly() |>
+ plotly::add_boxplot(
+ data = ranking,
+ x = "All genes",
+ y = ~score,
+ name = "All genes",
+ showlegend = FALSE,
+ line = list(color = base_color())
+ ) |>
+ plotly::layout(
+ xaxis = list(tickvals = c("All genes", names(gene_sets))),
+ yaxis = list(title = "Score")
+ )
+
+ if (length(gene_sets) > 0) {
+ index <- 1
+
+ for (gene_set_name in names(gene_sets)) {
+ gene_set <- gene_sets[[gene_set_name]]
+
+ plot <- plot |> plotly::add_boxplot(
+ data = ranking[gene %chin% gene_set],
+ x = gene_set_name,
+ y = ~score,
+ name = gene_set_name,
+ showlegend = FALSE,
+ line = list(color = gene_set_color(index))
+ )
+
+ index <- index + 1
}
+ }
- plot <- plotly::plot_ly() |>
- plotly::add_boxplot(
- data = ranking,
- x = "All genes",
- y = ~score,
- name = "All genes",
- showlegend = FALSE,
- line = list(color = base_color())
- ) |>
- plotly::layout(
- xaxis = list(tickvals = c("All genes", names(gene_sets))),
- yaxis = list(title = "Score")
- )
-
- if (length(gene_sets) > 0) {
- index <- 1
-
- for (gene_set_name in names(gene_sets)) {
- gene_set <- gene_sets[[gene_set_name]]
-
- plot <- plot |> plotly::add_boxplot(
- data = ranking[gene %chin% gene_set],
- x = gene_set_name,
- y = ~score,
- name = gene_set_name,
- showlegend = FALSE,
- line = list(color = gene_set_color(index))
- )
-
- index <- index + 1
- }
- }
-
- plot
+ plot
}
#' Show the distribution of scores across chromosomes.
@@ -340,46 +340,46 @@ plot_boxplot <- function(ranking, gene_sets = NULL) {
#'
#' @export
plot_chromosomes <- function(ranking) {
- if (!requireNamespace("plotly", quietly = TRUE)) {
- stop("Please install \"plotly\" to use this function.")
- }
+ if (!requireNamespace("plotly", quietly = TRUE)) {
+ stop("Please install \"plotly\" to use this function.")
+ }
- data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
- data <- data[, .(score = mean(score)), by = "chromosome"]
+ data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id")
+ data <- data[, .(score = mean(score)), by = "chromosome"]
- # Get an orderable integer from a chromosome name.
- chromosome_index <- function(chromosome) {
- index <- suppressWarnings(as.integer(chromosome))
+ # Get an orderable integer from a chromosome name.
+ chromosome_index <- function(chromosome) {
+ index <- suppressWarnings(as.integer(chromosome))
- ifelse(
- !is.na(index),
- index,
- ifelse(
- chromosome == "X",
- 998,
- 999
- )
- )
- }
+ ifelse(
+ !is.na(index),
+ index,
+ ifelse(
+ chromosome == "X",
+ 998,
+ 999
+ )
+ )
+ }
- data[, index := chromosome_index(chromosome)]
- setorder(data, "index")
+ data[, index := chromosome_index(chromosome)]
+ setorder(data, "index")
- plotly::plot_ly(
- data = data,
- x = ~chromosome,
- y = ~score,
- type = "bar",
- marker = list(color = base_color())
- ) |>
- plotly::layout(
- xaxis = list(
- title = "Chromosome",
- categoryorder = "array",
- categoryarray = ~chromosome
- ),
- yaxis = list(title = "Mean score")
- )
+ plotly::plot_ly(
+ data = data,
+ x = ~chromosome,
+ y = ~score,
+ type = "bar",
+ marker = list(color = base_color())
+ ) |>
+ plotly::layout(
+ xaxis = list(
+ title = "Chromosome",
+ categoryorder = "array",
+ categoryarray = ~chromosome
+ ),
+ yaxis = list(title = "Mean score")
+ )
}
#' Plot scores in relation to chromosomal position of genes.
@@ -398,74 +398,74 @@ plot_chromosomes <- function(ranking) {
plot_scores_by_position <- function(ranking,
chromosome_name = NULL,
gene_sets = NULL) {
- if (!requireNamespace("plotly", quietly = TRUE)) {
- stop("Please install \"plotly\" to use this function.")
- }
+ if (!requireNamespace("plotly", quietly = TRUE)) {
+ stop("Please install \"plotly\" to use this function.")
+ }
- distance_data <- if (!is.null(chromosome_name)) {
- chromosome_name_ <- chromosome_name
- geposan::distances[
- species == "hsapiens" &
- chromosome_name == chromosome_name_
- ]
- } else {
- geposan::distances[species == "hsapiens"]
- }
+ distance_data <- if (!is.null(chromosome_name)) {
+ chromosome_name_ <- chromosome_name
+ geposan::distances[
+ species == "hsapiens" &
+ chromosome_name == chromosome_name_
+ ]
+ } else {
+ geposan::distances[species == "hsapiens"]
+ }
- data <- merge(ranking, distance_data, by = "gene")
+ data <- merge(ranking, distance_data, by = "gene")
- data <- merge(
- data,
- geposan::genes,
- by.x = "gene",
- by.y = "id"
+ data <- merge(
+ data,
+ geposan::genes,
+ by.x = "gene",
+ by.y = "id"
+ )
+
+ data[, `:=`(gene_set = "All genes", color = base_color())]
+
+ index <- 1
+ for (gene_set_name in names(gene_sets)) {
+ gene_set_genes <- gene_sets[[gene_set_name]]
+ data[
+ gene %chin% gene_set_genes,
+ `:=`(
+ gene_set = gene_set_name,
+ color = gene_set_color(index)
+ )
+ ]
+
+ index <- index + 1
+ }
+
+ # Use distances instead of positions in case all chromosomes are included.
+ if (is.null(chromosome_name)) {
+ data[, x := distance]
+ } else {
+ data[, x := start_position]
+ }
+
+ plotly::plot_ly() |>
+ plotly::add_markers(
+ data = data,
+ x = ~x,
+ y = ~score,
+ name = ~gene_set,
+ text = ~ glue::glue(
+ "{name}
",
+ if (is.null(chromosome_name)) "Distance: " else "Position: ",
+ "{round(x / 1000000, digits = 2)} MBp
",
+ "Score: {round(score, digits = 2)}
",
+ "Rank: {rank}
",
+ "Percentile: {round(percentile * 100, digits = 2)}%"
+ ),
+ hoverinfo = "text",
+ ) |>
+ plotly::layout(
+ xaxis = list(title = if (is.null(chromosome_name)) {
+ "Distance (Bp)"
+ } else {
+ "Position (Bp)"
+ }),
+ yaxis = list(title = "Score")
)
-
- data[, `:=`(gene_set = "All genes", color = base_color())]
-
- index <- 1
- for (gene_set_name in names(gene_sets)) {
- gene_set_genes <- gene_sets[[gene_set_name]]
- data[
- gene %chin% gene_set_genes,
- `:=`(
- gene_set = gene_set_name,
- color = gene_set_color(index)
- )
- ]
-
- index <- index + 1
- }
-
- # Use distances instead of positions in case all chromosomes are included.
- if (is.null(chromosome_name)) {
- data[, x := distance]
- } else {
- data[, x := start_position]
- }
-
- plotly::plot_ly() |>
- plotly::add_markers(
- data = data,
- x = ~x,
- y = ~score,
- name = ~gene_set,
- text = ~ glue::glue(
- "{name}
",
- if (is.null(chromosome_name)) "Distance: " else "Position: ",
- "{round(x / 1000000, digits = 2)} MBp
",
- "Score: {round(score, digits = 2)}
",
- "Rank: {rank}
",
- "Percentile: {round(percentile * 100, digits = 2)}%"
- ),
- hoverinfo = "text",
- ) |>
- plotly::layout(
- xaxis = list(title = if (is.null(chromosome_name)) {
- "Distance (Bp)"
- } else {
- "Position (Bp)"
- }),
- yaxis = list(title = "Score")
- )
}
diff --git a/R/preset.R b/R/preset.R
index ad15018..e798bbb 100644
--- a/R/preset.R
+++ b/R/preset.R
@@ -21,55 +21,55 @@ preset <- function(reference_gene_ids,
methods = all_methods(),
species_ids = geposan::species$id,
gene_ids = geposan::genes$id) {
- # Count included species per gene.
- genes_n_species <- geposan::distances[
- species %chin% species_ids,
- .(n_species = .N),
- by = "gene"
- ]
+ # Count included species per gene.
+ genes_n_species <- geposan::distances[
+ species %chin% species_ids,
+ .(n_species = .N),
+ by = "gene"
+ ]
- # Filter out genes with less than 25% existing orthologs.
- gene_ids_filtered <- genes_n_species[
- gene %chin% gene_ids &
- n_species >= 0.25 * length(species_ids),
- gene
- ]
+ # Filter out genes with less than 25% existing orthologs.
+ gene_ids_filtered <- genes_n_species[
+ gene %chin% gene_ids &
+ n_species >= 0.25 * length(species_ids),
+ gene
+ ]
- reference_gene_ids_excluded <- reference_gene_ids[
- !reference_gene_ids %chin% gene_ids_filtered
- ]
+ reference_gene_ids_excluded <- reference_gene_ids[
+ !reference_gene_ids %chin% gene_ids_filtered
+ ]
- if (length(reference_gene_ids_excluded > 0)) {
- warning(paste0(
- "The following reference gene IDs are excluded from the preset ",
- "because they don't have enough data: ",
- paste(reference_gene_ids_excluded, collapse = ", ")
- ))
- }
+ if (length(reference_gene_ids_excluded > 0)) {
+ warning(paste0(
+ "The following reference gene IDs are excluded from the preset ",
+ "because they don't have enough data: ",
+ paste(reference_gene_ids_excluded, collapse = ", ")
+ ))
+ }
- reference_gene_ids_included <- reference_gene_ids[
- reference_gene_ids %chin% gene_ids_filtered
- ]
+ reference_gene_ids_included <- reference_gene_ids[
+ reference_gene_ids %chin% gene_ids_filtered
+ ]
- if (length(reference_gene_ids_included) < 1) {
- stop(paste0(
- "There has to be at least one reference gene for the preset to be ",
- "valid. Please note that some methods may require more reference ",
- "genes."
- ))
- }
+ if (length(reference_gene_ids_included) < 1) {
+ stop(paste0(
+ "There has to be at least one reference gene for the preset to be ",
+ "valid. Please note that some methods may require more reference ",
+ "genes."
+ ))
+ }
- # The included data gets sorted to be able to produce predictable hashes
- # for the object later.
- structure(
- list(
- reference_gene_ids = sort(reference_gene_ids_included),
- methods = methods,
- species_ids = sort(species_ids),
- gene_ids = sort(gene_ids_filtered)
- ),
- class = "geposan_preset"
- )
+ # The included data gets sorted to be able to produce predictable hashes
+ # for the object later.
+ structure(
+ list(
+ reference_gene_ids = sort(reference_gene_ids_included),
+ methods = methods,
+ species_ids = sort(species_ids),
+ gene_ids = sort(gene_ids_filtered)
+ ),
+ class = "geposan_preset"
+ )
}
#' S3 method to print a preset object.
@@ -81,20 +81,20 @@ preset <- function(reference_gene_ids,
#'
#' @export
print.geposan_preset <- function(x, ...) {
- cat(sprintf(
- paste0(
- "geposan preset:",
- "\n Reference genes: %i",
- "\n Included methods: %s",
- "\n Number of species: %i",
- "\n Number of genes: %i",
- "\n"
- ),
- length(x$reference_gene_ids),
- paste(sapply(x$methods, function(m) m$id), collapse = ", "),
- length(x$species_ids),
- length(x$gene_ids)
- ))
+ cat(sprintf(
+ paste0(
+ "geposan preset:",
+ "\n Reference genes: %i",
+ "\n Included methods: %s",
+ "\n Number of species: %i",
+ "\n Number of genes: %i",
+ "\n"
+ ),
+ length(x$reference_gene_ids),
+ paste(sapply(x$methods, function(m) m$id), collapse = ", "),
+ length(x$species_ids),
+ length(x$gene_ids)
+ ))
- invisible(x)
+ invisible(x)
}
diff --git a/R/ranking.R b/R/ranking.R
index 47ec098..2721f57 100644
--- a/R/ranking.R
+++ b/R/ranking.R
@@ -13,35 +13,35 @@
#'
#' @export
ranking <- function(analysis, weights) {
- ranking <- if (inherits(analysis, "geposan_analysis")) {
- copy(analysis$scores)
- } else if (inherits(analysis, "geposan_ranking")) {
- copy(analysis)
- } else {
- stop("Invalid analyis. Use geposan::analyze().")
- }
+ ranking <- if (inherits(analysis, "geposan_analysis")) {
+ copy(analysis$scores)
+ } else if (inherits(analysis, "geposan_ranking")) {
+ copy(analysis)
+ } else {
+ stop("Invalid analyis. Use geposan::analyze().")
+ }
- ranking[, score := 0.0]
+ ranking[, score := 0.0]
- for (method in names(weights)) {
- weighted <- weights[[method]] * ranking[, ..method]
- ranking[, score := score + weighted]
- }
+ for (method in names(weights)) {
+ weighted <- weights[[method]] * ranking[, ..method]
+ ranking[, score := score + weighted]
+ }
- # Normalize scores to be between 0.0 and 1.0.
- min_score <- ranking[, min(score)]
- max_score <- ranking[, max(score)]
- score_range <- max_score - min_score
- ranking[, score := (score - min_score) / score_range]
+ # Normalize scores to be between 0.0 and 1.0.
+ min_score <- ranking[, min(score)]
+ max_score <- ranking[, max(score)]
+ score_range <- max_score - min_score
+ ranking[, score := (score - min_score) / score_range]
- setorder(ranking, -score)
- ranking[, rank := .I]
- ranking[, percentile := 1 - rank / nrow(ranking)]
+ setorder(ranking, -score)
+ ranking[, rank := .I]
+ ranking[, percentile := 1 - rank / nrow(ranking)]
- structure(
- ranking,
- class = c("geposan_ranking", class(ranking))
- )
+ structure(
+ ranking,
+ class = c("geposan_ranking", class(ranking))
+ )
}
#' Find the best weights to rank the results.
@@ -61,40 +61,40 @@ ranking <- function(analysis, weights) {
#' @export
optimal_weights <- function(analysis, methods, reference_gene_ids,
target = "mean") {
- if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
- stop("Invalid analyis. Use geposan::analyze().")
+ if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
+ stop("Invalid analyis. Use geposan::analyze().")
+ }
+
+
+ # Compute the target rank of the reference genes when applying the
+ # weights.
+ target_rank <- function(factors) {
+ data <- ranking(analysis, as.list(factors))
+
+ result <- data[
+ gene %chin% reference_gene_ids,
+ if (target == "min") {
+ min(rank)
+ } else if (target == "max") {
+ max(rank)
+ } else if (target == "mean") {
+ mean(rank)
+ } else {
+ stats::median(rank)
+ }
+ ]
+
+ if (result > 0) {
+ result
+ } else {
+ Inf
}
+ }
+ initial_factors <- rep(1.0, length(methods))
+ names(initial_factors) <- methods
- # Compute the target rank of the reference genes when applying the
- # weights.
- target_rank <- function(factors) {
- data <- ranking(analysis, as.list(factors))
+ optimal_factors <- stats::optim(initial_factors, target_rank)$par
- result <- data[
- gene %chin% reference_gene_ids,
- if (target == "min") {
- min(rank)
- } else if (target == "max") {
- max(rank)
- } else if (target == "mean") {
- mean(rank)
- } else {
- stats::median(rank)
- }
- ]
-
- if (result > 0) {
- result
- } else {
- Inf
- }
- }
-
- initial_factors <- rep(1.0, length(methods))
- names(initial_factors) <- methods
-
- optimal_factors <- stats::optim(initial_factors, target_rank)$par
-
- as.list(optimal_factors / max(abs(optimal_factors)))
+ as.list(optimal_factors / max(abs(optimal_factors)))
}
diff --git a/R/result.R b/R/result.R
index c16ffc0..7c370e7 100644
--- a/R/result.R
+++ b/R/result.R
@@ -10,18 +10,18 @@
#'
#' @export
result <- function(method_id, scores, details = list()) {
- stopifnot(is.data.frame(scores) &
- c("gene", "score") %chin% colnames(scores))
- stopifnot(is.list(details))
+ stopifnot(is.data.frame(scores) &
+ c("gene", "score") %chin% colnames(scores))
+ stopifnot(is.list(details))
- structure(
- list(
- method_id = method_id,
- scores = scores,
- details = details
- ),
- class = "geposan_result"
- )
+ structure(
+ list(
+ method_id = method_id,
+ scores = scores,
+ details = details
+ ),
+ class = "geposan_result"
+ )
}
#' Print a result object.
@@ -33,18 +33,18 @@ result <- function(method_id, scores, details = list()) {
#'
#' @export
print.geposan_result <- function(x, ...) {
- cat(sprintf(
- paste0(
- "geposan result:",
- "\n Method: %s",
- "\n Number of genes: %i",
- "\n Available details: %s",
- "\n"
- ),
- x$method_id,
- nrow(x$scores),
- paste(names(x$details), collapse = ", ")
- ))
+ cat(sprintf(
+ paste0(
+ "geposan result:",
+ "\n Method: %s",
+ "\n Number of genes: %i",
+ "\n Available details: %s",
+ "\n"
+ ),
+ x$method_id,
+ nrow(x$scores),
+ paste(names(x$details), collapse = ", ")
+ ))
- invisible(x)
+ invisible(x)
}
diff --git a/R/utils.R b/R/utils.R
index a1f0820..b590219 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -8,25 +8,25 @@
# @param objects A vector of objects that this expression depends on. The hash
# of those objects will be used for identifying the cache file.
cached <- function(name, objects, expr) {
- if (!dir.exists("cache")) {
- dir.create("cache")
- }
+ if (!dir.exists("cache")) {
+ dir.create("cache")
+ }
- id <- rlang::hash(objects)
- cache_file <- sprintf("cache/%s_%s.rda", name, id)
+ id <- rlang::hash(objects)
+ cache_file <- sprintf("cache/%s_%s.rda", name, id)
- if (file.exists(cache_file)) {
- # If the cache file exists, we restore the data from it.
- load(cache_file)
- } else {
- # If the cache file doesn't exist, we have to do the computation.
- data <- expr
+ if (file.exists(cache_file)) {
+ # If the cache file exists, we restore the data from it.
+ load(cache_file)
+ } else {
+ # If the cache file doesn't exist, we have to do the computation.
+ data <- expr
- # The results are cached for the next run.
- save(data, file = cache_file, compress = "xz")
- }
+ # The results are cached for the next run.
+ save(data, file = cache_file, compress = "xz")
+ }
- data
+ data
}
# This is needed to make data.table's symbols available within the package.
diff --git a/R/validate.R b/R/validate.R
index 848c079..2c673f6 100644
--- a/R/validate.R
+++ b/R/validate.R
@@ -27,68 +27,68 @@
#'
#' @export
validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
- if (!inherits(ranking, "geposan_ranking")) {
- stop("Ranking is invalid. Use geposan::ranking().")
- }
+ if (!inherits(ranking, "geposan_ranking")) {
+ stop("Ranking is invalid. Use geposan::ranking().")
+ }
- if (is.null(progress)) {
- progress_bar <- progress::progress_bar$new()
- progress_bar$update(0.0)
+ if (is.null(progress)) {
+ progress_bar <- progress::progress_bar$new()
+ progress_bar$update(0.0)
- progress <- function(progress_value) {
- if (!progress_bar$finished) {
- progress_bar$update(progress_value)
- if (progress_value >= 1.0) {
- progress_bar$terminate()
- }
- }
+ progress <- function(progress_value) {
+ if (!progress_bar$finished) {
+ progress_bar$update(progress_value)
+ if (progress_value >= 1.0) {
+ progress_bar$terminate()
}
+ }
}
+ }
- progress_state <- 0.0
- progress_step <- 1.0 / length(reference_gene_ids)
+ progress_state <- 0.0
+ progress_step <- 1.0 / length(reference_gene_ids)
- results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
+ results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
- for (gene_id in reference_gene_ids) {
- included_gene_ids <- reference_gene_ids[
- reference_gene_ids != gene_id
- ]
+ for (gene_id in reference_gene_ids) {
+ included_gene_ids <- reference_gene_ids[
+ reference_gene_ids != gene_id
+ ]
- weights <- optimal_weights(
- ranking,
- method_ids,
- included_gene_ids
- )
-
- ranking_validation <- ranking(ranking, weights)
-
- results[
- gene == gene_id,
- percentile_validation := ranking_validation[
- gene == gene_id,
- percentile
- ]
- ]
-
- if (!is.null(progress)) {
- progress_state <- progress_state + progress_step
- progress(progress_state)
- }
- }
-
- results[, error := percentile - percentile_validation]
- setorder(results, error)
-
- structure(
- list(
- validation = results,
- mean_percentile_original = results[, mean(percentile)],
- mean_percentile_validation = results[, mean(percentile_validation)],
- mean_error = results[, mean(error)]
- ),
- class = "geposan_validation"
+ weights <- optimal_weights(
+ ranking,
+ method_ids,
+ included_gene_ids
)
+
+ ranking_validation <- ranking(ranking, weights)
+
+ results[
+ gene == gene_id,
+ percentile_validation := ranking_validation[
+ gene == gene_id,
+ percentile
+ ]
+ ]
+
+ if (!is.null(progress)) {
+ progress_state <- progress_state + progress_step
+ progress(progress_state)
+ }
+ }
+
+ results[, error := percentile - percentile_validation]
+ setorder(results, error)
+
+ structure(
+ list(
+ validation = results,
+ mean_percentile_original = results[, mean(percentile)],
+ mean_percentile_validation = results[, mean(percentile_validation)],
+ mean_error = results[, mean(error)]
+ ),
+ class = "geposan_validation"
+ )
}
#' S3 method to print a validation object.
@@ -100,18 +100,18 @@ validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
#'
#' @export
print.geposan_validation <- function(x, ...) {
- cat(sprintf(
- paste0(
- "geposan validation:",
- "\n Mean percentile original: %.1f%%",
- "\n Mean percentile validation: %.1f%%",
- "\n Mean error: %.1f percent points",
- "\n"
- ),
- x$mean_percentile_original * 100,
- x$mean_percentile_validation * 100,
- x$mean_error * 100
- ))
+ cat(sprintf(
+ paste0(
+ "geposan validation:",
+ "\n Mean percentile original: %.1f%%",
+ "\n Mean percentile validation: %.1f%%",
+ "\n Mean error: %.1f percent points",
+ "\n"
+ ),
+ x$mean_percentile_original * 100,
+ x$mean_percentile_validation * 100,
+ x$mean_error * 100
+ ))
- invisible(x)
+ invisible(x)
}
diff --git a/scripts/chromosome_names.R b/scripts/chromosome_names.R
index cf1a59a..e516f49 100644
--- a/scripts/chromosome_names.R
+++ b/scripts/chromosome_names.R
@@ -5,23 +5,23 @@ ensembl_api_url <- "https://rest.ensembl.org"
#' Perform a request to the Ensembl REST API.
ensembl_request <- function(api_path) {
- content(stop_for_status(GET(
- paste0(ensembl_api_url, api_path),
- content_type_json()
- )))
+ content(stop_for_status(GET(
+ paste0(ensembl_api_url, api_path),
+ content_type_json()
+ )))
}
#' Get IDs of all available vertebrates.
get_species_ids <- function() {
- species <- ensembl_request("/info/species")$species
- sapply(species, function(species) species$name)
+ species <- ensembl_request("/info/species")$species
+ sapply(species, function(species) species$name)
}
#' Get all chromosomes names for a species.
get_species_chromosomes <- function(species_id) {
- chromosomes <- unlist(ensembl_request(
- paste0("/info/assembly/", species_id)
- )$karyotype)
+ chromosomes <- unlist(ensembl_request(
+ paste0("/info/assembly/", species_id)
+ )$karyotype)
}
#' Get a vector of all available unqiue chromosome names.
@@ -29,6 +29,6 @@ get_species_chromosomes <- function(species_id) {
#' There are multiple names for mitochondrial DNA which have to be removed
#' manually, unfortunately.
get_all_chromosomes <- function() {
- chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
- unique(unlist(chromosomes))
+ chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
+ unique(unlist(chromosomes))
}
diff --git a/scripts/ensembl.R b/scripts/ensembl.R
index c66d706..ab052fc 100644
--- a/scripts/ensembl.R
+++ b/scripts/ensembl.R
@@ -12,8 +12,8 @@ ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
# Filter out species ID and name from the result.
species <- ensembl_datasets[, .(
- id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
- name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
+ id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
+ name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
)]
# List of assemblies that the Ensembl Rest API advertises as chromosomes.
@@ -23,232 +23,232 @@ species <- ensembl_datasets[, .(
#
# See get_all_chromosomes()
valid_chromosome_names <- c(
- "1",
- "2",
- "3",
- "4",
- "5",
- "6",
- "7",
- "8",
- "9",
- "10",
- "11",
- "12",
- "13",
- "14",
- "15",
- "16",
- "17",
- "18",
- "19",
- "20",
- "X",
- "Y",
- "21",
- "4A",
- "1A",
- "22",
- "23",
- "24",
- "25LG1",
- "25LG2",
- "26",
- "27",
- "28",
- "LGE22",
- "Z",
- "25",
- "29",
- "A1",
- "A2",
- "A3",
- "B1",
- "B2",
- "B3",
- "B4",
- "C1",
- "C2",
- "D1",
- "D2",
- "D3",
- "D4",
- "E1",
- "E2",
- "E3",
- "F1",
- "F2",
- "2A",
- "2B",
- "LG01",
- "LG02",
- "LG03",
- "LG04",
- "LG05",
- "LG06",
- "LG07",
- "LG08",
- "LG09",
- "LG10",
- "LG11",
- "LG12",
- "LG13",
- "LG14",
- "LG15",
- "LG16",
- "LG17",
- "LG18",
- "LG19",
- "LG20",
- "LG21",
- "LG22",
- "LG23",
- "LG24",
- "LG25",
- "I",
- "II",
- "III",
- "IV",
- "V",
- "VI",
- "VII",
- "VIII",
- "IX",
- "XI",
- "XII",
- "XIII",
- "XIV",
- "XV",
- "XVI",
- "XVII",
- "XVIII",
- "XIX",
- "XX",
- "XXI",
- "XXII",
- "XXIII",
- "XXIV",
- "W",
- "30",
- "31",
- "32",
- "33",
- "34",
- "35",
- "36",
- "37",
- "38",
- "39",
- "40",
- "2L",
- "2R",
- "3L",
- "3R",
- "LG1",
- "LG2",
- "LG3",
- "LG4",
- "LG5",
- "LG6",
- "LG7",
- "LG8",
- "LG9",
- "LG26",
- "LG27",
- "LG28",
- "LG29",
- "LG30",
- "1a",
- "7b",
- "22a",
- "LGE22C19W28_E50C23",
- "LGE64",
- "7a",
- "MIC_1",
- "MIC_10",
- "MIC_11",
- "MIC_2",
- "MIC_3",
- "MIC_4",
- "MIC_5",
- "MIC_6",
- "MIC_7",
- "MIC_8",
- "MIC_9",
- "sgr01",
- "sgr02",
- "sgr03",
- "sgr04",
- "sgr05",
- "sgr06",
- "sgr07",
- "sgr08",
- "sgr09",
- "sgr10",
- "sgr11",
- "sgr12",
- "sgr13",
- "sgr14",
- "sgr15",
- "sgr16",
- "sgr17",
- "sgr18",
- "sgr19",
- "X1",
- "X2",
- "X3",
- "X4",
- "X5",
- "a",
- "b",
- "c",
- "d",
- "f",
- "g",
- "h",
- "41",
- "42",
- "43",
- "44",
- "45",
- "46",
- "47",
- "48",
- "49",
- "50",
- "LG28B",
- "LG30F",
- "LG36F",
- "LG37M",
- "LG42F",
- "LG44F",
- "LG45M",
- "LG48F",
- "LG49B",
- "LG34",
- "LG35",
- "LG7_11",
- "groupI",
- "groupII",
- "groupIII",
- "groupIV",
- "groupV",
- "groupVI",
- "groupVII",
- "groupVIII",
- "groupIX",
- "groupX",
- "groupXI",
- "groupXII",
- "groupXIII",
- "groupXIV",
- "groupXV",
- "groupXVI",
- "groupXVII",
- "groupXVIII",
- "groupXIX",
- "groupXX",
- "groupXXI"
+ "1",
+ "2",
+ "3",
+ "4",
+ "5",
+ "6",
+ "7",
+ "8",
+ "9",
+ "10",
+ "11",
+ "12",
+ "13",
+ "14",
+ "15",
+ "16",
+ "17",
+ "18",
+ "19",
+ "20",
+ "X",
+ "Y",
+ "21",
+ "4A",
+ "1A",
+ "22",
+ "23",
+ "24",
+ "25LG1",
+ "25LG2",
+ "26",
+ "27",
+ "28",
+ "LGE22",
+ "Z",
+ "25",
+ "29",
+ "A1",
+ "A2",
+ "A3",
+ "B1",
+ "B2",
+ "B3",
+ "B4",
+ "C1",
+ "C2",
+ "D1",
+ "D2",
+ "D3",
+ "D4",
+ "E1",
+ "E2",
+ "E3",
+ "F1",
+ "F2",
+ "2A",
+ "2B",
+ "LG01",
+ "LG02",
+ "LG03",
+ "LG04",
+ "LG05",
+ "LG06",
+ "LG07",
+ "LG08",
+ "LG09",
+ "LG10",
+ "LG11",
+ "LG12",
+ "LG13",
+ "LG14",
+ "LG15",
+ "LG16",
+ "LG17",
+ "LG18",
+ "LG19",
+ "LG20",
+ "LG21",
+ "LG22",
+ "LG23",
+ "LG24",
+ "LG25",
+ "I",
+ "II",
+ "III",
+ "IV",
+ "V",
+ "VI",
+ "VII",
+ "VIII",
+ "IX",
+ "XI",
+ "XII",
+ "XIII",
+ "XIV",
+ "XV",
+ "XVI",
+ "XVII",
+ "XVIII",
+ "XIX",
+ "XX",
+ "XXI",
+ "XXII",
+ "XXIII",
+ "XXIV",
+ "W",
+ "30",
+ "31",
+ "32",
+ "33",
+ "34",
+ "35",
+ "36",
+ "37",
+ "38",
+ "39",
+ "40",
+ "2L",
+ "2R",
+ "3L",
+ "3R",
+ "LG1",
+ "LG2",
+ "LG3",
+ "LG4",
+ "LG5",
+ "LG6",
+ "LG7",
+ "LG8",
+ "LG9",
+ "LG26",
+ "LG27",
+ "LG28",
+ "LG29",
+ "LG30",
+ "1a",
+ "7b",
+ "22a",
+ "LGE22C19W28_E50C23",
+ "LGE64",
+ "7a",
+ "MIC_1",
+ "MIC_10",
+ "MIC_11",
+ "MIC_2",
+ "MIC_3",
+ "MIC_4",
+ "MIC_5",
+ "MIC_6",
+ "MIC_7",
+ "MIC_8",
+ "MIC_9",
+ "sgr01",
+ "sgr02",
+ "sgr03",
+ "sgr04",
+ "sgr05",
+ "sgr06",
+ "sgr07",
+ "sgr08",
+ "sgr09",
+ "sgr10",
+ "sgr11",
+ "sgr12",
+ "sgr13",
+ "sgr14",
+ "sgr15",
+ "sgr16",
+ "sgr17",
+ "sgr18",
+ "sgr19",
+ "X1",
+ "X2",
+ "X3",
+ "X4",
+ "X5",
+ "a",
+ "b",
+ "c",
+ "d",
+ "f",
+ "g",
+ "h",
+ "41",
+ "42",
+ "43",
+ "44",
+ "45",
+ "46",
+ "47",
+ "48",
+ "49",
+ "50",
+ "LG28B",
+ "LG30F",
+ "LG36F",
+ "LG37M",
+ "LG42F",
+ "LG44F",
+ "LG45M",
+ "LG48F",
+ "LG49B",
+ "LG34",
+ "LG35",
+ "LG7_11",
+ "groupI",
+ "groupII",
+ "groupIII",
+ "groupIV",
+ "groupV",
+ "groupVI",
+ "groupVII",
+ "groupVIII",
+ "groupIX",
+ "groupX",
+ "groupXI",
+ "groupXII",
+ "groupXIII",
+ "groupXIV",
+ "groupXV",
+ "groupXVI",
+ "groupXVII",
+ "groupXVIII",
+ "groupXIX",
+ "groupXX",
+ "groupXXI"
)
#' Get all chromosome names for an Ensembl dataset.
@@ -256,8 +256,8 @@ valid_chromosome_names <- c(
#' The function tries to filter out valid chromosome names from the available
#' assemblies in the dataset.
get_chromosome_names <- function(dataset) {
- chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
- chromosome_names[chromosome_names %chin% valid_chromosome_names]
+ chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
+ chromosome_names[chromosome_names %chin% valid_chromosome_names]
}
# Retrieve information on human genes. This will only include genes on
@@ -267,16 +267,16 @@ rlog::log_info("Retrieving information on human genes")
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
human_data <- data.table(biomaRt::getBM(
- attributes = c(
- "ensembl_gene_id",
- "hgnc_symbol",
- "chromosome_name",
- "start_position",
- "end_position"
- ),
- filters = "chromosome_name",
- values = get_chromosome_names(dataset),
- mart = dataset
+ attributes = c(
+ "ensembl_gene_id",
+ "hgnc_symbol",
+ "chromosome_name",
+ "start_position",
+ "end_position"
+ ),
+ filters = "chromosome_name",
+ values = get_chromosome_names(dataset),
+ mart = dataset
))
# Remove duplicated gene IDs (at the time of writing, there are a handful).
@@ -284,9 +284,9 @@ human_data <- unique(human_data, by = "ensembl_gene_id")
# Only keep relevant information on genes.
genes <- human_data[, .(
- id = ensembl_gene_id,
- name = hgnc_symbol,
- chromosome = chromosome_name
+ id = ensembl_gene_id,
+ name = hgnc_symbol,
+ chromosome = chromosome_name
)]
# Retrieve gene distance data across species.
@@ -296,39 +296,39 @@ distances <- data.table()
#' Handle data for one species.
handle_species <- function(species_id, species_data) {
- chromosomes <- species_data[,
- .(chromosome_length = max(end_position)),
- by = chromosome_name
- ]
+ chromosomes <- species_data[,
+ .(chromosome_length = max(end_position)),
+ by = chromosome_name
+ ]
- # Store the number of chromosomes in the species table.
- species[id == species_id, n_chromosomes := nrow(chromosomes)]
+ # Store the number of chromosomes in the species table.
+ species[id == species_id, n_chromosomes := nrow(chromosomes)]
- # Store the median chromosome length in the species table.
- species[
- id == species_id,
- median_chromosome_length := chromosomes[, median(chromosome_length)]
- ]
+ # Store the median chromosome length in the species table.
+ species[
+ id == species_id,
+ median_chromosome_length := chromosomes[, median(chromosome_length)]
+ ]
- # Precompute the genes' distance to the nearest telomere.
- species_distances <- species_data[
- chromosomes,
- .(
- species = species_id,
- gene = ensembl_gene_id,
- chromosome_name = chromosome_name,
- start_position = start_position,
- end_position = end_position,
- distance = pmin(
- start_position,
- chromosome_length - end_position
- )
- ),
- on = "chromosome_name"
- ]
+ # Precompute the genes' distance to the nearest telomere.
+ species_distances <- species_data[
+ chromosomes,
+ .(
+ species = species_id,
+ gene = ensembl_gene_id,
+ chromosome_name = chromosome_name,
+ start_position = start_position,
+ end_position = end_position,
+ distance = pmin(
+ start_position,
+ chromosome_length - end_position
+ )
+ ),
+ on = "chromosome_name"
+ ]
- # Add species distances to the distances table.
- distances <<- rbindlist(list(distances, species_distances))
+ # Add species distances to the distances table.
+ distances <<- rbindlist(list(distances, species_distances))
}
# Handle the human first, as we already retrieved the data and don't need to
@@ -337,67 +337,67 @@ handle_species("hsapiens", human_data)
# Iterate through all other species and retrieve their distance data.
for (species_id in species[id != "hsapiens", id]) {
- rlog::log_info(sprintf("Loading species \"%s\"", species_id))
+ rlog::log_info(sprintf("Loading species \"%s\"", species_id))
- dataset <- biomaRt::useDataset(
- sprintf("%s_gene_ensembl", species_id),
- mart = ensembl
- )
+ dataset <- biomaRt::useDataset(
+ sprintf("%s_gene_ensembl", species_id),
+ mart = ensembl
+ )
- # Besides the attributes that are always present, we need to check for
- # human orthologs. Some species don't have that information and will be
- # skipped.
- if (!"hsapiens_homolog_ensembl_gene" %chin%
- biomaRt::listAttributes(dataset, what = "name")) {
- rlog::log_info("No data on human orthologs")
- species <- species[id != species_id]
+ # Besides the attributes that are always present, we need to check for
+ # human orthologs. Some species don't have that information and will be
+ # skipped.
+ if (!"hsapiens_homolog_ensembl_gene" %chin%
+ biomaRt::listAttributes(dataset, what = "name")) {
+ rlog::log_info("No data on human orthologs")
+ species <- species[id != species_id]
- next
- }
+ next
+ }
- chromosome_names <- get_chromosome_names(dataset)
+ chromosome_names <- get_chromosome_names(dataset)
- # Skip the species, if there are no assembled chromosomes.
- if (length(chromosome_names) <= 0) {
- rlog::log_info("No matching chromosome assemblies")
- species <- species[id != species_id]
+ # Skip the species, if there are no assembled chromosomes.
+ if (length(chromosome_names) <= 0) {
+ rlog::log_info("No matching chromosome assemblies")
+ species <- species[id != species_id]
- next
- }
+ next
+ }
- # Retrieve information on all genes of the current species, that have
- # human orthologs. This is called "homolog" in the Ensembl schema.
- species_distances <- data.table(biomaRt::getBM(
- attributes = c(
- "hsapiens_homolog_ensembl_gene",
- "chromosome_name",
- "start_position",
- "end_position"
- ),
- filters = c("with_hsapiens_homolog", "chromosome_name"),
- values = list(TRUE, chromosome_names),
- mart = dataset
- ))
+ # Retrieve information on all genes of the current species, that have
+ # human orthologs. This is called "homolog" in the Ensembl schema.
+ species_distances <- data.table(biomaRt::getBM(
+ attributes = c(
+ "hsapiens_homolog_ensembl_gene",
+ "chromosome_name",
+ "start_position",
+ "end_position"
+ ),
+ filters = c("with_hsapiens_homolog", "chromosome_name"),
+ values = list(TRUE, chromosome_names),
+ mart = dataset
+ ))
- # Only include human genes that we have information on.
- species_distances <- species_distances[
- hsapiens_homolog_ensembl_gene %chin% genes$id
- ]
+ # Only include human genes that we have information on.
+ species_distances <- species_distances[
+ hsapiens_homolog_ensembl_gene %chin% genes$id
+ ]
- # Only include one ortholog per human gene.
- species_distances <- unique(
- species_distances,
- by = "hsapiens_homolog_ensembl_gene"
- )
+ # Only include one ortholog per human gene.
+ species_distances <- unique(
+ species_distances,
+ by = "hsapiens_homolog_ensembl_gene"
+ )
- # Rename gene ID column to match the human data.
- setnames(
- species_distances,
- "hsapiens_homolog_ensembl_gene",
- "ensembl_gene_id"
- )
+ # Rename gene ID column to match the human data.
+ setnames(
+ species_distances,
+ "hsapiens_homolog_ensembl_gene",
+ "ensembl_gene_id"
+ )
- handle_species(species_id, species_distances)
+ handle_species(species_id, species_distances)
}
# Save data in the appropriate place.