From c04b6337e9f5eeff7adebe7818234704ada9af8b Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Thu, 26 May 2022 12:42:19 +0200 Subject: [PATCH] Reindent code to use just two spaces --- R/analyze.R | 88 ++--- R/comparison.R | 104 +++--- R/method.R | 68 ++-- R/method_adjacency.R | 148 ++++---- R/method_clustering.R | 108 +++--- R/method_correlation.R | 149 ++++---- R/method_neural.R | 432 +++++++++++----------- R/method_proximity.R | 58 +-- R/method_species_adjacency.R | 232 ++++++------ R/plots.R | 674 +++++++++++++++++------------------ R/preset.R | 116 +++--- R/ranking.R | 112 +++--- R/result.R | 48 +-- R/utils.R | 30 +- R/validate.R | 132 +++---- scripts/chromosome_names.R | 22 +- scripts/ensembl.R | 644 ++++++++++++++++----------------- 17 files changed, 1583 insertions(+), 1582 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index b17dbc8..1605427 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -18,54 +18,54 @@ #' #' @export analyze <- function(preset, progress = NULL, include_results = TRUE) { - if (!inherits(preset, "geposan_preset")) { - stop("Preset is invalid. Use geposan::preset() to create one.") - } + if (!inherits(preset, "geposan_preset")) { + stop("Preset is invalid. Use geposan::preset() to create one.") + } - if (is.null(progress)) { - progress_bar <- progress::progress_bar$new() - progress_bar$update(0.0) + if (is.null(progress)) { + progress_bar <- progress::progress_bar$new() + progress_bar$update(0.0) - progress <- function(progress_value) { - if (!progress_bar$finished) { - progress_bar$update(progress_value) - if (progress_value >= 1.0) { - progress_bar$terminate() - } - } + progress <- function(progress_value) { + if (!progress_bar$finished) { + progress_bar$update(progress_value) + if (progress_value >= 1.0) { + progress_bar$terminate() } + } } + } - progress_buffer <- 0.0 - method_count <- length(preset$methods) + progress_buffer <- 0.0 + method_count <- length(preset$methods) - method_progress <- function(progress_value) { - progress(progress_buffer + progress_value / method_count) - } + method_progress <- function(progress_value) { + progress(progress_buffer + progress_value / method_count) + } - scores <- data.table(gene = preset$gene_id) - results <- list() + scores <- data.table(gene = preset$gene_id) + results <- list() - for (method in preset$methods) { - method_results <- method$func(preset, method_progress) + for (method in preset$methods) { + method_results <- method$func(preset, method_progress) - scores <- merge(scores, method_results$scores, by = "gene") - setnames(scores, "score", method$id) + scores <- merge(scores, method_results$scores, by = "gene") + setnames(scores, "score", method$id) - results <- c(results, list(method_results)) + results <- c(results, list(method_results)) - progress_buffer <- progress_buffer + 1 / method_count - progress(progress_buffer) - } + progress_buffer <- progress_buffer + 1 / method_count + progress(progress_buffer) + } - structure( - list( - preset = preset, - scores = scores, - results = if (include_results) results else NULL - ), - class = "geposan_analysis" - ) + structure( + list( + preset = preset, + scores = scores, + results = if (include_results) results else NULL + ), + class = "geposan_analysis" + ) } #' Print an analysis object. @@ -77,14 +77,14 @@ analyze <- function(preset, progress = NULL, include_results = TRUE) { #' #' @export print.geposan_analysis <- function(x, ...) { - cat("geposan analysis:\n\n") - print(x$preset) + cat("geposan analysis:\n\n") + print(x$preset) + cat("\n") + + for (result in x$results) { + print(result) cat("\n") + } - for (result in x$results) { - print(result) - cat("\n") - } - - invisible(x) + invisible(x) } diff --git a/R/comparison.R b/R/comparison.R index 49e73af..62d21d4 100644 --- a/R/comparison.R +++ b/R/comparison.R @@ -18,38 +18,38 @@ #' #' @export compare <- function(ranking, comparison_gene_ids) { - if (!inherits(ranking, "geposan_ranking")) { - stop("Invalid ranking. Use geposan::ranking().") - } + if (!inherits(ranking, "geposan_ranking")) { + stop("Invalid ranking. Use geposan::ranking().") + } - comparison_ranking <- ranking[gene %chin% comparison_gene_ids] + comparison_ranking <- ranking[gene %chin% comparison_gene_ids] - quantiles <- data.table( - quantile = c("0%", "25%", "50%", "75%", "100%"), - score = stats::quantile(comparison_ranking[, score]), - rank = stats::quantile( - comparison_ranking[, rank], - probs = seq(1, 0, -0.25) - ), - percentile = stats::quantile(comparison_ranking[, percentile]) - ) + quantiles <- data.table( + quantile = c("0%", "25%", "50%", "75%", "100%"), + score = stats::quantile(comparison_ranking[, score]), + rank = stats::quantile( + comparison_ranking[, rank], + probs = seq(1, 0, -0.25) + ), + percentile = stats::quantile(comparison_ranking[, percentile]) + ) - p_value <- stats::wilcox.test( - x = comparison_ranking[, score], - y = ranking[!gene %chin% comparison_gene_ids, score], - alternative = "greater" - )$p.value + p_value <- stats::wilcox.test( + x = comparison_ranking[, score], + y = ranking[!gene %chin% comparison_gene_ids, score], + alternative = "greater" + )$p.value - structure( - list( - quantiles = quantiles, - mean_score = comparison_ranking[, mean(score)], - mean_rank = comparison_ranking[, mean(rank)], - mean_percentile = comparison_ranking[, mean(percentile)], - p_value = p_value - ), - class = "geposan_comparison" - ) + structure( + list( + quantiles = quantiles, + mean_score = comparison_ranking[, mean(score)], + mean_rank = comparison_ranking[, mean(rank)], + mean_percentile = comparison_ranking[, mean(percentile)], + p_value = p_value + ), + class = "geposan_comparison" + ) } #' S3 method to print a comparison object. @@ -61,32 +61,32 @@ compare <- function(ranking, comparison_gene_ids) { #' #' @export print.geposan_comparison <- function(x, ...) { - cat("geposan comparison:\n\n") + cat("geposan comparison:\n\n") - quantiles_formatted <- x$quantiles[, .( - "Quantile" = quantile, - "Score" = round(score, 3), - "Rank" = rank, - "Percentile" = paste0( - format(round(percentile * 100, 1), nsmall = 1), - "%" - ) - )] + quantiles_formatted <- x$quantiles[, .( + "Quantile" = quantile, + "Score" = round(score, 3), + "Rank" = rank, + "Percentile" = paste0( + format(round(percentile * 100, 1), nsmall = 1), + "%" + ) + )] - print(quantiles_formatted, row.names = FALSE) + print(quantiles_formatted, row.names = FALSE) - cat(sprintf( - paste0( - "\n Mean score: %.3f", - "\n Mean rank: %.1f", - "\n Mean percentile: %.1f%%", - "\n p-value for better scores: %.4f\n" - ), - x$mean_score, - x$mean_rank, - x$mean_percentile * 100, - x$p_value - )) + cat(sprintf( + paste0( + "\n Mean score: %.3f", + "\n Mean rank: %.1f", + "\n Mean percentile: %.1f%%", + "\n p-value for better scores: %.4f\n" + ), + x$mean_score, + x$mean_rank, + x$mean_percentile * 100, + x$p_value + )) - invisible(x) + invisible(x) } diff --git a/R/method.R b/R/method.R index cff0f61..2d909b1 100644 --- a/R/method.R +++ b/R/method.R @@ -12,34 +12,34 @@ #' #' @export method <- function(id, name, description, func) { - stopifnot(is.character(id) & length(id) == 1) - stopifnot(is.character(name) & length(name) == 1) - stopifnot(is.character(description) & length(description) == 1) - stopifnot(is.function(func)) + stopifnot(is.character(id) & length(id) == 1) + stopifnot(is.character(name) & length(name) == 1) + stopifnot(is.character(description) & length(description) == 1) + stopifnot(is.function(func)) - structure( - list( - id = id, - name = name, - description = description, - func = func - ), - class = "geposan_method" - ) + structure( + list( + id = id, + name = name, + description = description, + func = func + ), + class = "geposan_method" + ) } #' Get a list of all available methods. #' #' @export all_methods <- function() { - list( - clustering(), - correlation(), - neural(), - adjacency(), - species_adjacency(), - proximity() - ) + list( + clustering(), + correlation(), + neural(), + adjacency(), + species_adjacency(), + proximity() + ) } #' Print a method object. @@ -51,18 +51,18 @@ all_methods <- function() { #' #' @export print.geposan_method <- function(x, ...) { - cat(sprintf( - paste0( - "geposan method:", - "\n Method ID: %s", - "\n Name: %s", - "\n Description: %s", - "\n" - ), - x$id, - x$name, - x$description - )) + cat(sprintf( + paste0( + "geposan method:", + "\n Method ID: %s", + "\n Name: %s", + "\n Description: %s", + "\n" + ), + x$id, + x$name, + x$description + )) - invisible(x) + invisible(x) } diff --git a/R/method_adjacency.R b/R/method_adjacency.R index 26ad099..2ece9e6 100644 --- a/R/method_adjacency.R +++ b/R/method_adjacency.R @@ -12,14 +12,14 @@ #' #' @export densest <- function(data) { - as.numeric(if (length(data) <= 0) { - NULL - } else if (length(data) == 1) { - data - } else { - density <- stats::density(data) - mean(density$x[density$y == max(density$y)]) - }) + as.numeric(if (length(data) <= 0) { + NULL + } else if (length(data) == 1) { + data + } else { + density <- stats::density(data) + mean(density$x[density$y == max(density$y)]) + }) } #' Score genes based on their proximity to the reference genes. @@ -40,80 +40,80 @@ densest <- function(data) { #' #' @export adjacency <- function(distance_estimate = densest, summarize = stats::median) { - method( - id = "adjacency", - name = "Adjacency", - description = "Adjacency to reference genes", - function(preset, progress) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - reference_gene_ids <- preset$reference_gene_ids + method( + id = "adjacency", + name = "Adjacency", + description = "Adjacency to reference genes", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + reference_gene_ids <- preset$reference_gene_ids - cached( - "adjacency", - c( - species_ids, - gene_ids, - reference_gene_ids, - distance_estimate, - summarize - ), - { # nolint - # Filter distances by species and gene and summarize each - # gene's distance values using the estimation function. - data <- geposan::distances[ - species %chin% species_ids & gene %chin% gene_ids, - .(distance = as.numeric(distance_estimate(distance))), - by = gene - ] + cached( + "adjacency", + c( + species_ids, + gene_ids, + reference_gene_ids, + distance_estimate, + summarize + ), + { # nolint + # Filter distances by species and gene and summarize each + # gene's distance values using the estimation function. + data <- geposan::distances[ + species %chin% species_ids & gene %chin% gene_ids, + .(distance = as.numeric(distance_estimate(distance))), + by = gene + ] - # Compute the absolute value of the difference between the - # estimated distances of each gene to the reference genes. - compute_difference <- function(distance_value, - comparison_ids) { - differences <- data[ - gene %chin% comparison_ids, - .(difference = abs(distance_value - distance)) - ] + # Compute the absolute value of the difference between the + # estimated distances of each gene to the reference genes. + compute_difference <- function(distance_value, + comparison_ids) { + differences <- data[ + gene %chin% comparison_ids, + .(difference = abs(distance_value - distance)) + ] - summarize(differences$difference) - } + summarize(differences$difference) + } - # Compute the differences to the reference genes. - data[ - !gene %chin% reference_gene_ids, - difference := compute_difference( - distance, - reference_gene_ids - ), - by = gene - ] + # Compute the differences to the reference genes. + data[ + !gene %chin% reference_gene_ids, + difference := compute_difference( + distance, + reference_gene_ids + ), + by = gene + ] - progress(0.5) + progress(0.5) - # Exclude the reference gene itself when computing its - # difference. - data[ - gene %chin% reference_gene_ids, - difference := compute_difference( - distance, - reference_gene_ids[reference_gene_ids != gene] - ), - by = gene - ] + # Exclude the reference gene itself when computing its + # difference. + data[ + gene %chin% reference_gene_ids, + difference := compute_difference( + distance, + reference_gene_ids[reference_gene_ids != gene] + ), + by = gene + ] - # Compute the final score by normalizing the difference. - data[, score := 1 - difference / max(difference)] + # Compute the final score by normalizing the difference. + data[, score := 1 - difference / max(difference)] - progress(1.0) + progress(1.0) - result( - method = "adjacency", - scores = data[, .(gene, score)], - details = list(data = data) - ) - } - ) + result( + method = "adjacency", + scores = data[, .(gene, score)], + details = list(data = data) + ) } - ) + ) + } + ) } diff --git a/R/method_clustering.R b/R/method_clustering.R index 51087f5..7c34052 100644 --- a/R/method_clustering.R +++ b/R/method_clustering.R @@ -13,33 +13,33 @@ #' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49 #' etc. clusteriness <- function(data, span = 100000, weight = 0.7) { - n <- length(data) + n <- length(data) - # Return a score of 0.0 if there is just one or no value at all. - if (n < 2) { - return(0.0) + # Return a score of 0.0 if there is just one or no value at all. + if (n < 2) { + return(0.0) + } + + # Cluster the data and compute the cluster sizes. + + tree <- stats::hclust(stats::dist(data)) + clusters <- stats::cutree(tree, h = span) + cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) + + # Compute the "clusteriness" score. + + score <- 0.0 + + for (i in seq_along(cluster_sizes)) { + cluster_size <- cluster_sizes[i] + + if (cluster_size >= 2) { + cluster_score <- cluster_size / n + score <- score + weight^(i - 1) * cluster_score } + } - # Cluster the data and compute the cluster sizes. - - tree <- stats::hclust(stats::dist(data)) - clusters <- stats::cutree(tree, h = span) - cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) - - # Compute the "clusteriness" score. - - score <- 0.0 - - for (i in seq_along(cluster_sizes)) { - cluster_size <- cluster_sizes[i] - - if (cluster_size >= 2) { - cluster_score <- cluster_size / n - score <- score + weight^(i - 1) * cluster_score - } - } - - score + score } #' Process genes clustering their distance to telomeres. @@ -53,41 +53,41 @@ clusteriness <- function(data, span = 100000, weight = 0.7) { #' #' @export clustering <- function() { - method( - id = "clustering", - name = "Clustering", - description = "Clustering of genes", - function(preset, progress) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids + method( + id = "clustering", + name = "Clustering", + description = "Clustering of genes", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids - cached("clustering", c(species_ids, gene_ids), { - scores <- data.table(gene = gene_ids) + cached("clustering", c(species_ids, gene_ids), { + scores <- data.table(gene = gene_ids) - # Prefilter the input data by species. - distances <- geposan::distances[species %chin% species_ids] + # Prefilter the input data by species. + distances <- geposan::distances[species %chin% species_ids] - genes_done <- 0 - genes_total <- length(gene_ids) + genes_done <- 0 + genes_total <- length(gene_ids) - # Perform the cluster analysis for one gene. - compute <- function(gene_id) { - data <- distances[gene == gene_id, distance] - score <- clusteriness(data) + # Perform the cluster analysis for one gene. + compute <- function(gene_id) { + data <- distances[gene == gene_id, distance] + score <- clusteriness(data) - genes_done <<- genes_done + 1 - progress(genes_done / genes_total) + genes_done <<- genes_done + 1 + progress(genes_done / genes_total) - score - } - - scores[, score := compute(gene), by = gene] - - result( - method = "clustering", - scores = scores - ) - }) + score } - ) + + scores[, score := compute(gene), by = gene] + + result( + method = "clustering", + scores = scores + ) + }) + } + ) } diff --git a/R/method_correlation.R b/R/method_correlation.R index 44b36f6..1b69861 100644 --- a/R/method_correlation.R +++ b/R/method_correlation.R @@ -8,96 +8,97 @@ #' #' @export correlation <- function(summarize = stats::median) { - method( - id = "correlation", - name = "Correlation", - description = "Correlation with reference genes", - function(preset, progress) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - reference_gene_ids <- preset$reference_gene_ids + method( + id = "correlation", + name = "Correlation", + description = "Correlation with reference genes", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + reference_gene_ids <- preset$reference_gene_ids - cached( - "correlation", - c(species_ids, gene_ids, reference_gene_ids, summarize), - { # nolint - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + cached( + "correlation", + c(species_ids, gene_ids, reference_gene_ids, summarize), + { # nolint + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Tranform data to get species as rows and genes as columns. - # We construct columns per species, because it requires - # fewer iterations, and transpose the table afterwards. + # Tranform data to get species as rows and genes as columns. + # We construct columns per species, because it requires + # fewer iterations, and transpose the table afterwards. - data <- data.table(gene = gene_ids) + data <- data.table(gene = gene_ids) - # Make a column containing distance data for each species. - for (species_id in species_ids) { - species_data <- distances[ - species == species_id, - .(gene, distance) - ] + # Make a column containing distance data for each species. + for (species_id in species_ids) { + species_data <- distances[ + species == species_id, + .(gene, distance) + ] - data <- merge(data, species_data, all.x = TRUE) - setnames(data, "distance", species_id) - } + data <- merge(data, species_data, all.x = TRUE) + setnames(data, "distance", species_id) + } - # Transpose to the desired format. - data <- transpose(data, make.names = "gene") + # Transpose to the desired format. + data <- transpose(data, make.names = "gene") - progress(0.33) + progress(0.33) - # Take the reference data. - reference_data <- data[, ..reference_gene_ids] + # Take the reference data. + reference_data <- data[, ..reference_gene_ids] - # Perform the correlation between all possible pairs. - results <- stats::cor( - data[, ..gene_ids], - reference_data, - use = "pairwise.complete.obs", - method = "spearman" - ) + # Perform the correlation between all possible pairs. + results <- stats::cor( + data[, ..gene_ids], + reference_data, + use = "pairwise.complete.obs", + method = "spearman" + ) - results <- data.table(results, keep.rownames = TRUE) - setnames(results, "rn", "gene") + results <- data.table(results, keep.rownames = TRUE) + setnames(results, "rn", "gene") - # Remove correlations between the reference genes - # themselves. - for (reference_gene_id in reference_gene_ids) { - column <- quote(reference_gene_id) - results[gene == reference_gene_id, eval(column) := NA] - } + # Remove correlations between the reference genes + # themselves. + for (reference_gene_id in reference_gene_ids) { + column <- quote(reference_gene_id) + results[gene == reference_gene_id, eval(column) := NA] + } - progress(0.66) + progress(0.66) - # Combine the correlation coefficients. - results[, - max_correlation := as.double(summarize(stats::na.omit( - # Convert the data.table subset into a - # vector to get the correct na.omit - # behavior. - as.matrix(.SD)[1, ] - ))), - .SDcols = reference_gene_ids, - by = gene - ] + # Combine the correlation coefficients. + results[, + max_correlation := as.double(summarize(stats::na.omit( + # Convert the data.table subset into a + # vector to get the correct na.omit + # behavior. + as.matrix(.SD)[1, ] + ))), + .SDcols = reference_gene_ids, + by = gene + ] - # Normalize scores. - results[, - score := (max_correlation - min(max_correlation)) / - (max(max_correlation) - min(max_correlation)) - ] + # Normalize scores. + results[ + , + score := (max_correlation - min(max_correlation)) / + (max(max_correlation) - min(max_correlation)) + ] - # Normalize scores. + # Normalize scores. - results[, .(gene, score)] + results[, .(gene, score)] - result( - method = "correlation", - scores = results[, .(gene, score)], - details = list(all_correlations = results) - ) - } - ) + result( + method = "correlation", + scores = results[, .(gene, score)], + details = list(all_correlations = results) + ) } - ) + ) + } + ) } diff --git a/R/method_neural.R b/R/method_neural.R index 2b3323c..6f62d49 100644 --- a/R/method_neural.R +++ b/R/method_neural.R @@ -12,244 +12,244 @@ #' #' @export neural <- function(seed = 180199, n_models = 5) { - method( - id = "neural", - name = "Neural", - description = "Assessment by neural network", - function(preset, progress) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - reference_gene_ids <- preset$reference_gene_ids + method( + id = "neural", + name = "Neural", + description = "Assessment by neural network", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + reference_gene_ids <- preset$reference_gene_ids - cached( - "neural", - c(species_ids, gene_ids, reference_gene_ids, seed, n_models), - { # nolint - reference_count <- length(reference_gene_ids) - stopifnot(n_models %in% 2:reference_count) + cached( + "neural", + c(species_ids, gene_ids, reference_gene_ids, seed, n_models), + { # nolint + reference_count <- length(reference_gene_ids) + stopifnot(n_models %in% 2:reference_count) - # Make results reproducible. - tensorflow::set_random_seed(seed) + # Make results reproducible. + tensorflow::set_random_seed(seed) - # Step 1: Prepare input data. - # --------------------------- + # Step 1: Prepare input data. + # --------------------------- - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Input data for the network. This contains the gene ID as - # an identifier as well as the per-species gene distances as - # input variables. - data <- data.table(gene = gene_ids) + # Input data for the network. This contains the gene ID as + # an identifier as well as the per-species gene distances as + # input variables. + data <- data.table(gene = gene_ids) - # Buffer to keep track of the names of the input variables. - input_vars <- NULL + # Buffer to keep track of the names of the input variables. + input_vars <- NULL - # Make a columns containing positions and distances for each - # species. - for (species_id in species_ids) { - species_data <- distances[ - species == species_id, - .(gene, distance) - ] + # Make a columns containing positions and distances for each + # species. + for (species_id in species_ids) { + species_data <- distances[ + species == species_id, + .(gene, distance) + ] - # Only include species with at least 25% known values. - # As positions and distances always coexist, we don't - # loose any data here. + # Only include species with at least 25% known values. + # As positions and distances always coexist, we don't + # loose any data here. - species_data <- stats::na.omit(species_data) + species_data <- stats::na.omit(species_data) - if (nrow(species_data) >= 0.25 * length(gene_ids)) { - data <- merge(data, species_data, all.x = TRUE) + if (nrow(species_data) >= 0.25 * length(gene_ids)) { + data <- merge(data, species_data, all.x = TRUE) - # Replace missing data with mean values. The neural - # network can't handle NAs in a meaningful way. - # Choosing extreme values here would result in - # heavily biased results. Therefore, the mean value - # is chosen as a compromise. However, this will of - # course lessen the significance of the results. + # Replace missing data with mean values. The neural + # network can't handle NAs in a meaningful way. + # Choosing extreme values here would result in + # heavily biased results. Therefore, the mean value + # is chosen as a compromise. However, this will of + # course lessen the significance of the results. - mean_distance <- round( - species_data[, mean(distance)] - ) + mean_distance <- round( + species_data[, mean(distance)] + ) - data[is.na(distance), distance := mean_distance] + data[is.na(distance), distance := mean_distance] - # Name the new column after the species. - setnames(data, "distance", species_id) + # Name the new column after the species. + setnames(data, "distance", species_id) - # Add the input variable to the buffer. - input_vars <- c(input_vars, species_id) - } - } + # Add the input variable to the buffer. + input_vars <- c(input_vars, species_id) + } + } - progress(0.1) + progress(0.1) - # Step 2: Prepare training data. - # ------------------------------ + # Step 2: Prepare training data. + # ------------------------------ - # Take out the reference data. + # Take out the reference data. - reference_data <- data[gene %chin% reference_gene_ids] - reference_data[, score := 1.0] + reference_data <- data[gene %chin% reference_gene_ids] + reference_data[, score := 1.0] - # Take out random samples from the remaining genes. This is - # another compromise with a negative impact on - # significance. We assume that a random gene is not likely - # to match the reference genes. + # Take out random samples from the remaining genes. This is + # another compromise with a negative impact on + # significance. We assume that a random gene is not likely + # to match the reference genes. - without_reference_data <- data[ - !gene %chin% reference_gene_ids - ] + without_reference_data <- data[ + !gene %chin% reference_gene_ids + ] - control_data <- without_reference_data[ - sample( - nrow(without_reference_data), - reference_count - ) - ] - - control_data[, score := 0.0] - - # Split the training data into random sets to have - # validation data for each model. - - # Scramble the source tables. - reference_data <- reference_data[sample(reference_count)] - control_data <- control_data[sample(reference_count)] - - networks <- list() - - indices <- seq_len(reference_count) - indices_split <- split(indices, indices %% n_models) - - for (i in seq_len(n_models)) { - training_data <- rbindlist(list( - reference_data[!indices_split[[i]]], - control_data[!indices_split[[i]]] - )) - - validation_data <- rbindlist(list( - reference_data[indices_split[[i]]], - control_data[indices_split[[i]]] - )) - - networks[[i]] <- list( - training_data = training_data, - validation_data = validation_data - ) - } - - # Step 3: Create, train and apply neural network. - # ----------------------------------------------- - - # Layers for the neural network. - input_layer <- length(input_vars) - layer1 <- input_layer - layer2 <- 0.5 * input_layer - layer3 <- 0.5 * layer2 - - # Convert data to matrix and normalize it. - to_matrix <- function(data) { - data_matrix <- as.matrix(data[, ..input_vars]) - colnames(data_matrix) <- NULL - keras::normalize(data_matrix) - } - - data_matrix <- to_matrix(data) - output_vars <- NULL - - for (i in seq_along(networks)) { - # Create a new model for each training session, because - # the model would keep its state across training - # sessions otherwise. - model <- keras::keras_model_sequential() |> - keras::layer_dense( - units = layer1, - activation = "relu", - input_shape = input_layer, - ) |> - keras::layer_dense( - units = layer2, - activation = "relu", - kernel_regularizer = keras::regularizer_l2() - ) |> - keras::layer_dense( - units = layer3, - activation = "relu", - kernel_regularizer = keras::regularizer_l2() - ) |> - keras::layer_dense( - units = 1, - activation = "sigmoid" - ) |> - keras::compile( - loss = keras::loss_mean_absolute_error(), - optimizer = keras::optimizer_adam() - ) - - # Train the model. - - network <- networks[[i]] - - training_data <- network$training_data - training_matrix <- to_matrix(training_data) - validation_data <- network$validation_data - validation_matrix <- to_matrix(validation_data) - - fit <- keras::fit( - model, - x = training_matrix, - y = training_data$score, - validation_data = list( - x_val = validation_matrix, - y_val = validation_data$score - ), - epochs = 500, - verbose = FALSE - ) - - # Apply the model. - - data[, new_score := stats::predict(model, data_matrix)] - - # Remove the values of the training data itself. - data[gene %chin% training_data$gene, new_score := NA] - - output_var <- sprintf("score%i", i) - setnames(data, "new_score", output_var) - output_vars <- c(output_vars, output_var) - - - # Store the details. - - networks[[i]]$model <- keras::serialize_model(model) - networks[[i]]$fit <- fit - - progress(0.1 + i * (0.9 / n_models)) - } - - # Compute the final score as the mean score. - data[, - score := mean(as.numeric(.SD), na.rm = TRUE), - .SDcols = output_vars, - by = gene - ] - - progress(1.0) - - result( - method = "neural", - scores = data[, .(gene, score)], - details = list( - seed = seed, - n_models = n_models, - all_results = data[, !..input_vars], - networks = networks - ) - ) - } + control_data <- without_reference_data[ + sample( + nrow(without_reference_data), + reference_count ) + ] + + control_data[, score := 0.0] + + # Split the training data into random sets to have + # validation data for each model. + + # Scramble the source tables. + reference_data <- reference_data[sample(reference_count)] + control_data <- control_data[sample(reference_count)] + + networks <- list() + + indices <- seq_len(reference_count) + indices_split <- split(indices, indices %% n_models) + + for (i in seq_len(n_models)) { + training_data <- rbindlist(list( + reference_data[!indices_split[[i]]], + control_data[!indices_split[[i]]] + )) + + validation_data <- rbindlist(list( + reference_data[indices_split[[i]]], + control_data[indices_split[[i]]] + )) + + networks[[i]] <- list( + training_data = training_data, + validation_data = validation_data + ) + } + + # Step 3: Create, train and apply neural network. + # ----------------------------------------------- + + # Layers for the neural network. + input_layer <- length(input_vars) + layer1 <- input_layer + layer2 <- 0.5 * input_layer + layer3 <- 0.5 * layer2 + + # Convert data to matrix and normalize it. + to_matrix <- function(data) { + data_matrix <- as.matrix(data[, ..input_vars]) + colnames(data_matrix) <- NULL + keras::normalize(data_matrix) + } + + data_matrix <- to_matrix(data) + output_vars <- NULL + + for (i in seq_along(networks)) { + # Create a new model for each training session, because + # the model would keep its state across training + # sessions otherwise. + model <- keras::keras_model_sequential() |> + keras::layer_dense( + units = layer1, + activation = "relu", + input_shape = input_layer, + ) |> + keras::layer_dense( + units = layer2, + activation = "relu", + kernel_regularizer = keras::regularizer_l2() + ) |> + keras::layer_dense( + units = layer3, + activation = "relu", + kernel_regularizer = keras::regularizer_l2() + ) |> + keras::layer_dense( + units = 1, + activation = "sigmoid" + ) |> + keras::compile( + loss = keras::loss_mean_absolute_error(), + optimizer = keras::optimizer_adam() + ) + + # Train the model. + + network <- networks[[i]] + + training_data <- network$training_data + training_matrix <- to_matrix(training_data) + validation_data <- network$validation_data + validation_matrix <- to_matrix(validation_data) + + fit <- keras::fit( + model, + x = training_matrix, + y = training_data$score, + validation_data = list( + x_val = validation_matrix, + y_val = validation_data$score + ), + epochs = 500, + verbose = FALSE + ) + + # Apply the model. + + data[, new_score := stats::predict(model, data_matrix)] + + # Remove the values of the training data itself. + data[gene %chin% training_data$gene, new_score := NA] + + output_var <- sprintf("score%i", i) + setnames(data, "new_score", output_var) + output_vars <- c(output_vars, output_var) + + + # Store the details. + + networks[[i]]$model <- keras::serialize_model(model) + networks[[i]]$fit <- fit + + progress(0.1 + i * (0.9 / n_models)) + } + + # Compute the final score as the mean score. + data[, + score := mean(as.numeric(.SD), na.rm = TRUE), + .SDcols = output_vars, + by = gene + ] + + progress(1.0) + + result( + method = "neural", + scores = data[, .(gene, score)], + details = list( + seed = seed, + n_models = n_models, + all_results = data[, !..input_vars], + networks = networks + ) + ) } - ) + ) + } + ) } diff --git a/R/method_proximity.R b/R/method_proximity.R index 3861517..20a6d55 100644 --- a/R/method_proximity.R +++ b/R/method_proximity.R @@ -11,38 +11,38 @@ #' #' @export proximity <- function(summarize = stats::median) { - method( - id = "proximity", - name = "Proximity", - description = "Proximity to telomeres", - function(preset, progress) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids + method( + id = "proximity", + name = "Proximity", + description = "Proximity to telomeres", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids - cached("proximity", c(species_ids, gene_ids), { - # Prefilter distances by species and gene. - data <- geposan::distances[ - species %chin% preset$species_ids & - gene %chin% preset$gene_ids - ] + cached("proximity", c(species_ids, gene_ids), { + # Prefilter distances by species and gene. + data <- geposan::distances[ + species %chin% preset$species_ids & + gene %chin% preset$gene_ids + ] - # Compute the score as described above. - data <- data[, - .(combined_distance = as.double(summarize(distance))), - by = "gene" - ] + # Compute the score as described above. + data <- data[, + .(combined_distance = as.double(summarize(distance))), + by = "gene" + ] - # Normalize scores. - data[, score := 1 - combined_distance / max(combined_distance)] + # Normalize scores. + data[, score := 1 - combined_distance / max(combined_distance)] - progress(1.0) + progress(1.0) - result( - method = "proximity", - scores = data[, .(gene, score)], - details = list(data = data) - ) - }) - } - ) + result( + method = "proximity", + scores = data[, .(gene, score)], + details = list(data = data) + ) + }) + } + ) } diff --git a/R/method_species_adjacency.R b/R/method_species_adjacency.R index d4d042f..76f06b5 100644 --- a/R/method_species_adjacency.R +++ b/R/method_species_adjacency.R @@ -15,136 +15,136 @@ #' @export species_adjacency <- function(distance_estimate = stats::median, summarize = stats::median) { - method( - id = "species_adjacency", - name = "Species adj.", - description = "Species adjacency", - function(preset, progress) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - reference_gene_ids <- preset$reference_gene_ids + method( + id = "species_adjacency", + name = "Species adj.", + description = "Species adjacency", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + reference_gene_ids <- preset$reference_gene_ids - cached( - "species_adjacency", - c( - species_ids, - gene_ids, - reference_gene_ids, - distance_estimate, - summarize - ), - { # nolint - # Prefilter distances. - data <- geposan::distances[ - species %chin% species_ids & gene %chin% gene_ids - ] + cached( + "species_adjacency", + c( + species_ids, + gene_ids, + reference_gene_ids, + distance_estimate, + summarize + ), + { # nolint + # Prefilter distances. + data <- geposan::distances[ + species %chin% species_ids & gene %chin% gene_ids + ] - progress_state <- 0.0 - progress_step <- 0.9 / length(species_ids) + progress_state <- 0.0 + progress_step <- 0.9 / length(species_ids) - # Iterate through all species and find the distance - # estimates within that species. - for (species_id in species_ids) { - # For all genes, compute the distance to one reference - # gene at a time in one go. - for (reference_gene_id in reference_gene_ids) { - comparison_distance <- data[ - species == species_id & - gene == reference_gene_id, - distance - ] + # Iterate through all species and find the distance + # estimates within that species. + for (species_id in species_ids) { + # For all genes, compute the distance to one reference + # gene at a time in one go. + for (reference_gene_id in reference_gene_ids) { + comparison_distance <- data[ + species == species_id & + gene == reference_gene_id, + distance + ] - column <- quote(reference_gene_id) + column <- quote(reference_gene_id) - if (length(comparison_distance) != 1) { - # If we don't have a comparison distance, we - # can't compute a difference. This happens, if - # the species doesn't have the reference gene. - data[ - species == species_id & - gene %chin% gene_ids, - eval(column) := NA_integer_ - ] - } else { - data[ - species == species_id & - gene %chin% gene_ids, - eval(column) := - abs(distance - comparison_distance) - ] - } - } + if (length(comparison_distance) != 1) { + # If we don't have a comparison distance, we + # can't compute a difference. This happens, if + # the species doesn't have the reference gene. + data[ + species == species_id & + gene %chin% gene_ids, + eval(column) := NA_integer_ + ] + } else { + data[ + species == species_id & + gene %chin% gene_ids, + eval(column) := + abs(distance - comparison_distance) + ] + } + } - # Combine the distances to the different reference genes - # into one value using the provided function. - data[ - species == species_id & - gene %chin% gene_ids, - combined_distance := as.numeric( - distance_estimate(stats::na.omit( - # Convert the data.table subset into a - # vector to get the correct na.omit - # behavior. - as.matrix(.SD)[1, ] - )) - ), - .SDcols = reference_gene_ids, - by = gene - ] + # Combine the distances to the different reference genes + # into one value using the provided function. + data[ + species == species_id & + gene %chin% gene_ids, + combined_distance := as.numeric( + distance_estimate(stats::na.omit( + # Convert the data.table subset into a + # vector to get the correct na.omit + # behavior. + as.matrix(.SD)[1, ] + )) + ), + .SDcols = reference_gene_ids, + by = gene + ] - progress_state <- progress_state + progress_step - progress(progress_state) - } + progress_state <- progress_state + progress_step + progress(progress_state) + } - progress(0.9) + progress(0.9) - # Remove the distances between the reference genes. - for (reference_gene_id in reference_gene_ids) { - column <- quote(reference_gene_id) - data[gene == reference_gene_id, eval(column) := NA] - } + # Remove the distances between the reference genes. + for (reference_gene_id in reference_gene_ids) { + column <- quote(reference_gene_id) + data[gene == reference_gene_id, eval(column) := NA] + } - # Recompute the combined distance for the reference genes. - data[ - gene %chin% reference_gene_ids, - combined_distance := as.numeric( - distance_estimate(stats::na.omit( - as.matrix(.SD)[1, ]) - ) - ), - .SDcols = reference_gene_ids, - by = list(species, gene) - ] + # Recompute the combined distance for the reference genes. + data[ + gene %chin% reference_gene_ids, + combined_distance := as.numeric( + distance_estimate(stats::na.omit( + as.matrix(.SD)[1, ] + )) + ), + .SDcols = reference_gene_ids, + by = list(species, gene) + ] - # Combine the distances into one value. - results <- data[, - .( - summarized_distances = as.numeric( - summarize(stats::na.omit(combined_distance)) - ) - ), - by = gene - ] + # Combine the distances into one value. + results <- data[, + .( + summarized_distances = as.numeric( + summarize(stats::na.omit(combined_distance)) + ) + ), + by = gene + ] - # Compute the final score by normalizing the difference. - results[ - , - score := 1 - summarized_distances / - max(summarized_distances) - ] + # Compute the final score by normalizing the difference. + results[ + , + score := 1 - summarized_distances / + max(summarized_distances) + ] - progress(1.0) + progress(1.0) - result( - method = "species_adjacency", - scores = results[, .(gene, score)], - details = list( - data = data, - results = results - ) - ) - } + result( + method = "species_adjacency", + scores = results[, .(gene, score)], + details = list( + data = data, + results = results ) + ) } - ) + ) + } + ) } diff --git a/R/plots.R b/R/plots.R index 6b59352..0dd8c1a 100644 --- a/R/plots.R +++ b/R/plots.R @@ -9,7 +9,7 @@ base_color_transparent <- function() "#1964bf80" #' Color palette for gene sets. #' @noRd gene_set_color <- function(index) { - c("#FF7F00", "#4DAF4A", "#984EA3")[index] + c("#FF7F00", "#4DAF4A", "#984EA3")[index] } #' Plot gene positions. @@ -22,74 +22,74 @@ gene_set_color <- function(index) { #' #' @export plot_positions <- function(species_ids, gene_sets) { - if (!requireNamespace("plotly", quietly = TRUE)) { - stop("Please install \"plotly\" to use this function.") - } + if (!requireNamespace("plotly", quietly = TRUE)) { + stop("Please install \"plotly\" to use this function.") + } - # Prefilter data by species. - data <- geposan::distances[species %chin% species_ids] + # Prefilter data by species. + data <- geposan::distances[species %chin% species_ids] - species_max_distance <- data[, - .(max_distance = max(distance)), - by = species - ] + species_max_distance <- data[, + .(max_distance = max(distance)), + by = species + ] - # Prefilter species. - species <- geposan::species[id %chin% species_ids] + # Prefilter species. + species <- geposan::species[id %chin% species_ids] - plot <- plotly::plot_ly() |> - plotly::layout( - xaxis = list( - title = "Species", - tickvals = species$id, - ticktext = species$name - ), - yaxis = list(title = "Distance to telomeres [Bp]"), - bargap = 0.9 - ) |> - plotly::add_bars( - data = species_max_distance, - x = ~species, - y = ~max_distance, - name = "All genes", - marker = list(color = base_color()) + plot <- plotly::plot_ly() |> + plotly::layout( + xaxis = list( + title = "Species", + tickvals = species$id, + ticktext = species$name + ), + yaxis = list(title = "Distance to telomeres [Bp]"), + bargap = 0.9 + ) |> + plotly::add_bars( + data = species_max_distance, + x = ~species, + y = ~max_distance, + name = "All genes", + marker = list(color = base_color()) + ) + + if (length(gene_sets) > 0) { + # Include gene information which will be used for labeling + gene_set_data <- merge( + data[gene %chin% unlist(gene_sets)], + geposan::genes, + by.x = "gene", + by.y = "id" + ) + + index <- 1 + + for (gene_set_name in names(gene_sets)) { + gene_set <- gene_sets[[gene_set_name]] + + plot <- plot |> plotly::add_markers( + data = gene_set_data[gene %chin% gene_set], + x = ~species, + y = ~distance, + name = gene_set_name, + text = ~ glue::glue( + "{name}
", + "{round(distance / 1000000, digits = 2)} MBp" + ), + hoverinfo = "text", + marker = list( + size = 10, + color = gene_set_color(index) ) + ) - if (length(gene_sets) > 0) { - # Include gene information which will be used for labeling - gene_set_data <- merge( - data[gene %chin% unlist(gene_sets)], - geposan::genes, - by.x = "gene", - by.y = "id" - ) - - index <- 1 - - for (gene_set_name in names(gene_sets)) { - gene_set <- gene_sets[[gene_set_name]] - - plot <- plot |> plotly::add_markers( - data = gene_set_data[gene %chin% gene_set], - x = ~species, - y = ~distance, - name = gene_set_name, - text = ~ glue::glue( - "{name}
", - "{round(distance / 1000000, digits = 2)} MBp" - ), - hoverinfo = "text", - marker = list( - size = 10, - color = gene_set_color(index) - ) - ) - - index <- index + 1 - } + index <- index + 1 } + } - plot + plot } @@ -108,75 +108,75 @@ plot_positions <- function(species_ids, gene_sets) { #' #' @export plot_rankings <- function(rankings, gene_sets) { - if (!requireNamespace("plotly", quietly = TRUE)) { - stop("Please install \"plotly\" to use this function.") - } + if (!requireNamespace("plotly", quietly = TRUE)) { + stop("Please install \"plotly\" to use this function.") + } - plot <- plotly::plot_ly() |> - plotly::layout( - xaxis = list(tickvals = names(rankings)), - yaxis = list(title = "Score") + plot <- plotly::plot_ly() |> + plotly::layout( + xaxis = list(tickvals = names(rankings)), + yaxis = list(title = "Score") + ) + + is_first <- TRUE + + for (ranking_name in names(rankings)) { + ranking <- rankings[[ranking_name]] + + plot <- plot |> plotly::add_trace( + data = ranking, + x = ranking_name, + y = ~score, + name = "All genes", + type = "violin", + spanmode = "hard", + points = FALSE, + showlegend = is_first, + hoverinfo = "skip", + line = list(color = base_color()), + fillcolor = base_color_transparent() + ) + + if (length(gene_sets) > 0) { + gene_set_data <- merge( + ranking[gene %chin% unlist(gene_sets)], + geposan::genes, + by.x = "gene", + by.y = "id" + ) + + index <- 1 + + for (gene_set_name in names(gene_sets)) { + gene_set <- gene_sets[[gene_set_name]] + + plot <- plot |> plotly::add_markers( + data = gene_set_data[gene %chin% gene_set], + x = ranking_name, + y = ~score, + name = gene_set_name, + text = ~ glue::glue( + "{name}
", + "Score: {round(score, digits = 2)}
", + "Rank: {rank}
", + "Percentile: {round(percentile * 100, digits = 2)}%" + ), + hoverinfo = "text", + showlegend = is_first, + marker = list( + size = 10, + color = gene_set_color(index) + ) ) - is_first <- TRUE - - for (ranking_name in names(rankings)) { - ranking <- rankings[[ranking_name]] - - plot <- plot |> plotly::add_trace( - data = ranking, - x = ranking_name, - y = ~score, - name = "All genes", - type = "violin", - spanmode = "hard", - points = FALSE, - showlegend = is_first, - hoverinfo = "skip", - line = list(color = base_color()), - fillcolor = base_color_transparent() - ) - - if (length(gene_sets) > 0) { - gene_set_data <- merge( - ranking[gene %chin% unlist(gene_sets)], - geposan::genes, - by.x = "gene", - by.y = "id" - ) - - index <- 1 - - for (gene_set_name in names(gene_sets)) { - gene_set <- gene_sets[[gene_set_name]] - - plot <- plot |> plotly::add_markers( - data = gene_set_data[gene %chin% gene_set], - x = ranking_name, - y = ~score, - name = gene_set_name, - text = ~ glue::glue( - "{name}
", - "Score: {round(score, digits = 2)}
", - "Rank: {rank}
", - "Percentile: {round(percentile * 100, digits = 2)}%" - ), - hoverinfo = "text", - showlegend = is_first, - marker = list( - size = 10, - color = gene_set_color(index) - ) - ) - - index <- index + 1 - } - } - - is_first <- FALSE + index <- index + 1 + } } - plot + is_first <- FALSE + } + + plot } @@ -194,88 +194,88 @@ plot_rankings <- function(rankings, gene_sets) { #' #' @export plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) { - if (!requireNamespace("plotly", quietly = TRUE)) { - stop("Please install \"plotly\" to use this function.") - } + if (!requireNamespace("plotly", quietly = TRUE)) { + stop("Please install \"plotly\" to use this function.") + } - # To speed up rendering, don't show every single gene. - n_ranks <- nrow(ranking) - sample_ranking <- ranking[seq(1, n_ranks, 5)] + # To speed up rendering, don't show every single gene. + n_ranks <- nrow(ranking) + sample_ranking <- ranking[seq(1, n_ranks, 5)] - plot <- plotly::plot_ly() |> - plotly::add_lines( - data = sample_ranking, - x = ~percentile, - y = ~score, - name = "All genes", - hoverinfo = "skip", - line = list(width = 4, color = base_color()) - ) |> - plotly::layout( - xaxis = list( - title = "Percentile", - tickformat = ".0%" - ), - yaxis = list(title = "Score") + plot <- plotly::plot_ly() |> + plotly::add_lines( + data = sample_ranking, + x = ~percentile, + y = ~score, + name = "All genes", + hoverinfo = "skip", + line = list(width = 4, color = base_color()) + ) |> + plotly::layout( + xaxis = list( + title = "Percentile", + tickformat = ".0%" + ), + yaxis = list(title = "Score") + ) + + if (length(gene_sets) > 0) { + # Include gene information which will be used for labeling + gene_set_data <- merge( + ranking[gene %chin% unlist(gene_sets)], + geposan::genes, + by.x = "gene", + by.y = "id" + ) + + index <- 1 + + for (gene_set_name in names(gene_sets)) { + gene_set <- gene_sets[[gene_set_name]] + + plot <- plot |> plotly::add_markers( + data = gene_set_data[gene %chin% gene_set], + x = ~percentile, + y = ~score, + name = gene_set_name, + text = ~ glue::glue( + "{name}
", + "Score: {round(score, digits = 2)}
", + "Rank: {rank}
", + "Percentile: {round(percentile * 100, digits = 2)}%" + ), + hoverinfo = "text", + marker = list( + size = 10, + color = gene_set_color(index) ) + ) - if (length(gene_sets) > 0) { - # Include gene information which will be used for labeling - gene_set_data <- merge( - ranking[gene %chin% unlist(gene_sets)], - geposan::genes, - by.x = "gene", - by.y = "id" + index <- index + 1 + } + } + + + if (!is.null(max_rank)) { + first_not_included_rank <- max_rank + 1 + last_rank <- ranking[, .N] + + if (first_not_included_rank <= last_rank) { + plot <- plot |> plotly::layout( + shapes = list( + type = "rect", + fillcolor = "black", + opacity = 0.1, + x0 = 1 - first_not_included_rank / n_ranks, + x1 = 1 - last_rank / n_ranks, + y0 = 0.0, + y1 = 1.0 ) - - index <- 1 - - for (gene_set_name in names(gene_sets)) { - gene_set <- gene_sets[[gene_set_name]] - - plot <- plot |> plotly::add_markers( - data = gene_set_data[gene %chin% gene_set], - x = ~percentile, - y = ~score, - name = gene_set_name, - text = ~ glue::glue( - "{name}
", - "Score: {round(score, digits = 2)}
", - "Rank: {rank}
", - "Percentile: {round(percentile * 100, digits = 2)}%" - ), - hoverinfo = "text", - marker = list( - size = 10, - color = gene_set_color(index) - ) - ) - - index <- index + 1 - } + ) } + } - - if (!is.null(max_rank)) { - first_not_included_rank <- max_rank + 1 - last_rank <- ranking[, .N] - - if (first_not_included_rank <= last_rank) { - plot <- plot |> plotly::layout( - shapes = list( - type = "rect", - fillcolor = "black", - opacity = 0.1, - x0 = 1 - first_not_included_rank / n_ranks, - x1 = 1 - last_rank / n_ranks, - y0 = 0.0, - y1 = 1.0 - ) - ) - } - } - - plot + plot } #' Visualize a ranking by comparing gene sets in a boxplot. @@ -290,44 +290,44 @@ plot_scores <- function(ranking, gene_sets = NULL, max_rank = NULL) { #' #' @export plot_boxplot <- function(ranking, gene_sets = NULL) { - if (!requireNamespace("plotly", quietly = TRUE)) { - stop("Please install \"plotly\" to use this function.") + if (!requireNamespace("plotly", quietly = TRUE)) { + stop("Please install \"plotly\" to use this function.") + } + + plot <- plotly::plot_ly() |> + plotly::add_boxplot( + data = ranking, + x = "All genes", + y = ~score, + name = "All genes", + showlegend = FALSE, + line = list(color = base_color()) + ) |> + plotly::layout( + xaxis = list(tickvals = c("All genes", names(gene_sets))), + yaxis = list(title = "Score") + ) + + if (length(gene_sets) > 0) { + index <- 1 + + for (gene_set_name in names(gene_sets)) { + gene_set <- gene_sets[[gene_set_name]] + + plot <- plot |> plotly::add_boxplot( + data = ranking[gene %chin% gene_set], + x = gene_set_name, + y = ~score, + name = gene_set_name, + showlegend = FALSE, + line = list(color = gene_set_color(index)) + ) + + index <- index + 1 } + } - plot <- plotly::plot_ly() |> - plotly::add_boxplot( - data = ranking, - x = "All genes", - y = ~score, - name = "All genes", - showlegend = FALSE, - line = list(color = base_color()) - ) |> - plotly::layout( - xaxis = list(tickvals = c("All genes", names(gene_sets))), - yaxis = list(title = "Score") - ) - - if (length(gene_sets) > 0) { - index <- 1 - - for (gene_set_name in names(gene_sets)) { - gene_set <- gene_sets[[gene_set_name]] - - plot <- plot |> plotly::add_boxplot( - data = ranking[gene %chin% gene_set], - x = gene_set_name, - y = ~score, - name = gene_set_name, - showlegend = FALSE, - line = list(color = gene_set_color(index)) - ) - - index <- index + 1 - } - } - - plot + plot } #' Show the distribution of scores across chromosomes. @@ -340,46 +340,46 @@ plot_boxplot <- function(ranking, gene_sets = NULL) { #' #' @export plot_chromosomes <- function(ranking) { - if (!requireNamespace("plotly", quietly = TRUE)) { - stop("Please install \"plotly\" to use this function.") - } + if (!requireNamespace("plotly", quietly = TRUE)) { + stop("Please install \"plotly\" to use this function.") + } - data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id") - data <- data[, .(score = mean(score)), by = "chromosome"] + data <- merge(ranking, geposan::genes, by.x = "gene", by.y = "id") + data <- data[, .(score = mean(score)), by = "chromosome"] - # Get an orderable integer from a chromosome name. - chromosome_index <- function(chromosome) { - index <- suppressWarnings(as.integer(chromosome)) + # Get an orderable integer from a chromosome name. + chromosome_index <- function(chromosome) { + index <- suppressWarnings(as.integer(chromosome)) - ifelse( - !is.na(index), - index, - ifelse( - chromosome == "X", - 998, - 999 - ) - ) - } + ifelse( + !is.na(index), + index, + ifelse( + chromosome == "X", + 998, + 999 + ) + ) + } - data[, index := chromosome_index(chromosome)] - setorder(data, "index") + data[, index := chromosome_index(chromosome)] + setorder(data, "index") - plotly::plot_ly( - data = data, - x = ~chromosome, - y = ~score, - type = "bar", - marker = list(color = base_color()) - ) |> - plotly::layout( - xaxis = list( - title = "Chromosome", - categoryorder = "array", - categoryarray = ~chromosome - ), - yaxis = list(title = "Mean score") - ) + plotly::plot_ly( + data = data, + x = ~chromosome, + y = ~score, + type = "bar", + marker = list(color = base_color()) + ) |> + plotly::layout( + xaxis = list( + title = "Chromosome", + categoryorder = "array", + categoryarray = ~chromosome + ), + yaxis = list(title = "Mean score") + ) } #' Plot scores in relation to chromosomal position of genes. @@ -398,74 +398,74 @@ plot_chromosomes <- function(ranking) { plot_scores_by_position <- function(ranking, chromosome_name = NULL, gene_sets = NULL) { - if (!requireNamespace("plotly", quietly = TRUE)) { - stop("Please install \"plotly\" to use this function.") - } + if (!requireNamespace("plotly", quietly = TRUE)) { + stop("Please install \"plotly\" to use this function.") + } - distance_data <- if (!is.null(chromosome_name)) { - chromosome_name_ <- chromosome_name - geposan::distances[ - species == "hsapiens" & - chromosome_name == chromosome_name_ - ] - } else { - geposan::distances[species == "hsapiens"] - } + distance_data <- if (!is.null(chromosome_name)) { + chromosome_name_ <- chromosome_name + geposan::distances[ + species == "hsapiens" & + chromosome_name == chromosome_name_ + ] + } else { + geposan::distances[species == "hsapiens"] + } - data <- merge(ranking, distance_data, by = "gene") + data <- merge(ranking, distance_data, by = "gene") - data <- merge( - data, - geposan::genes, - by.x = "gene", - by.y = "id" + data <- merge( + data, + geposan::genes, + by.x = "gene", + by.y = "id" + ) + + data[, `:=`(gene_set = "All genes", color = base_color())] + + index <- 1 + for (gene_set_name in names(gene_sets)) { + gene_set_genes <- gene_sets[[gene_set_name]] + data[ + gene %chin% gene_set_genes, + `:=`( + gene_set = gene_set_name, + color = gene_set_color(index) + ) + ] + + index <- index + 1 + } + + # Use distances instead of positions in case all chromosomes are included. + if (is.null(chromosome_name)) { + data[, x := distance] + } else { + data[, x := start_position] + } + + plotly::plot_ly() |> + plotly::add_markers( + data = data, + x = ~x, + y = ~score, + name = ~gene_set, + text = ~ glue::glue( + "{name}
", + if (is.null(chromosome_name)) "Distance: " else "Position: ", + "{round(x / 1000000, digits = 2)} MBp
", + "Score: {round(score, digits = 2)}
", + "Rank: {rank}
", + "Percentile: {round(percentile * 100, digits = 2)}%" + ), + hoverinfo = "text", + ) |> + plotly::layout( + xaxis = list(title = if (is.null(chromosome_name)) { + "Distance (Bp)" + } else { + "Position (Bp)" + }), + yaxis = list(title = "Score") ) - - data[, `:=`(gene_set = "All genes", color = base_color())] - - index <- 1 - for (gene_set_name in names(gene_sets)) { - gene_set_genes <- gene_sets[[gene_set_name]] - data[ - gene %chin% gene_set_genes, - `:=`( - gene_set = gene_set_name, - color = gene_set_color(index) - ) - ] - - index <- index + 1 - } - - # Use distances instead of positions in case all chromosomes are included. - if (is.null(chromosome_name)) { - data[, x := distance] - } else { - data[, x := start_position] - } - - plotly::plot_ly() |> - plotly::add_markers( - data = data, - x = ~x, - y = ~score, - name = ~gene_set, - text = ~ glue::glue( - "{name}
", - if (is.null(chromosome_name)) "Distance: " else "Position: ", - "{round(x / 1000000, digits = 2)} MBp
", - "Score: {round(score, digits = 2)}
", - "Rank: {rank}
", - "Percentile: {round(percentile * 100, digits = 2)}%" - ), - hoverinfo = "text", - ) |> - plotly::layout( - xaxis = list(title = if (is.null(chromosome_name)) { - "Distance (Bp)" - } else { - "Position (Bp)" - }), - yaxis = list(title = "Score") - ) } diff --git a/R/preset.R b/R/preset.R index ad15018..e798bbb 100644 --- a/R/preset.R +++ b/R/preset.R @@ -21,55 +21,55 @@ preset <- function(reference_gene_ids, methods = all_methods(), species_ids = geposan::species$id, gene_ids = geposan::genes$id) { - # Count included species per gene. - genes_n_species <- geposan::distances[ - species %chin% species_ids, - .(n_species = .N), - by = "gene" - ] + # Count included species per gene. + genes_n_species <- geposan::distances[ + species %chin% species_ids, + .(n_species = .N), + by = "gene" + ] - # Filter out genes with less than 25% existing orthologs. - gene_ids_filtered <- genes_n_species[ - gene %chin% gene_ids & - n_species >= 0.25 * length(species_ids), - gene - ] + # Filter out genes with less than 25% existing orthologs. + gene_ids_filtered <- genes_n_species[ + gene %chin% gene_ids & + n_species >= 0.25 * length(species_ids), + gene + ] - reference_gene_ids_excluded <- reference_gene_ids[ - !reference_gene_ids %chin% gene_ids_filtered - ] + reference_gene_ids_excluded <- reference_gene_ids[ + !reference_gene_ids %chin% gene_ids_filtered + ] - if (length(reference_gene_ids_excluded > 0)) { - warning(paste0( - "The following reference gene IDs are excluded from the preset ", - "because they don't have enough data: ", - paste(reference_gene_ids_excluded, collapse = ", ") - )) - } + if (length(reference_gene_ids_excluded > 0)) { + warning(paste0( + "The following reference gene IDs are excluded from the preset ", + "because they don't have enough data: ", + paste(reference_gene_ids_excluded, collapse = ", ") + )) + } - reference_gene_ids_included <- reference_gene_ids[ - reference_gene_ids %chin% gene_ids_filtered - ] + reference_gene_ids_included <- reference_gene_ids[ + reference_gene_ids %chin% gene_ids_filtered + ] - if (length(reference_gene_ids_included) < 1) { - stop(paste0( - "There has to be at least one reference gene for the preset to be ", - "valid. Please note that some methods may require more reference ", - "genes." - )) - } + if (length(reference_gene_ids_included) < 1) { + stop(paste0( + "There has to be at least one reference gene for the preset to be ", + "valid. Please note that some methods may require more reference ", + "genes." + )) + } - # The included data gets sorted to be able to produce predictable hashes - # for the object later. - structure( - list( - reference_gene_ids = sort(reference_gene_ids_included), - methods = methods, - species_ids = sort(species_ids), - gene_ids = sort(gene_ids_filtered) - ), - class = "geposan_preset" - ) + # The included data gets sorted to be able to produce predictable hashes + # for the object later. + structure( + list( + reference_gene_ids = sort(reference_gene_ids_included), + methods = methods, + species_ids = sort(species_ids), + gene_ids = sort(gene_ids_filtered) + ), + class = "geposan_preset" + ) } #' S3 method to print a preset object. @@ -81,20 +81,20 @@ preset <- function(reference_gene_ids, #' #' @export print.geposan_preset <- function(x, ...) { - cat(sprintf( - paste0( - "geposan preset:", - "\n Reference genes: %i", - "\n Included methods: %s", - "\n Number of species: %i", - "\n Number of genes: %i", - "\n" - ), - length(x$reference_gene_ids), - paste(sapply(x$methods, function(m) m$id), collapse = ", "), - length(x$species_ids), - length(x$gene_ids) - )) + cat(sprintf( + paste0( + "geposan preset:", + "\n Reference genes: %i", + "\n Included methods: %s", + "\n Number of species: %i", + "\n Number of genes: %i", + "\n" + ), + length(x$reference_gene_ids), + paste(sapply(x$methods, function(m) m$id), collapse = ", "), + length(x$species_ids), + length(x$gene_ids) + )) - invisible(x) + invisible(x) } diff --git a/R/ranking.R b/R/ranking.R index 47ec098..2721f57 100644 --- a/R/ranking.R +++ b/R/ranking.R @@ -13,35 +13,35 @@ #' #' @export ranking <- function(analysis, weights) { - ranking <- if (inherits(analysis, "geposan_analysis")) { - copy(analysis$scores) - } else if (inherits(analysis, "geposan_ranking")) { - copy(analysis) - } else { - stop("Invalid analyis. Use geposan::analyze().") - } + ranking <- if (inherits(analysis, "geposan_analysis")) { + copy(analysis$scores) + } else if (inherits(analysis, "geposan_ranking")) { + copy(analysis) + } else { + stop("Invalid analyis. Use geposan::analyze().") + } - ranking[, score := 0.0] + ranking[, score := 0.0] - for (method in names(weights)) { - weighted <- weights[[method]] * ranking[, ..method] - ranking[, score := score + weighted] - } + for (method in names(weights)) { + weighted <- weights[[method]] * ranking[, ..method] + ranking[, score := score + weighted] + } - # Normalize scores to be between 0.0 and 1.0. - min_score <- ranking[, min(score)] - max_score <- ranking[, max(score)] - score_range <- max_score - min_score - ranking[, score := (score - min_score) / score_range] + # Normalize scores to be between 0.0 and 1.0. + min_score <- ranking[, min(score)] + max_score <- ranking[, max(score)] + score_range <- max_score - min_score + ranking[, score := (score - min_score) / score_range] - setorder(ranking, -score) - ranking[, rank := .I] - ranking[, percentile := 1 - rank / nrow(ranking)] + setorder(ranking, -score) + ranking[, rank := .I] + ranking[, percentile := 1 - rank / nrow(ranking)] - structure( - ranking, - class = c("geposan_ranking", class(ranking)) - ) + structure( + ranking, + class = c("geposan_ranking", class(ranking)) + ) } #' Find the best weights to rank the results. @@ -61,40 +61,40 @@ ranking <- function(analysis, weights) { #' @export optimal_weights <- function(analysis, methods, reference_gene_ids, target = "mean") { - if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) { - stop("Invalid analyis. Use geposan::analyze().") + if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) { + stop("Invalid analyis. Use geposan::analyze().") + } + + + # Compute the target rank of the reference genes when applying the + # weights. + target_rank <- function(factors) { + data <- ranking(analysis, as.list(factors)) + + result <- data[ + gene %chin% reference_gene_ids, + if (target == "min") { + min(rank) + } else if (target == "max") { + max(rank) + } else if (target == "mean") { + mean(rank) + } else { + stats::median(rank) + } + ] + + if (result > 0) { + result + } else { + Inf } + } + initial_factors <- rep(1.0, length(methods)) + names(initial_factors) <- methods - # Compute the target rank of the reference genes when applying the - # weights. - target_rank <- function(factors) { - data <- ranking(analysis, as.list(factors)) + optimal_factors <- stats::optim(initial_factors, target_rank)$par - result <- data[ - gene %chin% reference_gene_ids, - if (target == "min") { - min(rank) - } else if (target == "max") { - max(rank) - } else if (target == "mean") { - mean(rank) - } else { - stats::median(rank) - } - ] - - if (result > 0) { - result - } else { - Inf - } - } - - initial_factors <- rep(1.0, length(methods)) - names(initial_factors) <- methods - - optimal_factors <- stats::optim(initial_factors, target_rank)$par - - as.list(optimal_factors / max(abs(optimal_factors))) + as.list(optimal_factors / max(abs(optimal_factors))) } diff --git a/R/result.R b/R/result.R index c16ffc0..7c370e7 100644 --- a/R/result.R +++ b/R/result.R @@ -10,18 +10,18 @@ #' #' @export result <- function(method_id, scores, details = list()) { - stopifnot(is.data.frame(scores) & - c("gene", "score") %chin% colnames(scores)) - stopifnot(is.list(details)) + stopifnot(is.data.frame(scores) & + c("gene", "score") %chin% colnames(scores)) + stopifnot(is.list(details)) - structure( - list( - method_id = method_id, - scores = scores, - details = details - ), - class = "geposan_result" - ) + structure( + list( + method_id = method_id, + scores = scores, + details = details + ), + class = "geposan_result" + ) } #' Print a result object. @@ -33,18 +33,18 @@ result <- function(method_id, scores, details = list()) { #' #' @export print.geposan_result <- function(x, ...) { - cat(sprintf( - paste0( - "geposan result:", - "\n Method: %s", - "\n Number of genes: %i", - "\n Available details: %s", - "\n" - ), - x$method_id, - nrow(x$scores), - paste(names(x$details), collapse = ", ") - )) + cat(sprintf( + paste0( + "geposan result:", + "\n Method: %s", + "\n Number of genes: %i", + "\n Available details: %s", + "\n" + ), + x$method_id, + nrow(x$scores), + paste(names(x$details), collapse = ", ") + )) - invisible(x) + invisible(x) } diff --git a/R/utils.R b/R/utils.R index a1f0820..b590219 100644 --- a/R/utils.R +++ b/R/utils.R @@ -8,25 +8,25 @@ # @param objects A vector of objects that this expression depends on. The hash # of those objects will be used for identifying the cache file. cached <- function(name, objects, expr) { - if (!dir.exists("cache")) { - dir.create("cache") - } + if (!dir.exists("cache")) { + dir.create("cache") + } - id <- rlang::hash(objects) - cache_file <- sprintf("cache/%s_%s.rda", name, id) + id <- rlang::hash(objects) + cache_file <- sprintf("cache/%s_%s.rda", name, id) - if (file.exists(cache_file)) { - # If the cache file exists, we restore the data from it. - load(cache_file) - } else { - # If the cache file doesn't exist, we have to do the computation. - data <- expr + if (file.exists(cache_file)) { + # If the cache file exists, we restore the data from it. + load(cache_file) + } else { + # If the cache file doesn't exist, we have to do the computation. + data <- expr - # The results are cached for the next run. - save(data, file = cache_file, compress = "xz") - } + # The results are cached for the next run. + save(data, file = cache_file, compress = "xz") + } - data + data } # This is needed to make data.table's symbols available within the package. diff --git a/R/validate.R b/R/validate.R index 848c079..2c673f6 100644 --- a/R/validate.R +++ b/R/validate.R @@ -27,68 +27,68 @@ #' #' @export validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) { - if (!inherits(ranking, "geposan_ranking")) { - stop("Ranking is invalid. Use geposan::ranking().") - } + if (!inherits(ranking, "geposan_ranking")) { + stop("Ranking is invalid. Use geposan::ranking().") + } - if (is.null(progress)) { - progress_bar <- progress::progress_bar$new() - progress_bar$update(0.0) + if (is.null(progress)) { + progress_bar <- progress::progress_bar$new() + progress_bar$update(0.0) - progress <- function(progress_value) { - if (!progress_bar$finished) { - progress_bar$update(progress_value) - if (progress_value >= 1.0) { - progress_bar$terminate() - } - } + progress <- function(progress_value) { + if (!progress_bar$finished) { + progress_bar$update(progress_value) + if (progress_value >= 1.0) { + progress_bar$terminate() } + } } + } - progress_state <- 0.0 - progress_step <- 1.0 / length(reference_gene_ids) + progress_state <- 0.0 + progress_step <- 1.0 / length(reference_gene_ids) - results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)] + results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)] - for (gene_id in reference_gene_ids) { - included_gene_ids <- reference_gene_ids[ - reference_gene_ids != gene_id - ] + for (gene_id in reference_gene_ids) { + included_gene_ids <- reference_gene_ids[ + reference_gene_ids != gene_id + ] - weights <- optimal_weights( - ranking, - method_ids, - included_gene_ids - ) - - ranking_validation <- ranking(ranking, weights) - - results[ - gene == gene_id, - percentile_validation := ranking_validation[ - gene == gene_id, - percentile - ] - ] - - if (!is.null(progress)) { - progress_state <- progress_state + progress_step - progress(progress_state) - } - } - - results[, error := percentile - percentile_validation] - setorder(results, error) - - structure( - list( - validation = results, - mean_percentile_original = results[, mean(percentile)], - mean_percentile_validation = results[, mean(percentile_validation)], - mean_error = results[, mean(error)] - ), - class = "geposan_validation" + weights <- optimal_weights( + ranking, + method_ids, + included_gene_ids ) + + ranking_validation <- ranking(ranking, weights) + + results[ + gene == gene_id, + percentile_validation := ranking_validation[ + gene == gene_id, + percentile + ] + ] + + if (!is.null(progress)) { + progress_state <- progress_state + progress_step + progress(progress_state) + } + } + + results[, error := percentile - percentile_validation] + setorder(results, error) + + structure( + list( + validation = results, + mean_percentile_original = results[, mean(percentile)], + mean_percentile_validation = results[, mean(percentile_validation)], + mean_error = results[, mean(error)] + ), + class = "geposan_validation" + ) } #' S3 method to print a validation object. @@ -100,18 +100,18 @@ validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) { #' #' @export print.geposan_validation <- function(x, ...) { - cat(sprintf( - paste0( - "geposan validation:", - "\n Mean percentile original: %.1f%%", - "\n Mean percentile validation: %.1f%%", - "\n Mean error: %.1f percent points", - "\n" - ), - x$mean_percentile_original * 100, - x$mean_percentile_validation * 100, - x$mean_error * 100 - )) + cat(sprintf( + paste0( + "geposan validation:", + "\n Mean percentile original: %.1f%%", + "\n Mean percentile validation: %.1f%%", + "\n Mean error: %.1f percent points", + "\n" + ), + x$mean_percentile_original * 100, + x$mean_percentile_validation * 100, + x$mean_error * 100 + )) - invisible(x) + invisible(x) } diff --git a/scripts/chromosome_names.R b/scripts/chromosome_names.R index cf1a59a..e516f49 100644 --- a/scripts/chromosome_names.R +++ b/scripts/chromosome_names.R @@ -5,23 +5,23 @@ ensembl_api_url <- "https://rest.ensembl.org" #' Perform a request to the Ensembl REST API. ensembl_request <- function(api_path) { - content(stop_for_status(GET( - paste0(ensembl_api_url, api_path), - content_type_json() - ))) + content(stop_for_status(GET( + paste0(ensembl_api_url, api_path), + content_type_json() + ))) } #' Get IDs of all available vertebrates. get_species_ids <- function() { - species <- ensembl_request("/info/species")$species - sapply(species, function(species) species$name) + species <- ensembl_request("/info/species")$species + sapply(species, function(species) species$name) } #' Get all chromosomes names for a species. get_species_chromosomes <- function(species_id) { - chromosomes <- unlist(ensembl_request( - paste0("/info/assembly/", species_id) - )$karyotype) + chromosomes <- unlist(ensembl_request( + paste0("/info/assembly/", species_id) + )$karyotype) } #' Get a vector of all available unqiue chromosome names. @@ -29,6 +29,6 @@ get_species_chromosomes <- function(species_id) { #' There are multiple names for mitochondrial DNA which have to be removed #' manually, unfortunately. get_all_chromosomes <- function() { - chromosomes <- sapply(get_species_ids(), get_species_chromosomes) - unique(unlist(chromosomes)) + chromosomes <- sapply(get_species_ids(), get_species_chromosomes) + unique(unlist(chromosomes)) } diff --git a/scripts/ensembl.R b/scripts/ensembl.R index c66d706..ab052fc 100644 --- a/scripts/ensembl.R +++ b/scripts/ensembl.R @@ -12,8 +12,8 @@ ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl)) # Filter out species ID and name from the result. species <- ensembl_datasets[, .( - id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2], - name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2] + id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2], + name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2] )] # List of assemblies that the Ensembl Rest API advertises as chromosomes. @@ -23,232 +23,232 @@ species <- ensembl_datasets[, .( # # See get_all_chromosomes() valid_chromosome_names <- c( - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "X", - "Y", - "21", - "4A", - "1A", - "22", - "23", - "24", - "25LG1", - "25LG2", - "26", - "27", - "28", - "LGE22", - "Z", - "25", - "29", - "A1", - "A2", - "A3", - "B1", - "B2", - "B3", - "B4", - "C1", - "C2", - "D1", - "D2", - "D3", - "D4", - "E1", - "E2", - "E3", - "F1", - "F2", - "2A", - "2B", - "LG01", - "LG02", - "LG03", - "LG04", - "LG05", - "LG06", - "LG07", - "LG08", - "LG09", - "LG10", - "LG11", - "LG12", - "LG13", - "LG14", - "LG15", - "LG16", - "LG17", - "LG18", - "LG19", - "LG20", - "LG21", - "LG22", - "LG23", - "LG24", - "LG25", - "I", - "II", - "III", - "IV", - "V", - "VI", - "VII", - "VIII", - "IX", - "XI", - "XII", - "XIII", - "XIV", - "XV", - "XVI", - "XVII", - "XVIII", - "XIX", - "XX", - "XXI", - "XXII", - "XXIII", - "XXIV", - "W", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "2L", - "2R", - "3L", - "3R", - "LG1", - "LG2", - "LG3", - "LG4", - "LG5", - "LG6", - "LG7", - "LG8", - "LG9", - "LG26", - "LG27", - "LG28", - "LG29", - "LG30", - "1a", - "7b", - "22a", - "LGE22C19W28_E50C23", - "LGE64", - "7a", - "MIC_1", - "MIC_10", - "MIC_11", - "MIC_2", - "MIC_3", - "MIC_4", - "MIC_5", - "MIC_6", - "MIC_7", - "MIC_8", - "MIC_9", - "sgr01", - "sgr02", - "sgr03", - "sgr04", - "sgr05", - "sgr06", - "sgr07", - "sgr08", - "sgr09", - "sgr10", - "sgr11", - "sgr12", - "sgr13", - "sgr14", - "sgr15", - "sgr16", - "sgr17", - "sgr18", - "sgr19", - "X1", - "X2", - "X3", - "X4", - "X5", - "a", - "b", - "c", - "d", - "f", - "g", - "h", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "LG28B", - "LG30F", - "LG36F", - "LG37M", - "LG42F", - "LG44F", - "LG45M", - "LG48F", - "LG49B", - "LG34", - "LG35", - "LG7_11", - "groupI", - "groupII", - "groupIII", - "groupIV", - "groupV", - "groupVI", - "groupVII", - "groupVIII", - "groupIX", - "groupX", - "groupXI", - "groupXII", - "groupXIII", - "groupXIV", - "groupXV", - "groupXVI", - "groupXVII", - "groupXVIII", - "groupXIX", - "groupXX", - "groupXXI" + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "X", + "Y", + "21", + "4A", + "1A", + "22", + "23", + "24", + "25LG1", + "25LG2", + "26", + "27", + "28", + "LGE22", + "Z", + "25", + "29", + "A1", + "A2", + "A3", + "B1", + "B2", + "B3", + "B4", + "C1", + "C2", + "D1", + "D2", + "D3", + "D4", + "E1", + "E2", + "E3", + "F1", + "F2", + "2A", + "2B", + "LG01", + "LG02", + "LG03", + "LG04", + "LG05", + "LG06", + "LG07", + "LG08", + "LG09", + "LG10", + "LG11", + "LG12", + "LG13", + "LG14", + "LG15", + "LG16", + "LG17", + "LG18", + "LG19", + "LG20", + "LG21", + "LG22", + "LG23", + "LG24", + "LG25", + "I", + "II", + "III", + "IV", + "V", + "VI", + "VII", + "VIII", + "IX", + "XI", + "XII", + "XIII", + "XIV", + "XV", + "XVI", + "XVII", + "XVIII", + "XIX", + "XX", + "XXI", + "XXII", + "XXIII", + "XXIV", + "W", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "2L", + "2R", + "3L", + "3R", + "LG1", + "LG2", + "LG3", + "LG4", + "LG5", + "LG6", + "LG7", + "LG8", + "LG9", + "LG26", + "LG27", + "LG28", + "LG29", + "LG30", + "1a", + "7b", + "22a", + "LGE22C19W28_E50C23", + "LGE64", + "7a", + "MIC_1", + "MIC_10", + "MIC_11", + "MIC_2", + "MIC_3", + "MIC_4", + "MIC_5", + "MIC_6", + "MIC_7", + "MIC_8", + "MIC_9", + "sgr01", + "sgr02", + "sgr03", + "sgr04", + "sgr05", + "sgr06", + "sgr07", + "sgr08", + "sgr09", + "sgr10", + "sgr11", + "sgr12", + "sgr13", + "sgr14", + "sgr15", + "sgr16", + "sgr17", + "sgr18", + "sgr19", + "X1", + "X2", + "X3", + "X4", + "X5", + "a", + "b", + "c", + "d", + "f", + "g", + "h", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "LG28B", + "LG30F", + "LG36F", + "LG37M", + "LG42F", + "LG44F", + "LG45M", + "LG48F", + "LG49B", + "LG34", + "LG35", + "LG7_11", + "groupI", + "groupII", + "groupIII", + "groupIV", + "groupV", + "groupVI", + "groupVII", + "groupVIII", + "groupIX", + "groupX", + "groupXI", + "groupXII", + "groupXIII", + "groupXIV", + "groupXV", + "groupXVI", + "groupXVII", + "groupXVIII", + "groupXIX", + "groupXX", + "groupXXI" ) #' Get all chromosome names for an Ensembl dataset. @@ -256,8 +256,8 @@ valid_chromosome_names <- c( #' The function tries to filter out valid chromosome names from the available #' assemblies in the dataset. get_chromosome_names <- function(dataset) { - chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") - chromosome_names[chromosome_names %chin% valid_chromosome_names] + chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") + chromosome_names[chromosome_names %chin% valid_chromosome_names] } # Retrieve information on human genes. This will only include genes on @@ -267,16 +267,16 @@ rlog::log_info("Retrieving information on human genes") dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl) human_data <- data.table(biomaRt::getBM( - attributes = c( - "ensembl_gene_id", - "hgnc_symbol", - "chromosome_name", - "start_position", - "end_position" - ), - filters = "chromosome_name", - values = get_chromosome_names(dataset), - mart = dataset + attributes = c( + "ensembl_gene_id", + "hgnc_symbol", + "chromosome_name", + "start_position", + "end_position" + ), + filters = "chromosome_name", + values = get_chromosome_names(dataset), + mart = dataset )) # Remove duplicated gene IDs (at the time of writing, there are a handful). @@ -284,9 +284,9 @@ human_data <- unique(human_data, by = "ensembl_gene_id") # Only keep relevant information on genes. genes <- human_data[, .( - id = ensembl_gene_id, - name = hgnc_symbol, - chromosome = chromosome_name + id = ensembl_gene_id, + name = hgnc_symbol, + chromosome = chromosome_name )] # Retrieve gene distance data across species. @@ -296,39 +296,39 @@ distances <- data.table() #' Handle data for one species. handle_species <- function(species_id, species_data) { - chromosomes <- species_data[, - .(chromosome_length = max(end_position)), - by = chromosome_name - ] + chromosomes <- species_data[, + .(chromosome_length = max(end_position)), + by = chromosome_name + ] - # Store the number of chromosomes in the species table. - species[id == species_id, n_chromosomes := nrow(chromosomes)] + # Store the number of chromosomes in the species table. + species[id == species_id, n_chromosomes := nrow(chromosomes)] - # Store the median chromosome length in the species table. - species[ - id == species_id, - median_chromosome_length := chromosomes[, median(chromosome_length)] - ] + # Store the median chromosome length in the species table. + species[ + id == species_id, + median_chromosome_length := chromosomes[, median(chromosome_length)] + ] - # Precompute the genes' distance to the nearest telomere. - species_distances <- species_data[ - chromosomes, - .( - species = species_id, - gene = ensembl_gene_id, - chromosome_name = chromosome_name, - start_position = start_position, - end_position = end_position, - distance = pmin( - start_position, - chromosome_length - end_position - ) - ), - on = "chromosome_name" - ] + # Precompute the genes' distance to the nearest telomere. + species_distances <- species_data[ + chromosomes, + .( + species = species_id, + gene = ensembl_gene_id, + chromosome_name = chromosome_name, + start_position = start_position, + end_position = end_position, + distance = pmin( + start_position, + chromosome_length - end_position + ) + ), + on = "chromosome_name" + ] - # Add species distances to the distances table. - distances <<- rbindlist(list(distances, species_distances)) + # Add species distances to the distances table. + distances <<- rbindlist(list(distances, species_distances)) } # Handle the human first, as we already retrieved the data and don't need to @@ -337,67 +337,67 @@ handle_species("hsapiens", human_data) # Iterate through all other species and retrieve their distance data. for (species_id in species[id != "hsapiens", id]) { - rlog::log_info(sprintf("Loading species \"%s\"", species_id)) + rlog::log_info(sprintf("Loading species \"%s\"", species_id)) - dataset <- biomaRt::useDataset( - sprintf("%s_gene_ensembl", species_id), - mart = ensembl - ) + dataset <- biomaRt::useDataset( + sprintf("%s_gene_ensembl", species_id), + mart = ensembl + ) - # Besides the attributes that are always present, we need to check for - # human orthologs. Some species don't have that information and will be - # skipped. - if (!"hsapiens_homolog_ensembl_gene" %chin% - biomaRt::listAttributes(dataset, what = "name")) { - rlog::log_info("No data on human orthologs") - species <- species[id != species_id] + # Besides the attributes that are always present, we need to check for + # human orthologs. Some species don't have that information and will be + # skipped. + if (!"hsapiens_homolog_ensembl_gene" %chin% + biomaRt::listAttributes(dataset, what = "name")) { + rlog::log_info("No data on human orthologs") + species <- species[id != species_id] - next - } + next + } - chromosome_names <- get_chromosome_names(dataset) + chromosome_names <- get_chromosome_names(dataset) - # Skip the species, if there are no assembled chromosomes. - if (length(chromosome_names) <= 0) { - rlog::log_info("No matching chromosome assemblies") - species <- species[id != species_id] + # Skip the species, if there are no assembled chromosomes. + if (length(chromosome_names) <= 0) { + rlog::log_info("No matching chromosome assemblies") + species <- species[id != species_id] - next - } + next + } - # Retrieve information on all genes of the current species, that have - # human orthologs. This is called "homolog" in the Ensembl schema. - species_distances <- data.table(biomaRt::getBM( - attributes = c( - "hsapiens_homolog_ensembl_gene", - "chromosome_name", - "start_position", - "end_position" - ), - filters = c("with_hsapiens_homolog", "chromosome_name"), - values = list(TRUE, chromosome_names), - mart = dataset - )) + # Retrieve information on all genes of the current species, that have + # human orthologs. This is called "homolog" in the Ensembl schema. + species_distances <- data.table(biomaRt::getBM( + attributes = c( + "hsapiens_homolog_ensembl_gene", + "chromosome_name", + "start_position", + "end_position" + ), + filters = c("with_hsapiens_homolog", "chromosome_name"), + values = list(TRUE, chromosome_names), + mart = dataset + )) - # Only include human genes that we have information on. - species_distances <- species_distances[ - hsapiens_homolog_ensembl_gene %chin% genes$id - ] + # Only include human genes that we have information on. + species_distances <- species_distances[ + hsapiens_homolog_ensembl_gene %chin% genes$id + ] - # Only include one ortholog per human gene. - species_distances <- unique( - species_distances, - by = "hsapiens_homolog_ensembl_gene" - ) + # Only include one ortholog per human gene. + species_distances <- unique( + species_distances, + by = "hsapiens_homolog_ensembl_gene" + ) - # Rename gene ID column to match the human data. - setnames( - species_distances, - "hsapiens_homolog_ensembl_gene", - "ensembl_gene_id" - ) + # Rename gene ID column to match the human data. + setnames( + species_distances, + "hsapiens_homolog_ensembl_gene", + "ensembl_gene_id" + ) - handle_species(species_id, species_distances) + handle_species(species_id, species_distances) } # Save data in the appropriate place.