From cfc5e7a6bf19726f4be5cdf9ec84c63eb6c46658 Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Fri, 5 Nov 2021 19:49:54 +0100 Subject: [PATCH] Implement all methods using positions additionally --- R/analyze.R | 20 ++++- R/clusteriness.R | 12 ++- R/correlation.R | 143 ++++++++++++++++++---------------- R/neural.R | 195 ++++++++++++++++++++++++++--------------------- R/proximity.R | 19 +++-- 5 files changed, 222 insertions(+), 167 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index f14cf96..64feb80 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -35,7 +35,19 @@ analyze <- function(preset, progress = NULL) { "clusteriness" = clusteriness, "correlation" = correlation, "proximity" = proximity, - "neural" = neural + "neural" = neural, + "clusteriness_positions" = function(...) { + clusteriness(..., use_positions = TRUE) + }, + "correlation_positions" = function(...) { + correlation(..., use_positions = TRUE) + }, + "proximity_positions" = function(...) { + proximity(..., use_positions = TRUE) + }, + "neural_positions" = function(...) { + neural(..., use_positions = TRUE) + } ) results <- cached("analysis", preset, { @@ -50,7 +62,11 @@ analyze <- function(preset, progress = NULL) { } } - method_results <- methods[[method_id]](preset, method_progress) + method_results <- methods[[method_id]]( + preset, + progress = method_progress + ) + setnames(method_results, "score", method_id) results <- merge( diff --git a/R/clusteriness.R b/R/clusteriness.R index 20ca849..f2490a1 100644 --- a/R/clusteriness.R +++ b/R/clusteriness.R @@ -36,11 +36,11 @@ clusteriness_priv <- function(data, height = 1000000) { } # Process genes clustering their distance to telomeres. -clusteriness <- function(preset, progress = NULL) { +clusteriness <- function(preset, use_positions = FALSE, progress = NULL) { species_ids <- preset$species_ids gene_ids <- preset$gene_ids - cached("clusteriness", c(species_ids, gene_ids), { + cached("clusteriness", c(species_ids, gene_ids, use_positions), { results <- data.table(gene = gene_ids) # Prefilter the input data by species. @@ -54,7 +54,13 @@ clusteriness <- function(preset, progress = NULL) { # Perform the cluster analysis for one gene. compute <- function(gene_id) { - score <- clusteriness_priv(distances[gene_id, distance]) + data <- if (use_positions) { + distances[gene_id, position] + } else { + distances[gene_id, distance] + } + + score <- clusteriness_priv(data) if (!is.null(progress)) { genes_done <<- genes_done + 1 diff --git a/R/correlation.R b/R/correlation.R index 6f93a1e..bfcdddc 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -1,79 +1,90 @@ # Compute the mean correlation coefficient comparing gene distances with a set # of reference genes. -correlation <- function(preset, progress = NULL) { +correlation <- function(preset, use_positions = FALSE, progress = NULL) { species_ids <- preset$species_ids gene_ids <- preset$gene_ids reference_gene_ids <- preset$reference_gene_ids - cached("correlation", c(species_ids, gene_ids, reference_gene_ids), { - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + cached( + "correlation", + c(species_ids, gene_ids, reference_gene_ids, use_positions), + { # nolint + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Tranform data to get species as rows and genes as columns. We - # construct columns per species, because it requires fewer iterations, - # and transpose the table afterwards. + # Tranform data to get species as rows and genes as columns. We + # construct columns per species, because it requires fewer + # iterations, and transpose the table afterwards. - data <- data.table(gene = gene_ids) + data <- data.table(gene = gene_ids) - # Make a column containing distance data for each species. - for (species_id in species_ids) { - species_distances <- distances[ - species == species_id, - .(gene, distance) - ] - - data <- merge(data, species_distances, all.x = TRUE) - setnames(data, "distance", species_id) - } - - # Transpose to the desired format. - data <- transpose(data, make.names = "gene") - - if (!is.null(progress)) progress(0.33) - - # Take the reference data. - reference_data <- data[, ..reference_gene_ids] - - # Perform the correlation between all possible pairs. - results <- stats::cor( - data[, ..gene_ids], - reference_data, - use = "pairwise.complete.obs", - method = "spearman" - ) - - results <- data.table(results, keep.rownames = TRUE) - setnames(results, "rn", "gene") - - # Remove correlations between the reference genes themselves. - for (reference_gene_id in reference_gene_ids) { - column <- quote(reference_gene_id) - results[gene == reference_gene_id, eval(column) := NA] - } - - if (!is.null(progress)) progress(0.66) - - # Compute the final score as the mean of known correlation scores. - # Negative correlations will correctly lessen the score, which will be - # clamped to zero as its lower bound. Genes with no possible - # correlations at all will be assumed to have a score of 0.0. - - compute_score <- function(scores) { - score <- mean(scores, na.rm = TRUE) - - if (is.na(score) | score < 0.0) { - score <- 0.0 + # Make a column containing distance data for each species. + for (species_id in species_ids) { + species_data <- if (use_positions) { + setnames(distances[ + species == species_id, + .(gene, position) + ], "position", "distance") + } else { + distances[ + species == species_id, + .(gene, distance) + ] + } } - score + data <- merge(data, species_data, all.x = TRUE) + setnames(data, "distance", species_id) + + # Transpose to the desired format. + data <- transpose(data, make.names = "gene") + + if (!is.null(progress)) progress(0.33) + + # Take the reference data. + reference_data <- data[, ..reference_gene_ids] + + # Perform the correlation between all possible pairs. + results <- stats::cor( + data[, ..gene_ids], + reference_data, + use = "pairwise.complete.obs", + method = "spearman" + ) + + results <- data.table(results, keep.rownames = TRUE) + setnames(results, "rn", "gene") + + # Remove correlations between the reference genes themselves. + for (reference_gene_id in reference_gene_ids) { + column <- quote(reference_gene_id) + results[gene == reference_gene_id, eval(column) := NA] + } + + if (!is.null(progress)) progress(0.66) + + # Compute the final score as the mean of known correlation scores. + # Negative correlations will correctly lessen the score, which will + # be clamped to zero as its lower bound. Genes with no possible + # correlations at all will be assumed to have a score of 0.0. + + compute_score <- function(scores) { + score <- mean(scores, na.rm = TRUE) + + if (is.na(score) | score < 0.0) { + score <- 0.0 + } + + score + } + + results[, + score := compute_score(as.matrix(.SD)), + .SDcols = reference_gene_ids, + by = gene + ] + + results[, .(gene, score)] } - - results[, - score := compute_score(as.matrix(.SD)), - .SDcols = reference_gene_ids, - by = gene - ] - - results[, .(gene, score)] - }) + ) } diff --git a/R/neural.R b/R/neural.R index 1c98d1c..680f2f0 100644 --- a/R/neural.R +++ b/R/neural.R @@ -1,114 +1,131 @@ # Find genes by training a neural network on reference position data. # # @param seed A seed to get reproducible results. -neural <- function(preset, progress = NULL, seed = 448077) { +neural <- function(preset, + use_positions = FALSE, + progress = NULL, + seed = 448077) { species_ids <- preset$species_ids gene_ids <- preset$gene_ids reference_gene_ids <- preset$reference_gene_ids - cached("neural", c(species_ids, gene_ids, reference_gene_ids), { - set.seed(seed) - gene_count <- length(gene_ids) + cached( + "neural", + c(species_ids, gene_ids, reference_gene_ids, use_positions), + { # nolint + set.seed(seed) + gene_count <- length(gene_ids) - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Input data for the network. This contains the gene ID as an identifier - # as well as the per-species gene distances as input variables. - data <- data.table(gene = gene_ids) + # Input data for the network. This contains the gene ID as an + # identifier as well as the per-species gene distances as input + # variables. + data <- data.table(gene = gene_ids) - # Buffer to keep track of species included in the computation. Species - # from `species_ids` may be excluded if they don't have enough data. - species_ids_included <- NULL + # Buffer to keep track of species included in the computation. + # Species from `species_ids` may be excluded if they don't have + # enough data. + species_ids_included <- NULL - # Make a column containing distance data for each species. - for (species_id in species_ids) { - species_distances <- distances[ - species == species_id, - .(gene, distance) + # Make a column containing distance data for each species. + for (species_id in species_ids) { + species_data <- if (use_positions) { + setnames(distances[ + species == species_id, + .(gene, position) + ], "position", "distance") + } else { + distances[ + species == species_id, + .(gene, distance) + ] + } + + # Only include species with at least 25% known values. + + species_distances <- stats::na.omit(species_data) + + if (nrow(species_distances) >= 0.25 * gene_count) { + species_ids_included <- c(species_ids_included, species_id) + data <- merge(data, species_distances, all.x = TRUE) + + # Replace missing data with mean values. The neural network + # can't handle NAs in a meaningful way. Choosing extreme + # values here would result in heavily biased results. + # Therefore, the mean value is chosen as a compromise. + # However, this will of course lessen the significance of + # the results. + + mean_distance <- round(species_distances[, mean(distance)]) + data[is.na(distance), distance := mean_distance] + + # Name the new column after the species. + setnames(data, "distance", species_id) + } + } + + # Extract the reference genes. + + reference_data <- data[gene %chin% reference_gene_ids] + reference_data[, neural := 1.0] + + # Take out random samples from the remaining genes. This is another + # compromise with a negative impact on significance. Because there + # is no information on genes with are explicitely *not* TPE-OLD + # genes, we have to assume that a random sample of genes has a low + # probability of including TPE-OLD genes. + + without_reference_data <- data[!gene %chin% reference_gene_ids] + + reference_samples <- without_reference_data[ + sample( + nrow(without_reference_data), + nrow(reference_data) + ) ] - # Only include species with at least 25% known values. + reference_samples[, neural := 0.0] - species_distances <- stats::na.omit(species_distances) + # Merge training data. The training data includes all reference + # genes as well as an equal number of random sample genes. + training_data <- rbindlist(list(reference_data, reference_samples)) - if (nrow(species_distances) >= 0.25 * gene_count) { - species_ids_included <- c(species_ids_included, species_id) - data <- merge(data, species_distances, all.x = TRUE) + # Construct and train the neural network. - # Replace missing data with mean values. The neural network - # can't handle NAs in a meaningful way. Choosing extreme values - # here would result in heavily biased results. Therefore, the - # mean value is chosen as a compromise. However, this will of - # course lessen the significance of the results. + nn_formula <- stats::as.formula(sprintf( + "neural~%s", + paste(species_ids_included, collapse = "+") + )) - mean_distance <- round(species_distances[, mean(distance)]) - data[is.na(distance), distance := mean_distance] + layer1 <- length(species_ids) * 0.66 + layer2 <- layer1 * 0.66 + layer3 <- layer2 * 0.66 - # Name the new column after the species. - setnames(data, "distance", species_id) - } - } - - # Extract the reference genes. - - reference_data <- data[gene %chin% reference_gene_ids] - reference_data[, neural := 1.0] - - # Take out random samples from the remaining genes. This is another - # compromise with a negative impact on significance. Because there is - # no information on genes with are explicitely *not* TPE-OLD genes, we - # have to assume that a random sample of genes has a low probability of - # including TPE-OLD genes. - - without_reference_data <- data[!gene %chin% reference_gene_ids] - - reference_samples <- without_reference_data[ - sample( - nrow(without_reference_data), - nrow(reference_data) + nn <- neuralnet::neuralnet( + nn_formula, + training_data, + hidden = c(layer1, layer2, layer3), + linear.output = FALSE ) - ] - reference_samples[, neural := 0.0] + if (!is.null(progress)) { + # We do everything in one go, so it's not possible to report + # detailed progress information. As the method is relatively + # quick, this should not be a problem. + progress(0.5) + } - # Merge training data. The training data includes all reference genes as - # well as an equal number of random sample genes. - training_data <- rbindlist(list(reference_data, reference_samples)) + # Apply the neural network. + data[, score := neuralnet::compute(nn, data)$net.result] - # Construct and train the neural network. + if (!is.null(progress)) { + # See above. + progress(1.0) + } - nn_formula <- stats::as.formula(sprintf( - "neural~%s", - paste(species_ids_included, collapse = "+") - )) - - layer1 <- length(species_ids) * 0.66 - layer2 <- layer1 * 0.66 - layer3 <- layer2 * 0.66 - - nn <- neuralnet::neuralnet( - nn_formula, - training_data, - hidden = c(layer1, layer2, layer3), - linear.output = FALSE - ) - - if (!is.null(progress)) { - # We do everything in one go, so it's not possible to report - # detailed progress information. As the method is relatively quick, - # this should not be a problem. - progress(0.5) + data[, .(gene, score)] } - - # Apply the neural network. - data[, score := neuralnet::compute(nn, data)$net.result] - - if (!is.null(progress)) { - # See above. - progress(1.0) - } - - data[, .(gene, score)] - }) + ) } diff --git a/R/proximity.R b/R/proximity.R index bcbbfd4..ed91660 100644 --- a/R/proximity.R +++ b/R/proximity.R @@ -2,21 +2,26 @@ # # A score will be given to each gene such that 0.0 corresponds to the maximal # mean distance across all genes and 1.0 corresponds to a distance of 0. -proximity <- function(preset, progress = NULL) { +proximity <- function(preset, use_positions = FALSE, progress = NULL) { species_ids <- preset$species_ids gene_ids <- preset$gene_ids - cached("proximity", c(species_ids, gene_ids), { + cached("proximity", c(species_ids, gene_ids, use_positions), { # Prefilter distances by species and gene. - distances <- geposan::distances[ + data <- geposan::distances[ species %chin% preset$species_ids & gene %chin% preset$gene_ids ] # Compute the score as described above. - distances <- distances[, .(mean_distance = mean(distance)), by = "gene"] - max_distance <- distances[, max(mean_distance)] - distances[, score := 1 - mean_distance / max_distance] + data <- if (use_positions) { + data[, .(mean_distance = mean(position)), by = "gene"] + } else { + data[, .(mean_distance = mean(distance)), by = "gene"] + } + + max_distance <- data[, max(mean_distance)] + data[, score := 1 - mean_distance / max_distance] if (!is.null(progress)) { # We do everything in one go, so it's not possible to report @@ -25,6 +30,6 @@ proximity <- function(preset, progress = NULL) { progress(1.0) } - distances[, .(gene, score)] + data[, .(gene, score)] }) }