diff --git a/DESCRIPTION b/DESCRIPTION index 283d115..4738307 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,8 @@ Depends: R (>= 2.10) Imports: data.table, - neuralnet + neuralnet, + rlang Suggests: biomaRt, rlog, diff --git a/R/analyze.R b/R/analyze.R index 249ead1..b8930e3 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -59,30 +59,32 @@ analyze <- function(preset, progress = NULL) { "neural" = neural ) - total_progress <- 0.0 - method_count <- length(preset$method_ids) - results <- data.table(gene = preset$gene_ids) + cached("results", preset, { + total_progress <- 0.0 + method_count <- length(preset$method_ids) + results <- data.table(gene = preset$gene_ids) - for (method_id in preset$method_ids) { - method_progress <- if (!is.null(progress)) function(p) { - progress(total_progress + p / method_count) + for (method_id in preset$method_ids) { + method_progress <- if (!is.null(progress)) function(p) { + progress(total_progress + p / method_count) + } + + method_results <- methods[[method_id]](preset, method_progress) + setnames(method_results, "score", method_id) + + results <- merge( + results, + method_results, + by = "gene" + ) + + total_progress <- total_progress + 1 / method_count } - method_results <- methods[[method_id]](preset, method_progress) - setnames(method_results, "score", method_id) + if (!is.null(progress)) { + progress(1.0) + } - results <- merge( - results, - method_results, - by = "gene" - ) - - total_progress <- total_progress + 1 / method_count - } - - if (!is.null(progress)) { - progress(1.0) - } - - results + results + }) } diff --git a/R/clusteriness.R b/R/clusteriness.R index 91c0154..20ca849 100644 --- a/R/clusteriness.R +++ b/R/clusteriness.R @@ -37,28 +37,33 @@ clusteriness_priv <- function(data, height = 1000000) { # Process genes clustering their distance to telomeres. clusteriness <- function(preset, progress = NULL) { - results <- data.table(gene = preset$gene_ids) + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids - # Prefilter the input data by species. - distances <- geposan::distances[species %chin% preset$species_ids] + cached("clusteriness", c(species_ids, gene_ids), { + results <- data.table(gene = gene_ids) - # Add an index for quickly accessing data per gene. - setkey(distances, gene) + # Prefilter the input data by species. + distances <- geposan::distances[species %chin% species_ids] - genes_done <- 0 - genes_total <- length(preset$gene_ids) + # Add an index for quickly accessing data per gene. + setkey(distances, gene) - # Perform the cluster analysis for one gene. - compute <- function(gene_id) { - score <- clusteriness_priv(distances[gene_id, distance]) + genes_done <- 0 + genes_total <- length(gene_ids) - if (!is.null(progress)) { - genes_done <<- genes_done + 1 - progress(genes_done / genes_total) + # Perform the cluster analysis for one gene. + compute <- function(gene_id) { + score <- clusteriness_priv(distances[gene_id, distance]) + + if (!is.null(progress)) { + genes_done <<- genes_done + 1 + progress(genes_done / genes_total) + } + + score } - score - } - - results[, score := compute(gene), by = 1:nrow(results)] + results[, score := compute(gene), by = 1:nrow(results)] + }) } diff --git a/R/correlation.R b/R/correlation.R index a5f9422..6f93a1e 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -5,69 +5,75 @@ correlation <- function(preset, progress = NULL) { gene_ids <- preset$gene_ids reference_gene_ids <- preset$reference_gene_ids - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + cached("correlation", c(species_ids, gene_ids, reference_gene_ids), { + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Tranform data to get species as rows and genes as columns. We construct - # columns per species, because it requires fewer iterations, and transpose - # the table afterwards. + # Tranform data to get species as rows and genes as columns. We + # construct columns per species, because it requires fewer iterations, + # and transpose the table afterwards. - data <- data.table(gene = gene_ids) + data <- data.table(gene = gene_ids) - # Make a column containing distance data for each species. - for (species_id in species_ids) { - species_distances <- distances[species == species_id, .(gene, distance)] - data <- merge(data, species_distances, all.x = TRUE) - setnames(data, "distance", species_id) - } + # Make a column containing distance data for each species. + for (species_id in species_ids) { + species_distances <- distances[ + species == species_id, + .(gene, distance) + ] - # Transpose to the desired format. - data <- transpose(data, make.names = "gene") - - if (!is.null(progress)) progress(0.33) - - # Take the reference data. - reference_data <- data[, ..reference_gene_ids] - - # Perform the correlation between all possible pairs. - results <- stats::cor( - data[, ..gene_ids], - reference_data, - use = "pairwise.complete.obs", - method = "spearman" - ) - - results <- data.table(results, keep.rownames = TRUE) - setnames(results, "rn", "gene") - - # Remove correlations between the reference genes themselves. - for (reference_gene_id in reference_gene_ids) { - column <- quote(reference_gene_id) - results[gene == reference_gene_id, eval(column) := NA] - } - - if (!is.null(progress)) progress(0.66) - - # Compute the final score as the mean of known correlation scores. Negative - # correlations will correctly lessen the score, which will be clamped to - # zero as its lower bound. Genes with no possible correlations at all will - # be assumed to have a score of 0.0. - - compute_score <- function(scores) { - score <- mean(scores, na.rm = TRUE) - - if (is.na(score) | score < 0.0) { - score <- 0.0 + data <- merge(data, species_distances, all.x = TRUE) + setnames(data, "distance", species_id) } - score - } + # Transpose to the desired format. + data <- transpose(data, make.names = "gene") - results[, - score := compute_score(as.matrix(.SD)), - .SDcols = reference_gene_ids, - by = gene - ] + if (!is.null(progress)) progress(0.33) - results[, .(gene, score)] + # Take the reference data. + reference_data <- data[, ..reference_gene_ids] + + # Perform the correlation between all possible pairs. + results <- stats::cor( + data[, ..gene_ids], + reference_data, + use = "pairwise.complete.obs", + method = "spearman" + ) + + results <- data.table(results, keep.rownames = TRUE) + setnames(results, "rn", "gene") + + # Remove correlations between the reference genes themselves. + for (reference_gene_id in reference_gene_ids) { + column <- quote(reference_gene_id) + results[gene == reference_gene_id, eval(column) := NA] + } + + if (!is.null(progress)) progress(0.66) + + # Compute the final score as the mean of known correlation scores. + # Negative correlations will correctly lessen the score, which will be + # clamped to zero as its lower bound. Genes with no possible + # correlations at all will be assumed to have a score of 0.0. + + compute_score <- function(scores) { + score <- mean(scores, na.rm = TRUE) + + if (is.na(score) | score < 0.0) { + score <- 0.0 + } + + score + } + + results[, + score := compute_score(as.matrix(.SD)), + .SDcols = reference_gene_ids, + by = gene + ] + + results[, .(gene, score)] + }) } diff --git a/R/neural.R b/R/neural.R index 0f7b05e..1c98d1c 100644 --- a/R/neural.R +++ b/R/neural.R @@ -3,106 +3,112 @@ # @param seed A seed to get reproducible results. neural <- function(preset, progress = NULL, seed = 448077) { species_ids <- preset$species_ids + gene_ids <- preset$gene_ids reference_gene_ids <- preset$reference_gene_ids - set.seed(seed) - gene_count <- length(preset$gene_ids) + cached("neural", c(species_ids, gene_ids, reference_gene_ids), { + set.seed(seed) + gene_count <- length(gene_ids) - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Input data for the network. This contains the gene ID as an identifier - # as well as the per-species gene distances as input variables. - data <- data.table(gene = preset$gene_ids) + # Input data for the network. This contains the gene ID as an identifier + # as well as the per-species gene distances as input variables. + data <- data.table(gene = gene_ids) - # Buffer to keep track of species included in the computation. Species - # from `species_ids` may be excluded if they don't have enough data. - species_ids_included <- NULL + # Buffer to keep track of species included in the computation. Species + # from `species_ids` may be excluded if they don't have enough data. + species_ids_included <- NULL - # Make a column containing distance data for each species. - for (species_id in species_ids) { - species_distances <- distances[species == species_id, .(gene, distance)] + # Make a column containing distance data for each species. + for (species_id in species_ids) { + species_distances <- distances[ + species == species_id, + .(gene, distance) + ] - # Only include species with at least 25% known values. + # Only include species with at least 25% known values. - species_distances <- stats::na.omit(species_distances) + species_distances <- stats::na.omit(species_distances) - if (nrow(species_distances) >= 0.25 * gene_count) { - species_ids_included <- c(species_ids_included, species_id) - data <- merge(data, species_distances, all.x = TRUE) + if (nrow(species_distances) >= 0.25 * gene_count) { + species_ids_included <- c(species_ids_included, species_id) + data <- merge(data, species_distances, all.x = TRUE) - # Replace missing data with mean values. The neural network can't - # handle NAs in a meaningful way. Choosing extreme values here - # would result in heavily biased results. Therefore, the mean value - # is chosen as a compromise. However, this will of course lessen the - # significance of the results. + # Replace missing data with mean values. The neural network + # can't handle NAs in a meaningful way. Choosing extreme values + # here would result in heavily biased results. Therefore, the + # mean value is chosen as a compromise. However, this will of + # course lessen the significance of the results. - mean_distance <- round(species_distances[, mean(distance)]) - data[is.na(distance), distance := mean_distance] + mean_distance <- round(species_distances[, mean(distance)]) + data[is.na(distance), distance := mean_distance] - # Name the new column after the species. - setnames(data, "distance", species_id) + # Name the new column after the species. + setnames(data, "distance", species_id) + } } - } - # Extract the reference genes. + # Extract the reference genes. - reference_data <- data[gene %chin% reference_gene_ids] - reference_data[, neural := 1.0] + reference_data <- data[gene %chin% reference_gene_ids] + reference_data[, neural := 1.0] - # Take out random samples from the remaining genes. This is another - # compromise with a negative impact on significance. Because there is no - # information on genes with are explicitely *not* TPE-OLD genes, we have to - # assume that a random sample of genes has a low probability of including - # TPE-OLD genes. + # Take out random samples from the remaining genes. This is another + # compromise with a negative impact on significance. Because there is + # no information on genes with are explicitely *not* TPE-OLD genes, we + # have to assume that a random sample of genes has a low probability of + # including TPE-OLD genes. - without_reference_data <- data[!gene %chin% reference_gene_ids] + without_reference_data <- data[!gene %chin% reference_gene_ids] - reference_samples <- without_reference_data[ - sample( - nrow(without_reference_data), - nrow(reference_data) + reference_samples <- without_reference_data[ + sample( + nrow(without_reference_data), + nrow(reference_data) + ) + ] + + reference_samples[, neural := 0.0] + + # Merge training data. The training data includes all reference genes as + # well as an equal number of random sample genes. + training_data <- rbindlist(list(reference_data, reference_samples)) + + # Construct and train the neural network. + + nn_formula <- stats::as.formula(sprintf( + "neural~%s", + paste(species_ids_included, collapse = "+") + )) + + layer1 <- length(species_ids) * 0.66 + layer2 <- layer1 * 0.66 + layer3 <- layer2 * 0.66 + + nn <- neuralnet::neuralnet( + nn_formula, + training_data, + hidden = c(layer1, layer2, layer3), + linear.output = FALSE ) - ] - reference_samples[, neural := 0.0] + if (!is.null(progress)) { + # We do everything in one go, so it's not possible to report + # detailed progress information. As the method is relatively quick, + # this should not be a problem. + progress(0.5) + } - # Merge training data. The training data includes all reference genes as - # well as an equal number of random sample genes. - training_data <- rbindlist(list(reference_data, reference_samples)) + # Apply the neural network. + data[, score := neuralnet::compute(nn, data)$net.result] - # Construct and train the neural network. + if (!is.null(progress)) { + # See above. + progress(1.0) + } - nn_formula <- stats::as.formula(sprintf( - "neural~%s", - paste(species_ids_included, collapse = "+") - )) - - layer1 <- length(species_ids) * 0.66 - layer2 <- layer1 * 0.66 - layer3 <- layer2 * 0.66 - - nn <- neuralnet::neuralnet( - nn_formula, - training_data, - hidden = c(layer1, layer2, layer3), - linear.output = FALSE - ) - - if (!is.null(progress)) { - # We do everything in one go, so it's not possible to report detailed - # progress information. As the method is relatively quick, this should - # not be a problem. - progress(0.5) - } - - # Apply the neural network. - data[, score := neuralnet::compute(nn, data)$net.result] - - if (!is.null(progress)) { - # See above. - progress(1.0) - } - - data[, .(gene, score)] + data[, .(gene, score)] + }) } diff --git a/R/proximity.R b/R/proximity.R index d3f7b53..bcbbfd4 100644 --- a/R/proximity.R +++ b/R/proximity.R @@ -3,23 +3,28 @@ # A score will be given to each gene such that 0.0 corresponds to the maximal # mean distance across all genes and 1.0 corresponds to a distance of 0. proximity <- function(preset, progress = NULL) { - # Prefilter distances by species and gene. - distances <- geposan::distances[ - species %chin% preset$species_ids & gene %chin% preset$gene_ids - ] + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids - # Compute the score as described above. + cached("proximity", c(species_ids, gene_ids), { + # Prefilter distances by species and gene. + distances <- geposan::distances[ + species %chin% preset$species_ids & gene %chin% preset$gene_ids + ] - distances <- distances[, .(mean_distance = mean(distance)), by = "gene"] - max_distance <- distances[, max(mean_distance)] - distances[, score := 1 - mean_distance / max_distance] + # Compute the score as described above. - if (!is.null(progress)) { - # We do everything in one go, so it's not possible to report detailed - # progress information. As the method is relatively quick, this should - # not be a problem. - progress(1.0) - } + distances <- distances[, .(mean_distance = mean(distance)), by = "gene"] + max_distance <- distances[, max(mean_distance)] + distances[, score := 1 - mean_distance / max_distance] - distances[, .(gene, score)] + if (!is.null(progress)) { + # We do everything in one go, so it's not possible to report + # detailed progress information. As the method is relatively quick, + # this should not be a problem. + progress(1.0) + } + + distances[, .(gene, score)] + }) } diff --git a/R/utils.R b/R/utils.R index 887a466..a1f0820 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,34 @@ +# Cache the value of an expression on the file system. +# +# The expression will be evaluated if there is no matching cache file found. +# The cache files will be located in a directory "cache" located in the current +# working directory. +# +# @param name Human readable part of the cache file name. +# @param objects A vector of objects that this expression depends on. The hash +# of those objects will be used for identifying the cache file. +cached <- function(name, objects, expr) { + if (!dir.exists("cache")) { + dir.create("cache") + } + + id <- rlang::hash(objects) + cache_file <- sprintf("cache/%s_%s.rda", name, id) + + if (file.exists(cache_file)) { + # If the cache file exists, we restore the data from it. + load(cache_file) + } else { + # If the cache file doesn't exist, we have to do the computation. + data <- expr + + # The results are cached for the next run. + save(data, file = cache_file, compress = "xz") + } + + data +} + # This is needed to make data.table's symbols available within the package. #' @import data.table NULL