From 53f955f3dac4a81988c499135f17a975fa03abd4 Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Mon, 17 Jan 2022 20:11:07 +0100 Subject: [PATCH] Add species adjacency method --- NAMESPACE | 1 + R/adjacency.R | 23 +++--- R/method.R | 3 +- R/species_adjacency.R | 148 +++++++++++++++++++++++++++++++++++++++ man/adjacency.Rd | 16 +++-- man/species_adjacency.Rd | 25 +++++++ 6 files changed, 202 insertions(+), 14 deletions(-) create mode 100644 R/species_adjacency.R create mode 100644 man/species_adjacency.Rd diff --git a/NAMESPACE b/NAMESPACE index 685d468..22449fb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,5 +24,6 @@ export(plot_scores) export(preset) export(ranking) export(result) +export(species_adjacency) export(validate) import(data.table) diff --git a/R/adjacency.R b/R/adjacency.R index 6d2a1e6..aceffa8 100644 --- a/R/adjacency.R +++ b/R/adjacency.R @@ -24,16 +24,23 @@ densest <- function(data) { #' Score genes based on their proximity to the reference genes. #' -#' @param estimate A function that will be used to summarize the distance -#' values for each gene. See [densest()] for the default implementation. -#' @param combination A function that will be used to combine the different +#' In this case, the distance data that is available for one gene is first +#' combined. The resulting value is compared to the reference genes and +#' determines the gene's score in relation to other genes. +#' +#' @param distance_estimate A function that will be used to summarize the +#' distance values for each gene. See [densest()] for the default +#' implementation. +#' @param summarize A function that will be used to combine the different #' distances to the reference genes. By default [min()] is used. That means #' the distance to the nearest reference gene will be scored. #' #' @return An object of class `geposan_method`. #' +#' @seealso [species_adjacency()] +#' #' @export -adjacency <- function(estimate = densest, combination = min) { +adjacency <- function(distance_estimate = densest, summarize = min) { method( id = "adjacency", name = "Adjacency", @@ -49,15 +56,15 @@ adjacency <- function(estimate = densest, combination = min) { species_ids, gene_ids, reference_gene_ids, - estimate, - combination + distance_estimate, + summarize ), { # nolint # Filter distances by species and gene and summarize each # gene's distance values using the estimation function. data <- geposan::distances[ species %chin% species_ids & gene %chin% gene_ids, - .(distance = as.numeric(estimate(distance))), + .(distance = as.numeric(distance_estimate(distance))), by = gene ] @@ -70,7 +77,7 @@ adjacency <- function(estimate = densest, combination = min) { .(difference = abs(distance_value - distance)) ] - combination(differences$difference) + summarize(differences$difference) } # Compute the differences to the reference genes. diff --git a/R/method.R b/R/method.R index c0be5ad..d1a0a84 100644 --- a/R/method.R +++ b/R/method.R @@ -36,7 +36,8 @@ all_methods <- function() { clustering(), correlation(), neural(), - adjacency() + adjacency(), + species_adjacency() ) } diff --git a/R/species_adjacency.R b/R/species_adjacency.R new file mode 100644 index 0000000..587ffa6 --- /dev/null +++ b/R/species_adjacency.R @@ -0,0 +1,148 @@ +#' Score genes based on their adjacency to the reference genes within species. +#' +#' For each gene and species, the method will first combine the gene's distances +#' to the reference genes within that species. Afterwards, the results are +#' summarized across species and determine the gene's score. +#' +#' @param distance_estimate Function for combining the distance differences +#' within one species. +#' @param summarize Function for summarizing the distance values across species. +#' +#' @return An object of class `geposan_method`. +#' +#' @seealso [adjacency()] +#' +#' @export +species_adjacency <- function(distance_estimate = min, + summarize = stats::median) { + method( + id = "species_adjacency", + name = "Species adj.", + description = "Species adjacency", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + reference_gene_ids <- preset$reference_gene_ids + + cached( + "species_adjacency", + c( + species_ids, + gene_ids, + reference_gene_ids, + distance_estimate, + summarize + ), + { # nolint + # Prefilter distances. + data <- geposan::distances[ + species %chin% species_ids & gene %chin% gene_ids + ] + + progress_state <- 0.0 + progress_step <- 0.9 / length(species_ids) + + # Iterate through all species and find the distance + # estimates within that species. + for (species_id in species_ids) { + # For all genes, compute the distance to one reference + # gene at a time in one go. + for (reference_gene_id in reference_gene_ids) { + comparison_distance <- data[ + species == species_id & + gene == reference_gene_id, + distance + ] + + column <- quote(reference_gene_id) + + if (length(comparison_distance) != 1) { + # If we don't have a comparison distance, we + # can't compute a difference. This happens, if + # the species doesn't have the reference gene. + data[ + species == species_id & + gene %chin% gene_ids, + eval(column) := NA_integer_ + ] + } else { + data[ + species == species_id & + gene %chin% gene_ids, + eval(column) := + abs(distance - comparison_distance) + ] + } + } + + # Combine the distances to the different reference genes + # into one value using the provided function. + data[ + species == species_id & + gene %chin% gene_ids, + combined_distance := as.numeric( + distance_estimate(na.omit( + # Convert the data.table subset into a + # vector to get the correct na.omit + # behavior. + as.matrix(.SD)[1, ] + )) + ), + .SDcols = reference_gene_ids, + by = gene + ] + + progress_state <- progress_state + progress_step + progress(progress_state) + } + + progress(0.9) + + # Remove the distances between the reference genes. + for (reference_gene_id in reference_gene_ids) { + column <- quote(reference_gene_id) + data[gene == reference_gene_id, eval(column) := NA] + } + + # Recompute the combined distance for the reference genes. + data[ + gene %chin% reference_gene_ids, + combined_distance := as.numeric( + distance_estimate(na.omit(as.matrix(.SD)[1, ])) + ), + .SDcols = reference_gene_ids, + by = list(species, gene) + ] + + # Combine the distances into one value. + results <- data[, + .( + summarized_distances = as.numeric( + summarize(na.omit(combined_distance)) + ) + ), + by = gene + ] + + # Compute the final score by normalizing the difference. + results[ + , + score := 1 - summarized_distances / + max(summarized_distances) + ] + + progress(1.0) + + result( + method = "species_adjacency", + scores = results[, .(gene, score)], + details = list( + data = data, + results = results + ) + ) + } + ) + } + ) +} diff --git a/man/adjacency.Rd b/man/adjacency.Rd index f68d759..4eac7b9 100644 --- a/man/adjacency.Rd +++ b/man/adjacency.Rd @@ -4,13 +4,14 @@ \alias{adjacency} \title{Score genes based on their proximity to the reference genes.} \usage{ -adjacency(estimate = densest, combination = min) +adjacency(distance_estimate = densest, summarize = min) } \arguments{ -\item{estimate}{A function that will be used to summarize the distance -values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.} +\item{distance_estimate}{A function that will be used to summarize the +distance values for each gene. See \code{\link[=densest]{densest()}} for the default +implementation.} -\item{combination}{A function that will be used to combine the different +\item{summarize}{A function that will be used to combine the different distances to the reference genes. By default \code{\link[=min]{min()}} is used. That means the distance to the nearest reference gene will be scored.} } @@ -18,5 +19,10 @@ the distance to the nearest reference gene will be scored.} An object of class \code{geposan_method}. } \description{ -Score genes based on their proximity to the reference genes. +In this case, the distance data that is available for one gene is first +combined. The resulting value is compared to the reference genes and +determines the gene's score in relation to other genes. +} +\seealso{ +\code{\link[=species_adjacency]{species_adjacency()}} } diff --git a/man/species_adjacency.Rd b/man/species_adjacency.Rd new file mode 100644 index 0000000..0607559 --- /dev/null +++ b/man/species_adjacency.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/species_adjacency.R +\name{species_adjacency} +\alias{species_adjacency} +\title{Score genes based on their adjacency to the reference genes within species.} +\usage{ +species_adjacency(distance_estimate = min, summarize = stats::median) +} +\arguments{ +\item{distance_estimate}{Function for combining the distance differences +within one species.} + +\item{summarize}{Function for summarizing the distance values across species.} +} +\value{ +An object of class \code{geposan_method}. +} +\description{ +For each gene and species, the method will first combine the gene's distances +to the reference genes within that species. Afterwards, the results are +summarized across species and determine the gene's score. +} +\seealso{ +\code{\link[=adjacency]{adjacency()}} +}