Add species adjacency method

This commit is contained in:
Elias Projahn 2022-01-17 20:11:07 +01:00
parent c8f1e522f9
commit 53f955f3da
6 changed files with 202 additions and 14 deletions

View file

@ -24,16 +24,23 @@ densest <- function(data) {
#' Score genes based on their proximity to the reference genes.
#'
#' @param estimate A function that will be used to summarize the distance
#' values for each gene. See [densest()] for the default implementation.
#' @param combination A function that will be used to combine the different
#' In this case, the distance data that is available for one gene is first
#' combined. The resulting value is compared to the reference genes and
#' determines the gene's score in relation to other genes.
#'
#' @param distance_estimate A function that will be used to summarize the
#' distance values for each gene. See [densest()] for the default
#' implementation.
#' @param summarize A function that will be used to combine the different
#' distances to the reference genes. By default [min()] is used. That means
#' the distance to the nearest reference gene will be scored.
#'
#' @return An object of class `geposan_method`.
#'
#' @seealso [species_adjacency()]
#'
#' @export
adjacency <- function(estimate = densest, combination = min) {
adjacency <- function(distance_estimate = densest, summarize = min) {
method(
id = "adjacency",
name = "Adjacency",
@ -49,15 +56,15 @@ adjacency <- function(estimate = densest, combination = min) {
species_ids,
gene_ids,
reference_gene_ids,
estimate,
combination
distance_estimate,
summarize
),
{ # nolint
# Filter distances by species and gene and summarize each
# gene's distance values using the estimation function.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids,
.(distance = as.numeric(estimate(distance))),
.(distance = as.numeric(distance_estimate(distance))),
by = gene
]
@ -70,7 +77,7 @@ adjacency <- function(estimate = densest, combination = min) {
.(difference = abs(distance_value - distance))
]
combination(differences$difference)
summarize(differences$difference)
}
# Compute the differences to the reference genes.

View file

@ -36,7 +36,8 @@ all_methods <- function() {
clustering(),
correlation(),
neural(),
adjacency()
adjacency(),
species_adjacency()
)
}

148
R/species_adjacency.R Normal file
View file

@ -0,0 +1,148 @@
#' Score genes based on their adjacency to the reference genes within species.
#'
#' For each gene and species, the method will first combine the gene's distances
#' to the reference genes within that species. Afterwards, the results are
#' summarized across species and determine the gene's score.
#'
#' @param distance_estimate Function for combining the distance differences
#' within one species.
#' @param summarize Function for summarizing the distance values across species.
#'
#' @return An object of class `geposan_method`.
#'
#' @seealso [adjacency()]
#'
#' @export
species_adjacency <- function(distance_estimate = min,
summarize = stats::median) {
method(
id = "species_adjacency",
name = "Species adj.",
description = "Species adjacency",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached(
"species_adjacency",
c(
species_ids,
gene_ids,
reference_gene_ids,
distance_estimate,
summarize
),
{ # nolint
# Prefilter distances.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids
]
progress_state <- 0.0
progress_step <- 0.9 / length(species_ids)
# Iterate through all species and find the distance
# estimates within that species.
for (species_id in species_ids) {
# For all genes, compute the distance to one reference
# gene at a time in one go.
for (reference_gene_id in reference_gene_ids) {
comparison_distance <- data[
species == species_id &
gene == reference_gene_id,
distance
]
column <- quote(reference_gene_id)
if (length(comparison_distance) != 1) {
# If we don't have a comparison distance, we
# can't compute a difference. This happens, if
# the species doesn't have the reference gene.
data[
species == species_id &
gene %chin% gene_ids,
eval(column) := NA_integer_
]
} else {
data[
species == species_id &
gene %chin% gene_ids,
eval(column) :=
abs(distance - comparison_distance)
]
}
}
# Combine the distances to the different reference genes
# into one value using the provided function.
data[
species == species_id &
gene %chin% gene_ids,
combined_distance := as.numeric(
distance_estimate(na.omit(
# Convert the data.table subset into a
# vector to get the correct na.omit
# behavior.
as.matrix(.SD)[1, ]
))
),
.SDcols = reference_gene_ids,
by = gene
]
progress_state <- progress_state + progress_step
progress(progress_state)
}
progress(0.9)
# Remove the distances between the reference genes.
for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id)
data[gene == reference_gene_id, eval(column) := NA]
}
# Recompute the combined distance for the reference genes.
data[
gene %chin% reference_gene_ids,
combined_distance := as.numeric(
distance_estimate(na.omit(as.matrix(.SD)[1, ]))
),
.SDcols = reference_gene_ids,
by = list(species, gene)
]
# Combine the distances into one value.
results <- data[,
.(
summarized_distances = as.numeric(
summarize(na.omit(combined_distance))
)
),
by = gene
]
# Compute the final score by normalizing the difference.
results[
,
score := 1 - summarized_distances /
max(summarized_distances)
]
progress(1.0)
result(
method = "species_adjacency",
scores = results[, .(gene, score)],
details = list(
data = data,
results = results
)
)
}
)
}
)
}