Add species adjacency method

This commit is contained in:
Elias Projahn 2022-01-17 20:11:07 +01:00
parent c8f1e522f9
commit 53f955f3da
6 changed files with 202 additions and 14 deletions

View file

@ -24,5 +24,6 @@ export(plot_scores)
export(preset) export(preset)
export(ranking) export(ranking)
export(result) export(result)
export(species_adjacency)
export(validate) export(validate)
import(data.table) import(data.table)

View file

@ -24,16 +24,23 @@ densest <- function(data) {
#' Score genes based on their proximity to the reference genes. #' Score genes based on their proximity to the reference genes.
#' #'
#' @param estimate A function that will be used to summarize the distance #' In this case, the distance data that is available for one gene is first
#' values for each gene. See [densest()] for the default implementation. #' combined. The resulting value is compared to the reference genes and
#' @param combination A function that will be used to combine the different #' determines the gene's score in relation to other genes.
#'
#' @param distance_estimate A function that will be used to summarize the
#' distance values for each gene. See [densest()] for the default
#' implementation.
#' @param summarize A function that will be used to combine the different
#' distances to the reference genes. By default [min()] is used. That means #' distances to the reference genes. By default [min()] is used. That means
#' the distance to the nearest reference gene will be scored. #' the distance to the nearest reference gene will be scored.
#' #'
#' @return An object of class `geposan_method`. #' @return An object of class `geposan_method`.
#' #'
#' @seealso [species_adjacency()]
#'
#' @export #' @export
adjacency <- function(estimate = densest, combination = min) { adjacency <- function(distance_estimate = densest, summarize = min) {
method( method(
id = "adjacency", id = "adjacency",
name = "Adjacency", name = "Adjacency",
@ -49,15 +56,15 @@ adjacency <- function(estimate = densest, combination = min) {
species_ids, species_ids,
gene_ids, gene_ids,
reference_gene_ids, reference_gene_ids,
estimate, distance_estimate,
combination summarize
), ),
{ # nolint { # nolint
# Filter distances by species and gene and summarize each # Filter distances by species and gene and summarize each
# gene's distance values using the estimation function. # gene's distance values using the estimation function.
data <- geposan::distances[ data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids, species %chin% species_ids & gene %chin% gene_ids,
.(distance = as.numeric(estimate(distance))), .(distance = as.numeric(distance_estimate(distance))),
by = gene by = gene
] ]
@ -70,7 +77,7 @@ adjacency <- function(estimate = densest, combination = min) {
.(difference = abs(distance_value - distance)) .(difference = abs(distance_value - distance))
] ]
combination(differences$difference) summarize(differences$difference)
} }
# Compute the differences to the reference genes. # Compute the differences to the reference genes.

View file

@ -36,7 +36,8 @@ all_methods <- function() {
clustering(), clustering(),
correlation(), correlation(),
neural(), neural(),
adjacency() adjacency(),
species_adjacency()
) )
} }

148
R/species_adjacency.R Normal file
View file

@ -0,0 +1,148 @@
#' Score genes based on their adjacency to the reference genes within species.
#'
#' For each gene and species, the method will first combine the gene's distances
#' to the reference genes within that species. Afterwards, the results are
#' summarized across species and determine the gene's score.
#'
#' @param distance_estimate Function for combining the distance differences
#' within one species.
#' @param summarize Function for summarizing the distance values across species.
#'
#' @return An object of class `geposan_method`.
#'
#' @seealso [adjacency()]
#'
#' @export
species_adjacency <- function(distance_estimate = min,
summarize = stats::median) {
method(
id = "species_adjacency",
name = "Species adj.",
description = "Species adjacency",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached(
"species_adjacency",
c(
species_ids,
gene_ids,
reference_gene_ids,
distance_estimate,
summarize
),
{ # nolint
# Prefilter distances.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids
]
progress_state <- 0.0
progress_step <- 0.9 / length(species_ids)
# Iterate through all species and find the distance
# estimates within that species.
for (species_id in species_ids) {
# For all genes, compute the distance to one reference
# gene at a time in one go.
for (reference_gene_id in reference_gene_ids) {
comparison_distance <- data[
species == species_id &
gene == reference_gene_id,
distance
]
column <- quote(reference_gene_id)
if (length(comparison_distance) != 1) {
# If we don't have a comparison distance, we
# can't compute a difference. This happens, if
# the species doesn't have the reference gene.
data[
species == species_id &
gene %chin% gene_ids,
eval(column) := NA_integer_
]
} else {
data[
species == species_id &
gene %chin% gene_ids,
eval(column) :=
abs(distance - comparison_distance)
]
}
}
# Combine the distances to the different reference genes
# into one value using the provided function.
data[
species == species_id &
gene %chin% gene_ids,
combined_distance := as.numeric(
distance_estimate(na.omit(
# Convert the data.table subset into a
# vector to get the correct na.omit
# behavior.
as.matrix(.SD)[1, ]
))
),
.SDcols = reference_gene_ids,
by = gene
]
progress_state <- progress_state + progress_step
progress(progress_state)
}
progress(0.9)
# Remove the distances between the reference genes.
for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id)
data[gene == reference_gene_id, eval(column) := NA]
}
# Recompute the combined distance for the reference genes.
data[
gene %chin% reference_gene_ids,
combined_distance := as.numeric(
distance_estimate(na.omit(as.matrix(.SD)[1, ]))
),
.SDcols = reference_gene_ids,
by = list(species, gene)
]
# Combine the distances into one value.
results <- data[,
.(
summarized_distances = as.numeric(
summarize(na.omit(combined_distance))
)
),
by = gene
]
# Compute the final score by normalizing the difference.
results[
,
score := 1 - summarized_distances /
max(summarized_distances)
]
progress(1.0)
result(
method = "species_adjacency",
scores = results[, .(gene, score)],
details = list(
data = data,
results = results
)
)
}
)
}
)
}

View file

@ -4,13 +4,14 @@
\alias{adjacency} \alias{adjacency}
\title{Score genes based on their proximity to the reference genes.} \title{Score genes based on their proximity to the reference genes.}
\usage{ \usage{
adjacency(estimate = densest, combination = min) adjacency(distance_estimate = densest, summarize = min)
} }
\arguments{ \arguments{
\item{estimate}{A function that will be used to summarize the distance \item{distance_estimate}{A function that will be used to summarize the
values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.} distance values for each gene. See \code{\link[=densest]{densest()}} for the default
implementation.}
\item{combination}{A function that will be used to combine the different \item{summarize}{A function that will be used to combine the different
distances to the reference genes. By default \code{\link[=min]{min()}} is used. That means distances to the reference genes. By default \code{\link[=min]{min()}} is used. That means
the distance to the nearest reference gene will be scored.} the distance to the nearest reference gene will be scored.}
} }
@ -18,5 +19,10 @@ the distance to the nearest reference gene will be scored.}
An object of class \code{geposan_method}. An object of class \code{geposan_method}.
} }
\description{ \description{
Score genes based on their proximity to the reference genes. In this case, the distance data that is available for one gene is first
combined. The resulting value is compared to the reference genes and
determines the gene's score in relation to other genes.
}
\seealso{
\code{\link[=species_adjacency]{species_adjacency()}}
} }

25
man/species_adjacency.Rd Normal file
View file

@ -0,0 +1,25 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/species_adjacency.R
\name{species_adjacency}
\alias{species_adjacency}
\title{Score genes based on their adjacency to the reference genes within species.}
\usage{
species_adjacency(distance_estimate = min, summarize = stats::median)
}
\arguments{
\item{distance_estimate}{Function for combining the distance differences
within one species.}
\item{summarize}{Function for summarizing the distance values across species.}
}
\value{
An object of class \code{geposan_method}.
}
\description{
For each gene and species, the method will first combine the gene's distances
to the reference genes within that species. Afterwards, the results are
summarized across species and determine the gene's score.
}
\seealso{
\code{\link[=adjacency]{adjacency()}}
}