adjacency: Make distance estimation customizable

This commit is contained in:
Elias Projahn 2022-01-09 20:21:27 +01:00
parent ac9894e988
commit 2ceda0691b
4 changed files with 109 additions and 71 deletions

View file

@ -12,6 +12,7 @@ export(analyze)
export(clustering) export(clustering)
export(compare) export(compare)
export(correlation) export(correlation)
export(densest)
export(method) export(method)
export(neural) export(neural)
export(optimal_weights) export(optimal_weights)

View file

@ -1,13 +1,36 @@
#' Find the densest value in the data.
#'
#' This function assumes that data represents a continuous variable and finds
#' a single value with the highest estimated density. This can be used to
#' estimate the mode of the data. If there is only one value that value is
#' returned. If multiple density maxima with the same density exist, their mean
#' is returned.
#'
#' @param data The input data.
#'
#' @return The densest value of data.
#'
#' @export
densest <- function(data) {
as.numeric(if (length(data) <= 0) {
NULL
} else if (length(data) == 1) {
data
} else {
density <- stats::density(data)
mean(density$x[density$y == max(density$y)])
})
}
#' Score genes based on their proximity to the reference genes. #' Score genes based on their proximity to the reference genes.
#' #'
#' This method finds the distance value with the maximum density for each gene #' @param estimate A function that will be used to summarize the distance
#' (i.e. the mode of its estimated distribution). Genes are scored by comparing #' values for each gene. See [densest()] for the default implementation.
#' those distance values with the values of the reference genes.
#' #'
#' @return An object of class `geposan_method`. #' @return An object of class `geposan_method`.
#' #'
#' @export #' @export
adjacency <- function() { adjacency <- function(estimate = densest) {
method( method(
id = "adjacency", id = "adjacency",
name = "Adjacency", name = "Adjacency",
@ -17,73 +40,64 @@ adjacency <- function() {
gene_ids <- preset$gene_ids gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids reference_gene_ids <- preset$reference_gene_ids
cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), { cached(
# Get the virtual distance value with the highest density. "adjacency",
compute_densest_distance <- function(distances) { c(species_ids, gene_ids, reference_gene_ids, estimate),
if (length(distances) <= 2) { { # nolint
mean(distances) # Filter distances by species and gene and summarize each
} else { # gene's distance values using the estimation function.
d <- stats::density(distances) data <- geposan::distances[
d$x[which.max(d$y)] species %chin% species_ids & gene %chin% gene_ids,
} .(distance = estimate(distance)),
} by = gene
# Filter distances by species and gene and find the distance
# with the highest density of values for each gene.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids,
.(densest_distance = compute_densest_distance(distance)),
by = gene
]
# Compute the absolute value of the difference between the
# provided densest distance value in comparison to the mean of
# the densest distances of the comparison genes.
compute_difference <- function(densest_distance,
comparison_ids) {
# Get the mean of the densest distances of the reference
# genes.
mean_densest_distance <- data[
gene %chin% comparison_ids,
mean(densest_distance)
] ]
abs(densest_distance - mean_densest_distance) # Compute the absolute value of the difference between the
} # estimated distances of each gene to the reference genes.
compute_difference <- function(distance,
comparison_ids) {
reference_distance <- data[
gene %chin% comparison_ids,
mean(distance)
]
# Compute the differences to the reference genes. abs(distance - reference_distance)
data[ }
!gene %chin% reference_gene_ids,
difference := compute_difference( # Compute the differences to the reference genes.
densest_distance, data[
reference_gene_ids !gene %chin% reference_gene_ids,
difference := compute_difference(
distance,
reference_gene_ids
)
]
progress(0.5)
# Exclude the reference gene itself when computing its
# difference.
data[
gene %chin% reference_gene_ids,
difference := compute_difference(
distance,
reference_gene_ids[reference_gene_ids != gene]
),
by = gene
]
# Compute the final score by normalizing the difference.
data[, score := 1 - difference / max(difference)]
progress(1.0)
result(
method = "adjacency",
scores = data[, .(gene, score)],
details = list(data = data)
) )
] }
)
progress(0.5)
# Exclude the reference gene itself when computing its
# difference.
data[
gene %chin% reference_gene_ids,
difference := compute_difference(
densest_distance,
reference_gene_ids[reference_gene_ids != gene]
),
by = gene
]
# Compute the final score by normalizing the difference.
data[, score := 1 - difference / max(difference)]
progress(1.0)
result(
method = "adjacency",
scores = data[, .(gene, score)],
details = list(data = data)
)
})
} }
) )
} }

View file

@ -4,13 +4,15 @@
\alias{adjacency} \alias{adjacency}
\title{Score genes based on their proximity to the reference genes.} \title{Score genes based on their proximity to the reference genes.}
\usage{ \usage{
adjacency() adjacency(estimate = densest)
}
\arguments{
\item{estimate}{A function that will be used to summarize the distance
values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.}
} }
\value{ \value{
An object of class \code{geposan_method}. An object of class \code{geposan_method}.
} }
\description{ \description{
This method finds the distance value with the maximum density for each gene Score genes based on their proximity to the reference genes.
(i.e. the mode of its estimated distribution). Genes are scored by comparing
those distance values with the values of the reference genes.
} }

21
man/densest.Rd Normal file
View file

@ -0,0 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/adjacency.R
\name{densest}
\alias{densest}
\title{Find the densest value in the data.}
\usage{
densest(data)
}
\arguments{
\item{data}{The input data.}
}
\value{
The densest value of data.
}
\description{
This function assumes that data represents a continuous variable and finds
a single value with the highest estimated density. This can be used to
estimate the mode of the data. If there is only one value that value is
returned. If multiple density maxima with the same density exist, their mean
is returned.
}