mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 18:57:25 +01:00
adjacency: Make distance estimation customizable
This commit is contained in:
parent
ac9894e988
commit
2ceda0691b
4 changed files with 109 additions and 71 deletions
|
|
@ -12,6 +12,7 @@ export(analyze)
|
||||||
export(clustering)
|
export(clustering)
|
||||||
export(compare)
|
export(compare)
|
||||||
export(correlation)
|
export(correlation)
|
||||||
|
export(densest)
|
||||||
export(method)
|
export(method)
|
||||||
export(neural)
|
export(neural)
|
||||||
export(optimal_weights)
|
export(optimal_weights)
|
||||||
|
|
|
||||||
148
R/adjacency.R
148
R/adjacency.R
|
|
@ -1,13 +1,36 @@
|
||||||
|
#' Find the densest value in the data.
|
||||||
|
#'
|
||||||
|
#' This function assumes that data represents a continuous variable and finds
|
||||||
|
#' a single value with the highest estimated density. This can be used to
|
||||||
|
#' estimate the mode of the data. If there is only one value that value is
|
||||||
|
#' returned. If multiple density maxima with the same density exist, their mean
|
||||||
|
#' is returned.
|
||||||
|
#'
|
||||||
|
#' @param data The input data.
|
||||||
|
#'
|
||||||
|
#' @return The densest value of data.
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
densest <- function(data) {
|
||||||
|
as.numeric(if (length(data) <= 0) {
|
||||||
|
NULL
|
||||||
|
} else if (length(data) == 1) {
|
||||||
|
data
|
||||||
|
} else {
|
||||||
|
density <- stats::density(data)
|
||||||
|
mean(density$x[density$y == max(density$y)])
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#' Score genes based on their proximity to the reference genes.
|
#' Score genes based on their proximity to the reference genes.
|
||||||
#'
|
#'
|
||||||
#' This method finds the distance value with the maximum density for each gene
|
#' @param estimate A function that will be used to summarize the distance
|
||||||
#' (i.e. the mode of its estimated distribution). Genes are scored by comparing
|
#' values for each gene. See [densest()] for the default implementation.
|
||||||
#' those distance values with the values of the reference genes.
|
|
||||||
#'
|
#'
|
||||||
#' @return An object of class `geposan_method`.
|
#' @return An object of class `geposan_method`.
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
adjacency <- function() {
|
adjacency <- function(estimate = densest) {
|
||||||
method(
|
method(
|
||||||
id = "adjacency",
|
id = "adjacency",
|
||||||
name = "Adjacency",
|
name = "Adjacency",
|
||||||
|
|
@ -17,73 +40,64 @@ adjacency <- function() {
|
||||||
gene_ids <- preset$gene_ids
|
gene_ids <- preset$gene_ids
|
||||||
reference_gene_ids <- preset$reference_gene_ids
|
reference_gene_ids <- preset$reference_gene_ids
|
||||||
|
|
||||||
cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), {
|
cached(
|
||||||
# Get the virtual distance value with the highest density.
|
"adjacency",
|
||||||
compute_densest_distance <- function(distances) {
|
c(species_ids, gene_ids, reference_gene_ids, estimate),
|
||||||
if (length(distances) <= 2) {
|
{ # nolint
|
||||||
mean(distances)
|
# Filter distances by species and gene and summarize each
|
||||||
} else {
|
# gene's distance values using the estimation function.
|
||||||
d <- stats::density(distances)
|
data <- geposan::distances[
|
||||||
d$x[which.max(d$y)]
|
species %chin% species_ids & gene %chin% gene_ids,
|
||||||
}
|
.(distance = estimate(distance)),
|
||||||
}
|
by = gene
|
||||||
|
|
||||||
# Filter distances by species and gene and find the distance
|
|
||||||
# with the highest density of values for each gene.
|
|
||||||
data <- geposan::distances[
|
|
||||||
species %chin% species_ids & gene %chin% gene_ids,
|
|
||||||
.(densest_distance = compute_densest_distance(distance)),
|
|
||||||
by = gene
|
|
||||||
]
|
|
||||||
|
|
||||||
# Compute the absolute value of the difference between the
|
|
||||||
# provided densest distance value in comparison to the mean of
|
|
||||||
# the densest distances of the comparison genes.
|
|
||||||
compute_difference <- function(densest_distance,
|
|
||||||
comparison_ids) {
|
|
||||||
# Get the mean of the densest distances of the reference
|
|
||||||
# genes.
|
|
||||||
mean_densest_distance <- data[
|
|
||||||
gene %chin% comparison_ids,
|
|
||||||
mean(densest_distance)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
abs(densest_distance - mean_densest_distance)
|
# Compute the absolute value of the difference between the
|
||||||
}
|
# estimated distances of each gene to the reference genes.
|
||||||
|
compute_difference <- function(distance,
|
||||||
|
comparison_ids) {
|
||||||
|
reference_distance <- data[
|
||||||
|
gene %chin% comparison_ids,
|
||||||
|
mean(distance)
|
||||||
|
]
|
||||||
|
|
||||||
# Compute the differences to the reference genes.
|
abs(distance - reference_distance)
|
||||||
data[
|
}
|
||||||
!gene %chin% reference_gene_ids,
|
|
||||||
difference := compute_difference(
|
# Compute the differences to the reference genes.
|
||||||
densest_distance,
|
data[
|
||||||
reference_gene_ids
|
!gene %chin% reference_gene_ids,
|
||||||
|
difference := compute_difference(
|
||||||
|
distance,
|
||||||
|
reference_gene_ids
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
progress(0.5)
|
||||||
|
|
||||||
|
# Exclude the reference gene itself when computing its
|
||||||
|
# difference.
|
||||||
|
data[
|
||||||
|
gene %chin% reference_gene_ids,
|
||||||
|
difference := compute_difference(
|
||||||
|
distance,
|
||||||
|
reference_gene_ids[reference_gene_ids != gene]
|
||||||
|
),
|
||||||
|
by = gene
|
||||||
|
]
|
||||||
|
|
||||||
|
# Compute the final score by normalizing the difference.
|
||||||
|
data[, score := 1 - difference / max(difference)]
|
||||||
|
|
||||||
|
progress(1.0)
|
||||||
|
|
||||||
|
result(
|
||||||
|
method = "adjacency",
|
||||||
|
scores = data[, .(gene, score)],
|
||||||
|
details = list(data = data)
|
||||||
)
|
)
|
||||||
]
|
}
|
||||||
|
)
|
||||||
progress(0.5)
|
|
||||||
|
|
||||||
# Exclude the reference gene itself when computing its
|
|
||||||
# difference.
|
|
||||||
data[
|
|
||||||
gene %chin% reference_gene_ids,
|
|
||||||
difference := compute_difference(
|
|
||||||
densest_distance,
|
|
||||||
reference_gene_ids[reference_gene_ids != gene]
|
|
||||||
),
|
|
||||||
by = gene
|
|
||||||
]
|
|
||||||
|
|
||||||
# Compute the final score by normalizing the difference.
|
|
||||||
data[, score := 1 - difference / max(difference)]
|
|
||||||
|
|
||||||
progress(1.0)
|
|
||||||
|
|
||||||
result(
|
|
||||||
method = "adjacency",
|
|
||||||
scores = data[, .(gene, score)],
|
|
||||||
details = list(data = data)
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,13 +4,15 @@
|
||||||
\alias{adjacency}
|
\alias{adjacency}
|
||||||
\title{Score genes based on their proximity to the reference genes.}
|
\title{Score genes based on their proximity to the reference genes.}
|
||||||
\usage{
|
\usage{
|
||||||
adjacency()
|
adjacency(estimate = densest)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{estimate}{A function that will be used to summarize the distance
|
||||||
|
values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
An object of class \code{geposan_method}.
|
An object of class \code{geposan_method}.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
This method finds the distance value with the maximum density for each gene
|
Score genes based on their proximity to the reference genes.
|
||||||
(i.e. the mode of its estimated distribution). Genes are scored by comparing
|
|
||||||
those distance values with the values of the reference genes.
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
21
man/densest.Rd
Normal file
21
man/densest.Rd
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/adjacency.R
|
||||||
|
\name{densest}
|
||||||
|
\alias{densest}
|
||||||
|
\title{Find the densest value in the data.}
|
||||||
|
\usage{
|
||||||
|
densest(data)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{The input data.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
The densest value of data.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
This function assumes that data represents a continuous variable and finds
|
||||||
|
a single value with the highest estimated density. This can be used to
|
||||||
|
estimate the mode of the data. If there is only one value that value is
|
||||||
|
returned. If multiple density maxima with the same density exist, their mean
|
||||||
|
is returned.
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue