mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 10:47:25 +01:00
Add species adjacency method
This commit is contained in:
parent
c8f1e522f9
commit
53f955f3da
6 changed files with 202 additions and 14 deletions
|
|
@ -24,5 +24,6 @@ export(plot_scores)
|
||||||
export(preset)
|
export(preset)
|
||||||
export(ranking)
|
export(ranking)
|
||||||
export(result)
|
export(result)
|
||||||
|
export(species_adjacency)
|
||||||
export(validate)
|
export(validate)
|
||||||
import(data.table)
|
import(data.table)
|
||||||
|
|
|
||||||
|
|
@ -24,16 +24,23 @@ densest <- function(data) {
|
||||||
|
|
||||||
#' Score genes based on their proximity to the reference genes.
|
#' Score genes based on their proximity to the reference genes.
|
||||||
#'
|
#'
|
||||||
#' @param estimate A function that will be used to summarize the distance
|
#' In this case, the distance data that is available for one gene is first
|
||||||
#' values for each gene. See [densest()] for the default implementation.
|
#' combined. The resulting value is compared to the reference genes and
|
||||||
#' @param combination A function that will be used to combine the different
|
#' determines the gene's score in relation to other genes.
|
||||||
|
#'
|
||||||
|
#' @param distance_estimate A function that will be used to summarize the
|
||||||
|
#' distance values for each gene. See [densest()] for the default
|
||||||
|
#' implementation.
|
||||||
|
#' @param summarize A function that will be used to combine the different
|
||||||
#' distances to the reference genes. By default [min()] is used. That means
|
#' distances to the reference genes. By default [min()] is used. That means
|
||||||
#' the distance to the nearest reference gene will be scored.
|
#' the distance to the nearest reference gene will be scored.
|
||||||
#'
|
#'
|
||||||
#' @return An object of class `geposan_method`.
|
#' @return An object of class `geposan_method`.
|
||||||
#'
|
#'
|
||||||
|
#' @seealso [species_adjacency()]
|
||||||
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
adjacency <- function(estimate = densest, combination = min) {
|
adjacency <- function(distance_estimate = densest, summarize = min) {
|
||||||
method(
|
method(
|
||||||
id = "adjacency",
|
id = "adjacency",
|
||||||
name = "Adjacency",
|
name = "Adjacency",
|
||||||
|
|
@ -49,15 +56,15 @@ adjacency <- function(estimate = densest, combination = min) {
|
||||||
species_ids,
|
species_ids,
|
||||||
gene_ids,
|
gene_ids,
|
||||||
reference_gene_ids,
|
reference_gene_ids,
|
||||||
estimate,
|
distance_estimate,
|
||||||
combination
|
summarize
|
||||||
),
|
),
|
||||||
{ # nolint
|
{ # nolint
|
||||||
# Filter distances by species and gene and summarize each
|
# Filter distances by species and gene and summarize each
|
||||||
# gene's distance values using the estimation function.
|
# gene's distance values using the estimation function.
|
||||||
data <- geposan::distances[
|
data <- geposan::distances[
|
||||||
species %chin% species_ids & gene %chin% gene_ids,
|
species %chin% species_ids & gene %chin% gene_ids,
|
||||||
.(distance = as.numeric(estimate(distance))),
|
.(distance = as.numeric(distance_estimate(distance))),
|
||||||
by = gene
|
by = gene
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -70,7 +77,7 @@ adjacency <- function(estimate = densest, combination = min) {
|
||||||
.(difference = abs(distance_value - distance))
|
.(difference = abs(distance_value - distance))
|
||||||
]
|
]
|
||||||
|
|
||||||
combination(differences$difference)
|
summarize(differences$difference)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Compute the differences to the reference genes.
|
# Compute the differences to the reference genes.
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,8 @@ all_methods <- function() {
|
||||||
clustering(),
|
clustering(),
|
||||||
correlation(),
|
correlation(),
|
||||||
neural(),
|
neural(),
|
||||||
adjacency()
|
adjacency(),
|
||||||
|
species_adjacency()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
148
R/species_adjacency.R
Normal file
148
R/species_adjacency.R
Normal file
|
|
@ -0,0 +1,148 @@
|
||||||
|
#' Score genes based on their adjacency to the reference genes within species.
|
||||||
|
#'
|
||||||
|
#' For each gene and species, the method will first combine the gene's distances
|
||||||
|
#' to the reference genes within that species. Afterwards, the results are
|
||||||
|
#' summarized across species and determine the gene's score.
|
||||||
|
#'
|
||||||
|
#' @param distance_estimate Function for combining the distance differences
|
||||||
|
#' within one species.
|
||||||
|
#' @param summarize Function for summarizing the distance values across species.
|
||||||
|
#'
|
||||||
|
#' @return An object of class `geposan_method`.
|
||||||
|
#'
|
||||||
|
#' @seealso [adjacency()]
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
species_adjacency <- function(distance_estimate = min,
|
||||||
|
summarize = stats::median) {
|
||||||
|
method(
|
||||||
|
id = "species_adjacency",
|
||||||
|
name = "Species adj.",
|
||||||
|
description = "Species adjacency",
|
||||||
|
function(preset, progress) {
|
||||||
|
species_ids <- preset$species_ids
|
||||||
|
gene_ids <- preset$gene_ids
|
||||||
|
reference_gene_ids <- preset$reference_gene_ids
|
||||||
|
|
||||||
|
cached(
|
||||||
|
"species_adjacency",
|
||||||
|
c(
|
||||||
|
species_ids,
|
||||||
|
gene_ids,
|
||||||
|
reference_gene_ids,
|
||||||
|
distance_estimate,
|
||||||
|
summarize
|
||||||
|
),
|
||||||
|
{ # nolint
|
||||||
|
# Prefilter distances.
|
||||||
|
data <- geposan::distances[
|
||||||
|
species %chin% species_ids & gene %chin% gene_ids
|
||||||
|
]
|
||||||
|
|
||||||
|
progress_state <- 0.0
|
||||||
|
progress_step <- 0.9 / length(species_ids)
|
||||||
|
|
||||||
|
# Iterate through all species and find the distance
|
||||||
|
# estimates within that species.
|
||||||
|
for (species_id in species_ids) {
|
||||||
|
# For all genes, compute the distance to one reference
|
||||||
|
# gene at a time in one go.
|
||||||
|
for (reference_gene_id in reference_gene_ids) {
|
||||||
|
comparison_distance <- data[
|
||||||
|
species == species_id &
|
||||||
|
gene == reference_gene_id,
|
||||||
|
distance
|
||||||
|
]
|
||||||
|
|
||||||
|
column <- quote(reference_gene_id)
|
||||||
|
|
||||||
|
if (length(comparison_distance) != 1) {
|
||||||
|
# If we don't have a comparison distance, we
|
||||||
|
# can't compute a difference. This happens, if
|
||||||
|
# the species doesn't have the reference gene.
|
||||||
|
data[
|
||||||
|
species == species_id &
|
||||||
|
gene %chin% gene_ids,
|
||||||
|
eval(column) := NA_integer_
|
||||||
|
]
|
||||||
|
} else {
|
||||||
|
data[
|
||||||
|
species == species_id &
|
||||||
|
gene %chin% gene_ids,
|
||||||
|
eval(column) :=
|
||||||
|
abs(distance - comparison_distance)
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Combine the distances to the different reference genes
|
||||||
|
# into one value using the provided function.
|
||||||
|
data[
|
||||||
|
species == species_id &
|
||||||
|
gene %chin% gene_ids,
|
||||||
|
combined_distance := as.numeric(
|
||||||
|
distance_estimate(na.omit(
|
||||||
|
# Convert the data.table subset into a
|
||||||
|
# vector to get the correct na.omit
|
||||||
|
# behavior.
|
||||||
|
as.matrix(.SD)[1, ]
|
||||||
|
))
|
||||||
|
),
|
||||||
|
.SDcols = reference_gene_ids,
|
||||||
|
by = gene
|
||||||
|
]
|
||||||
|
|
||||||
|
progress_state <- progress_state + progress_step
|
||||||
|
progress(progress_state)
|
||||||
|
}
|
||||||
|
|
||||||
|
progress(0.9)
|
||||||
|
|
||||||
|
# Remove the distances between the reference genes.
|
||||||
|
for (reference_gene_id in reference_gene_ids) {
|
||||||
|
column <- quote(reference_gene_id)
|
||||||
|
data[gene == reference_gene_id, eval(column) := NA]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Recompute the combined distance for the reference genes.
|
||||||
|
data[
|
||||||
|
gene %chin% reference_gene_ids,
|
||||||
|
combined_distance := as.numeric(
|
||||||
|
distance_estimate(na.omit(as.matrix(.SD)[1, ]))
|
||||||
|
),
|
||||||
|
.SDcols = reference_gene_ids,
|
||||||
|
by = list(species, gene)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Combine the distances into one value.
|
||||||
|
results <- data[,
|
||||||
|
.(
|
||||||
|
summarized_distances = as.numeric(
|
||||||
|
summarize(na.omit(combined_distance))
|
||||||
|
)
|
||||||
|
),
|
||||||
|
by = gene
|
||||||
|
]
|
||||||
|
|
||||||
|
# Compute the final score by normalizing the difference.
|
||||||
|
results[
|
||||||
|
,
|
||||||
|
score := 1 - summarized_distances /
|
||||||
|
max(summarized_distances)
|
||||||
|
]
|
||||||
|
|
||||||
|
progress(1.0)
|
||||||
|
|
||||||
|
result(
|
||||||
|
method = "species_adjacency",
|
||||||
|
scores = results[, .(gene, score)],
|
||||||
|
details = list(
|
||||||
|
data = data,
|
||||||
|
results = results
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
@ -4,13 +4,14 @@
|
||||||
\alias{adjacency}
|
\alias{adjacency}
|
||||||
\title{Score genes based on their proximity to the reference genes.}
|
\title{Score genes based on their proximity to the reference genes.}
|
||||||
\usage{
|
\usage{
|
||||||
adjacency(estimate = densest, combination = min)
|
adjacency(distance_estimate = densest, summarize = min)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{estimate}{A function that will be used to summarize the distance
|
\item{distance_estimate}{A function that will be used to summarize the
|
||||||
values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.}
|
distance values for each gene. See \code{\link[=densest]{densest()}} for the default
|
||||||
|
implementation.}
|
||||||
|
|
||||||
\item{combination}{A function that will be used to combine the different
|
\item{summarize}{A function that will be used to combine the different
|
||||||
distances to the reference genes. By default \code{\link[=min]{min()}} is used. That means
|
distances to the reference genes. By default \code{\link[=min]{min()}} is used. That means
|
||||||
the distance to the nearest reference gene will be scored.}
|
the distance to the nearest reference gene will be scored.}
|
||||||
}
|
}
|
||||||
|
|
@ -18,5 +19,10 @@ the distance to the nearest reference gene will be scored.}
|
||||||
An object of class \code{geposan_method}.
|
An object of class \code{geposan_method}.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Score genes based on their proximity to the reference genes.
|
In this case, the distance data that is available for one gene is first
|
||||||
|
combined. The resulting value is compared to the reference genes and
|
||||||
|
determines the gene's score in relation to other genes.
|
||||||
|
}
|
||||||
|
\seealso{
|
||||||
|
\code{\link[=species_adjacency]{species_adjacency()}}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
25
man/species_adjacency.Rd
Normal file
25
man/species_adjacency.Rd
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/species_adjacency.R
|
||||||
|
\name{species_adjacency}
|
||||||
|
\alias{species_adjacency}
|
||||||
|
\title{Score genes based on their adjacency to the reference genes within species.}
|
||||||
|
\usage{
|
||||||
|
species_adjacency(distance_estimate = min, summarize = stats::median)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{distance_estimate}{Function for combining the distance differences
|
||||||
|
within one species.}
|
||||||
|
|
||||||
|
\item{summarize}{Function for summarizing the distance values across species.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An object of class \code{geposan_method}.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
For each gene and species, the method will first combine the gene's distances
|
||||||
|
to the reference genes within that species. Afterwards, the results are
|
||||||
|
summarized across species and determine the gene's score.
|
||||||
|
}
|
||||||
|
\seealso{
|
||||||
|
\code{\link[=adjacency]{adjacency()}}
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue