2022-01-13 18:35:02 +01:00
|
|
|
#' Find the densest value in the data.
|
|
|
|
|
#'
|
|
|
|
|
#' This function assumes that data represents a continuous variable and finds
|
|
|
|
|
#' a single value with the highest estimated density. This can be used to
|
|
|
|
|
#' estimate the mode of the data. If there is only one value that value is
|
|
|
|
|
#' returned. If multiple density maxima with the same density exist, their mean
|
|
|
|
|
#' is returned.
|
|
|
|
|
#'
|
|
|
|
|
#' @param data The input data.
|
|
|
|
|
#'
|
|
|
|
|
#' @return The densest value of data.
|
|
|
|
|
#'
|
|
|
|
|
#' @export
|
|
|
|
|
densest <- function(data) {
|
|
|
|
|
as.numeric(if (length(data) <= 0) {
|
|
|
|
|
NULL
|
|
|
|
|
} else if (length(data) == 1) {
|
|
|
|
|
data
|
|
|
|
|
} else {
|
|
|
|
|
density <- stats::density(data)
|
|
|
|
|
mean(density$x[density$y == max(density$y)])
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-16 13:01:44 +01:00
|
|
|
#' Score genes based on their proximity to the reference genes.
|
|
|
|
|
#'
|
2022-01-17 20:11:07 +01:00
|
|
|
#' In this case, the distance data that is available for one gene is first
|
|
|
|
|
#' combined. The resulting value is compared to the reference genes and
|
|
|
|
|
#' determines the gene's score in relation to other genes.
|
|
|
|
|
#'
|
|
|
|
|
#' @param distance_estimate A function that will be used to summarize the
|
|
|
|
|
#' distance values for each gene. See [densest()] for the default
|
|
|
|
|
#' implementation.
|
|
|
|
|
#' @param summarize A function that will be used to combine the different
|
2022-01-09 20:26:42 +01:00
|
|
|
#' distances to the reference genes. By default [min()] is used. That means
|
|
|
|
|
#' the distance to the nearest reference gene will be scored.
|
2021-12-16 13:01:44 +01:00
|
|
|
#'
|
|
|
|
|
#' @return An object of class `geposan_method`.
|
|
|
|
|
#'
|
2022-01-17 20:11:07 +01:00
|
|
|
#' @seealso [species_adjacency()]
|
|
|
|
|
#'
|
2021-12-16 13:01:44 +01:00
|
|
|
#' @export
|
2022-01-17 20:11:07 +01:00
|
|
|
adjacency <- function(distance_estimate = densest, summarize = min) {
|
2021-12-16 13:01:44 +01:00
|
|
|
method(
|
|
|
|
|
id = "adjacency",
|
|
|
|
|
name = "Adjacency",
|
|
|
|
|
description = "Adjacency to reference genes",
|
|
|
|
|
function(preset, progress) {
|
|
|
|
|
species_ids <- preset$species_ids
|
|
|
|
|
gene_ids <- preset$gene_ids
|
|
|
|
|
reference_gene_ids <- preset$reference_gene_ids
|
2021-11-25 20:55:11 +01:00
|
|
|
|
2022-01-09 20:21:27 +01:00
|
|
|
cached(
|
|
|
|
|
"adjacency",
|
2022-01-09 20:26:42 +01:00
|
|
|
c(
|
|
|
|
|
species_ids,
|
|
|
|
|
gene_ids,
|
|
|
|
|
reference_gene_ids,
|
2022-01-17 20:11:07 +01:00
|
|
|
distance_estimate,
|
|
|
|
|
summarize
|
2022-01-09 20:26:42 +01:00
|
|
|
),
|
2022-01-09 20:21:27 +01:00
|
|
|
{ # nolint
|
|
|
|
|
# Filter distances by species and gene and summarize each
|
|
|
|
|
# gene's distance values using the estimation function.
|
|
|
|
|
data <- geposan::distances[
|
|
|
|
|
species %chin% species_ids & gene %chin% gene_ids,
|
2022-01-17 20:11:07 +01:00
|
|
|
.(distance = as.numeric(distance_estimate(distance))),
|
2022-01-09 20:21:27 +01:00
|
|
|
by = gene
|
2021-12-16 13:01:44 +01:00
|
|
|
]
|
2021-11-25 20:55:11 +01:00
|
|
|
|
2022-01-09 20:21:27 +01:00
|
|
|
# Compute the absolute value of the difference between the
|
|
|
|
|
# estimated distances of each gene to the reference genes.
|
2022-01-09 20:26:42 +01:00
|
|
|
compute_difference <- function(distance_value,
|
2022-01-09 20:21:27 +01:00
|
|
|
comparison_ids) {
|
2022-01-09 20:26:42 +01:00
|
|
|
differences <- data[
|
2022-01-09 20:21:27 +01:00
|
|
|
gene %chin% comparison_ids,
|
2022-01-09 20:26:42 +01:00
|
|
|
.(difference = abs(distance_value - distance))
|
2022-01-09 20:21:27 +01:00
|
|
|
]
|
2021-11-25 20:55:11 +01:00
|
|
|
|
2022-01-17 20:11:07 +01:00
|
|
|
summarize(differences$difference)
|
2022-01-09 20:21:27 +01:00
|
|
|
}
|
2021-11-25 20:55:11 +01:00
|
|
|
|
2022-01-09 20:21:27 +01:00
|
|
|
# Compute the differences to the reference genes.
|
|
|
|
|
data[
|
|
|
|
|
!gene %chin% reference_gene_ids,
|
|
|
|
|
difference := compute_difference(
|
|
|
|
|
distance,
|
|
|
|
|
reference_gene_ids
|
2022-01-09 20:26:42 +01:00
|
|
|
),
|
|
|
|
|
by = gene
|
2022-01-09 20:21:27 +01:00
|
|
|
]
|
2021-11-25 20:55:11 +01:00
|
|
|
|
2022-01-09 20:21:27 +01:00
|
|
|
progress(0.5)
|
2021-11-25 20:55:11 +01:00
|
|
|
|
2022-01-09 20:21:27 +01:00
|
|
|
# Exclude the reference gene itself when computing its
|
|
|
|
|
# difference.
|
|
|
|
|
data[
|
|
|
|
|
gene %chin% reference_gene_ids,
|
|
|
|
|
difference := compute_difference(
|
|
|
|
|
distance,
|
|
|
|
|
reference_gene_ids[reference_gene_ids != gene]
|
|
|
|
|
),
|
|
|
|
|
by = gene
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Compute the final score by normalizing the difference.
|
|
|
|
|
data[, score := 1 - difference / max(difference)]
|
2021-11-25 20:55:11 +01:00
|
|
|
|
2022-01-09 20:21:27 +01:00
|
|
|
progress(1.0)
|
2021-11-25 20:55:11 +01:00
|
|
|
|
2022-01-09 20:21:27 +01:00
|
|
|
result(
|
|
|
|
|
method = "adjacency",
|
|
|
|
|
scores = data[, .(gene, score)],
|
|
|
|
|
details = list(data = data)
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
)
|
2021-12-16 13:01:44 +01:00
|
|
|
}
|
|
|
|
|
)
|
2021-11-25 20:55:11 +01:00
|
|
|
}
|