geposan/R/method_adjacency.R

100 lines
3.1 KiB
R
Raw Permalink Normal View History

#' Score genes based on their proximity to the reference genes.
#'
2022-01-17 20:11:07 +01:00
#' In this case, the distance data that is available for one gene is first
#' combined. The resulting value is compared to the reference genes and
#' determines the gene's score in relation to other genes.
#'
2022-06-22 11:20:39 +02:00
#' @param id Unique ID for the method and its results.
#' @param name Human readable name for the method.
#' @param description Method description.
2022-01-17 20:11:07 +01:00
#' @param distance_estimate A function that will be used to summarize the
#' distance values for each gene. See [densest()] for the default
#' implementation.
#'
#' @return An object of class `geposan_method`.
#'
#' @export
2022-06-22 11:20:39 +02:00
adjacency <- function(id = "adjacency",
name = "Adjacency",
description = "Adjacency to reference genes",
distance_estimate = densest) {
2022-05-26 12:42:19 +02:00
method(
2022-06-22 11:20:39 +02:00
id = id,
name = name,
description = description,
2024-01-31 12:14:55 +01:00
help = paste0(
"Adjacency to the reference genes across species. This method penalizes ",
"genes that do not occur in the region typical for the reference genes, ",
"without artificially defining a fixed boundary."
),
2022-05-26 12:42:19 +02:00
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
2021-11-25 20:55:11 +01:00
2022-05-26 12:42:19 +02:00
cached(
2022-08-12 12:41:56 +02:00
id,
2022-05-26 12:42:19 +02:00
c(
species_ids,
gene_ids,
reference_gene_ids,
distance_estimate
2022-05-26 12:42:19 +02:00
),
{ # nolint
# Filter distances by species and gene and summarize each
# gene's distance values using the estimation function.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids,
.(distance = as.numeric(distance_estimate(distance))),
by = gene
]
2021-11-25 20:55:11 +01:00
2022-05-26 12:42:19 +02:00
# Compute the absolute value of the difference between the
# estimated distances of each gene to the reference genes.
compute_difference <- function(distance_values,
2022-05-26 12:42:19 +02:00
comparison_ids) {
comparison_distance <- data[
2022-05-26 12:42:19 +02:00
gene %chin% comparison_ids,
distance_estimate(distance)
2022-05-26 12:42:19 +02:00
]
2021-11-25 20:55:11 +01:00
abs(distance_values - comparison_distance)
2022-05-26 12:42:19 +02:00
}
2021-11-25 20:55:11 +01:00
2022-05-26 12:42:19 +02:00
# Compute the differences to the reference genes.
data[
!gene %chin% reference_gene_ids,
difference := compute_difference(
distance,
reference_gene_ids
)
2022-05-26 12:42:19 +02:00
]
2021-11-25 20:55:11 +01:00
2022-05-26 12:42:19 +02:00
progress(0.5)
2021-11-25 20:55:11 +01:00
2022-05-26 12:42:19 +02:00
# Exclude the reference gene itself when computing its
# difference.
data[
gene %chin% reference_gene_ids,
difference := compute_difference(
distance,
reference_gene_ids[reference_gene_ids != gene]
)
2022-05-26 12:42:19 +02:00
]
2022-05-26 12:42:19 +02:00
# Compute the final score by normalizing the difference.
data[, score := 1 - difference / max(difference)]
2021-11-25 20:55:11 +01:00
2022-05-26 12:42:19 +02:00
progress(1.0)
2021-11-25 20:55:11 +01:00
2022-05-26 12:42:19 +02:00
result(
method = "adjacency",
scores = data[, .(gene, score)],
details = list(data = data)
)
}
2022-05-26 12:42:19 +02:00
)
}
)
2021-11-25 20:55:11 +01:00
}