2022-01-17 20:11:07 +01:00
|
|
|
#' Score genes based on their adjacency to the reference genes within species.
|
|
|
|
|
#'
|
|
|
|
|
#' For each gene and species, the method will first combine the gene's distances
|
|
|
|
|
#' to the reference genes within that species. Afterwards, the results are
|
|
|
|
|
#' summarized across species and determine the gene's score.
|
|
|
|
|
#'
|
|
|
|
|
#' @param distance_estimate Function for combining the distance differences
|
|
|
|
|
#' within one species.
|
|
|
|
|
#' @param summarize Function for summarizing the distance values across species.
|
|
|
|
|
#'
|
|
|
|
|
#' @return An object of class `geposan_method`.
|
|
|
|
|
#'
|
|
|
|
|
#' @seealso [adjacency()]
|
|
|
|
|
#'
|
|
|
|
|
#' @export
|
2022-02-24 15:36:07 +01:00
|
|
|
species_adjacency <- function(distance_estimate = stats::median,
|
2022-01-17 20:11:07 +01:00
|
|
|
summarize = stats::median) {
|
2022-05-26 12:42:19 +02:00
|
|
|
method(
|
|
|
|
|
id = "species_adjacency",
|
|
|
|
|
name = "Species adj.",
|
|
|
|
|
description = "Species adjacency",
|
|
|
|
|
function(preset, progress) {
|
|
|
|
|
species_ids <- preset$species_ids
|
|
|
|
|
gene_ids <- preset$gene_ids
|
|
|
|
|
reference_gene_ids <- preset$reference_gene_ids
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
cached(
|
|
|
|
|
"species_adjacency",
|
|
|
|
|
c(
|
|
|
|
|
species_ids,
|
|
|
|
|
gene_ids,
|
|
|
|
|
reference_gene_ids,
|
|
|
|
|
distance_estimate,
|
|
|
|
|
summarize
|
|
|
|
|
),
|
|
|
|
|
{ # nolint
|
|
|
|
|
# Prefilter distances.
|
|
|
|
|
data <- geposan::distances[
|
|
|
|
|
species %chin% species_ids & gene %chin% gene_ids
|
|
|
|
|
]
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
progress_state <- 0.0
|
|
|
|
|
progress_step <- 0.9 / length(species_ids)
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
# Iterate through all species and find the distance
|
|
|
|
|
# estimates within that species.
|
|
|
|
|
for (species_id in species_ids) {
|
|
|
|
|
# For all genes, compute the distance to one reference
|
|
|
|
|
# gene at a time in one go.
|
|
|
|
|
for (reference_gene_id in reference_gene_ids) {
|
|
|
|
|
comparison_distance <- data[
|
|
|
|
|
species == species_id &
|
|
|
|
|
gene == reference_gene_id,
|
|
|
|
|
distance
|
|
|
|
|
]
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
column <- quote(reference_gene_id)
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
if (length(comparison_distance) != 1) {
|
|
|
|
|
# If we don't have a comparison distance, we
|
|
|
|
|
# can't compute a difference. This happens, if
|
|
|
|
|
# the species doesn't have the reference gene.
|
|
|
|
|
data[
|
|
|
|
|
species == species_id &
|
|
|
|
|
gene %chin% gene_ids,
|
|
|
|
|
eval(column) := NA_integer_
|
|
|
|
|
]
|
|
|
|
|
} else {
|
|
|
|
|
data[
|
|
|
|
|
species == species_id &
|
|
|
|
|
gene %chin% gene_ids,
|
|
|
|
|
eval(column) :=
|
|
|
|
|
abs(distance - comparison_distance)
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
# Combine the distances to the different reference genes
|
|
|
|
|
# into one value using the provided function.
|
|
|
|
|
data[
|
|
|
|
|
species == species_id &
|
|
|
|
|
gene %chin% gene_ids,
|
|
|
|
|
combined_distance := as.numeric(
|
|
|
|
|
distance_estimate(stats::na.omit(
|
|
|
|
|
# Convert the data.table subset into a
|
|
|
|
|
# vector to get the correct na.omit
|
|
|
|
|
# behavior.
|
|
|
|
|
as.matrix(.SD)[1, ]
|
|
|
|
|
))
|
|
|
|
|
),
|
|
|
|
|
.SDcols = reference_gene_ids,
|
|
|
|
|
by = gene
|
|
|
|
|
]
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
progress_state <- progress_state + progress_step
|
|
|
|
|
progress(progress_state)
|
|
|
|
|
}
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
progress(0.9)
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
# Remove the distances between the reference genes.
|
|
|
|
|
for (reference_gene_id in reference_gene_ids) {
|
|
|
|
|
column <- quote(reference_gene_id)
|
|
|
|
|
data[gene == reference_gene_id, eval(column) := NA]
|
|
|
|
|
}
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
# Recompute the combined distance for the reference genes.
|
|
|
|
|
data[
|
|
|
|
|
gene %chin% reference_gene_ids,
|
|
|
|
|
combined_distance := as.numeric(
|
|
|
|
|
distance_estimate(stats::na.omit(
|
|
|
|
|
as.matrix(.SD)[1, ]
|
|
|
|
|
))
|
|
|
|
|
),
|
|
|
|
|
.SDcols = reference_gene_ids,
|
|
|
|
|
by = list(species, gene)
|
|
|
|
|
]
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
# Combine the distances into one value.
|
|
|
|
|
results <- data[,
|
|
|
|
|
.(
|
|
|
|
|
summarized_distances = as.numeric(
|
|
|
|
|
summarize(stats::na.omit(combined_distance))
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
by = gene
|
|
|
|
|
]
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
# Compute the final score by normalizing the difference.
|
|
|
|
|
results[
|
|
|
|
|
,
|
|
|
|
|
score := 1 - summarized_distances /
|
|
|
|
|
max(summarized_distances)
|
|
|
|
|
]
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
progress(1.0)
|
2022-01-17 20:11:07 +01:00
|
|
|
|
2022-05-26 12:42:19 +02:00
|
|
|
result(
|
|
|
|
|
method = "species_adjacency",
|
|
|
|
|
scores = results[, .(gene, score)],
|
|
|
|
|
details = list(
|
|
|
|
|
data = data,
|
|
|
|
|
results = results
|
2022-01-17 20:11:07 +01:00
|
|
|
)
|
2022-05-26 12:42:19 +02:00
|
|
|
)
|
2022-01-17 20:11:07 +01:00
|
|
|
}
|
2022-05-26 12:42:19 +02:00
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
)
|
2022-01-17 20:11:07 +01:00
|
|
|
}
|