Add species adjacency method

2025-10-26 10:47:25 +01:00 · 2022-01-17 20:11:07 +01:00 · 2022-01-17 20:11:07 +01:00 · 53f955f3da
commit 53f955f3da
parent c8f1e522f9
6 changed files with 202 additions and 14 deletions
--- a/1
+++ b/1
@ -24,5 +24,6 @@ export(plot_scores)
 export(preset)
 export(ranking)
 export(result)
 export(species_adjacency)
 export(validate)
 import(data.table)
--- a/R/adjacency.R
+++ b/R/adjacency.R
@ -24,16 +24,23 @@ densest <- function(data) {
 #' Score genes based on their proximity to the reference genes.
 #'
-#' @param estimate A function that will be used to summarize the distance
+#' In this case, the distance data that is available for one gene is first
-#'   values for each gene. See [densest()] for the default implementation.
+#' combined. The resulting value is compared to the reference genes and
-#' @param combination A function that will be used to combine the different
+#' determines the gene's score in relation to other genes.
 #'
 #' @param distance_estimate A function that will be used to summarize the
 #'   distance values for each gene. See [densest()] for the default
 #'   implementation.
 #' @param summarize A function that will be used to combine the different
 #'   distances to the reference genes. By default [min()] is used. That means
 #'   the distance to the nearest reference gene will be scored.
 #'
 #' @return An object of class `geposan_method`.
 #'
 #' @seealso [species_adjacency()]
 #'
 #' @export
-adjacency <- function(estimate = densest, combination = min) {
+adjacency <- function(distance_estimate = densest, summarize = min) {
    method(
        id = "adjacency",
        name = "Adjacency",
@ -49,15 +56,15 @@ adjacency <- function(estimate = densest, combination = min) {
                    species_ids,
                    gene_ids,
                    reference_gene_ids,
-                    estimate,
+                    distance_estimate,
-                    combination
+                    summarize
                ),
                { # nolint
                    # Filter distances by species and gene and summarize each
                    # gene's distance values using the estimation function.
                    data <- geposan::distances[
                        species %chin% species_ids & gene %chin% gene_ids,
-                        .(distance = as.numeric(estimate(distance))),
+                        .(distance = as.numeric(distance_estimate(distance))),
                        by = gene
                    ]
@ -70,7 +77,7 @@ adjacency <- function(estimate = densest, combination = min) {
                            .(difference = abs(distance_value - distance))
                        ]
-                        combination(differences$difference)
+                        summarize(differences$difference)
                    }
                    # Compute the differences to the reference genes.
--- a/R/method.R
+++ b/R/method.R
@ -36,7 +36,8 @@ all_methods <- function() {
        clustering(),
        correlation(),
        neural(),
-        adjacency()
+        adjacency(),
        species_adjacency()
    )
 }
--- a/R/species_adjacency.R
+++ b/R/species_adjacency.R
@ -0,0 +1,148 @@
 #' Score genes based on their adjacency to the reference genes within species.
 #'
 #' For each gene and species, the method will first combine the gene's distances
 #' to the reference genes within that species. Afterwards, the results are
 #' summarized across species and determine the gene's score.
 #'
 #' @param distance_estimate Function for combining the distance differences
 #'   within one species.
 #' @param summarize Function for summarizing the distance values across species.
 #'
 #' @return An object of class `geposan_method`.
 #'
 #' @seealso [adjacency()]
 #'
 #' @export
 species_adjacency <- function(distance_estimate = min,
                              summarize = stats::median) {
    method(
        id = "species_adjacency",
        name = "Species adj.",
        description = "Species adjacency",
        function(preset, progress) {
            species_ids <- preset$species_ids
            gene_ids <- preset$gene_ids
            reference_gene_ids <- preset$reference_gene_ids
            cached(
                "species_adjacency",
                c(
                    species_ids,
                    gene_ids,
                    reference_gene_ids,
                    distance_estimate,
                    summarize
                ),
                { # nolint
                    # Prefilter distances.
                    data <- geposan::distances[
                        species %chin% species_ids & gene %chin% gene_ids
                    ]
                    progress_state <- 0.0
                    progress_step <- 0.9 / length(species_ids)
                    # Iterate through all species and find the distance
                    # estimates within that species.
                    for (species_id in species_ids) {
                        # For all genes, compute the distance to one reference
                        # gene at a time in one go.
                        for (reference_gene_id in reference_gene_ids) {
                            comparison_distance <- data[
                                species == species_id &
                                    gene == reference_gene_id,
                                distance
                            ]
                            column <- quote(reference_gene_id)
                            if (length(comparison_distance) != 1) {
                                # If we don't have a comparison distance, we
                                # can't compute a difference. This happens, if
                                # the species doesn't have the reference gene.
                                data[
                                    species == species_id &
                                        gene %chin% gene_ids,
                                    eval(column) := NA_integer_
                                ]
                            } else {
                                data[
                                    species == species_id &
                                        gene %chin% gene_ids,
                                    eval(column) :=
                                        abs(distance - comparison_distance)
                                ]
                            }
                        }
                        # Combine the distances to the different reference genes
                        # into one value using the provided function.
                        data[
                            species == species_id &
                                gene %chin% gene_ids,
                            combined_distance := as.numeric(
                                distance_estimate(na.omit(
                                    # Convert the data.table subset into a
                                    # vector to get the correct na.omit
                                    # behavior.
                                    as.matrix(.SD)[1, ]
                                ))
                            ),
                            .SDcols = reference_gene_ids,
                            by = gene
                        ]
                        progress_state <- progress_state + progress_step
                        progress(progress_state)
                    }
                    progress(0.9)
                    # Remove the distances between the reference genes.
                    for (reference_gene_id in reference_gene_ids) {
                        column <- quote(reference_gene_id)
                        data[gene == reference_gene_id, eval(column) := NA]
                    }
                    # Recompute the combined distance for the reference genes.
                    data[
                        gene %chin% reference_gene_ids,
                        combined_distance := as.numeric(
                            distance_estimate(na.omit(as.matrix(.SD)[1, ]))
                        ),
                        .SDcols = reference_gene_ids,
                        by = list(species, gene)
                    ]
                    # Combine the distances into one value.
                    results <- data[,
                        .(
                            summarized_distances = as.numeric(
                                summarize(na.omit(combined_distance))
                            )
                        ),
                        by = gene
                    ]
                    # Compute the final score by normalizing the difference.
                    results[
                        ,
                        score := 1 - summarized_distances /
                            max(summarized_distances)
                    ]
                    progress(1.0)
                    result(
                        method = "species_adjacency",
                        scores = results[, .(gene, score)],
                        details = list(
                            data = data,
                            results = results
                        )
                    )
                }
            )
        }
    )
 }
--- a/man/adjacency.Rd
+++ b/man/adjacency.Rd
@ -4,13 +4,14 @@
 \alias{adjacency}
 \title{Score genes based on their proximity to the reference genes.}
 \usage{
-adjacency(estimate = densest, combination = min)
+adjacency(distance_estimate = densest, summarize = min)
 }
 \arguments{
-\item{estimate}{A function that will be used to summarize the distance
+\item{distance_estimate}{A function that will be used to summarize the
-values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.}
+distance values for each gene. See \code{\link[=densest]{densest()}} for the default
 implementation.}
-\item{combination}{A function that will be used to combine the different
+\item{summarize}{A function that will be used to combine the different
 distances to the reference genes. By default \code{\link[=min]{min()}} is used. That means
 the distance to the nearest reference gene will be scored.}
 }
@ -18,5 +19,10 @@ the distance to the nearest reference gene will be scored.}
 An object of class \code{geposan_method}.
 }
 \description{
-Score genes based on their proximity to the reference genes.
+In this case, the distance data that is available for one gene is first
 combined. The resulting value is compared to the reference genes and
 determines the gene's score in relation to other genes.
 }
 \seealso{
 \code{\link[=species_adjacency]{species_adjacency()}}
 }
--- a/man/species_adjacency.Rd
+++ b/man/species_adjacency.Rd
@ -0,0 +1,25 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/species_adjacency.R
 \name{species_adjacency}
 \alias{species_adjacency}
 \title{Score genes based on their adjacency to the reference genes within species.}
 \usage{
 species_adjacency(distance_estimate = min, summarize = stats::median)
 }
 \arguments{
 \item{distance_estimate}{Function for combining the distance differences
 within one species.}
 \item{summarize}{Function for summarizing the distance values across species.}
 }
 \value{
 An object of class \code{geposan_method}.
 }
 \description{
 For each gene and species, the method will first combine the gene's distances
 to the reference genes within that species. Afterwards, the results are
 summarized across species and determine the gene's score.
 }
 \seealso{
 \code{\link[=adjacency]{adjacency()}}
 }