adjacency: Make distance estimation customizable

2025-10-26 18:57:25 +01:00 · 2022-01-09 20:21:27 +01:00 · 2022-01-09 20:21:27 +01:00 · 2ceda0691b
commit 2ceda0691b
parent ac9894e988
4 changed files with 109 additions and 71 deletions
--- a/1
+++ b/1
@ -12,6 +12,7 @@ export(analyze)
 export(clustering)
 export(compare)
 export(correlation)
 export(densest)
 export(method)
 export(neural)
 export(optimal_weights)
--- a/R/adjacency.R
+++ b/R/adjacency.R
@ -1,13 +1,36 @@
 #' Find the densest value in the data.
 #'
 #' This function assumes that data represents a continuous variable and finds
 #' a single value with the highest estimated density. This can be used to
 #' estimate the mode of the data. If there is only one value that value is
 #' returned. If multiple density maxima with the same density exist, their mean
 #' is returned.
 #'
 #' @param data The input data.
 #'
 #' @return The densest value of data.
 #'
 #' @export
 densest <- function(data) {
    as.numeric(if (length(data) <= 0) {
        NULL
    } else if (length(data) == 1) {
        data
    } else {
        density <- stats::density(data)
        mean(density$x[density$y == max(density$y)])
    })
 }
 #' Score genes based on their proximity to the reference genes.
 #'
-#' This method finds the distance value with the maximum density for each gene
+#' @param estimate A function that will be used to summarize the distance
-#' (i.e. the mode of its estimated distribution). Genes are scored by comparing
+#'   values for each gene. See [densest()] for the default implementation.
 #' those distance values with the values of the reference genes.
 #'
 #' @return An object of class `geposan_method`.
 #'
 #' @export
-adjacency <- function() {
+adjacency <- function(estimate = densest) {
    method(
        id = "adjacency",
        name = "Adjacency",
@ -17,73 +40,64 @@ adjacency <- function() {
            gene_ids <- preset$gene_ids
            reference_gene_ids <- preset$reference_gene_ids
-            cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), {
+            cached(
-                # Get the virtual distance value with the highest density.
+                "adjacency",
-                compute_densest_distance <- function(distances) {
+                c(species_ids, gene_ids, reference_gene_ids, estimate),
-                    if (length(distances) <= 2) {
+                { # nolint
-                        mean(distances)
+                    # Filter distances by species and gene and summarize each
-                    } else {
+                    # gene's distance values using the estimation function.
-                        d <- stats::density(distances)
+                    data <- geposan::distances[
-                        d$x[which.max(d$y)]
+                        species %chin% species_ids & gene %chin% gene_ids,
-                    }
+                        .(distance = estimate(distance)),
-                }
+                        by = gene
                # Filter distances by species and gene and find the distance
                # with the highest density of values for each gene.
                data <- geposan::distances[
                    species %chin% species_ids & gene %chin% gene_ids,
                    .(densest_distance = compute_densest_distance(distance)),
                    by = gene
                ]
                # Compute the absolute value of the difference between the
                # provided densest distance value in comparison to the mean of
                # the densest distances of the comparison genes.
                compute_difference <- function(densest_distance,
                                               comparison_ids) {
                    # Get the mean of the densest distances of the reference
                    # genes.
                    mean_densest_distance <- data[
                        gene %chin% comparison_ids,
                        mean(densest_distance)
                    ]
-                    abs(densest_distance - mean_densest_distance)
+                    # Compute the absolute value of the difference between the
-                }
+                    # estimated distances of each gene to the reference genes.
                    compute_difference <- function(distance,
                                                   comparison_ids) {
                        reference_distance <- data[
                            gene %chin% comparison_ids,
                            mean(distance)
                        ]
-                # Compute the differences to the reference genes.
+                        abs(distance - reference_distance)
-                data[
+                    }
-                    !gene %chin% reference_gene_ids,
+
-                    difference := compute_difference(
+                    # Compute the differences to the reference genes.
-                        densest_distance,
+                    data[
-                        reference_gene_ids
+                        !gene %chin% reference_gene_ids,
                        difference := compute_difference(
                            distance,
                            reference_gene_ids
                        )
                    ]
                    progress(0.5)
                    # Exclude the reference gene itself when computing its
                    # difference.
                    data[
                        gene %chin% reference_gene_ids,
                        difference := compute_difference(
                            distance,
                            reference_gene_ids[reference_gene_ids != gene]
                        ),
                        by = gene
                    ]
                    # Compute the final score by normalizing the difference.
                    data[, score := 1 - difference / max(difference)]
                    progress(1.0)
                    result(
                        method = "adjacency",
                        scores = data[, .(gene, score)],
                        details = list(data = data)
                    )
-                ]
+                }
-
+            )
                progress(0.5)
                # Exclude the reference gene itself when computing its
                # difference.
                data[
                    gene %chin% reference_gene_ids,
                    difference := compute_difference(
                        densest_distance,
                        reference_gene_ids[reference_gene_ids != gene]
                    ),
                    by = gene
                ]
                # Compute the final score by normalizing the difference.
                data[, score := 1 - difference / max(difference)]
                progress(1.0)
                result(
                    method = "adjacency",
                    scores = data[, .(gene, score)],
                    details = list(data = data)
                )
            })
        }
    )
 }
--- a/man/adjacency.Rd
+++ b/man/adjacency.Rd
@ -4,13 +4,15 @@
 \alias{adjacency}
 \title{Score genes based on their proximity to the reference genes.}
 \usage{
-adjacency()
+adjacency(estimate = densest)
 }
 \arguments{
 \item{estimate}{A function that will be used to summarize the distance
 values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.}
 }
 \value{
 An object of class \code{geposan_method}.
 }
 \description{
-This method finds the distance value with the maximum density for each gene
+Score genes based on their proximity to the reference genes.
 (i.e. the mode of its estimated distribution). Genes are scored by comparing
 those distance values with the values of the reference genes.
 }
--- a/man/densest.Rd
+++ b/man/densest.Rd
@ -0,0 +1,21 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/adjacency.R
 \name{densest}
 \alias{densest}
 \title{Find the densest value in the data.}
 \usage{
 densest(data)
 }
 \arguments{
 \item{data}{The input data.}
 }
 \value{
 The densest value of data.
 }
 \description{
 This function assumes that data represents a continuous variable and finds
 a single value with the highest estimated density. This can be used to
 estimate the mode of the data. If there is only one value that value is
 returned. If multiple density maxima with the same density exist, their mean
 is returned.
 }