adjacency: Make distance estimation customizable

2025-10-26 10:47:25 +01:00 · 2022-01-09 20:21:27 +01:00 · 2022-01-09 20:21:27 +01:00 · 2ceda0691b
commit 2ceda0691b
parent ac9894e988
4 changed files with 109 additions and 71 deletions
--- a/R/adjacency.R
+++ b/R/adjacency.R
@ -1,13 +1,36 @@
+#' Find the densest value in the data.
+#'
+#' This function assumes that data represents a continuous variable and finds
+#' a single value with the highest estimated density. This can be used to
+#' estimate the mode of the data. If there is only one value that value is
+#' returned. If multiple density maxima with the same density exist, their mean
+#' is returned.
+#'
+#' @param data The input data.
+#'
+#' @return The densest value of data.
+#'
+#' @export
+densest <- function(data) {
+    as.numeric(if (length(data) <= 0) {
+        NULL
+    } else if (length(data) == 1) {
+        data
+    } else {
+        density <- stats::density(data)
+        mean(density$x[density$y == max(density$y)])
+    })
+}
+
 #' Score genes based on their proximity to the reference genes.
 #'
-#' This method finds the distance value with the maximum density for each gene
-#' (i.e. the mode of its estimated distribution). Genes are scored by comparing
-#' those distance values with the values of the reference genes.
+#' @param estimate A function that will be used to summarize the distance
+#'   values for each gene. See [densest()] for the default implementation.
 #'
 #' @return An object of class `geposan_method`.
 #'
 #' @export
-adjacency <- function() {
+adjacency <- function(estimate = densest) {
    method(
        id = "adjacency",
        name = "Adjacency",
@ -17,73 +40,64 @@ adjacency <- function() {
            gene_ids <- preset$gene_ids
            reference_gene_ids <- preset$reference_gene_ids

-            cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), {
-                # Get the virtual distance value with the highest density.
-                compute_densest_distance <- function(distances) {
-                    if (length(distances) <= 2) {
-                        mean(distances)
-                    } else {
-                        d <- stats::density(distances)
-                        d$x[which.max(d$y)]
-                    }
-                }
-
-                # Filter distances by species and gene and find the distance
-                # with the highest density of values for each gene.
-                data <- geposan::distances[
-                    species %chin% species_ids & gene %chin% gene_ids,
-                    .(densest_distance = compute_densest_distance(distance)),
-                    by = gene
-                ]
-
-                # Compute the absolute value of the difference between the
-                # provided densest distance value in comparison to the mean of
-                # the densest distances of the comparison genes.
-                compute_difference <- function(densest_distance,
-                                               comparison_ids) {
-                    # Get the mean of the densest distances of the reference
-                    # genes.
-                    mean_densest_distance <- data[
-                        gene %chin% comparison_ids,
-                        mean(densest_distance)
+            cached(
+                "adjacency",
+                c(species_ids, gene_ids, reference_gene_ids, estimate),
+                { # nolint
+                    # Filter distances by species and gene and summarize each
+                    # gene's distance values using the estimation function.
+                    data <- geposan::distances[
+                        species %chin% species_ids & gene %chin% gene_ids,
+                        .(distance = estimate(distance)),
+                        by = gene
                    ]

-                    abs(densest_distance - mean_densest_distance)
-                }
+                    # Compute the absolute value of the difference between the
+                    # estimated distances of each gene to the reference genes.
+                    compute_difference <- function(distance,
+                                                   comparison_ids) {
+                        reference_distance <- data[
+                            gene %chin% comparison_ids,
+                            mean(distance)
+                        ]

-                # Compute the differences to the reference genes.
-                data[
-                    !gene %chin% reference_gene_ids,
-                    difference := compute_difference(
-                        densest_distance,
-                        reference_gene_ids
+                        abs(distance - reference_distance)
+                    }
+
+                    # Compute the differences to the reference genes.
+                    data[
+                        !gene %chin% reference_gene_ids,
+                        difference := compute_difference(
+                            distance,
+                            reference_gene_ids
+                        )
+                    ]
+
+                    progress(0.5)
+
+                    # Exclude the reference gene itself when computing its
+                    # difference.
+                    data[
+                        gene %chin% reference_gene_ids,
+                        difference := compute_difference(
+                            distance,
+                            reference_gene_ids[reference_gene_ids != gene]
+                        ),
+                        by = gene
+                    ]
+
+                    # Compute the final score by normalizing the difference.
+                    data[, score := 1 - difference / max(difference)]
+
+                    progress(1.0)
+
+                    result(
+                        method = "adjacency",
+                        scores = data[, .(gene, score)],
+                        details = list(data = data)
                    )
-                ]
-
-                progress(0.5)
-
-                # Exclude the reference gene itself when computing its
-                # difference.
-                data[
-                    gene %chin% reference_gene_ids,
-                    difference := compute_difference(
-                        densest_distance,
-                        reference_gene_ids[reference_gene_ids != gene]
-                    ),
-                    by = gene
-                ]
-
-                # Compute the final score by normalizing the difference.
-                data[, score := 1 - difference / max(difference)]
-
-                progress(1.0)
-
-                result(
-                    method = "adjacency",
-                    scores = data[, .(gene, score)],
-                    details = list(data = data)
-                )
-            })
+                }
+            )
        }
    )
 }