From 53f955f3dac4a81988c499135f17a975fa03abd4 Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Mon, 17 Jan 2022 20:11:07 +0100
Subject: [PATCH] Add species adjacency method

---
 NAMESPACE                |   1 +
 R/adjacency.R            |  23 +++---
 R/method.R               |   3 +-
 R/species_adjacency.R    | 148 +++++++++++++++++++++++++++++++++++++++
 man/adjacency.Rd         |  16 +++--
 man/species_adjacency.Rd |  25 +++++++
 6 files changed, 202 insertions(+), 14 deletions(-)
 create mode 100644 R/species_adjacency.R
 create mode 100644 man/species_adjacency.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 685d468..22449fb 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -24,5 +24,6 @@ export(plot_scores)
 export(preset)
 export(ranking)
 export(result)
+export(species_adjacency)
 export(validate)
 import(data.table)
diff --git a/R/adjacency.R b/R/adjacency.R
index 6d2a1e6..aceffa8 100644
--- a/R/adjacency.R
+++ b/R/adjacency.R
@@ -24,16 +24,23 @@ densest <- function(data) {
 
 #' Score genes based on their proximity to the reference genes.
 #'
-#' @param estimate A function that will be used to summarize the distance
-#'   values for each gene. See [densest()] for the default implementation.
-#' @param combination A function that will be used to combine the different
+#' In this case, the distance data that is available for one gene is first
+#' combined. The resulting value is compared to the reference genes and
+#' determines the gene's score in relation to other genes.
+#'
+#' @param distance_estimate A function that will be used to summarize the
+#'   distance values for each gene. See [densest()] for the default
+#'   implementation.
+#' @param summarize A function that will be used to combine the different
 #'   distances to the reference genes. By default [min()] is used. That means
 #'   the distance to the nearest reference gene will be scored.
 #'
 #' @return An object of class `geposan_method`.
 #'
+#' @seealso [species_adjacency()]
+#'
 #' @export
-adjacency <- function(estimate = densest, combination = min) {
+adjacency <- function(distance_estimate = densest, summarize = min) {
     method(
         id = "adjacency",
         name = "Adjacency",
@@ -49,15 +56,15 @@ adjacency <- function(estimate = densest, combination = min) {
                     species_ids,
                     gene_ids,
                     reference_gene_ids,
-                    estimate,
-                    combination
+                    distance_estimate,
+                    summarize
                 ),
                 { # nolint
                     # Filter distances by species and gene and summarize each
                     # gene's distance values using the estimation function.
                     data <- geposan::distances[
                         species %chin% species_ids & gene %chin% gene_ids,
-                        .(distance = as.numeric(estimate(distance))),
+                        .(distance = as.numeric(distance_estimate(distance))),
                         by = gene
                     ]
 
@@ -70,7 +77,7 @@ adjacency <- function(estimate = densest, combination = min) {
                             .(difference = abs(distance_value - distance))
                         ]
 
-                        combination(differences$difference)
+                        summarize(differences$difference)
                     }
 
                     # Compute the differences to the reference genes.
diff --git a/R/method.R b/R/method.R
index c0be5ad..d1a0a84 100644
--- a/R/method.R
+++ b/R/method.R
@@ -36,7 +36,8 @@ all_methods <- function() {
         clustering(),
         correlation(),
         neural(),
-        adjacency()
+        adjacency(),
+        species_adjacency()
     )
 }
 
diff --git a/R/species_adjacency.R b/R/species_adjacency.R
new file mode 100644
index 0000000..587ffa6
--- /dev/null
+++ b/R/species_adjacency.R
@@ -0,0 +1,148 @@
+#' Score genes based on their adjacency to the reference genes within species.
+#'
+#' For each gene and species, the method will first combine the gene's distances
+#' to the reference genes within that species. Afterwards, the results are
+#' summarized across species and determine the gene's score.
+#'
+#' @param distance_estimate Function for combining the distance differences
+#'   within one species.
+#' @param summarize Function for summarizing the distance values across species.
+#'
+#' @return An object of class `geposan_method`.
+#'
+#' @seealso [adjacency()]
+#'
+#' @export
+species_adjacency <- function(distance_estimate = min,
+                              summarize = stats::median) {
+    method(
+        id = "species_adjacency",
+        name = "Species adj.",
+        description = "Species adjacency",
+        function(preset, progress) {
+            species_ids <- preset$species_ids
+            gene_ids <- preset$gene_ids
+            reference_gene_ids <- preset$reference_gene_ids
+
+            cached(
+                "species_adjacency",
+                c(
+                    species_ids,
+                    gene_ids,
+                    reference_gene_ids,
+                    distance_estimate,
+                    summarize
+                ),
+                { # nolint
+                    # Prefilter distances.
+                    data <- geposan::distances[
+                        species %chin% species_ids & gene %chin% gene_ids
+                    ]
+
+                    progress_state <- 0.0
+                    progress_step <- 0.9 / length(species_ids)
+
+                    # Iterate through all species and find the distance
+                    # estimates within that species.
+                    for (species_id in species_ids) {
+                        # For all genes, compute the distance to one reference
+                        # gene at a time in one go.
+                        for (reference_gene_id in reference_gene_ids) {
+                            comparison_distance <- data[
+                                species == species_id &
+                                    gene == reference_gene_id,
+                                distance
+                            ]
+
+                            column <- quote(reference_gene_id)
+
+                            if (length(comparison_distance) != 1) {
+                                # If we don't have a comparison distance, we
+                                # can't compute a difference. This happens, if
+                                # the species doesn't have the reference gene.
+                                data[
+                                    species == species_id &
+                                        gene %chin% gene_ids,
+                                    eval(column) := NA_integer_
+                                ]
+                            } else {
+                                data[
+                                    species == species_id &
+                                        gene %chin% gene_ids,
+                                    eval(column) :=
+                                        abs(distance - comparison_distance)
+                                ]
+                            }
+                        }
+
+                        # Combine the distances to the different reference genes
+                        # into one value using the provided function.
+                        data[
+                            species == species_id &
+                                gene %chin% gene_ids,
+                            combined_distance := as.numeric(
+                                distance_estimate(na.omit(
+                                    # Convert the data.table subset into a
+                                    # vector to get the correct na.omit
+                                    # behavior.
+                                    as.matrix(.SD)[1, ]
+                                ))
+                            ),
+                            .SDcols = reference_gene_ids,
+                            by = gene
+                        ]
+
+                        progress_state <- progress_state + progress_step
+                        progress(progress_state)
+                    }
+
+                    progress(0.9)
+
+                    # Remove the distances between the reference genes.
+                    for (reference_gene_id in reference_gene_ids) {
+                        column <- quote(reference_gene_id)
+                        data[gene == reference_gene_id, eval(column) := NA]
+                    }
+
+                    # Recompute the combined distance for the reference genes.
+                    data[
+                        gene %chin% reference_gene_ids,
+                        combined_distance := as.numeric(
+                            distance_estimate(na.omit(as.matrix(.SD)[1, ]))
+                        ),
+                        .SDcols = reference_gene_ids,
+                        by = list(species, gene)
+                    ]
+
+                    # Combine the distances into one value.
+                    results <- data[,
+                        .(
+                            summarized_distances = as.numeric(
+                                summarize(na.omit(combined_distance))
+                            )
+                        ),
+                        by = gene
+                    ]
+
+                    # Compute the final score by normalizing the difference.
+                    results[
+                        ,
+                        score := 1 - summarized_distances /
+                            max(summarized_distances)
+                    ]
+
+                    progress(1.0)
+
+                    result(
+                        method = "species_adjacency",
+                        scores = results[, .(gene, score)],
+                        details = list(
+                            data = data,
+                            results = results
+                        )
+                    )
+                }
+            )
+        }
+    )
+}
diff --git a/man/adjacency.Rd b/man/adjacency.Rd
index f68d759..4eac7b9 100644
--- a/man/adjacency.Rd
+++ b/man/adjacency.Rd
@@ -4,13 +4,14 @@
 \alias{adjacency}
 \title{Score genes based on their proximity to the reference genes.}
 \usage{
-adjacency(estimate = densest, combination = min)
+adjacency(distance_estimate = densest, summarize = min)
 }
 \arguments{
-\item{estimate}{A function that will be used to summarize the distance
-values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.}
+\item{distance_estimate}{A function that will be used to summarize the
+distance values for each gene. See \code{\link[=densest]{densest()}} for the default
+implementation.}
 
-\item{combination}{A function that will be used to combine the different
+\item{summarize}{A function that will be used to combine the different
 distances to the reference genes. By default \code{\link[=min]{min()}} is used. That means
 the distance to the nearest reference gene will be scored.}
 }
@@ -18,5 +19,10 @@ the distance to the nearest reference gene will be scored.}
 An object of class \code{geposan_method}.
 }
 \description{
-Score genes based on their proximity to the reference genes.
+In this case, the distance data that is available for one gene is first
+combined. The resulting value is compared to the reference genes and
+determines the gene's score in relation to other genes.
+}
+\seealso{
+\code{\link[=species_adjacency]{species_adjacency()}}
 }
diff --git a/man/species_adjacency.Rd b/man/species_adjacency.Rd
new file mode 100644
index 0000000..0607559
--- /dev/null
+++ b/man/species_adjacency.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/species_adjacency.R
+\name{species_adjacency}
+\alias{species_adjacency}
+\title{Score genes based on their adjacency to the reference genes within species.}
+\usage{
+species_adjacency(distance_estimate = min, summarize = stats::median)
+}
+\arguments{
+\item{distance_estimate}{Function for combining the distance differences
+within one species.}
+
+\item{summarize}{Function for summarizing the distance values across species.}
+}
+\value{
+An object of class \code{geposan_method}.
+}
+\description{
+For each gene and species, the method will first combine the gene's distances
+to the reference genes within that species. Afterwards, the results are
+summarized across species and determine the gene's score.
+}
+\seealso{
+\code{\link[=adjacency]{adjacency()}}
+}