Implement all methods using positions additionally

2025-10-26 18:57:25 +01:00 · 2021-11-05 19:49:54 +01:00 · 2021-11-05 19:49:54 +01:00 · cfc5e7a6bf
commit cfc5e7a6bf
parent 9cbc127177
5 changed files with 222 additions and 167 deletions
--- a/R/correlation.R
+++ b/R/correlation.R
@ -1,79 +1,90 @@
 # Compute the mean correlation coefficient comparing gene distances with a set
 # of reference genes.
-correlation <- function(preset, progress = NULL) {
+correlation <- function(preset, use_positions = FALSE, progress = NULL) {
    species_ids <- preset$species_ids
    gene_ids <- preset$gene_ids
    reference_gene_ids <- preset$reference_gene_ids

-    cached("correlation", c(species_ids, gene_ids, reference_gene_ids), {
-        # Prefilter distances by species.
-        distances <- geposan::distances[species %chin% species_ids]
+    cached(
+        "correlation",
+        c(species_ids, gene_ids, reference_gene_ids, use_positions),
+        { # nolint
+            # Prefilter distances by species.
+            distances <- geposan::distances[species %chin% species_ids]

-        # Tranform data to get species as rows and genes as columns. We
-        # construct columns per species, because it requires fewer iterations,
-        # and transpose the table afterwards.
+            # Tranform data to get species as rows and genes as columns. We
+            # construct columns per species, because it requires fewer
+            # iterations, and transpose the table afterwards.

-        data <- data.table(gene = gene_ids)
+            data <- data.table(gene = gene_ids)

-        # Make a column containing distance data for each species.
-        for (species_id in species_ids) {
-            species_distances <- distances[
-                species == species_id,
-                .(gene, distance)
-            ]
-
-            data <- merge(data, species_distances, all.x = TRUE)
-            setnames(data, "distance", species_id)
-        }
-
-        # Transpose to the desired format.
-        data <- transpose(data, make.names = "gene")
-
-        if (!is.null(progress)) progress(0.33)
-
-        # Take the reference data.
-        reference_data <- data[, ..reference_gene_ids]
-
-        # Perform the correlation between all possible pairs.
-        results <- stats::cor(
-            data[, ..gene_ids],
-            reference_data,
-            use = "pairwise.complete.obs",
-            method = "spearman"
-        )
-
-        results <- data.table(results, keep.rownames = TRUE)
-        setnames(results, "rn", "gene")
-
-        # Remove correlations between the reference genes themselves.
-        for (reference_gene_id in reference_gene_ids) {
-            column <- quote(reference_gene_id)
-            results[gene == reference_gene_id, eval(column) := NA]
-        }
-
-        if (!is.null(progress)) progress(0.66)
-
-        # Compute the final score as the mean of known correlation scores.
-        # Negative correlations will correctly lessen the score, which will be
-        # clamped to zero as its lower bound. Genes with no possible
-        # correlations at all will be assumed to have a score of 0.0.
-
-        compute_score <- function(scores) {
-            score <- mean(scores, na.rm = TRUE)
-
-            if (is.na(score) | score < 0.0) {
-                score <- 0.0
+            # Make a column containing distance data for each species.
+            for (species_id in species_ids) {
+                species_data <- if (use_positions) {
+                    setnames(distances[
+                        species == species_id,
+                        .(gene, position)
+                    ], "position", "distance")
+                } else {
+                    distances[
+                        species == species_id,
+                        .(gene, distance)
+                    ]
+                }
            }

-            score
+            data <- merge(data, species_data, all.x = TRUE)
+            setnames(data, "distance", species_id)
+
+            # Transpose to the desired format.
+            data <- transpose(data, make.names = "gene")
+
+            if (!is.null(progress)) progress(0.33)
+
+            # Take the reference data.
+            reference_data <- data[, ..reference_gene_ids]
+
+            # Perform the correlation between all possible pairs.
+            results <- stats::cor(
+                data[, ..gene_ids],
+                reference_data,
+                use = "pairwise.complete.obs",
+                method = "spearman"
+            )
+
+            results <- data.table(results, keep.rownames = TRUE)
+            setnames(results, "rn", "gene")
+
+            # Remove correlations between the reference genes themselves.
+            for (reference_gene_id in reference_gene_ids) {
+                column <- quote(reference_gene_id)
+                results[gene == reference_gene_id, eval(column) := NA]
+            }
+
+            if (!is.null(progress)) progress(0.66)
+
+            # Compute the final score as the mean of known correlation scores.
+            # Negative correlations will correctly lessen the score, which will
+            # be clamped to zero as its lower bound. Genes with no possible
+            # correlations at all will be assumed to have a score of 0.0.
+
+            compute_score <- function(scores) {
+                score <- mean(scores, na.rm = TRUE)
+
+                if (is.na(score) | score < 0.0) {
+                    score <- 0.0
+                }
+
+                score
+            }
+
+            results[,
+                score := compute_score(as.matrix(.SD)),
+                .SDcols = reference_gene_ids,
+                by = gene
+            ]
+
+            results[, .(gene, score)]
        }
-
-        results[,
-            score := compute_score(as.matrix(.SD)),
-            .SDcols = reference_gene_ids,
-            by = gene
-        ]
-
-        results[, .(gene, score)]
-    })
+    )
 }