From 7b9a42215e1c32a7ee0acd755c5734412a8df731 Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Fri, 15 Oct 2021 09:27:35 +0200
Subject: [PATCH] Add new method proximity

---
 methods.R   |  7 +++++++
 proximity.R | 29 +++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 proximity.R

diff --git a/methods.R b/methods.R
index 8d91478..a15d255 100644
--- a/methods.R
+++ b/methods.R
@@ -1,6 +1,7 @@
 source("clusteriness.R")
 source("correlation.R")
 source("neural.R")
+source("proximity.R")
 
 #' Construct a new method.
 #'
@@ -47,6 +48,12 @@ methods <- list(
         "Correlation with known genes",
         process_correlation
     ),
+    method(
+        "proximity",
+        "Proximity",
+        "Proximity to telomeres",
+        process_proximity
+    ),
     method(
         "neural",
         "Neural",
diff --git a/proximity.R b/proximity.R
new file mode 100644
index 0000000..7da4363
--- /dev/null
+++ b/proximity.R
@@ -0,0 +1,29 @@
+library(data.table)
+
+#' Score the mean distance of genes to the telomeres across species.
+#'
+#' A score will be given to each gene such that 0.0 corresponds to the maximal
+#' mean distance across all genes and 1.0 corresponds to a distance of 0.
+#'
+#' The result will be a data.table with the following columns:
+#'
+#'  - `gene` Gene ID of the processed gene.
+#'  - `score` Score for the proximity.
+#'
+#' @param distances Distance data to use.
+#' @param species_ids Species, whose data should be included.
+#' @param gene_ids Genes to process.
+process_proximity <- function(distances, species_ids, gene_ids, ...) {
+    species_count <- length(species_ids)
+
+    # Prefilter distances by species.
+    distances <- distances[species %chin% species_ids]
+
+    # Compute the score as described above.
+
+    distances <- distances[, .(mean_distance = mean(distance)), by = "gene"]
+    max_distance <- distances[, max(mean_distance)]
+    distances[, score := 1 - mean_distance / max_distance]
+
+    distances[, .(gene, score)]
+}