From 68354bf80844a429f039e31390a9f83af4b4659d Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Sat, 16 Oct 2021 21:46:59 +0200
Subject: [PATCH] Reorganize source files and generalize presets

---
 init.R                                   | 82 ------------------------
 main.R                                   |  6 --
 clusteriness.R => process/clusteriness.R | 13 +---
 correlation.R => process/correlation.R   | 16 +----
 input.R => process/input.R               |  0
 methods.R => process/methods.R           | 11 ++--
 neural.R => process/neural.R             | 15 +----
 process/presets.R                        | 29 +++++++++
 process/process.R                        | 58 +++++++++++++++++
 proximity.R => process/proximity.R       | 15 +----
 util.R => process/util.R                 |  0
 shiny/main.R                             |  7 ++
 server.R => shiny/server.R               | 12 +++-
 ui.R => shiny/ui.R                       |  2 -
 14 files changed, 119 insertions(+), 147 deletions(-)
 delete mode 100644 init.R
 delete mode 100644 main.R
 rename clusteriness.R => process/clusteriness.R (76%)
 rename correlation.R => process/correlation.R (78%)
 rename input.R => process/input.R (100%)
 rename methods.R => process/methods.R (87%)
 rename neural.R => process/neural.R (86%)
 create mode 100644 process/presets.R
 create mode 100644 process/process.R
 rename proximity.R => process/proximity.R (54%)
 rename util.R => process/util.R (100%)
 create mode 100644 shiny/main.R
 rename server.R => shiny/server.R (96%)
 rename ui.R => shiny/ui.R (99%)

diff --git a/init.R b/init.R
deleted file mode 100644
index f630d79..0000000
--- a/init.R
+++ /dev/null
@@ -1,82 +0,0 @@
-source("input.R")
-source("methods.R")
-source("util.R")
-
-# Load input data
-
-species <- run_cached("input_species", retrieve_species)
-genes <- run_cached("input_genes", retrieve_genes)
-
-distances <- run_cached(
-    "input_distances",
-    retrieve_distances,
-    species[, id],
-    genes[, id]
-)
-
-all_species <- species[, id]
-replicative_species <- species[replicative == TRUE, id]
-all_genes <- genes[, id]
-tpe_old_genes <- genes[suggested | verified == TRUE, id]
-
-# Apply all methods for all species
-
-results_all <- merge(
-    genes,
-    distances[, .(n_species = .N), by = "gene"],
-    by.x = "id",
-    by.y = "gene"
-)
-
-setnames(results_all, "id", "gene")
-
-for (method in methods) {
-    method_results <- run_cached(
-        sprintf("%s_all", method$id),
-        method$fn,
-        distances,
-        all_species,
-        all_genes,
-        tpe_old_genes
-    )
-
-    setnames(method_results, "score", method$id)
-
-    results_all <- merge(
-        results_all,
-        method_results,
-    )
-}
-
-# Apply all methods for replicatively aging species
-
-results_replicative <- merge(
-    genes,
-    distances[
-        species %chin% species_ids_replicative,
-        .(n_species = .N),
-        by = gene
-    ],
-    by.x = "id",
-    by.y = "gene"
-)
-
-setnames(results_replicative, "id", "gene")
-
-for (method in methods) {
-    method_results <- run_cached(
-        sprintf("%s_replicative", method$id),
-        method$fn,
-        distances,
-        replicative_species,
-        all_genes,
-        tpe_old_genes
-    )
-
-    setnames(method_results, "score", method$id)
-
-    results_replicative <- merge(
-        results_replicative,
-        method_results,
-    )
-}
\ No newline at end of file
diff --git a/main.R b/main.R
deleted file mode 100644
index 6b70e73..0000000
--- a/main.R
+++ /dev/null
@@ -1,6 +0,0 @@
-library(shiny)
-
-source("server.R")
-source("ui.R")
-
-runApp(shinyApp(ui, server))
\ No newline at end of file
diff --git a/clusteriness.R b/process/clusteriness.R
similarity index 76%
rename from clusteriness.R
rename to process/clusteriness.R
index 6af743e..d860bb8 100644
--- a/clusteriness.R
+++ b/process/clusteriness.R
@@ -38,20 +38,11 @@ clusteriness <- function(data, height = 1000000) {
 }
 
 #' Process genes clustering their distance to telomeres.
-#'
-#' The return value will be a data.table with the following columns:
-#'
-#'  - `gene` Gene ID of the processed gene.
-#'  - `score` Score quantidying the gene's clusters.
-#'
-#' @param distances Gene distance data to use.
-#' @param species_ids IDs of species to include in the analysis.
-#' @param gene_ids Genes to include in the computation.
-process_clusteriness <- function(distances, species_ids, gene_ids, ...) {
+process_clusteriness <- function(distances, gene_ids, preset) {
     results <- data.table(gene = gene_ids)
 
     # Prefilter the input data by species.
-    distances <- distances[species %chin% species_ids]
+    distances <- distances[species %chin% preset$species_ids]
 
     # Add an index for quickly accessing data per gene.
     setkey(distances, gene)
diff --git a/correlation.R b/process/correlation.R
similarity index 78%
rename from correlation.R
rename to process/correlation.R
index cb69db1..5cce8f7 100644
--- a/correlation.R
+++ b/process/correlation.R
@@ -2,23 +2,13 @@ library(data.table)
 
 #' Compute the mean correlation coefficient comparing gene distances with a set
 #' of reference genes.
-#'
-#' The result will be a data.table with the following columns:
-#'
-#'  - `gene` Gene ID of the processed gene.
-#'  - `score` Mean correlation coefficient.
-#'
-#' @param distances Distance data to use.
-#' @param species_ids Species, whose data should be included.
-#' @param gene_ids Genes to process.
-#' @param reference_gene_ids Genes to compare to.
-process_correlation <- function(distances, species_ids, gene_ids,
-                                reference_gene_ids) {
+process_correlation <- function(distances, gene_ids, preset) {
     results <- data.table(gene = gene_ids)
+    reference_gene_ids <- preset$reference_gene_ids
     reference_count <- length(reference_gene_ids)
 
     # Prefilter distances by species.
-    distances <- distances[species %chin% species_ids]
+    distances <- distances[species %chin% preset$species_ids]
 
     # Add an index for quickly accessing data per gene.
     setkey(distances, gene)
diff --git a/input.R b/process/input.R
similarity index 100%
rename from input.R
rename to process/input.R
diff --git a/methods.R b/process/methods.R
similarity index 87%
rename from methods.R
rename to process/methods.R
index a15d255..a060710 100644
--- a/methods.R
+++ b/process/methods.R
@@ -1,7 +1,7 @@
-source("clusteriness.R")
-source("correlation.R")
-source("neural.R")
-source("proximity.R")
+source("process/clusteriness.R")
+source("process/correlation.R")
+source("process/neural.R")
+source("process/proximity.R")
 
 #' Construct a new method.
 #'
@@ -10,9 +10,8 @@ source("proximity.R")
 #' parameters in this order:
 #'
 #'  - `distances` Distance data to use.
-#'  - `species_ids` Species, whose data should be included.
 #'  - `gene_ids` Genes to process.
-#'  - `reference_gene_ids` Genes to compare to.
+#'  - `preset` Preset to apply.
 #'
 #' The function should return a `data.table` with the following columns:
 #'
diff --git a/neural.R b/process/neural.R
similarity index 86%
rename from neural.R
rename to process/neural.R
index f84f7ab..ef137d5 100644
--- a/neural.R
+++ b/process/neural.R
@@ -2,19 +2,10 @@ library(data.table)
 library(neuralnet)
 
 #' Find genes by training a neural network on reference position data.
-#'
-#' The result will be a data.table with the following columns:
-#'
-#'  - `gene` Gene ID of the processed gene.
-#'  - `score` Output score given by the neural network.
-#'
-#' @param distances Distance data to use.
-#' @param species_ids Species, whose data should be included.
-#' @param gene_ids Genes to process. This should include the reference genes.
-#' @param reference_gene_ids Genes to compare to.
 #' @param seed A seed to get reproducible results.
-process_neural <- function(distances, species_ids, gene_ids,
-                           reference_gene_ids, seed = 726839) {
+process_neural <- function(distances, gene_ids, preset, seed = 726839) {
+    species_ids <- preset$species_ids
+    reference_gene_ids <- preset$reference_gene_ids
     set.seed(seed)
     gene_count <- length(gene_ids)
 
diff --git a/process/presets.R b/process/presets.R
new file mode 100644
index 0000000..c0efc00
--- /dev/null
+++ b/process/presets.R
@@ -0,0 +1,29 @@
+library(data.table)
+
+#' Create a new preset.
+#'
+#' A preset is a combination of input values to all processing methods. The
+#' preset's hash will be used to cache the results of applying those.
+#'
+#' @param species_ids IDs of species to include.
+#' @param reference_gene_ids Reference genes to use.
+#'
+#' @return A named list containing the arguments.
+preset <- function(species_ids, reference_gene_ids) {
+    list(
+        species_ids = species_ids,
+        reference_gene_ids = reference_gene_ids
+    )
+}
+
+#' A default preset including only replicatively aging species.
+preset_replicative_species <- preset(
+    species[replicative == TRUE, id],
+    genes[suggested | verified == TRUE, id]
+)
+
+#' A default preset including all species.
+preset_all_species <-  preset(
+    species[, id],
+    genes[suggested | verified == TRUE, id]
+)
\ No newline at end of file
diff --git a/process/process.R b/process/process.R
new file mode 100644
index 0000000..0e3a2db
--- /dev/null
+++ b/process/process.R
@@ -0,0 +1,58 @@
+library(data.table)
+
+source("process/util.R")
+
+# Load input data
+
+source("process/input.R")
+
+species <- run_cached("inputs/species", retrieve_species)
+genes <- run_cached("inputs/genes", retrieve_genes)
+
+distances <- run_cached(
+    "inputs/distances",
+    retrieve_distances,
+    species[, id],
+    genes[, id]
+)
+
+genes <- merge(
+    genes,
+    distances[, .(n_species = .N), by = "gene"],
+    by.x = "id",
+    by.y = "gene"
+)
+
+source("process/methods.R")
+source("process/presets.R")
+
+#' Apply all methods with the specified preset without caching.
+process_priv <- function(preset) {
+    results <- data.table(gene = genes[, id])
+
+    for (method in methods) {
+        method_results <- method$fn(distances, genes[, id], preset)
+        setnames(method_results, "score", method$id)
+
+        results <- merge(
+            results,
+            method_results
+        )
+    }
+
+    results
+}
+
+#' Apply all methods with the specified preset.
+#'
+#' The result will be cached by the preset's hash and restored from cache, if
+#' possible. The return value is a `data.table` with one row for each gene
+#' identified by it's ID (`gene` column). The additional columns contain the
+#' resulting per method and are named after the method IDs.
+process <- function(preset) {
+    run_cached(
+        sprintf("results/%s", rlang::hash(preset)),
+        process_priv,
+        preset
+    )
+}
\ No newline at end of file
diff --git a/proximity.R b/process/proximity.R
similarity index 54%
rename from proximity.R
rename to process/proximity.R
index 7da4363..cef008c 100644
--- a/proximity.R
+++ b/process/proximity.R
@@ -4,20 +4,11 @@ library(data.table)
 #'
 #' A score will be given to each gene such that 0.0 corresponds to the maximal
 #' mean distance across all genes and 1.0 corresponds to a distance of 0.
-#'
-#' The result will be a data.table with the following columns:
-#'
-#'  - `gene` Gene ID of the processed gene.
-#'  - `score` Score for the proximity.
-#'
-#' @param distances Distance data to use.
-#' @param species_ids Species, whose data should be included.
-#' @param gene_ids Genes to process.
-process_proximity <- function(distances, species_ids, gene_ids, ...) {
-    species_count <- length(species_ids)
+process_proximity <- function(distances, gene_ids, preset) {
+    species_count <- length(preset$species_ids)
 
     # Prefilter distances by species.
-    distances <- distances[species %chin% species_ids]
+    distances <- distances[species %chin% preset$species_ids]
 
     # Compute the score as described above.
 
diff --git a/util.R b/process/util.R
similarity index 100%
rename from util.R
rename to process/util.R
diff --git a/shiny/main.R b/shiny/main.R
new file mode 100644
index 0000000..39228ac
--- /dev/null
+++ b/shiny/main.R
@@ -0,0 +1,7 @@
+library(shiny)
+
+source("process/process.R")
+source("shiny/server.R")
+source("shiny/ui.R")
+
+runApp(shinyApp(ui, server))
\ No newline at end of file
diff --git a/server.R b/shiny/server.R
similarity index 96%
rename from server.R
rename to shiny/server.R
index ea2930b..79a5acd 100644
--- a/server.R
+++ b/shiny/server.R
@@ -5,7 +5,6 @@ library(plotly)
 library(rclipboard)
 library(shiny)
 
-source("init.R")
 source("optimize.R")
 source("rank_plot.R")
 source("scatter_plot.R")
@@ -71,11 +70,18 @@ server <- function(input, output, session) {
         # Select the species preset.
 
         results <- if (input$species == "all") {
-            results_all
+            process(preset_all_species)
         } else {
-            results_replicative
+            process(preset_replicative_species)
         }
 
+        results <- merge(
+            results,
+            genes,
+            by.x = "gene",
+            by.y = "id"
+        )
+
         # Compute scoring factors and the weighted score.
 
         total_weight <- 0.0
diff --git a/ui.R b/shiny/ui.R
similarity index 99%
rename from ui.R
rename to shiny/ui.R
index 0401597..17e519f 100644
--- a/ui.R
+++ b/shiny/ui.R
@@ -3,8 +3,6 @@ library(plotly)
 library(rclipboard)
 library(shiny)
 
-source("methods.R")
-
 ui <- fluidPage(
     shinyjs::useShinyjs(),
     rclipboardSetup(),