Reorganize source files and generalize presets

2025-10-26 19:27:24 +01:00 · 2021-10-16 21:46:59 +02:00 · 2021-10-16 21:46:59 +02:00 · 68354bf808
commit 68354bf808
parent 8104e9bd8a
14 changed files with 119 additions and 147 deletions
--- a/init.R
+++ b/init.R
@ -1,82 +0,0 @@
 source("input.R")
 source("methods.R")
 source("util.R")
 # Load input data
 species <- run_cached("input_species", retrieve_species)
 genes <- run_cached("input_genes", retrieve_genes)
 distances <- run_cached(
    "input_distances",
    retrieve_distances,
    species[, id],
    genes[, id]
 )
 all_species <- species[, id]
 replicative_species <- species[replicative == TRUE, id]
 all_genes <- genes[, id]
 tpe_old_genes <- genes[suggested | verified == TRUE, id]
 # Apply all methods for all species
 results_all <- merge(
    genes,
    distances[, .(n_species = .N), by = "gene"],
    by.x = "id",
    by.y = "gene"
 )
 setnames(results_all, "id", "gene")
 for (method in methods) {
    method_results <- run_cached(
        sprintf("%s_all", method$id),
        method$fn,
        distances,
        all_species,
        all_genes,
        tpe_old_genes
    )
    setnames(method_results, "score", method$id)
    results_all <- merge(
        results_all,
        method_results,
    )
 }
 # Apply all methods for replicatively aging species
 results_replicative <- merge(
    genes,
    distances[
        species %chin% species_ids_replicative,
        .(n_species = .N),
        by = gene
    ],
    by.x = "id",
    by.y = "gene"
 )
 setnames(results_replicative, "id", "gene")
 for (method in methods) {
    method_results <- run_cached(
        sprintf("%s_replicative", method$id),
        method$fn,
        distances,
        replicative_species,
        all_genes,
        tpe_old_genes
    )
    setnames(method_results, "score", method$id)
    results_replicative <- merge(
        results_replicative,
        method_results,
    )
 }
--- a/main.R
+++ b/main.R
@ -1,6 +0,0 @@
 library(shiny)
 source("server.R")
 source("ui.R")
 runApp(shinyApp(ui, server))
--- a/process/clusteriness.R
+++ b/process/clusteriness.R
@ -38,20 +38,11 @@ clusteriness <- function(data, height = 1000000) {
 }
 #' Process genes clustering their distance to telomeres.
-#'
+process_clusteriness <- function(distances, gene_ids, preset) {
 #' The return value will be a data.table with the following columns:
 #'
 #'  - `gene` Gene ID of the processed gene.
 #'  - `score` Score quantidying the gene's clusters.
 #'
 #' @param distances Gene distance data to use.
 #' @param species_ids IDs of species to include in the analysis.
 #' @param gene_ids Genes to include in the computation.
 process_clusteriness <- function(distances, species_ids, gene_ids, ...) {
    results <- data.table(gene = gene_ids)
    # Prefilter the input data by species.
-    distances <- distances[species %chin% species_ids]
+    distances <- distances[species %chin% preset$species_ids]
    # Add an index for quickly accessing data per gene.
    setkey(distances, gene)
--- a/process/correlation.R
+++ b/process/correlation.R
@ -2,23 +2,13 @@ library(data.table)
 #' Compute the mean correlation coefficient comparing gene distances with a set
 #' of reference genes.
-#'
+process_correlation <- function(distances, gene_ids, preset) {
 #' The result will be a data.table with the following columns:
 #'
 #'  - `gene` Gene ID of the processed gene.
 #'  - `score` Mean correlation coefficient.
 #'
 #' @param distances Distance data to use.
 #' @param species_ids Species, whose data should be included.
 #' @param gene_ids Genes to process.
 #' @param reference_gene_ids Genes to compare to.
 process_correlation <- function(distances, species_ids, gene_ids,
                                reference_gene_ids) {
    results <- data.table(gene = gene_ids)
    reference_gene_ids <- preset$reference_gene_ids
    reference_count <- length(reference_gene_ids)
    # Prefilter distances by species.
-    distances <- distances[species %chin% species_ids]
+    distances <- distances[species %chin% preset$species_ids]
    # Add an index for quickly accessing data per gene.
    setkey(distances, gene)
--- a/process/input.R
+++ b/process/input.R
--- a/process/methods.R
+++ b/process/methods.R
@ -1,7 +1,7 @@
-source("clusteriness.R")
+source("process/clusteriness.R")
-source("correlation.R")
+source("process/correlation.R")
-source("neural.R")
+source("process/neural.R")
-source("proximity.R")
+source("process/proximity.R")
 #' Construct a new method.
 #'
@ -10,9 +10,8 @@ source("proximity.R")
 #' parameters in this order:
 #'
 #'  - `distances` Distance data to use.
 #'  - `species_ids` Species, whose data should be included.
 #'  - `gene_ids` Genes to process.
-#'  - `reference_gene_ids` Genes to compare to.
+#'  - `preset` Preset to apply.
 #'
 #' The function should return a `data.table` with the following columns:
 #'
--- a/process/neural.R
+++ b/process/neural.R
@ -2,19 +2,10 @@ library(data.table)
 library(neuralnet)
 #' Find genes by training a neural network on reference position data.
 #'
 #' The result will be a data.table with the following columns:
 #'
 #'  - `gene` Gene ID of the processed gene.
 #'  - `score` Output score given by the neural network.
 #'
 #' @param distances Distance data to use.
 #' @param species_ids Species, whose data should be included.
 #' @param gene_ids Genes to process. This should include the reference genes.
 #' @param reference_gene_ids Genes to compare to.
 #' @param seed A seed to get reproducible results.
-process_neural <- function(distances, species_ids, gene_ids,
+process_neural <- function(distances, gene_ids, preset, seed = 726839) {
-                           reference_gene_ids, seed = 726839) {
+    species_ids <- preset$species_ids
    reference_gene_ids <- preset$reference_gene_ids
    set.seed(seed)
    gene_count <- length(gene_ids)
--- a/process/presets.R
+++ b/process/presets.R
@ -0,0 +1,29 @@
 library(data.table)
 #' Create a new preset.
 #'
 #' A preset is a combination of input values to all processing methods. The
 #' preset's hash will be used to cache the results of applying those.
 #'
 #' @param species_ids IDs of species to include.
 #' @param reference_gene_ids Reference genes to use.
 #'
 #' @return A named list containing the arguments.
 preset <- function(species_ids, reference_gene_ids) {
    list(
        species_ids = species_ids,
        reference_gene_ids = reference_gene_ids
    )
 }
 #' A default preset including only replicatively aging species.
 preset_replicative_species <- preset(
    species[replicative == TRUE, id],
    genes[suggested | verified == TRUE, id]
 )
 #' A default preset including all species.
 preset_all_species <-  preset(
    species[, id],
    genes[suggested | verified == TRUE, id]
 )
--- a/process/process.R
+++ b/process/process.R
@ -0,0 +1,58 @@
 library(data.table)
 source("process/util.R")
 # Load input data
 source("process/input.R")
 species <- run_cached("inputs/species", retrieve_species)
 genes <- run_cached("inputs/genes", retrieve_genes)
 distances <- run_cached(
    "inputs/distances",
    retrieve_distances,
    species[, id],
    genes[, id]
 )
 genes <- merge(
    genes,
    distances[, .(n_species = .N), by = "gene"],
    by.x = "id",
    by.y = "gene"
 )
 source("process/methods.R")
 source("process/presets.R")
 #' Apply all methods with the specified preset without caching.
 process_priv <- function(preset) {
    results <- data.table(gene = genes[, id])
    for (method in methods) {
        method_results <- method$fn(distances, genes[, id], preset)
        setnames(method_results, "score", method$id)
        results <- merge(
            results,
            method_results
        )
    }
    results
 }
 #' Apply all methods with the specified preset.
 #'
 #' The result will be cached by the preset's hash and restored from cache, if
 #' possible. The return value is a `data.table` with one row for each gene
 #' identified by it's ID (`gene` column). The additional columns contain the
 #' resulting per method and are named after the method IDs.
 process <- function(preset) {
    run_cached(
        sprintf("results/%s", rlang::hash(preset)),
        process_priv,
        preset
    )
 }
--- a/process/proximity.R
+++ b/process/proximity.R
@ -4,20 +4,11 @@ library(data.table)
 #'
 #' A score will be given to each gene such that 0.0 corresponds to the maximal
 #' mean distance across all genes and 1.0 corresponds to a distance of 0.
-#'
+process_proximity <- function(distances, gene_ids, preset) {
-#' The result will be a data.table with the following columns:
+    species_count <- length(preset$species_ids)
 #'
 #'  - `gene` Gene ID of the processed gene.
 #'  - `score` Score for the proximity.
 #'
 #' @param distances Distance data to use.
 #' @param species_ids Species, whose data should be included.
 #' @param gene_ids Genes to process.
 process_proximity <- function(distances, species_ids, gene_ids, ...) {
    species_count <- length(species_ids)
    # Prefilter distances by species.
-    distances <- distances[species %chin% species_ids]
+    distances <- distances[species %chin% preset$species_ids]
    # Compute the score as described above.
--- a/process/util.R
+++ b/process/util.R
--- a/shiny/main.R
+++ b/shiny/main.R
@ -0,0 +1,7 @@
 library(shiny)
 source("process/process.R")
 source("shiny/server.R")
 source("shiny/ui.R")
 runApp(shinyApp(ui, server))
--- a/shiny/server.R
+++ b/shiny/server.R
@ -5,7 +5,6 @@ library(plotly)
 library(rclipboard)
 library(shiny)
 source("init.R")
 source("optimize.R")
 source("rank_plot.R")
 source("scatter_plot.R")
@ -71,11 +70,18 @@ server <- function(input, output, session) {
        # Select the species preset.
        results <- if (input$species == "all") {
-            results_all
+            process(preset_all_species)
        } else {
-            results_replicative
+            process(preset_replicative_species)
        }
        results <- merge(
            results,
            genes,
            by.x = "gene",
            by.y = "id"
        )
        # Compute scoring factors and the weighted score.
        total_weight <- 0.0
--- a/shiny/ui.R
+++ b/shiny/ui.R
@ -3,8 +3,6 @@ library(plotly)
 library(rclipboard)
 library(shiny)
 source("methods.R")
 ui <- fluidPage(
    shinyjs::useShinyjs(),
    rclipboardSetup(),