Generalize method definitions

2025-10-26 11:17:24 +01:00 · 2021-10-15 09:26:57 +02:00 · 2021-10-15 09:26:57 +02:00 · 9b0b3c13f5
commit 9b0b3c13f5
parent d3edeefbe2
7 changed files with 137 additions and 169 deletions
--- a/clusteriness.R
+++ b/clusteriness.R
@ -42,12 +42,12 @@ clusteriness <- function(data, height = 1000000) {
 #' The return value will be a data.table with the following columns:
 #'
 #'  - `gene` Gene ID of the processed gene.
-#'  - `clusteriness` Score quantidying the gene's clusters.
+#'  - `score` Score quantidying the gene's clusters.
 #'
 #' @param distances Gene distance data to use.
 #' @param species_ids IDs of species to include in the analysis.
 #' @param gene_ids Genes to include in the computation.
-process_clustering <- function(distances, species_ids, gene_ids) {
+process_clusteriness <- function(distances, species_ids, gene_ids, ...) {
    results <- data.table(gene = gene_ids)

    # Prefilter the input data by species.
@ -61,5 +61,5 @@ process_clustering <- function(distances, species_ids, gene_ids) {
        clusteriness(distances[gene_id, distance])
    }

-    results[, clusteriness := compute(gene), by = 1:nrow(results)]
+    results[, score := compute(gene), by = 1:nrow(results)]
 }
--- a/correlation.R
+++ b/correlation.R
@ -6,7 +6,7 @@ library(data.table)
 #' The result will be a data.table with the following columns:
 #'
 #'  - `gene` Gene ID of the processed gene.
-#'  - `correlation` Mean correlation coefficient.
+#'  - `score` Mean correlation coefficient.
 #'
 #' @param distances Distance data to use.
 #' @param species_ids Species, whose data should be included.
@ -69,5 +69,5 @@ process_correlation <- function(distances, species_ids, gene_ids,
        score <- correlation_sum / reference_count
    }

-    results[, correlation := compute(gene), by = 1:nrow(results)]
+    results[, score := compute(gene), by = 1:nrow(results)]
 }
--- a/init.R
+++ b/init.R
@ -1,7 +1,5 @@
-source("clustering.R")
-source("correlation.R")
 source("input.R")
-source("neural.R")
+source("methods.R")
 source("util.R")

 # Load input data
@ -16,66 +14,12 @@ distances <- run_cached(
    genes[, id]
 )

-# Load processed data
-
 all_species <- species[, id]
 replicative_species <- species[replicative == TRUE, id]
 all_genes <- genes[, id]
 tpe_old_genes <- genes[suggested | verified == TRUE, id]

-clustering_all <- run_cached(
-    "clustering_all",
-    process_clustering,
-    distances,
-    all_species,
-    all_genes
-)
-
-clustering_replicative <- run_cached(
-    "clustering_replicative",
-    process_clustering,
-    distances,
-    replicative_species,
-    all_genes
-)
-
-correlation_all <- run_cached(
-    "correlation_all",
-    process_correlation,
-    distances,
-    all_species,
-    all_genes,
-    tpe_old_genes
-)
-
-correlation_replicative <- run_cached(
-    "correlation_replicative",
-    process_correlation,
-    distances,
-    replicative_species,
-    all_genes,
-    tpe_old_genes
-)
-
-neural_all <- run_cached(
-    "neural_all",
-    process_neural,
-    distances,
-    all_species,
-    all_genes,
-    tpe_old_genes
-)
-
-neural_replicative <- run_cached(
-    "neural_replicative",
-    process_neural,
-    distances,
-    replicative_species,
-    all_genes,
-    tpe_old_genes
-)
-
-# Merge processed data as well as gene information.
+# Apply all methods for all species

 results_all <- merge(
    genes,
@ -84,26 +28,27 @@ results_all <- merge(
    by.y = "gene"
 )

-results_all <- merge(
-    results_all,
-    clustering_all,
-    by.x = "id",
-    by.y = "gene"
+setnames(results_all, "id", "gene")
+
+for (method in methods) {
+    method_results <- run_cached(
+        sprintf("%s_all", method$id),
+        method$fn,
+        distances,
+        all_species,
+        all_genes,
+        tpe_old_genes
    )

+    setnames(method_results, "score", method$id)
+
    results_all <- merge(
        results_all,
-    correlation_all,
-    by.x = "id",
-    by.y = "gene"
+        method_results,
    )
+}

-results_all <- merge(
-    results_all,
-    neural_all,
-    by.x = "id",
-    by.y = "gene"
-)
+# Apply all methods for replicatively aging species

 results_replicative <- merge(
    genes,
@ -116,28 +61,22 @@ results_replicative <- merge(
    by.y = "gene"
 )

-results_replicative <- merge(
-    results_replicative,
-    clustering_replicative,
-    by.x = "id",
-    by.y = "gene"
-)
-
-results_replicative <- merge(
-    results_replicative,
-    correlation_replicative,
-    by.x = "id",
-    by.y = "gene"
-)
-
-results_replicative <- merge(
-    results_replicative,
-    neural_replicative,
-    by.x = "id",
-    by.y = "gene"
-)
-
-# Rename `id` columns to `gene`.
-
-setnames(results_all, "id", "gene")
 setnames(results_replicative, "id", "gene")
+
+for (method in methods) {
+    method_results <- run_cached(
+        sprintf("%s_replicative", method$id),
+        method$fn,
+        distances,
+        replicative_species,
+        all_genes,
+        tpe_old_genes
+    )
+
+    setnames(method_results, "score", method$id)
+
+    results_replicative <- merge(
+        results_replicative,
+        method_results,
+    )
+}
--- a/methods.R
+++ b/methods.R
@ -0,0 +1,56 @@
+source("clusteriness.R")
+source("correlation.R")
+source("neural.R")
+
+#' Construct a new method.
+#'
+#' A method describes a way to perform a computation on gene distance data that
+#' results in a single score per gene. The function should accept the following
+#' parameters in this order:
+#'
+#'  - `distances` Distance data to use.
+#'  - `species_ids` Species, whose data should be included.
+#'  - `gene_ids` Genes to process.
+#'  - `reference_gene_ids` Genes to compare to.
+#'
+#' The function should return a `data.table` with the following columns:
+#'
+#'  - `gene` Gene ID of the processed gene.
+#'  - `score` Score for the gene between 0.0 and 1.0.
+#'
+#' @param id Internal identifier for the method.
+#' @param name Human readable name for the method.
+#' @param description Short human readable description.
+#' @param fn Function to perform the computation.
+#'
+#' @return A named list containing the arguments.
+method <- function(id, name, description, fn) {
+    list(
+        id = id,
+        name = name,
+        description = description,
+        fn = fn
+    )
+}
+
+#' All methods to be included in the analysis.
+methods <- list(
+    method(
+        "clusteriness",
+        "Clustering",
+        "Clustering of genes",
+        process_clusteriness
+    ),
+    method(
+        "correlation",
+        "Correlation",
+        "Correlation with known genes",
+        process_correlation
+    ),
+    method(
+        "neural",
+        "Neural",
+        "Assessment by neural network",
+        process_neural
+    )
+)
--- a/neural.R
+++ b/neural.R
@ -6,7 +6,7 @@ library(neuralnet)
 #' The result will be a data.table with the following columns:
 #'
 #'  - `gene` Gene ID of the processed gene.
-#'  - `neural` Output score given by the neural network.
+#'  - `score` Output score given by the neural network.
 #'
 #' @param distances Distance data to use.
 #' @param species_ids Species, whose data should be included.
@ -105,6 +105,6 @@ process_neural <- function(distances, species_ids, gene_ids,

    # Return the resulting scores given by applying the neural network.

-    data[, neural := compute(nn, data)$net.result]
-    data[, .(gene, neural)]
+    data[, score := compute(nn, data)$net.result]
+    data[, .(gene, score)]
 }
--- a/server.R
+++ b/server.R
@ -47,16 +47,18 @@ server <- function(input, output) {

        # Compute scoring factors and the weighted score.

-        clusteriness_weight <- input$clusteriness / 100
-        correlation_weight <- input$correlation / 100
-        neural_weight <- input$neural / 100
-        total_weight <- clusteriness_weight + correlation_weight + neural_weight
-        clusteriness_factor <- clusteriness_weight / total_weight
-        correlation_factor <- correlation_weight / total_weight
-        neural_factor <- neural_weight / total_weight
+        total_weight <- 0.0
+        results[, score := 0.0]

-        results[, score := clusteriness_factor * clusteriness +
-            correlation_factor * correlation + neural_factor * neural]
+        for (method in methods) {
+            weight <- input[[method$id]]
+            total_weight <- total_weight + weight
+            column <- method$id
+            weighted <- weight * results[, ..column]
+            results[, score := score + weighted]
+        }
+
+        results[, score := score / total_weight]

        # Exclude genes with too few species.
        results <- results[n_species >= input$n_species]
@ -75,33 +77,22 @@ server <- function(input, output) {
        # Apply the cut-off score.
        results <- results[score >= input$cutoff / 100]

-        # Order the results based on their score. The resulting index will be
-        # used as the "rank".
+        # Order the results based on their score.

        setorder(results, -score, na.last = TRUE)
+        results[, rank := .I]
    })

    output$genes <- renderDT({
+        method_ids <- sapply(methods, function(method) method$id)
+        method_names <- sapply(methods, function(method) method$name)
+        columns <- c("rank", "gene", "name", method_ids, "score")
+        column_names <- c("", "Gene", "", method_names, "Score")
+
        dt <- datatable(
-            results()[, .(
-                .I,
-                gene,
-                name,
-                clusteriness,
-                correlation,
-                neural,
-                score
-            )],
+            results()[, ..columns],
            rownames = FALSE,
-            colnames = c(
-                "",
-                "Gene",
-                "",
-                "Clusters",
-                "Correlation",
-                "Neural",
-                "Score"
-            ),
+            colnames = column_names,
            style = "bootstrap",
            options = list(
                rowCallback = js_link,
@ -109,11 +100,7 @@ server <- function(input, output) {
            )
        )

-        formatPercentage(
-            dt,
-            c("clusteriness", "correlation", "neural", "score"),
-            digits = 1
-        )
+        formatPercentage(dt, c(method_ids, "score"), digits = 1)
    })

    output$synposis <- renderText({
--- a/ui.R
+++ b/ui.R
@ -3,6 +3,8 @@ library(plotly)
 library(rclipboard)
 library(shiny)

+source("methods.R")
+
 ui <- fluidPage(
    rclipboardSetup(),
    titlePanel("TPE-OLD candidates"),
@ -22,33 +24,17 @@ ui <- fluidPage(
        ),
        wellPanel(
            h3("Ranking"),
+            lapply(methods, function(method) {
                sliderInput(
-                "clusteriness",
-                "Clustering of genes",
+                    method$id,
+                    method$description,
                    post = "%",
                    min = 0,
                    max = 100,
                    step = 1,
-                value = 58
-            ),
-            sliderInput(
-                "correlation",
-                "Correlation with known genes",
-                post = "%",
-                min = 0,
-                max = 100,
-                step = 1,
-                value = 36
-            ),
-            sliderInput(
-                "neural",
-                "Assessment by neural network",
-                post = "%",
-                min = 0,
-                max = 100,
-                step = 1,
-                value = 6
-            ),
+                    value = 100
+                )
+            }),
            sliderInput(
                "cutoff",
                "Cut-off score",