From dc0265a13f0d7fef9d09bd079e81e726b092df6c Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Tue, 19 Oct 2021 14:15:28 +0200
Subject: [PATCH] Separate out geposan package

---
 data.R                     | 115 +++++++++++++++++
 main.R                     |   9 ++
 optimize.R                 |  34 -----
 process/clusteriness.R     |  56 --------
 process/correlation.R      |  63 ---------
 process/input.R            | 254 -------------------------------------
 process/methods.R          |  62 ---------
 process/neural.R           | 101 ---------------
 process/presets.R          |  29 -----
 process/process.R          |  58 ---------
 process/proximity.R        |  20 ---
 rank_plot.R                |   2 +-
 shiny/server.R => server.R |  65 ++++------
 shiny/main.R               |   7 -
 shiny/ui.R => ui.R         |   8 +-
 process/util.R => utils.R  |   2 +-
 16 files changed, 154 insertions(+), 731 deletions(-)
 create mode 100644 data.R
 create mode 100644 main.R
 delete mode 100644 optimize.R
 delete mode 100644 process/clusteriness.R
 delete mode 100644 process/correlation.R
 delete mode 100644 process/input.R
 delete mode 100644 process/methods.R
 delete mode 100644 process/neural.R
 delete mode 100644 process/presets.R
 delete mode 100644 process/process.R
 delete mode 100644 process/proximity.R
 rename shiny/server.R => server.R (81%)
 delete mode 100644 shiny/main.R
 rename shiny/ui.R => ui.R (96%)
 rename process/util.R => utils.R (99%)

diff --git a/data.R b/data.R
new file mode 100644
index 0000000..4b47863
--- /dev/null
+++ b/data.R
@@ -0,0 +1,115 @@
+library(data.table)
+
+#' Species IDs of known replicatively aging species.
+species_ids_replicative <- c(
+    "bihybrid",
+    "btaurus",
+    "bthybrid",
+    "cfamiliaris",
+    "chircus",
+    "cjacchus",
+    "clfamiliaris",
+    "csabaeus",
+    "ecaballus",
+    "fcatus",
+    "ggorilla",
+    "hsapiens",
+    "lafricana",
+    "mfascicularis",
+    "mmulatta",
+    "mmurinus",
+    "mnemestrina",
+    "nleucogenys",
+    "oaries",
+    "pabelii",
+    "panubis",
+    "ppaniscus",
+    "ptroglodytes",
+    "sscrofa",
+    "tgelada"
+)
+
+#' Species from [geposan] and their aging status.
+species <- geposan::species[, .(
+    id,
+    name,
+    replicative = id %chin% species_ids_replicative
+)]
+
+#' Gene names of genes for verified TPE-OLD genes.
+genes_verified_tpe_old <- c(
+    "C1S",
+    "DSP",
+    "ISG15",
+    "SORBS2",
+    "TERT"
+)
+
+#' Gene names of genes with a suggested TPE-OLD.
+genes_suggested_tpe_old <- c(
+    "AKAP3",
+    "ANO2",
+    "CCND2",
+    "CD163L1",
+    "CD9",
+    "FOXM1",
+    "GALNT8",
+    "NDUFA9",
+    "TEAD4",
+    "TIGAR",
+    "TSPAN9"
+)
+
+#' Genes from [geposan] and their TPE-OLD status.
+genes <- geposan::genes[, .(
+    id,
+    name,
+    chromosome,
+    n_species,
+    suggested = name %chin% genes_suggested_tpe_old,
+    verified = name %chin% genes_verified_tpe_old
+)]
+
+#' All available methods from [geposan] and additional information on them.
+methods <- list(
+    list(
+        id = "clusteriness",
+        name = "Clustering",
+        description = "Clustering of genes"
+    ),
+    list(
+        id = "correlation",
+        name = "Correlation",
+        description = "Correlation with known genes"
+    ),
+    list(
+        id = "proximity",
+        name = "Proximity",
+        description = "Proximity to telomeres"
+    ),
+    list(
+        id = "neural",
+        name = "Neural",
+        description = "Assessment by neural network"
+    )
+)
+
+#' Gene IDs of known or suggested TPE-OLD genes.
+genes_tpe_old <- genes[suggested | verified == TRUE, id]
+
+#' Preset for [geposan] including all species and TPE-OLD genes for reference.
+preset_all_species <- geposan::preset(
+    methods = c("clusteriness", "correlation", "proximity", "neural"),
+    species = species$id,
+    genes = genes$id,
+    reference_genes = genes_tpe_old
+)
+
+#' Preset for [geposan] including only replicatively aging species as well as
+#' TPE-OLD genes for reference.
+preset_replicative_species <- geposan::preset(
+    methods = c("clusteriness", "correlation", "proximity", "neural"),
+    species = species_ids_replicative,
+    genes = genes$id,
+    reference_genes = genes_tpe_old
+)
diff --git a/main.R b/main.R
new file mode 100644
index 0000000..9e01819
--- /dev/null
+++ b/main.R
@@ -0,0 +1,9 @@
+library(shiny)
+
+# Initialize data first.
+source("data.R")
+
+source("server.R")
+source("ui.R")
+
+runApp(shinyApp(ui, server))
diff --git a/optimize.R b/optimize.R
deleted file mode 100644
index f20fec1..0000000
--- a/optimize.R
+++ /dev/null
@@ -1,34 +0,0 @@
-#' Find the best weights to rank the data.
-#'
-#' This function ranks the provided data table based on a weighted score
-#' computed from the specified `columns`. It tries to find the optimal weights
-#' that result in a ranking, where the mean rank of the given reference genes
-#' is as high as possible.
-#'
-#' @param data Input data including the columns.
-#' @param colums Columns containing the separate scores between 0.0 and 1.0.
-#' @param reference_gene_ids IDs of the reference genes within the input data.
-#'
-#' @returns Vector of optimal column weights adding up to 1.0.
-optimize_weights <- function(data, columns, reference_gene_ids) {
-    #' Compute the mean rank of the reference genes when applying the weights.
-    mean_rank <- function(weights) {
-        data <- copy(data)
-        data[, score := 0.0]
-
-        for (i in seq_along(columns)) {
-            column <- columns[i]
-            weighted <- weights[i] * data[, ..column]
-            data[, score := score + weighted]
-        }
-
-        setorder(data, -score)
-        data[, rank := .I]
-
-        data[gene %chin% reference_gene_ids, mean(rank)]
-    }
-
-    weights <- optim(rep(1.0, length(columns)), mean_rank)$par
-    total_weight <- sum(weights)
-    weights / total_weight
-}
\ No newline at end of file
diff --git a/process/clusteriness.R b/process/clusteriness.R
deleted file mode 100644
index d860bb8..0000000
--- a/process/clusteriness.R
+++ /dev/null
@@ -1,56 +0,0 @@
-library(data.table)
-
-#' Perform a cluster analysis.
-#'
-#' This function will cluster the data using `hclust` and `cutree` (with the
-#' specified height). Every cluster with at least two members qualifies for
-#' further analysis. Clusters are then ranked based on their size in relation
-#' to the number of values. The return value is a final score between zero and
-#' one. Lower ranking clusters contribute less to this score.
-clusteriness <- function(data, height = 1000000) {
-    n <- length(data)
-
-    # Return a score of 0.0 if there is just one or no value at all.
-    if (n < 2) {
-        return(0.0)
-    }
-
-    # Cluster the data and compute the cluster sizes.
-
-    tree <- hclust(dist(data))
-    clusters <- cutree(tree, h = height)
-    cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
-
-    # Compute the "clusteriness" score.
-
-    score <- 0.0
-
-    for (i in seq_along(cluster_sizes)) {
-        cluster_size <- cluster_sizes[i]
-
-        if (cluster_size >= 2) {
-            cluster_score <- cluster_size / n
-            score <- score + cluster_score / i
-        }
-    }
-
-    score
-}
-
-#' Process genes clustering their distance to telomeres.
-process_clusteriness <- function(distances, gene_ids, preset) {
-    results <- data.table(gene = gene_ids)
-
-    # Prefilter the input data by species.
-    distances <- distances[species %chin% preset$species_ids]
-
-    # Add an index for quickly accessing data per gene.
-    setkey(distances, gene)
-
-    #' Perform the cluster analysis for one gene.
-    compute <- function(gene_id) {
-        clusteriness(distances[gene_id, distance])
-    }
-
-    results[, score := compute(gene), by = 1:nrow(results)]
-}
\ No newline at end of file
diff --git a/process/correlation.R b/process/correlation.R
deleted file mode 100644
index 5cce8f7..0000000
--- a/process/correlation.R
+++ /dev/null
@@ -1,63 +0,0 @@
-library(data.table)
-
-#' Compute the mean correlation coefficient comparing gene distances with a set
-#' of reference genes.
-process_correlation <- function(distances, gene_ids, preset) {
-    results <- data.table(gene = gene_ids)
-    reference_gene_ids <- preset$reference_gene_ids
-    reference_count <- length(reference_gene_ids)
-
-    # Prefilter distances by species.
-    distances <- distances[species %chin% preset$species_ids]
-
-    # Add an index for quickly accessing data per gene.
-    setkey(distances, gene)
-
-    # Prepare the reference genes' data.
-    reference_distances <- distances[gene %chin% reference_gene_ids]
-
-    #' Perform the correlation for one gene.
-    compute <- function(gene_id) {
-        gene_distances <- distances[gene_id]
-        gene_species_count <- nrow(gene_distances)
-
-        # Return a score of 0.0 if there is just one or no value at all.
-        if (gene_species_count <= 1) {
-            return(0.0)
-        }
-
-        #' Buffer for the sum of correlation coefficients.
-        correlation_sum <- 0
-
-        # Correlate with all reference genes but not with the gene itself.
-        for (reference_gene_id in
-            reference_gene_ids[reference_gene_ids != gene_id]) {
-            data <- merge(
-                gene_distances,
-                reference_distances[reference_gene_id],
-                by = "species"
-            )
-
-            # Skip this reference gene, if there are not enough value pairs.
-            # This will lessen the final score, because it effectively
-            # represents a correlation coefficient of 0.0.
-            if (nrow(data) <= 1) {
-                next
-            }
-
-            # Order data by the reference gene's distance to get a monotonic
-            # relation.
-            setorder(data, distance.y)
-
-            correlation_sum <- correlation_sum + abs(cor(
-                data[, distance.x], data[, distance.y],
-                method = "spearman"
-            ))
-        }
-
-        # Compute the score as the mean correlation coefficient.
-        score <- correlation_sum / reference_count
-    }
-
-    results[, score := compute(gene), by = 1:nrow(results)]
-}
\ No newline at end of file
diff --git a/process/input.R b/process/input.R
deleted file mode 100644
index bb32260..0000000
--- a/process/input.R
+++ /dev/null
@@ -1,254 +0,0 @@
-library(biomaRt)
-library(data.table)
-library(progress)
-library(rlog)
-library(stringr)
-
-#' Species IDs of known replicatively aging species.
-species_ids_replicative <- c(
-    "bihybrid",
-    "btaurus",
-    "bthybrid",
-    "cfamiliaris",
-    "chircus",
-    "cjacchus",
-    "clfamiliaris",
-    "csabaeus",
-    "ecaballus",
-    "fcatus",
-    "ggorilla",
-    "hsapiens",
-    "lafricana",
-    "mfascicularis",
-    "mmulatta",
-    "mmurinus",
-    "mnemestrina",
-    "nleucogenys",
-    "oaries",
-    "pabelii",
-    "panubis",
-    "ppaniscus",
-    "ptroglodytes",
-    "sscrofa",
-    "tgelada"
-)
-
-#' Gene names of genes for verified TPE-OLD genes.
-genes_verified_tpe_old <- c(
-    "C1S",
-    "DSP",
-    "ISG15",
-    "SORBS2",
-    "TERT"
-)
-
-#' Gene names of genes with a suggested TPE-OLD.
-genes_suggested_tpe_old <- c(
-    "AKAP3",
-    "ANO2",
-    "CCND2",
-    "CD163L1",
-    "CD9",
-    "FOXM1",
-    "GALNT8",
-    "NDUFA9",
-    "TEAD4",
-    "TIGAR",
-    "TSPAN9"
-)
-
-#' Shared accessor for the Ensembl API.
-ensembl <- NULL
-
-#' Get the ensembl accessor and initialize it if necessary.
-get_ensembl <- function() {
-    if (is.null(ensembl)) {
-        ensembl <<- useEnsembl("ensembl", version = 104)
-    }
-
-    ensembl
-}
-
-#' Get all chromosome names for a Ensembl dataset.
-#'
-#' Valid chromosome names include decimal numbers as well as 'X' and 'Y'.
-get_chromosome_names <- function(dataset) {
-    chromosome_names <- listFilterOptions(dataset, "chromosome_name")
-    chromosome_names[str_which(chromosome_names, "^[0-9]+|[XY]$")]
-}
-
-#' Retrieve information on species.
-#'
-#' The result will be a `data.table` with the following columns:
-#'
-#'  - `id` Species ID as presented by Ensembl.
-#'  - `name` Human readable species name.
-#'  - `replicative` Whether the species is likely to be aging replicatively.
-retrieve_species <- function() {
-    # Ensembl datasets correspond to distinct species.
-    ensembl_datasets <- data.table(listDatasets(get_ensembl()))
-
-    # Filter out species ID and name from the result.
-    species <- ensembl_datasets[, .(
-        id = str_match(dataset, "(.*)_gene_ensembl")[, 2],
-        name = str_match(description, "(.*) genes \\(.*\\)")[, 2]
-    )]
-
-    species[, replicative := id %chin% species_ids_replicative]
-}
-
-#' Retrieve information on human genes.
-#'
-#' This will only include genes on assembled chromosomes. Chromosomes are
-#' filtered based on their name being either a decimal number, 'X' or 'Y'.
-#'
-#' The result will be a `data.table` with the following columns:
-#'
-#'  - `id` Ensembl gene ID.
-#'  - `ǹame` HGNC name of the gene.
-#'  - `chromosome` Human chromosome on which the gene is located.
-retrieve_genes <- function() {
-    dataset <- useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
-
-    genes <- data.table(getBM(
-        attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
-        filters = "chromosome_name",
-        values = get_chromosome_names(dataset),
-        mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
-    ))
-
-    genes[, .(
-        id = ensembl_gene_id,
-        name = hgnc_symbol,
-        chromosome = chromosome_name,
-        verified = hgnc_symbol %chin% genes_verified_tpe_old,
-        suggested = hgnc_symbol %chin% genes_suggested_tpe_old
-    )]
-}
-
-#' Retrieve gene distance data.
-#'
-#' The data will include all available values for the given species and genes
-#' that are located on assembled chromosomes.
-#'
-#' The result will be a `data.table` with the following columns:
-#'
-#'  - `species` Species ID.
-#'  - `gene` Ensembl gene ID.
-#'  - `distance` Distance to nearest telomere in base pairs.
-retrieve_distances <- function(species_ids, gene_ids) {
-    ensembl <- get_ensembl()
-
-    # Exclude the human from the species, in case it is present there.
-    species_ids <- species_ids[species_ids != "hsapiens"]
-
-    species_count <- length(species_ids)
-    gene_count <- length(gene_ids)
-
-    log_info(sprintf(
-        "Retrieving distance data for %i genes from %i species",
-        gene_count,
-        species_count
-    ))
-
-    progress <- progress_bar$new(
-        total = gene_count,
-        format = "Retrieving distance data [:bar] :percent (ETA :eta)"
-    )
-
-    # Special case the human species and retrieve all available distance
-    # information.
-
-    dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
-
-    human_distances <- data.table(getBM(
-        attributes = c(
-            "ensembl_gene_id",
-            "chromosome_name",
-            "start_position",
-            "end_position"
-        ),
-        filters = "chromosome_name",
-        values = get_chromosome_names(dataset),
-        mart = dataset
-    ))
-
-    # Compute the nearest distance to telomeres.
-
-    human_distances[,
-        chromosome_length := max(end_position),
-        by = chromosome_name
-    ]
-
-    distances <- human_distances[, .(
-        species = "hsapiens",
-        gene = ensembl_gene_id,
-        distance = pmin(
-            start_position,
-            chromosome_length - end_position
-        )
-    )]
-
-    for (i in 1:species_count) {
-        species_id <- species_ids[i]
-
-        progress$tick()
-
-        dataset <- useDataset(
-            sprintf("%s_gene_ensembl", species_id),
-            mart = ensembl
-        )
-
-        # Besides the attributes that are always present, we need to check for
-        # human orthologs. Some species don't have that information and will be
-        # skipped.
-        if (!"hsapiens_homolog_ensembl_gene" %chin%
-            listAttributes(dataset, what = "name")) {
-            next
-        }
-
-        chromosome_names <- get_chromosome_names(dataset)
-
-        # Skip the species, if there are no assembled chromosomes.
-        if (length(chromosome_names) <= 0) {
-            next
-        }
-
-        # Retrieve information on all genes of the current species, that have
-        # human orthologs. This is called "homolog" in the Ensembl schema.
-        ensembl_distances <- data.table(getBM(
-            attributes = c(
-                "hsapiens_homolog_ensembl_gene",
-                "chromosome_name",
-                "start_position",
-                "end_position"
-            ),
-            filters = c("with_hsapiens_homolog", "chromosome_name"),
-            values = list(TRUE, chromosome_names),
-            mart = dataset
-        ))
-
-        # Precompute the genes' distance to the nearest telomere.
-
-        ensembl_distances[,
-            chromosome_length := max(end_position),
-            by = chromosome_name
-        ]
-
-        species_distances <- ensembl_distances[, .(
-            species = species_id,
-            gene = hsapiens_homolog_ensembl_gene,
-            distance = pmin(
-                start_position,
-                chromosome_length - end_position
-            )
-        )]
-
-        distances <- rbindlist(list(distances, species_distances))
-    }
-
-    # Arbitrarily exclude duplicated genes.
-    # TODO: Consider a refined approach or work out how to include all
-    # duplicates.
-    unique(distances, by = c("species", "gene"))
-}
\ No newline at end of file
diff --git a/process/methods.R b/process/methods.R
deleted file mode 100644
index a060710..0000000
--- a/process/methods.R
+++ /dev/null
@@ -1,62 +0,0 @@
-source("process/clusteriness.R")
-source("process/correlation.R")
-source("process/neural.R")
-source("process/proximity.R")
-
-#' Construct a new method.
-#'
-#' A method describes a way to perform a computation on gene distance data that
-#' results in a single score per gene. The function should accept the following
-#' parameters in this order:
-#'
-#'  - `distances` Distance data to use.
-#'  - `gene_ids` Genes to process.
-#'  - `preset` Preset to apply.
-#'
-#' The function should return a `data.table` with the following columns:
-#'
-#'  - `gene` Gene ID of the processed gene.
-#'  - `score` Score for the gene between 0.0 and 1.0.
-#'
-#' @param id Internal identifier for the method.
-#' @param name Human readable name for the method.
-#' @param description Short human readable description.
-#' @param fn Function to perform the computation.
-#'
-#' @return A named list containing the arguments.
-method <- function(id, name, description, fn) {
-    list(
-        id = id,
-        name = name,
-        description = description,
-        fn = fn
-    )
-}
-
-#' All methods to be included in the analysis.
-methods <- list(
-    method(
-        "clusteriness",
-        "Clustering",
-        "Clustering of genes",
-        process_clusteriness
-    ),
-    method(
-        "correlation",
-        "Correlation",
-        "Correlation with known genes",
-        process_correlation
-    ),
-    method(
-        "proximity",
-        "Proximity",
-        "Proximity to telomeres",
-        process_proximity
-    ),
-    method(
-        "neural",
-        "Neural",
-        "Assessment by neural network",
-        process_neural
-    )
-)
\ No newline at end of file
diff --git a/process/neural.R b/process/neural.R
deleted file mode 100644
index ef137d5..0000000
--- a/process/neural.R
+++ /dev/null
@@ -1,101 +0,0 @@
-library(data.table)
-library(neuralnet)
-
-#' Find genes by training a neural network on reference position data.
-#' @param seed A seed to get reproducible results.
-process_neural <- function(distances, gene_ids, preset, seed = 726839) {
-    species_ids <- preset$species_ids
-    reference_gene_ids <- preset$reference_gene_ids
-    set.seed(seed)
-    gene_count <- length(gene_ids)
-
-    # Prefilter distances by species.
-    distances <- distances[species %chin% species_ids]
-
-    #' Input data for the network. This contains the gene ID as an identifier
-    #' as well as the per-species gene distances as input variables.
-    data <- data.table(gene = gene_ids)
-
-    #' Buffer to keep track of species included in the computation. Species
-    #' from `species_ids` may be excluded if they don't have enough data.
-    species_ids_included <- NULL
-
-    for (species_id in species_ids) {
-        # Make a column specific to this species.
-
-        species_distances <- distances[species == species_id, .(gene, distance)]
-        setnames(species_distances, "distance", species_id)
-
-        # Only include species with at least 25% known values.
-
-        species_distances <- na.omit(species_distances)
-
-        if (nrow(species_distances) >= 0.25 * gene_count) {
-            species_ids_included <- append(species_ids_included, species_id)
-            data <- merge(data, species_distances, all = TRUE)
-        }
-    }
-
-    # Replace missing data with mean values. The neural network can't handle
-    # NAs in a meaningful way. Choosing extreme values here would result in
-    # heavily biased results. Therefore, the mean value is chosen as a
-    # compromise. However, this will of course lessen the significance of the
-    # results.
-    for (species_id in species_ids_included) {
-        mean_value <- data[, mean(get(species_id), na.rm = TRUE)]
-
-        data[
-            is.na(get(species_id)),
-            eval(quote(species_id)) := round(mean_value)
-        ]
-    }
-
-    # Extract the reference genes.
-
-    reference_data <- data[gene %chin% reference_gene_ids]
-    reference_data[, neural := 1.0]
-
-    # Take out random samples from the remaining genes. This is another
-    # compromise with a negative impact on significance. Because there is no
-    # information on genes with are explicitely *not* TPE-OLD genes, we have to
-    # assume that a random sample of genes has a low probability of including
-    # TPE-OLD genes.
-
-    without_reference_data <- data[!gene %chin% reference_gene_ids]
-
-    reference_samples <- without_reference_data[
-        sample(
-            nrow(without_reference_data),
-            nrow(reference_data)
-        )
-    ]
-
-    reference_samples[, neural := 0.0]
-
-    # Merge training data. The training data includes all reference genes as
-    # well as an equal number of random sample genes.
-    training_data <- rbindlist(list(reference_data, reference_samples))
-
-    # Construct and train the neural network.
-
-    nn_formula <- as.formula(sprintf(
-        "neural~%s",
-        paste(species_ids_included, collapse = "+")
-    ))
-
-    layer1 <- length(species_ids_included) * 0.66
-    layer2 <- layer1 * 0.66
-    layer3 <- layer2 * 0.66
-
-    nn <- neuralnet(
-        nn_formula,
-        training_data,
-        hidden = c(layer1, layer2, layer3),
-        linear.output = FALSE
-    )
-
-    # Return the resulting scores given by applying the neural network.
-
-    data[, score := compute(nn, data)$net.result]
-    data[, .(gene, score)]
-}
\ No newline at end of file
diff --git a/process/presets.R b/process/presets.R
deleted file mode 100644
index c0efc00..0000000
--- a/process/presets.R
+++ /dev/null
@@ -1,29 +0,0 @@
-library(data.table)
-
-#' Create a new preset.
-#'
-#' A preset is a combination of input values to all processing methods. The
-#' preset's hash will be used to cache the results of applying those.
-#'
-#' @param species_ids IDs of species to include.
-#' @param reference_gene_ids Reference genes to use.
-#'
-#' @return A named list containing the arguments.
-preset <- function(species_ids, reference_gene_ids) {
-    list(
-        species_ids = species_ids,
-        reference_gene_ids = reference_gene_ids
-    )
-}
-
-#' A default preset including only replicatively aging species.
-preset_replicative_species <- preset(
-    species[replicative == TRUE, id],
-    genes[suggested | verified == TRUE, id]
-)
-
-#' A default preset including all species.
-preset_all_species <-  preset(
-    species[, id],
-    genes[suggested | verified == TRUE, id]
-)
\ No newline at end of file
diff --git a/process/process.R b/process/process.R
deleted file mode 100644
index 0e3a2db..0000000
--- a/process/process.R
+++ /dev/null
@@ -1,58 +0,0 @@
-library(data.table)
-
-source("process/util.R")
-
-# Load input data
-
-source("process/input.R")
-
-species <- run_cached("inputs/species", retrieve_species)
-genes <- run_cached("inputs/genes", retrieve_genes)
-
-distances <- run_cached(
-    "inputs/distances",
-    retrieve_distances,
-    species[, id],
-    genes[, id]
-)
-
-genes <- merge(
-    genes,
-    distances[, .(n_species = .N), by = "gene"],
-    by.x = "id",
-    by.y = "gene"
-)
-
-source("process/methods.R")
-source("process/presets.R")
-
-#' Apply all methods with the specified preset without caching.
-process_priv <- function(preset) {
-    results <- data.table(gene = genes[, id])
-
-    for (method in methods) {
-        method_results <- method$fn(distances, genes[, id], preset)
-        setnames(method_results, "score", method$id)
-
-        results <- merge(
-            results,
-            method_results
-        )
-    }
-
-    results
-}
-
-#' Apply all methods with the specified preset.
-#'
-#' The result will be cached by the preset's hash and restored from cache, if
-#' possible. The return value is a `data.table` with one row for each gene
-#' identified by it's ID (`gene` column). The additional columns contain the
-#' resulting per method and are named after the method IDs.
-process <- function(preset) {
-    run_cached(
-        sprintf("results/%s", rlang::hash(preset)),
-        process_priv,
-        preset
-    )
-}
\ No newline at end of file
diff --git a/process/proximity.R b/process/proximity.R
deleted file mode 100644
index cef008c..0000000
--- a/process/proximity.R
+++ /dev/null
@@ -1,20 +0,0 @@
-library(data.table)
-
-#' Score the mean distance of genes to the telomeres across species.
-#'
-#' A score will be given to each gene such that 0.0 corresponds to the maximal
-#' mean distance across all genes and 1.0 corresponds to a distance of 0.
-process_proximity <- function(distances, gene_ids, preset) {
-    species_count <- length(preset$species_ids)
-
-    # Prefilter distances by species.
-    distances <- distances[species %chin% preset$species_ids]
-
-    # Compute the score as described above.
-
-    distances <- distances[, .(mean_distance = mean(distance)), by = "gene"]
-    max_distance <- distances[, max(mean_distance)]
-    distances[, score := 1 - mean_distance / max_distance]
-
-    distances[, .(gene, score)]
-}
diff --git a/rank_plot.R b/rank_plot.R
index 3a15c7a..22a29e0 100644
--- a/rank_plot.R
+++ b/rank_plot.R
@@ -53,4 +53,4 @@ rank_plot <- function(results, reference_gene_ids, cutoff) {
     }
 
     plot
-}
\ No newline at end of file
+}
diff --git a/shiny/server.R b/server.R
similarity index 81%
rename from shiny/server.R
rename to server.R
index 79a5acd..d1d0fbf 100644
--- a/shiny/server.R
+++ b/server.R
@@ -1,13 +1,14 @@
 library(data.table)
 library(DT)
+library(geposan)
 library(gprofiler2)
 library(plotly)
 library(rclipboard)
 library(shiny)
 
-source("optimize.R")
 source("rank_plot.R")
 source("scatter_plot.R")
+source("utils.R")
 
 #' Java script function to replace gene IDs with Ensembl gene links.
 js_link <- JS("function(row, data) {
@@ -45,16 +46,19 @@ server <- function(input, output, session) {
             }
         }
 
-        reference_gene_ids <- genes[suggested | verified == TRUE, id]
-        weights <- optimize_weights(results, method_ids, reference_gene_ids)
+        weights <- geposan::optimize_weights(
+            results,
+            method_ids,
+            genes_tpe_old
+        )
 
-        mapply(function(method_id, weight) {
+        for (method_id in method_ids) {
             updateSliderInput(
                 session,
                 sprintf("%s_weight", method_id),
-                value = weight * 100
+                value = weights[[method_id]] * 100
             )
-        }, method_ids, weights)
+        }
     })
 
     # Observe each method's enable button.
@@ -67,14 +71,17 @@ server <- function(input, output, session) {
     #' Rank the results based on the specified weights. Filter out genes with
     #' too few species but don't apply the cut-off score.
     results <- reactive({
-        # Select the species preset.
-
-        results <- if (input$species == "all") {
-            process(preset_all_species)
+        # Select the preset.
+        preset <- if (input$species == "all") {
+            preset_all_species
         } else {
-            process(preset_replicative_species)
+            preset_replicative_species
         }
 
+        # Perform the analysis cached based on the preset's hash.
+        results <- run_cached(rlang::hash(preset), geposan::analyze, preset)
+
+        # Add all gene information to the results.
         results <- merge(
             results,
             genes,
@@ -82,41 +89,21 @@ server <- function(input, output, session) {
             by.y = "id"
         )
 
-        # Compute scoring factors and the weighted score.
+        # Exclude genes with too few species.
+        results <- results[n_species >= input$n_species]
 
-        total_weight <- 0.0
-        results[, score := 0.0]
+        # Rank the results based on the weights.
+
+        weights <- NULL
 
         for (method in methods) {
             if (input[[method$id]]) {
                 weight <- input[[sprintf("%s_weight", method$id)]]
-                total_weight <- total_weight + weight
-                column <- method$id
-                weighted <- weight * results[, ..column]
-                results[, score := score + weighted]
+                weights[[method$id]] <- weight
             }
         }
 
-        results[, score := score / total_weight]
-
-        # Exclude genes with too few species.
-        results <- results[n_species >= input$n_species]
-
-        # Penalize missing species.
-        if (input$penalize) {
-            species_count <- if (input$species == "all") {
-                nrow(species)
-            } else {
-                length(species_ids_replicative)
-            }
-
-            results <- results[, score := score * n_species / species_count]
-        }
-
-        # Order the results based on their score.
-
-        setorder(results, -score, na.last = TRUE)
-        results[, rank := .I]
+        geposan::ranking(results, weights)
     })
 
     #' Apply the cut-off score to the ranked results.
@@ -227,4 +214,4 @@ server <- function(input, output, session) {
             NULL
         }
     })
-}
\ No newline at end of file
+}
diff --git a/shiny/main.R b/shiny/main.R
deleted file mode 100644
index 39228ac..0000000
--- a/shiny/main.R
+++ /dev/null
@@ -1,7 +0,0 @@
-library(shiny)
-
-source("process/process.R")
-source("shiny/server.R")
-source("shiny/ui.R")
-
-runApp(shinyApp(ui, server))
\ No newline at end of file
diff --git a/shiny/ui.R b/ui.R
similarity index 96%
rename from shiny/ui.R
rename to ui.R
index 17e519f..690937c 100644
--- a/shiny/ui.R
+++ b/ui.R
@@ -56,11 +56,7 @@ ui <- fluidPage(
                         value = 100
                     )
                 )
-            }),
-            checkboxInput(
-                "penalize",
-                "Penalize missing values"
-            ),
+            })
         ),
         mainPanel(
             tabsetPanel(
@@ -113,4 +109,4 @@ ui <- fluidPage(
             )
         )
     )
-)
\ No newline at end of file
+)
diff --git a/process/util.R b/utils.R
similarity index 99%
rename from process/util.R
rename to utils.R
index e37ece2..94b56db 100644
--- a/process/util.R
+++ b/utils.R
@@ -26,4 +26,4 @@ run_cached <- function(id, func, ...) {
 
         data
     }
-}
\ No newline at end of file
+}