From dc0265a13f0d7fef9d09bd079e81e726b092df6c Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Tue, 19 Oct 2021 14:15:28 +0200 Subject: [PATCH] Separate out geposan package --- data.R | 115 +++++++++++++++++ main.R | 9 ++ optimize.R | 34 ----- process/clusteriness.R | 56 -------- process/correlation.R | 63 --------- process/input.R | 254 ------------------------------------- process/methods.R | 62 --------- process/neural.R | 101 --------------- process/presets.R | 29 ----- process/process.R | 58 --------- process/proximity.R | 20 --- rank_plot.R | 2 +- shiny/server.R => server.R | 65 ++++------ shiny/main.R | 7 - shiny/ui.R => ui.R | 8 +- process/util.R => utils.R | 2 +- 16 files changed, 154 insertions(+), 731 deletions(-) create mode 100644 data.R create mode 100644 main.R delete mode 100644 optimize.R delete mode 100644 process/clusteriness.R delete mode 100644 process/correlation.R delete mode 100644 process/input.R delete mode 100644 process/methods.R delete mode 100644 process/neural.R delete mode 100644 process/presets.R delete mode 100644 process/process.R delete mode 100644 process/proximity.R rename shiny/server.R => server.R (81%) delete mode 100644 shiny/main.R rename shiny/ui.R => ui.R (96%) rename process/util.R => utils.R (99%) diff --git a/data.R b/data.R new file mode 100644 index 0000000..4b47863 --- /dev/null +++ b/data.R @@ -0,0 +1,115 @@ +library(data.table) + +#' Species IDs of known replicatively aging species. +species_ids_replicative <- c( + "bihybrid", + "btaurus", + "bthybrid", + "cfamiliaris", + "chircus", + "cjacchus", + "clfamiliaris", + "csabaeus", + "ecaballus", + "fcatus", + "ggorilla", + "hsapiens", + "lafricana", + "mfascicularis", + "mmulatta", + "mmurinus", + "mnemestrina", + "nleucogenys", + "oaries", + "pabelii", + "panubis", + "ppaniscus", + "ptroglodytes", + "sscrofa", + "tgelada" +) + +#' Species from [geposan] and their aging status. +species <- geposan::species[, .( + id, + name, + replicative = id %chin% species_ids_replicative +)] + +#' Gene names of genes for verified TPE-OLD genes. +genes_verified_tpe_old <- c( + "C1S", + "DSP", + "ISG15", + "SORBS2", + "TERT" +) + +#' Gene names of genes with a suggested TPE-OLD. +genes_suggested_tpe_old <- c( + "AKAP3", + "ANO2", + "CCND2", + "CD163L1", + "CD9", + "FOXM1", + "GALNT8", + "NDUFA9", + "TEAD4", + "TIGAR", + "TSPAN9" +) + +#' Genes from [geposan] and their TPE-OLD status. +genes <- geposan::genes[, .( + id, + name, + chromosome, + n_species, + suggested = name %chin% genes_suggested_tpe_old, + verified = name %chin% genes_verified_tpe_old +)] + +#' All available methods from [geposan] and additional information on them. +methods <- list( + list( + id = "clusteriness", + name = "Clustering", + description = "Clustering of genes" + ), + list( + id = "correlation", + name = "Correlation", + description = "Correlation with known genes" + ), + list( + id = "proximity", + name = "Proximity", + description = "Proximity to telomeres" + ), + list( + id = "neural", + name = "Neural", + description = "Assessment by neural network" + ) +) + +#' Gene IDs of known or suggested TPE-OLD genes. +genes_tpe_old <- genes[suggested | verified == TRUE, id] + +#' Preset for [geposan] including all species and TPE-OLD genes for reference. +preset_all_species <- geposan::preset( + methods = c("clusteriness", "correlation", "proximity", "neural"), + species = species$id, + genes = genes$id, + reference_genes = genes_tpe_old +) + +#' Preset for [geposan] including only replicatively aging species as well as +#' TPE-OLD genes for reference. +preset_replicative_species <- geposan::preset( + methods = c("clusteriness", "correlation", "proximity", "neural"), + species = species_ids_replicative, + genes = genes$id, + reference_genes = genes_tpe_old +) diff --git a/main.R b/main.R new file mode 100644 index 0000000..9e01819 --- /dev/null +++ b/main.R @@ -0,0 +1,9 @@ +library(shiny) + +# Initialize data first. +source("data.R") + +source("server.R") +source("ui.R") + +runApp(shinyApp(ui, server)) diff --git a/optimize.R b/optimize.R deleted file mode 100644 index f20fec1..0000000 --- a/optimize.R +++ /dev/null @@ -1,34 +0,0 @@ -#' Find the best weights to rank the data. -#' -#' This function ranks the provided data table based on a weighted score -#' computed from the specified `columns`. It tries to find the optimal weights -#' that result in a ranking, where the mean rank of the given reference genes -#' is as high as possible. -#' -#' @param data Input data including the columns. -#' @param colums Columns containing the separate scores between 0.0 and 1.0. -#' @param reference_gene_ids IDs of the reference genes within the input data. -#' -#' @returns Vector of optimal column weights adding up to 1.0. -optimize_weights <- function(data, columns, reference_gene_ids) { - #' Compute the mean rank of the reference genes when applying the weights. - mean_rank <- function(weights) { - data <- copy(data) - data[, score := 0.0] - - for (i in seq_along(columns)) { - column <- columns[i] - weighted <- weights[i] * data[, ..column] - data[, score := score + weighted] - } - - setorder(data, -score) - data[, rank := .I] - - data[gene %chin% reference_gene_ids, mean(rank)] - } - - weights <- optim(rep(1.0, length(columns)), mean_rank)$par - total_weight <- sum(weights) - weights / total_weight -} \ No newline at end of file diff --git a/process/clusteriness.R b/process/clusteriness.R deleted file mode 100644 index d860bb8..0000000 --- a/process/clusteriness.R +++ /dev/null @@ -1,56 +0,0 @@ -library(data.table) - -#' Perform a cluster analysis. -#' -#' This function will cluster the data using `hclust` and `cutree` (with the -#' specified height). Every cluster with at least two members qualifies for -#' further analysis. Clusters are then ranked based on their size in relation -#' to the number of values. The return value is a final score between zero and -#' one. Lower ranking clusters contribute less to this score. -clusteriness <- function(data, height = 1000000) { - n <- length(data) - - # Return a score of 0.0 if there is just one or no value at all. - if (n < 2) { - return(0.0) - } - - # Cluster the data and compute the cluster sizes. - - tree <- hclust(dist(data)) - clusters <- cutree(tree, h = height) - cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) - - # Compute the "clusteriness" score. - - score <- 0.0 - - for (i in seq_along(cluster_sizes)) { - cluster_size <- cluster_sizes[i] - - if (cluster_size >= 2) { - cluster_score <- cluster_size / n - score <- score + cluster_score / i - } - } - - score -} - -#' Process genes clustering their distance to telomeres. -process_clusteriness <- function(distances, gene_ids, preset) { - results <- data.table(gene = gene_ids) - - # Prefilter the input data by species. - distances <- distances[species %chin% preset$species_ids] - - # Add an index for quickly accessing data per gene. - setkey(distances, gene) - - #' Perform the cluster analysis for one gene. - compute <- function(gene_id) { - clusteriness(distances[gene_id, distance]) - } - - results[, score := compute(gene), by = 1:nrow(results)] -} \ No newline at end of file diff --git a/process/correlation.R b/process/correlation.R deleted file mode 100644 index 5cce8f7..0000000 --- a/process/correlation.R +++ /dev/null @@ -1,63 +0,0 @@ -library(data.table) - -#' Compute the mean correlation coefficient comparing gene distances with a set -#' of reference genes. -process_correlation <- function(distances, gene_ids, preset) { - results <- data.table(gene = gene_ids) - reference_gene_ids <- preset$reference_gene_ids - reference_count <- length(reference_gene_ids) - - # Prefilter distances by species. - distances <- distances[species %chin% preset$species_ids] - - # Add an index for quickly accessing data per gene. - setkey(distances, gene) - - # Prepare the reference genes' data. - reference_distances <- distances[gene %chin% reference_gene_ids] - - #' Perform the correlation for one gene. - compute <- function(gene_id) { - gene_distances <- distances[gene_id] - gene_species_count <- nrow(gene_distances) - - # Return a score of 0.0 if there is just one or no value at all. - if (gene_species_count <= 1) { - return(0.0) - } - - #' Buffer for the sum of correlation coefficients. - correlation_sum <- 0 - - # Correlate with all reference genes but not with the gene itself. - for (reference_gene_id in - reference_gene_ids[reference_gene_ids != gene_id]) { - data <- merge( - gene_distances, - reference_distances[reference_gene_id], - by = "species" - ) - - # Skip this reference gene, if there are not enough value pairs. - # This will lessen the final score, because it effectively - # represents a correlation coefficient of 0.0. - if (nrow(data) <= 1) { - next - } - - # Order data by the reference gene's distance to get a monotonic - # relation. - setorder(data, distance.y) - - correlation_sum <- correlation_sum + abs(cor( - data[, distance.x], data[, distance.y], - method = "spearman" - )) - } - - # Compute the score as the mean correlation coefficient. - score <- correlation_sum / reference_count - } - - results[, score := compute(gene), by = 1:nrow(results)] -} \ No newline at end of file diff --git a/process/input.R b/process/input.R deleted file mode 100644 index bb32260..0000000 --- a/process/input.R +++ /dev/null @@ -1,254 +0,0 @@ -library(biomaRt) -library(data.table) -library(progress) -library(rlog) -library(stringr) - -#' Species IDs of known replicatively aging species. -species_ids_replicative <- c( - "bihybrid", - "btaurus", - "bthybrid", - "cfamiliaris", - "chircus", - "cjacchus", - "clfamiliaris", - "csabaeus", - "ecaballus", - "fcatus", - "ggorilla", - "hsapiens", - "lafricana", - "mfascicularis", - "mmulatta", - "mmurinus", - "mnemestrina", - "nleucogenys", - "oaries", - "pabelii", - "panubis", - "ppaniscus", - "ptroglodytes", - "sscrofa", - "tgelada" -) - -#' Gene names of genes for verified TPE-OLD genes. -genes_verified_tpe_old <- c( - "C1S", - "DSP", - "ISG15", - "SORBS2", - "TERT" -) - -#' Gene names of genes with a suggested TPE-OLD. -genes_suggested_tpe_old <- c( - "AKAP3", - "ANO2", - "CCND2", - "CD163L1", - "CD9", - "FOXM1", - "GALNT8", - "NDUFA9", - "TEAD4", - "TIGAR", - "TSPAN9" -) - -#' Shared accessor for the Ensembl API. -ensembl <- NULL - -#' Get the ensembl accessor and initialize it if necessary. -get_ensembl <- function() { - if (is.null(ensembl)) { - ensembl <<- useEnsembl("ensembl", version = 104) - } - - ensembl -} - -#' Get all chromosome names for a Ensembl dataset. -#' -#' Valid chromosome names include decimal numbers as well as 'X' and 'Y'. -get_chromosome_names <- function(dataset) { - chromosome_names <- listFilterOptions(dataset, "chromosome_name") - chromosome_names[str_which(chromosome_names, "^[0-9]+|[XY]$")] -} - -#' Retrieve information on species. -#' -#' The result will be a `data.table` with the following columns: -#' -#' - `id` Species ID as presented by Ensembl. -#' - `name` Human readable species name. -#' - `replicative` Whether the species is likely to be aging replicatively. -retrieve_species <- function() { - # Ensembl datasets correspond to distinct species. - ensembl_datasets <- data.table(listDatasets(get_ensembl())) - - # Filter out species ID and name from the result. - species <- ensembl_datasets[, .( - id = str_match(dataset, "(.*)_gene_ensembl")[, 2], - name = str_match(description, "(.*) genes \\(.*\\)")[, 2] - )] - - species[, replicative := id %chin% species_ids_replicative] -} - -#' Retrieve information on human genes. -#' -#' This will only include genes on assembled chromosomes. Chromosomes are -#' filtered based on their name being either a decimal number, 'X' or 'Y'. -#' -#' The result will be a `data.table` with the following columns: -#' -#' - `id` Ensembl gene ID. -#' - `Çıame` HGNC name of the gene. -#' - `chromosome` Human chromosome on which the gene is located. -retrieve_genes <- function() { - dataset <- useDataset("hsapiens_gene_ensembl", mart = get_ensembl()) - - genes <- data.table(getBM( - attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"), - filters = "chromosome_name", - values = get_chromosome_names(dataset), - mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl()) - )) - - genes[, .( - id = ensembl_gene_id, - name = hgnc_symbol, - chromosome = chromosome_name, - verified = hgnc_symbol %chin% genes_verified_tpe_old, - suggested = hgnc_symbol %chin% genes_suggested_tpe_old - )] -} - -#' Retrieve gene distance data. -#' -#' The data will include all available values for the given species and genes -#' that are located on assembled chromosomes. -#' -#' The result will be a `data.table` with the following columns: -#' -#' - `species` Species ID. -#' - `gene` Ensembl gene ID. -#' - `distance` Distance to nearest telomere in base pairs. -retrieve_distances <- function(species_ids, gene_ids) { - ensembl <- get_ensembl() - - # Exclude the human from the species, in case it is present there. - species_ids <- species_ids[species_ids != "hsapiens"] - - species_count <- length(species_ids) - gene_count <- length(gene_ids) - - log_info(sprintf( - "Retrieving distance data for %i genes from %i species", - gene_count, - species_count - )) - - progress <- progress_bar$new( - total = gene_count, - format = "Retrieving distance data [:bar] :percent (ETA :eta)" - ) - - # Special case the human species and retrieve all available distance - # information. - - dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl) - - human_distances <- data.table(getBM( - attributes = c( - "ensembl_gene_id", - "chromosome_name", - "start_position", - "end_position" - ), - filters = "chromosome_name", - values = get_chromosome_names(dataset), - mart = dataset - )) - - # Compute the nearest distance to telomeres. - - human_distances[, - chromosome_length := max(end_position), - by = chromosome_name - ] - - distances <- human_distances[, .( - species = "hsapiens", - gene = ensembl_gene_id, - distance = pmin( - start_position, - chromosome_length - end_position - ) - )] - - for (i in 1:species_count) { - species_id <- species_ids[i] - - progress$tick() - - dataset <- useDataset( - sprintf("%s_gene_ensembl", species_id), - mart = ensembl - ) - - # Besides the attributes that are always present, we need to check for - # human orthologs. Some species don't have that information and will be - # skipped. - if (!"hsapiens_homolog_ensembl_gene" %chin% - listAttributes(dataset, what = "name")) { - next - } - - chromosome_names <- get_chromosome_names(dataset) - - # Skip the species, if there are no assembled chromosomes. - if (length(chromosome_names) <= 0) { - next - } - - # Retrieve information on all genes of the current species, that have - # human orthologs. This is called "homolog" in the Ensembl schema. - ensembl_distances <- data.table(getBM( - attributes = c( - "hsapiens_homolog_ensembl_gene", - "chromosome_name", - "start_position", - "end_position" - ), - filters = c("with_hsapiens_homolog", "chromosome_name"), - values = list(TRUE, chromosome_names), - mart = dataset - )) - - # Precompute the genes' distance to the nearest telomere. - - ensembl_distances[, - chromosome_length := max(end_position), - by = chromosome_name - ] - - species_distances <- ensembl_distances[, .( - species = species_id, - gene = hsapiens_homolog_ensembl_gene, - distance = pmin( - start_position, - chromosome_length - end_position - ) - )] - - distances <- rbindlist(list(distances, species_distances)) - } - - # Arbitrarily exclude duplicated genes. - # TODO: Consider a refined approach or work out how to include all - # duplicates. - unique(distances, by = c("species", "gene")) -} \ No newline at end of file diff --git a/process/methods.R b/process/methods.R deleted file mode 100644 index a060710..0000000 --- a/process/methods.R +++ /dev/null @@ -1,62 +0,0 @@ -source("process/clusteriness.R") -source("process/correlation.R") -source("process/neural.R") -source("process/proximity.R") - -#' Construct a new method. -#' -#' A method describes a way to perform a computation on gene distance data that -#' results in a single score per gene. The function should accept the following -#' parameters in this order: -#' -#' - `distances` Distance data to use. -#' - `gene_ids` Genes to process. -#' - `preset` Preset to apply. -#' -#' The function should return a `data.table` with the following columns: -#' -#' - `gene` Gene ID of the processed gene. -#' - `score` Score for the gene between 0.0 and 1.0. -#' -#' @param id Internal identifier for the method. -#' @param name Human readable name for the method. -#' @param description Short human readable description. -#' @param fn Function to perform the computation. -#' -#' @return A named list containing the arguments. -method <- function(id, name, description, fn) { - list( - id = id, - name = name, - description = description, - fn = fn - ) -} - -#' All methods to be included in the analysis. -methods <- list( - method( - "clusteriness", - "Clustering", - "Clustering of genes", - process_clusteriness - ), - method( - "correlation", - "Correlation", - "Correlation with known genes", - process_correlation - ), - method( - "proximity", - "Proximity", - "Proximity to telomeres", - process_proximity - ), - method( - "neural", - "Neural", - "Assessment by neural network", - process_neural - ) -) \ No newline at end of file diff --git a/process/neural.R b/process/neural.R deleted file mode 100644 index ef137d5..0000000 --- a/process/neural.R +++ /dev/null @@ -1,101 +0,0 @@ -library(data.table) -library(neuralnet) - -#' Find genes by training a neural network on reference position data. -#' @param seed A seed to get reproducible results. -process_neural <- function(distances, gene_ids, preset, seed = 726839) { - species_ids <- preset$species_ids - reference_gene_ids <- preset$reference_gene_ids - set.seed(seed) - gene_count <- length(gene_ids) - - # Prefilter distances by species. - distances <- distances[species %chin% species_ids] - - #' Input data for the network. This contains the gene ID as an identifier - #' as well as the per-species gene distances as input variables. - data <- data.table(gene = gene_ids) - - #' Buffer to keep track of species included in the computation. Species - #' from `species_ids` may be excluded if they don't have enough data. - species_ids_included <- NULL - - for (species_id in species_ids) { - # Make a column specific to this species. - - species_distances <- distances[species == species_id, .(gene, distance)] - setnames(species_distances, "distance", species_id) - - # Only include species with at least 25% known values. - - species_distances <- na.omit(species_distances) - - if (nrow(species_distances) >= 0.25 * gene_count) { - species_ids_included <- append(species_ids_included, species_id) - data <- merge(data, species_distances, all = TRUE) - } - } - - # Replace missing data with mean values. The neural network can't handle - # NAs in a meaningful way. Choosing extreme values here would result in - # heavily biased results. Therefore, the mean value is chosen as a - # compromise. However, this will of course lessen the significance of the - # results. - for (species_id in species_ids_included) { - mean_value <- data[, mean(get(species_id), na.rm = TRUE)] - - data[ - is.na(get(species_id)), - eval(quote(species_id)) := round(mean_value) - ] - } - - # Extract the reference genes. - - reference_data <- data[gene %chin% reference_gene_ids] - reference_data[, neural := 1.0] - - # Take out random samples from the remaining genes. This is another - # compromise with a negative impact on significance. Because there is no - # information on genes with are explicitely *not* TPE-OLD genes, we have to - # assume that a random sample of genes has a low probability of including - # TPE-OLD genes. - - without_reference_data <- data[!gene %chin% reference_gene_ids] - - reference_samples <- without_reference_data[ - sample( - nrow(without_reference_data), - nrow(reference_data) - ) - ] - - reference_samples[, neural := 0.0] - - # Merge training data. The training data includes all reference genes as - # well as an equal number of random sample genes. - training_data <- rbindlist(list(reference_data, reference_samples)) - - # Construct and train the neural network. - - nn_formula <- as.formula(sprintf( - "neural~%s", - paste(species_ids_included, collapse = "+") - )) - - layer1 <- length(species_ids_included) * 0.66 - layer2 <- layer1 * 0.66 - layer3 <- layer2 * 0.66 - - nn <- neuralnet( - nn_formula, - training_data, - hidden = c(layer1, layer2, layer3), - linear.output = FALSE - ) - - # Return the resulting scores given by applying the neural network. - - data[, score := compute(nn, data)$net.result] - data[, .(gene, score)] -} \ No newline at end of file diff --git a/process/presets.R b/process/presets.R deleted file mode 100644 index c0efc00..0000000 --- a/process/presets.R +++ /dev/null @@ -1,29 +0,0 @@ -library(data.table) - -#' Create a new preset. -#' -#' A preset is a combination of input values to all processing methods. The -#' preset's hash will be used to cache the results of applying those. -#' -#' @param species_ids IDs of species to include. -#' @param reference_gene_ids Reference genes to use. -#' -#' @return A named list containing the arguments. -preset <- function(species_ids, reference_gene_ids) { - list( - species_ids = species_ids, - reference_gene_ids = reference_gene_ids - ) -} - -#' A default preset including only replicatively aging species. -preset_replicative_species <- preset( - species[replicative == TRUE, id], - genes[suggested | verified == TRUE, id] -) - -#' A default preset including all species. -preset_all_species <- preset( - species[, id], - genes[suggested | verified == TRUE, id] -) \ No newline at end of file diff --git a/process/process.R b/process/process.R deleted file mode 100644 index 0e3a2db..0000000 --- a/process/process.R +++ /dev/null @@ -1,58 +0,0 @@ -library(data.table) - -source("process/util.R") - -# Load input data - -source("process/input.R") - -species <- run_cached("inputs/species", retrieve_species) -genes <- run_cached("inputs/genes", retrieve_genes) - -distances <- run_cached( - "inputs/distances", - retrieve_distances, - species[, id], - genes[, id] -) - -genes <- merge( - genes, - distances[, .(n_species = .N), by = "gene"], - by.x = "id", - by.y = "gene" -) - -source("process/methods.R") -source("process/presets.R") - -#' Apply all methods with the specified preset without caching. -process_priv <- function(preset) { - results <- data.table(gene = genes[, id]) - - for (method in methods) { - method_results <- method$fn(distances, genes[, id], preset) - setnames(method_results, "score", method$id) - - results <- merge( - results, - method_results - ) - } - - results -} - -#' Apply all methods with the specified preset. -#' -#' The result will be cached by the preset's hash and restored from cache, if -#' possible. The return value is a `data.table` with one row for each gene -#' identified by it's ID (`gene` column). The additional columns contain the -#' resulting per method and are named after the method IDs. -process <- function(preset) { - run_cached( - sprintf("results/%s", rlang::hash(preset)), - process_priv, - preset - ) -} \ No newline at end of file diff --git a/process/proximity.R b/process/proximity.R deleted file mode 100644 index cef008c..0000000 --- a/process/proximity.R +++ /dev/null @@ -1,20 +0,0 @@ -library(data.table) - -#' Score the mean distance of genes to the telomeres across species. -#' -#' A score will be given to each gene such that 0.0 corresponds to the maximal -#' mean distance across all genes and 1.0 corresponds to a distance of 0. -process_proximity <- function(distances, gene_ids, preset) { - species_count <- length(preset$species_ids) - - # Prefilter distances by species. - distances <- distances[species %chin% preset$species_ids] - - # Compute the score as described above. - - distances <- distances[, .(mean_distance = mean(distance)), by = "gene"] - max_distance <- distances[, max(mean_distance)] - distances[, score := 1 - mean_distance / max_distance] - - distances[, .(gene, score)] -} diff --git a/rank_plot.R b/rank_plot.R index 3a15c7a..22a29e0 100644 --- a/rank_plot.R +++ b/rank_plot.R @@ -53,4 +53,4 @@ rank_plot <- function(results, reference_gene_ids, cutoff) { } plot -} \ No newline at end of file +} diff --git a/shiny/server.R b/server.R similarity index 81% rename from shiny/server.R rename to server.R index 79a5acd..d1d0fbf 100644 --- a/shiny/server.R +++ b/server.R @@ -1,13 +1,14 @@ library(data.table) library(DT) +library(geposan) library(gprofiler2) library(plotly) library(rclipboard) library(shiny) -source("optimize.R") source("rank_plot.R") source("scatter_plot.R") +source("utils.R") #' Java script function to replace gene IDs with Ensembl gene links. js_link <- JS("function(row, data) { @@ -45,16 +46,19 @@ server <- function(input, output, session) { } } - reference_gene_ids <- genes[suggested | verified == TRUE, id] - weights <- optimize_weights(results, method_ids, reference_gene_ids) + weights <- geposan::optimize_weights( + results, + method_ids, + genes_tpe_old + ) - mapply(function(method_id, weight) { + for (method_id in method_ids) { updateSliderInput( session, sprintf("%s_weight", method_id), - value = weight * 100 + value = weights[[method_id]] * 100 ) - }, method_ids, weights) + } }) # Observe each method's enable button. @@ -67,14 +71,17 @@ server <- function(input, output, session) { #' Rank the results based on the specified weights. Filter out genes with #' too few species but don't apply the cut-off score. results <- reactive({ - # Select the species preset. - - results <- if (input$species == "all") { - process(preset_all_species) + # Select the preset. + preset <- if (input$species == "all") { + preset_all_species } else { - process(preset_replicative_species) + preset_replicative_species } + # Perform the analysis cached based on the preset's hash. + results <- run_cached(rlang::hash(preset), geposan::analyze, preset) + + # Add all gene information to the results. results <- merge( results, genes, @@ -82,41 +89,21 @@ server <- function(input, output, session) { by.y = "id" ) - # Compute scoring factors and the weighted score. + # Exclude genes with too few species. + results <- results[n_species >= input$n_species] - total_weight <- 0.0 - results[, score := 0.0] + # Rank the results based on the weights. + + weights <- NULL for (method in methods) { if (input[[method$id]]) { weight <- input[[sprintf("%s_weight", method$id)]] - total_weight <- total_weight + weight - column <- method$id - weighted <- weight * results[, ..column] - results[, score := score + weighted] + weights[[method$id]] <- weight } } - results[, score := score / total_weight] - - # Exclude genes with too few species. - results <- results[n_species >= input$n_species] - - # Penalize missing species. - if (input$penalize) { - species_count <- if (input$species == "all") { - nrow(species) - } else { - length(species_ids_replicative) - } - - results <- results[, score := score * n_species / species_count] - } - - # Order the results based on their score. - - setorder(results, -score, na.last = TRUE) - results[, rank := .I] + geposan::ranking(results, weights) }) #' Apply the cut-off score to the ranked results. @@ -227,4 +214,4 @@ server <- function(input, output, session) { NULL } }) -} \ No newline at end of file +} diff --git a/shiny/main.R b/shiny/main.R deleted file mode 100644 index 39228ac..0000000 --- a/shiny/main.R +++ /dev/null @@ -1,7 +0,0 @@ -library(shiny) - -source("process/process.R") -source("shiny/server.R") -source("shiny/ui.R") - -runApp(shinyApp(ui, server)) \ No newline at end of file diff --git a/shiny/ui.R b/ui.R similarity index 96% rename from shiny/ui.R rename to ui.R index 17e519f..690937c 100644 --- a/shiny/ui.R +++ b/ui.R @@ -56,11 +56,7 @@ ui <- fluidPage( value = 100 ) ) - }), - checkboxInput( - "penalize", - "Penalize missing values" - ), + }) ), mainPanel( tabsetPanel( @@ -113,4 +109,4 @@ ui <- fluidPage( ) ) ) -) \ No newline at end of file +) diff --git a/process/util.R b/utils.R similarity index 99% rename from process/util.R rename to utils.R index e37ece2..94b56db 100644 --- a/process/util.R +++ b/utils.R @@ -26,4 +26,4 @@ run_cached <- function(id, func, ...) { data } -} \ No newline at end of file +}