diff --git a/clustering.R b/clusteriness.R similarity index 90% rename from clustering.R rename to clusteriness.R index 6382aac..6af743e 100644 --- a/clustering.R +++ b/clusteriness.R @@ -42,12 +42,12 @@ clusteriness <- function(data, height = 1000000) { #' The return value will be a data.table with the following columns: #' #' - `gene` Gene ID of the processed gene. -#' - `clusteriness` Score quantidying the gene's clusters. +#' - `score` Score quantidying the gene's clusters. #' #' @param distances Gene distance data to use. #' @param species_ids IDs of species to include in the analysis. #' @param gene_ids Genes to include in the computation. -process_clustering <- function(distances, species_ids, gene_ids) { +process_clusteriness <- function(distances, species_ids, gene_ids, ...) { results <- data.table(gene = gene_ids) # Prefilter the input data by species. @@ -61,5 +61,5 @@ process_clustering <- function(distances, species_ids, gene_ids) { clusteriness(distances[gene_id, distance]) } - results[, clusteriness := compute(gene), by = 1:nrow(results)] + results[, score := compute(gene), by = 1:nrow(results)] } \ No newline at end of file diff --git a/correlation.R b/correlation.R index 18ad4da..cb69db1 100644 --- a/correlation.R +++ b/correlation.R @@ -6,7 +6,7 @@ library(data.table) #' The result will be a data.table with the following columns: #' #' - `gene` Gene ID of the processed gene. -#' - `correlation` Mean correlation coefficient. +#' - `score` Mean correlation coefficient. #' #' @param distances Distance data to use. #' @param species_ids Species, whose data should be included. @@ -69,5 +69,5 @@ process_correlation <- function(distances, species_ids, gene_ids, score <- correlation_sum / reference_count } - results[, correlation := compute(gene), by = 1:nrow(results)] + results[, score := compute(gene), by = 1:nrow(results)] } \ No newline at end of file diff --git a/init.R b/init.R index f806a73..f630d79 100644 --- a/init.R +++ b/init.R @@ -1,7 +1,5 @@ -source("clustering.R") -source("correlation.R") source("input.R") -source("neural.R") +source("methods.R") source("util.R") # Load input data @@ -16,66 +14,12 @@ distances <- run_cached( genes[, id] ) -# Load processed data - all_species <- species[, id] replicative_species <- species[replicative == TRUE, id] all_genes <- genes[, id] tpe_old_genes <- genes[suggested | verified == TRUE, id] -clustering_all <- run_cached( - "clustering_all", - process_clustering, - distances, - all_species, - all_genes -) - -clustering_replicative <- run_cached( - "clustering_replicative", - process_clustering, - distances, - replicative_species, - all_genes -) - -correlation_all <- run_cached( - "correlation_all", - process_correlation, - distances, - all_species, - all_genes, - tpe_old_genes -) - -correlation_replicative <- run_cached( - "correlation_replicative", - process_correlation, - distances, - replicative_species, - all_genes, - tpe_old_genes -) - -neural_all <- run_cached( - "neural_all", - process_neural, - distances, - all_species, - all_genes, - tpe_old_genes -) - -neural_replicative <- run_cached( - "neural_replicative", - process_neural, - distances, - replicative_species, - all_genes, - tpe_old_genes -) - -# Merge processed data as well as gene information. +# Apply all methods for all species results_all <- merge( genes, @@ -84,26 +28,27 @@ results_all <- merge( by.y = "gene" ) -results_all <- merge( - results_all, - clustering_all, - by.x = "id", - by.y = "gene" -) +setnames(results_all, "id", "gene") -results_all <- merge( - results_all, - correlation_all, - by.x = "id", - by.y = "gene" -) +for (method in methods) { + method_results <- run_cached( + sprintf("%s_all", method$id), + method$fn, + distances, + all_species, + all_genes, + tpe_old_genes + ) -results_all <- merge( - results_all, - neural_all, - by.x = "id", - by.y = "gene" -) + setnames(method_results, "score", method$id) + + results_all <- merge( + results_all, + method_results, + ) +} + +# Apply all methods for replicatively aging species results_replicative <- merge( genes, @@ -116,28 +61,22 @@ results_replicative <- merge( by.y = "gene" ) -results_replicative <- merge( - results_replicative, - clustering_replicative, - by.x = "id", - by.y = "gene" -) - -results_replicative <- merge( - results_replicative, - correlation_replicative, - by.x = "id", - by.y = "gene" -) - -results_replicative <- merge( - results_replicative, - neural_replicative, - by.x = "id", - by.y = "gene" -) - -# Rename `id` columns to `gene`. - -setnames(results_all, "id", "gene") setnames(results_replicative, "id", "gene") + +for (method in methods) { + method_results <- run_cached( + sprintf("%s_replicative", method$id), + method$fn, + distances, + replicative_species, + all_genes, + tpe_old_genes + ) + + setnames(method_results, "score", method$id) + + results_replicative <- merge( + results_replicative, + method_results, + ) +} \ No newline at end of file diff --git a/methods.R b/methods.R new file mode 100644 index 0000000..8d91478 --- /dev/null +++ b/methods.R @@ -0,0 +1,56 @@ +source("clusteriness.R") +source("correlation.R") +source("neural.R") + +#' Construct a new method. +#' +#' A method describes a way to perform a computation on gene distance data that +#' results in a single score per gene. The function should accept the following +#' parameters in this order: +#' +#' - `distances` Distance data to use. +#' - `species_ids` Species, whose data should be included. +#' - `gene_ids` Genes to process. +#' - `reference_gene_ids` Genes to compare to. +#' +#' The function should return a `data.table` with the following columns: +#' +#' - `gene` Gene ID of the processed gene. +#' - `score` Score for the gene between 0.0 and 1.0. +#' +#' @param id Internal identifier for the method. +#' @param name Human readable name for the method. +#' @param description Short human readable description. +#' @param fn Function to perform the computation. +#' +#' @return A named list containing the arguments. +method <- function(id, name, description, fn) { + list( + id = id, + name = name, + description = description, + fn = fn + ) +} + +#' All methods to be included in the analysis. +methods <- list( + method( + "clusteriness", + "Clustering", + "Clustering of genes", + process_clusteriness + ), + method( + "correlation", + "Correlation", + "Correlation with known genes", + process_correlation + ), + method( + "neural", + "Neural", + "Assessment by neural network", + process_neural + ) +) \ No newline at end of file diff --git a/neural.R b/neural.R index 1a1611a..f84f7ab 100644 --- a/neural.R +++ b/neural.R @@ -6,7 +6,7 @@ library(neuralnet) #' The result will be a data.table with the following columns: #' #' - `gene` Gene ID of the processed gene. -#' - `neural` Output score given by the neural network. +#' - `score` Output score given by the neural network. #' #' @param distances Distance data to use. #' @param species_ids Species, whose data should be included. @@ -105,6 +105,6 @@ process_neural <- function(distances, species_ids, gene_ids, # Return the resulting scores given by applying the neural network. - data[, neural := compute(nn, data)$net.result] - data[, .(gene, neural)] + data[, score := compute(nn, data)$net.result] + data[, .(gene, score)] } \ No newline at end of file diff --git a/server.R b/server.R index 3e924cc..f5f426b 100644 --- a/server.R +++ b/server.R @@ -47,16 +47,18 @@ server <- function(input, output) { # Compute scoring factors and the weighted score. - clusteriness_weight <- input$clusteriness / 100 - correlation_weight <- input$correlation / 100 - neural_weight <- input$neural / 100 - total_weight <- clusteriness_weight + correlation_weight + neural_weight - clusteriness_factor <- clusteriness_weight / total_weight - correlation_factor <- correlation_weight / total_weight - neural_factor <- neural_weight / total_weight + total_weight <- 0.0 + results[, score := 0.0] - results[, score := clusteriness_factor * clusteriness + - correlation_factor * correlation + neural_factor * neural] + for (method in methods) { + weight <- input[[method$id]] + total_weight <- total_weight + weight + column <- method$id + weighted <- weight * results[, ..column] + results[, score := score + weighted] + } + + results[, score := score / total_weight] # Exclude genes with too few species. results <- results[n_species >= input$n_species] @@ -75,33 +77,22 @@ server <- function(input, output) { # Apply the cut-off score. results <- results[score >= input$cutoff / 100] - # Order the results based on their score. The resulting index will be - # used as the "rank". + # Order the results based on their score. setorder(results, -score, na.last = TRUE) + results[, rank := .I] }) output$genes <- renderDT({ + method_ids <- sapply(methods, function(method) method$id) + method_names <- sapply(methods, function(method) method$name) + columns <- c("rank", "gene", "name", method_ids, "score") + column_names <- c("", "Gene", "", method_names, "Score") + dt <- datatable( - results()[, .( - .I, - gene, - name, - clusteriness, - correlation, - neural, - score - )], + results()[, ..columns], rownames = FALSE, - colnames = c( - "", - "Gene", - "", - "Clusters", - "Correlation", - "Neural", - "Score" - ), + colnames = column_names, style = "bootstrap", options = list( rowCallback = js_link, @@ -109,11 +100,7 @@ server <- function(input, output) { ) ) - formatPercentage( - dt, - c("clusteriness", "correlation", "neural", "score"), - digits = 1 - ) + formatPercentage(dt, c(method_ids, "score"), digits = 1) }) output$synposis <- renderText({ diff --git a/ui.R b/ui.R index ca5f818..46c948b 100644 --- a/ui.R +++ b/ui.R @@ -3,6 +3,8 @@ library(plotly) library(rclipboard) library(shiny) +source("methods.R") + ui <- fluidPage( rclipboardSetup(), titlePanel("TPE-OLD candidates"), @@ -22,33 +24,17 @@ ui <- fluidPage( ), wellPanel( h3("Ranking"), - sliderInput( - "clusteriness", - "Clustering of genes", - post = "%", - min = 0, - max = 100, - step = 1, - value = 58 - ), - sliderInput( - "correlation", - "Correlation with known genes", - post = "%", - min = 0, - max = 100, - step = 1, - value = 36 - ), - sliderInput( - "neural", - "Assessment by neural network", - post = "%", - min = 0, - max = 100, - step = 1, - value = 6 - ), + lapply(methods, function(method) { + sliderInput( + method$id, + method$description, + post = "%", + min = 0, + max = 100, + step = 1, + value = 100 + ) + }), sliderInput( "cutoff", "Cut-off score",