Generalize method definitions

This commit is contained in:
Elias Projahn 2021-10-15 09:26:57 +02:00
parent d3edeefbe2
commit 9b0b3c13f5
7 changed files with 137 additions and 169 deletions

View file

@ -42,12 +42,12 @@ clusteriness <- function(data, height = 1000000) {
#' The return value will be a data.table with the following columns:
#'
#' - `gene` Gene ID of the processed gene.
#' - `clusteriness` Score quantidying the gene's clusters.
#' - `score` Score quantidying the gene's clusters.
#'
#' @param distances Gene distance data to use.
#' @param species_ids IDs of species to include in the analysis.
#' @param gene_ids Genes to include in the computation.
process_clustering <- function(distances, species_ids, gene_ids) {
process_clusteriness <- function(distances, species_ids, gene_ids, ...) {
results <- data.table(gene = gene_ids)
# Prefilter the input data by species.
@ -61,5 +61,5 @@ process_clustering <- function(distances, species_ids, gene_ids) {
clusteriness(distances[gene_id, distance])
}
results[, clusteriness := compute(gene), by = 1:nrow(results)]
results[, score := compute(gene), by = 1:nrow(results)]
}

View file

@ -6,7 +6,7 @@ library(data.table)
#' The result will be a data.table with the following columns:
#'
#' - `gene` Gene ID of the processed gene.
#' - `correlation` Mean correlation coefficient.
#' - `score` Mean correlation coefficient.
#'
#' @param distances Distance data to use.
#' @param species_ids Species, whose data should be included.
@ -69,5 +69,5 @@ process_correlation <- function(distances, species_ids, gene_ids,
score <- correlation_sum / reference_count
}
results[, correlation := compute(gene), by = 1:nrow(results)]
results[, score := compute(gene), by = 1:nrow(results)]
}

131
init.R
View file

@ -1,7 +1,5 @@
source("clustering.R")
source("correlation.R")
source("input.R")
source("neural.R")
source("methods.R")
source("util.R")
# Load input data
@ -16,66 +14,12 @@ distances <- run_cached(
genes[, id]
)
# Load processed data
all_species <- species[, id]
replicative_species <- species[replicative == TRUE, id]
all_genes <- genes[, id]
tpe_old_genes <- genes[suggested | verified == TRUE, id]
clustering_all <- run_cached(
"clustering_all",
process_clustering,
distances,
all_species,
all_genes
)
clustering_replicative <- run_cached(
"clustering_replicative",
process_clustering,
distances,
replicative_species,
all_genes
)
correlation_all <- run_cached(
"correlation_all",
process_correlation,
distances,
all_species,
all_genes,
tpe_old_genes
)
correlation_replicative <- run_cached(
"correlation_replicative",
process_correlation,
distances,
replicative_species,
all_genes,
tpe_old_genes
)
neural_all <- run_cached(
"neural_all",
process_neural,
distances,
all_species,
all_genes,
tpe_old_genes
)
neural_replicative <- run_cached(
"neural_replicative",
process_neural,
distances,
replicative_species,
all_genes,
tpe_old_genes
)
# Merge processed data as well as gene information.
# Apply all methods for all species
results_all <- merge(
genes,
@ -84,26 +28,27 @@ results_all <- merge(
by.y = "gene"
)
results_all <- merge(
results_all,
clustering_all,
by.x = "id",
by.y = "gene"
setnames(results_all, "id", "gene")
for (method in methods) {
method_results <- run_cached(
sprintf("%s_all", method$id),
method$fn,
distances,
all_species,
all_genes,
tpe_old_genes
)
setnames(method_results, "score", method$id)
results_all <- merge(
results_all,
correlation_all,
by.x = "id",
by.y = "gene"
method_results,
)
}
results_all <- merge(
results_all,
neural_all,
by.x = "id",
by.y = "gene"
)
# Apply all methods for replicatively aging species
results_replicative <- merge(
genes,
@ -116,28 +61,22 @@ results_replicative <- merge(
by.y = "gene"
)
results_replicative <- merge(
results_replicative,
clustering_replicative,
by.x = "id",
by.y = "gene"
)
results_replicative <- merge(
results_replicative,
correlation_replicative,
by.x = "id",
by.y = "gene"
)
results_replicative <- merge(
results_replicative,
neural_replicative,
by.x = "id",
by.y = "gene"
)
# Rename `id` columns to `gene`.
setnames(results_all, "id", "gene")
setnames(results_replicative, "id", "gene")
for (method in methods) {
method_results <- run_cached(
sprintf("%s_replicative", method$id),
method$fn,
distances,
replicative_species,
all_genes,
tpe_old_genes
)
setnames(method_results, "score", method$id)
results_replicative <- merge(
results_replicative,
method_results,
)
}

56
methods.R Normal file
View file

@ -0,0 +1,56 @@
source("clusteriness.R")
source("correlation.R")
source("neural.R")
#' Construct a new method.
#'
#' A method describes a way to perform a computation on gene distance data that
#' results in a single score per gene. The function should accept the following
#' parameters in this order:
#'
#' - `distances` Distance data to use.
#' - `species_ids` Species, whose data should be included.
#' - `gene_ids` Genes to process.
#' - `reference_gene_ids` Genes to compare to.
#'
#' The function should return a `data.table` with the following columns:
#'
#' - `gene` Gene ID of the processed gene.
#' - `score` Score for the gene between 0.0 and 1.0.
#'
#' @param id Internal identifier for the method.
#' @param name Human readable name for the method.
#' @param description Short human readable description.
#' @param fn Function to perform the computation.
#'
#' @return A named list containing the arguments.
method <- function(id, name, description, fn) {
list(
id = id,
name = name,
description = description,
fn = fn
)
}
#' All methods to be included in the analysis.
methods <- list(
method(
"clusteriness",
"Clustering",
"Clustering of genes",
process_clusteriness
),
method(
"correlation",
"Correlation",
"Correlation with known genes",
process_correlation
),
method(
"neural",
"Neural",
"Assessment by neural network",
process_neural
)
)

View file

@ -6,7 +6,7 @@ library(neuralnet)
#' The result will be a data.table with the following columns:
#'
#' - `gene` Gene ID of the processed gene.
#' - `neural` Output score given by the neural network.
#' - `score` Output score given by the neural network.
#'
#' @param distances Distance data to use.
#' @param species_ids Species, whose data should be included.
@ -105,6 +105,6 @@ process_neural <- function(distances, species_ids, gene_ids,
# Return the resulting scores given by applying the neural network.
data[, neural := compute(nn, data)$net.result]
data[, .(gene, neural)]
data[, score := compute(nn, data)$net.result]
data[, .(gene, score)]
}

View file

@ -47,16 +47,18 @@ server <- function(input, output) {
# Compute scoring factors and the weighted score.
clusteriness_weight <- input$clusteriness / 100
correlation_weight <- input$correlation / 100
neural_weight <- input$neural / 100
total_weight <- clusteriness_weight + correlation_weight + neural_weight
clusteriness_factor <- clusteriness_weight / total_weight
correlation_factor <- correlation_weight / total_weight
neural_factor <- neural_weight / total_weight
total_weight <- 0.0
results[, score := 0.0]
results[, score := clusteriness_factor * clusteriness +
correlation_factor * correlation + neural_factor * neural]
for (method in methods) {
weight <- input[[method$id]]
total_weight <- total_weight + weight
column <- method$id
weighted <- weight * results[, ..column]
results[, score := score + weighted]
}
results[, score := score / total_weight]
# Exclude genes with too few species.
results <- results[n_species >= input$n_species]
@ -75,33 +77,22 @@ server <- function(input, output) {
# Apply the cut-off score.
results <- results[score >= input$cutoff / 100]
# Order the results based on their score. The resulting index will be
# used as the "rank".
# Order the results based on their score.
setorder(results, -score, na.last = TRUE)
results[, rank := .I]
})
output$genes <- renderDT({
method_ids <- sapply(methods, function(method) method$id)
method_names <- sapply(methods, function(method) method$name)
columns <- c("rank", "gene", "name", method_ids, "score")
column_names <- c("", "Gene", "", method_names, "Score")
dt <- datatable(
results()[, .(
.I,
gene,
name,
clusteriness,
correlation,
neural,
score
)],
results()[, ..columns],
rownames = FALSE,
colnames = c(
"",
"Gene",
"",
"Clusters",
"Correlation",
"Neural",
"Score"
),
colnames = column_names,
style = "bootstrap",
options = list(
rowCallback = js_link,
@ -109,11 +100,7 @@ server <- function(input, output) {
)
)
formatPercentage(
dt,
c("clusteriness", "correlation", "neural", "score"),
digits = 1
)
formatPercentage(dt, c(method_ids, "score"), digits = 1)
})
output$synposis <- renderText({

30
ui.R
View file

@ -3,6 +3,8 @@ library(plotly)
library(rclipboard)
library(shiny)
source("methods.R")
ui <- fluidPage(
rclipboardSetup(),
titlePanel("TPE-OLD candidates"),
@ -22,33 +24,17 @@ ui <- fluidPage(
),
wellPanel(
h3("Ranking"),
lapply(methods, function(method) {
sliderInput(
"clusteriness",
"Clustering of genes",
method$id,
method$description,
post = "%",
min = 0,
max = 100,
step = 1,
value = 58
),
sliderInput(
"correlation",
"Correlation with known genes",
post = "%",
min = 0,
max = 100,
step = 1,
value = 36
),
sliderInput(
"neural",
"Assessment by neural network",
post = "%",
min = 0,
max = 100,
step = 1,
value = 6
),
value = 100
)
}),
sliderInput(
"cutoff",
"Cut-off score",