From d3151957a354ff44bca721ade2ecc4f51a0d79cb Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Wed, 25 Aug 2021 15:01:18 +0200 Subject: [PATCH] Add initial gene processing --- input.R | 2 ++ process.R | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ server.R | 17 +++++++++++++++-- 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 process.R diff --git a/input.R b/input.R index 7f7ded1..abb39cd 100644 --- a/input.R +++ b/input.R @@ -15,6 +15,7 @@ load_input <- function(path) { species <- data.table( id = character(), + group = character(), label = character(), median_distance = numeric() ) @@ -44,6 +45,7 @@ load_input <- function(path) { # add it to the species table along other static data. species <- rbindlist(list(species, data.table( id = species_id, + group = original_species[id == species_id, group], label = original_species[id == species_id, label], median_distance = median(species_distances[, dist]) ))) diff --git a/process.R b/process.R new file mode 100644 index 0000000..317a54d --- /dev/null +++ b/process.R @@ -0,0 +1,51 @@ +library(data.table) +library(rlog) + +#' Process genes screening for a likely TPE-OLD. +#' +#' The return value will be a table containing genes and data to take in +#' account when regarding them as TPE-OLD candidates. +#' +#' @param input Data from [`load_input()`]. +process_input <- function(input) { + results <- data.table(gene = input$genes$id) + + # Exclude species with naturally or artificially short chromosomes as well + # as non-replicatively aging species. + species_ids <- input$species[ + median_distance >= 7500000 & group == "replicative", + id + ] + + gene_ids <- input$genes[, id] + gene_count <- length(gene_ids) + + for (i in seq_along(gene_ids)) { + gene_id <- gene_ids[i] + log_info(sprintf("Processing gene %i/%i (%i)", i, gene_count, gene_id)) + + distances <- input$distances[ + species %chin% species_ids & gene == gene_id, + .(species, distance) + ] + + if (distances[, .N] < 12) { + next + } + + clusters <- hclust(dist(distances[, distance])) + clusters_cut <- cutree(clusters, h = 1000000) + cluster <- distances[which(clusters_cut == 1)] + + results[ + gene == gene_id, + `:=`( + cluster_length = cluster[, .N], + cluster_mean = mean(cluster[, distance]), + cluster_species = list(cluster[, species]) + ) + ] + } + + results +} \ No newline at end of file diff --git a/server.R b/server.R index d183c28..64e90af 100644 --- a/server.R +++ b/server.R @@ -3,22 +3,35 @@ library(DT) library(shiny) source("input.R") +source("process.R") source("scatter_plot.R") source("util.R") data <- run_cached("input", load_input, "input") +results <- run_cached("results", process_input, data) server <- function(input, output) { + filtered <- results[cluster_length >= 10] + merged <- merge.data.table(filtered, data$genes, by.x = "gene", by.y = "id") + setorder(merged, -cluster_length) + output$genes <- renderDT({ datatable( - data$genes[, c("name", "chromosome")], + merged[, .(.I, name, chromosome, cluster_length, cluster_mean)], rownames = FALSE, + colnames = c( + "Rank", + "Gene", + "Chromosome", + "Cluster length", + "Cluster mean" + ), style = "bootstrap" ) }) output$scatter <- renderPlot({ - gene_ids <- data$genes[input$genes_rows_selected, id] + gene_ids <- merged[input$genes_rows_selected, gene] scatter_plot(gene_ids, data) }) } \ No newline at end of file