Add initial gene processing

2025-10-26 11:17:24 +01:00 · 2021-08-25 15:01:18 +02:00 · 2021-08-25 15:01:18 +02:00 · d3151957a3
commit d3151957a3
parent fd72b3426c
3 changed files with 68 additions and 2 deletions
--- a/input.R
+++ b/input.R
@ -15,6 +15,7 @@ load_input <- function(path) {
    species <- data.table(
        id = character(),
        group = character(),
        label = character(),
        median_distance = numeric()
    )
@ -44,6 +45,7 @@ load_input <- function(path) {
        # add it to the species table along other static data.
        species <- rbindlist(list(species, data.table(
            id = species_id,
            group = original_species[id == species_id, group],
            label = original_species[id == species_id, label],
            median_distance = median(species_distances[, dist])
        )))
--- a/process.R
+++ b/process.R
@ -0,0 +1,51 @@
 library(data.table)
 library(rlog)
 #' Process genes screening for a likely TPE-OLD.
 #'
 #' The return value will be a table containing genes and data to take in
 #' account when regarding them as TPE-OLD candidates.
 #'
 #' @param input Data from [`load_input()`].
 process_input <- function(input) {
    results <- data.table(gene = input$genes$id)
    # Exclude species with naturally or artificially short chromosomes as well
    # as non-replicatively aging species.
    species_ids <- input$species[
        median_distance >= 7500000 & group == "replicative",
        id
    ]
    gene_ids <- input$genes[, id]
    gene_count <- length(gene_ids)
    for (i in seq_along(gene_ids)) {
        gene_id <- gene_ids[i]
        log_info(sprintf("Processing gene %i/%i (%i)", i, gene_count, gene_id))
        distances <- input$distances[
            species %chin% species_ids & gene == gene_id,
            .(species, distance)
        ]
        if (distances[, .N] < 12) {
            next
        }
        clusters <- hclust(dist(distances[, distance]))
        clusters_cut <- cutree(clusters, h = 1000000)
        cluster <- distances[which(clusters_cut == 1)]
        results[
            gene == gene_id,
            `:=`(
                cluster_length = cluster[, .N],
                cluster_mean = mean(cluster[, distance]),
                cluster_species = list(cluster[, species])
            )
        ]
    }
    results
 }
--- a/server.R
+++ b/server.R
@ -3,22 +3,35 @@ library(DT)
 library(shiny)
 source("input.R")
 source("process.R")
 source("scatter_plot.R")
 source("util.R")
 data <- run_cached("input", load_input, "input")
 results <- run_cached("results", process_input, data)
 server <- function(input, output) {
    filtered <- results[cluster_length >= 10]
    merged <- merge.data.table(filtered, data$genes, by.x = "gene", by.y = "id")
    setorder(merged, -cluster_length)
    output$genes <- renderDT({
        datatable(
-            data$genes[, c("name", "chromosome")],
+            merged[, .(.I, name, chromosome, cluster_length, cluster_mean)],
            rownames = FALSE,
            colnames = c(
                "Rank",
                "Gene",
                "Chromosome",
                "Cluster length",
                "Cluster mean"
            ),
            style = "bootstrap"
        )
    })
    output$scatter <- renderPlot({
-        gene_ids <- data$genes[input$genes_rows_selected, id]
+        gene_ids <- merged[input$genes_rows_selected, gene]
        scatter_plot(gene_ids, data)
    })
 }