From 91c15045cf0d6f3cbf8d736a2a668f3bc00f4154 Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Thu, 30 Sep 2021 12:54:40 +0200 Subject: [PATCH] Add new clusteriness score --- clustering.R | 48 +++++++++++++++++++++++++++++++++--------------- init.R | 5 ----- scatter_plot.R | 13 +------------ server.R | 19 ++++--------------- ui.R | 25 +++++-------------------- 5 files changed, 43 insertions(+), 67 deletions(-) diff --git a/clustering.R b/clustering.R index 1b5502c..ace9146 100644 --- a/clustering.R +++ b/clustering.R @@ -2,6 +2,37 @@ library(data.table) library(progress) library(rlog) +#' Perform a cluster analysis. +#' +#' This function will cluster the data using `hclust` and `cutree` (with the +#' specified height). Every cluster with at least two members qualifies for +#' further analysis. Clusters are then ranked based on their size in relation +#' to the total number of values. The return value is a final score between +#' zero and one. Lower ranking clusters contribute less to this score. +clusteriness <- function(data, height = 1000000) { + # Cluster the data and compute the cluster sizes. + + tree <- hclust(dist(data)) + clusters <- cutree(tree, h = height) + cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) + + # Compute the "cluteriness" score. + + score <- 0.0 + n <- length(data) + + for (i in seq_along(cluster_sizes)) { + cluster_size <- cluster_sizes[i] + + if (cluster_size >= 2) { + cluster_score <- cluster_size / n + score <- score + cluster_score / i + } + } + + score +} + #' Process genes clustering their distance to telomeres. #' #' The return value will be a data.table with the following columns: @@ -43,24 +74,11 @@ process_clustering <- function(distances, species_ids, gene_ids) { next } - clusters <- hclust(dist(data[, distance])) - clusters_cut <- cutree(clusters, h = 1000000) - - # Find the largest cluster - cluster_indices <- unique(clusters_cut) - cluster_index <- cluster_indices[ - which.max(tabulate(match(clusters_cut, cluster_indices))) - ] - - cluster <- data[which(clusters_cut == cluster_index)] + score <- clusteriness(data[, distance]) results[ gene == gene_id, - `:=`( - cluster_length = cluster[, .N], - cluster_mean = mean(cluster[, distance]), - cluster_species = list(cluster[, species]) - ) + clusteriness := score ] } diff --git a/init.R b/init.R index 17ba01c..e5d7235 100644 --- a/init.R +++ b/init.R @@ -90,8 +90,3 @@ results_replicative <- merge( setnames(results_all, "id", "gene") setnames(results_replicative, "id", "gene") - -# Order results by cluster length descendingly as a start. - -setorder(results_all, -cluster_length) -setorder(results_replicative, -cluster_length) \ No newline at end of file diff --git a/scatter_plot.R b/scatter_plot.R index 541b1ad..ee8f28c 100644 --- a/scatter_plot.R +++ b/scatter_plot.R @@ -20,11 +20,6 @@ scatter_plot <- function(results, species, genes, distances) { by.x = "id", by.y = "gene" ) - for (gene_id in genes[, id]) { - cluster_species <- unlist(results[gene == gene_id, cluster_species]) - data[id == gene_id, in_cluster := species %in% cluster_species] - } - ggplot(data) + scale_x_discrete( name = "Species", @@ -42,17 +37,11 @@ scatter_plot <- function(results, species, genes, distances) { } ) + scale_color_discrete(name = "Gene") + - scale_shape_discrete( - name = "Part of cluster", - breaks = c(TRUE, FALSE), - labels = c("Yes", "No") - ) + geom_point( mapping = aes( x = species, y = distance / 1000000, - color = name, - shape = in_cluster + color = name ), size = 5 ) + diff --git a/server.R b/server.R index 251fae7..949a1b7 100644 --- a/server.R +++ b/server.R @@ -17,36 +17,25 @@ server <- function(input, output) { results_replicative } - # Apply user defined filters. - - results <- results[ - cluster_length >= input$length & - cluster_mean >= input$range[1] * 1000000 & - cluster_mean <= input$range[2] * 1000000 - ] - # Compute scoring factors and the weighted score. - cluster_max <- results[, max(cluster_length)] - results[, cluster_score := cluster_length / cluster_max] - - results[, score := input$clustering / 100 * cluster_score + + results[, score := input$clusteriness / 100 * clusteriness + input$correlation / 100 * r_mean] # Order the results based on their score. The resulting index will be # used as the "rank". - setorder(results, -score) + setorder(results, -score, na.last = TRUE) }) output$genes <- renderDT({ datatable( - results()[, .(.I, name, cluster_length, r_mean)], + results()[, .(.I, name, clusteriness, r_mean)], rownames = FALSE, colnames = c( "Rank", "Gene", - "Cluster length", + "Clusteriness", "Correlation" ), style = "bootstrap" diff --git a/ui.R b/ui.R index 625a1b6..9f1b4a2 100644 --- a/ui.R +++ b/ui.R @@ -11,36 +11,21 @@ ui <- fluidPage( "species", "Species to include", choices = list( - "All qualified" = "all", - "Replicatively aging" = "replicative" + "Replicatively aging" = "replicative", + "All qualified" = "all" ) - ), - sliderInput( - "range", - "Gene position (Mbp)", - min = 0, - max = 50, - value = c(0, 15), - step = 0.1 - ), - sliderInput( - "length", - "Minimum cluster size", - min = 0, - max = 30, - value = 10 ) ), wellPanel( h3("Ranking"), sliderInput( - "clustering", - "Size of largest cluster", + "clusteriness", + "Clustering of genes", post = "%", min = 0, max = 100, step = 1, - value = 100 + value = 50 ), sliderInput( "correlation",