Add new clusteriness score

This commit is contained in:
Elias Projahn 2021-09-30 12:54:40 +02:00
parent ba7c624705
commit 91c15045cf
5 changed files with 43 additions and 67 deletions

View file

@ -2,6 +2,37 @@ library(data.table)
library(progress) library(progress)
library(rlog) library(rlog)
#' Perform a cluster analysis.
#'
#' This function will cluster the data using `hclust` and `cutree` (with the
#' specified height). Every cluster with at least two members qualifies for
#' further analysis. Clusters are then ranked based on their size in relation
#' to the total number of values. The return value is a final score between
#' zero and one. Lower ranking clusters contribute less to this score.
clusteriness <- function(data, height = 1000000) {
# Cluster the data and compute the cluster sizes.
tree <- hclust(dist(data))
clusters <- cutree(tree, h = height)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "cluteriness" score.
score <- 0.0
n <- length(data)
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + cluster_score / i
}
}
score
}
#' Process genes clustering their distance to telomeres. #' Process genes clustering their distance to telomeres.
#' #'
#' The return value will be a data.table with the following columns: #' The return value will be a data.table with the following columns:
@ -43,24 +74,11 @@ process_clustering <- function(distances, species_ids, gene_ids) {
next next
} }
clusters <- hclust(dist(data[, distance])) score <- clusteriness(data[, distance])
clusters_cut <- cutree(clusters, h = 1000000)
# Find the largest cluster
cluster_indices <- unique(clusters_cut)
cluster_index <- cluster_indices[
which.max(tabulate(match(clusters_cut, cluster_indices)))
]
cluster <- data[which(clusters_cut == cluster_index)]
results[ results[
gene == gene_id, gene == gene_id,
`:=`( clusteriness := score
cluster_length = cluster[, .N],
cluster_mean = mean(cluster[, distance]),
cluster_species = list(cluster[, species])
)
] ]
} }

5
init.R
View file

@ -90,8 +90,3 @@ results_replicative <- merge(
setnames(results_all, "id", "gene") setnames(results_all, "id", "gene")
setnames(results_replicative, "id", "gene") setnames(results_replicative, "id", "gene")
# Order results by cluster length descendingly as a start.
setorder(results_all, -cluster_length)
setorder(results_replicative, -cluster_length)

View file

@ -20,11 +20,6 @@ scatter_plot <- function(results, species, genes, distances) {
by.x = "id", by.y = "gene" by.x = "id", by.y = "gene"
) )
for (gene_id in genes[, id]) {
cluster_species <- unlist(results[gene == gene_id, cluster_species])
data[id == gene_id, in_cluster := species %in% cluster_species]
}
ggplot(data) + ggplot(data) +
scale_x_discrete( scale_x_discrete(
name = "Species", name = "Species",
@ -42,17 +37,11 @@ scatter_plot <- function(results, species, genes, distances) {
} }
) + ) +
scale_color_discrete(name = "Gene") + scale_color_discrete(name = "Gene") +
scale_shape_discrete(
name = "Part of cluster",
breaks = c(TRUE, FALSE),
labels = c("Yes", "No")
) +
geom_point( geom_point(
mapping = aes( mapping = aes(
x = species, x = species,
y = distance / 1000000, y = distance / 1000000,
color = name, color = name
shape = in_cluster
), ),
size = 5 size = 5
) + ) +

View file

@ -17,36 +17,25 @@ server <- function(input, output) {
results_replicative results_replicative
} }
# Apply user defined filters.
results <- results[
cluster_length >= input$length &
cluster_mean >= input$range[1] * 1000000 &
cluster_mean <= input$range[2] * 1000000
]
# Compute scoring factors and the weighted score. # Compute scoring factors and the weighted score.
cluster_max <- results[, max(cluster_length)] results[, score := input$clusteriness / 100 * clusteriness +
results[, cluster_score := cluster_length / cluster_max]
results[, score := input$clustering / 100 * cluster_score +
input$correlation / 100 * r_mean] input$correlation / 100 * r_mean]
# Order the results based on their score. The resulting index will be # Order the results based on their score. The resulting index will be
# used as the "rank". # used as the "rank".
setorder(results, -score) setorder(results, -score, na.last = TRUE)
}) })
output$genes <- renderDT({ output$genes <- renderDT({
datatable( datatable(
results()[, .(.I, name, cluster_length, r_mean)], results()[, .(.I, name, clusteriness, r_mean)],
rownames = FALSE, rownames = FALSE,
colnames = c( colnames = c(
"Rank", "Rank",
"Gene", "Gene",
"Cluster length", "Clusteriness",
"Correlation" "Correlation"
), ),
style = "bootstrap" style = "bootstrap"

25
ui.R
View file

@ -11,36 +11,21 @@ ui <- fluidPage(
"species", "species",
"Species to include", "Species to include",
choices = list( choices = list(
"All qualified" = "all", "Replicatively aging" = "replicative",
"Replicatively aging" = "replicative" "All qualified" = "all"
) )
),
sliderInput(
"range",
"Gene position (Mbp)",
min = 0,
max = 50,
value = c(0, 15),
step = 0.1
),
sliderInput(
"length",
"Minimum cluster size",
min = 0,
max = 30,
value = 10
) )
), ),
wellPanel( wellPanel(
h3("Ranking"), h3("Ranking"),
sliderInput( sliderInput(
"clustering", "clusteriness",
"Size of largest cluster", "Clustering of genes",
post = "%", post = "%",
min = 0, min = 0,
max = 100, max = 100,
step = 1, step = 1,
value = 100 value = 50
), ),
sliderInput( sliderInput(
"correlation", "correlation",