Add new clusteriness score

This commit is contained in:
Elias Projahn 2021-09-30 12:54:40 +02:00
parent ba7c624705
commit 91c15045cf
5 changed files with 43 additions and 67 deletions

View file

@ -2,6 +2,37 @@ library(data.table)
library(progress)
library(rlog)
#' Perform a cluster analysis.
#'
#' This function will cluster the data using `hclust` and `cutree` (with the
#' specified height). Every cluster with at least two members qualifies for
#' further analysis. Clusters are then ranked based on their size in relation
#' to the total number of values. The return value is a final score between
#' zero and one. Lower ranking clusters contribute less to this score.
clusteriness <- function(data, height = 1000000) {
# Cluster the data and compute the cluster sizes.
tree <- hclust(dist(data))
clusters <- cutree(tree, h = height)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "cluteriness" score.
score <- 0.0
n <- length(data)
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + cluster_score / i
}
}
score
}
#' Process genes clustering their distance to telomeres.
#'
#' The return value will be a data.table with the following columns:
@ -43,24 +74,11 @@ process_clustering <- function(distances, species_ids, gene_ids) {
next
}
clusters <- hclust(dist(data[, distance]))
clusters_cut <- cutree(clusters, h = 1000000)
# Find the largest cluster
cluster_indices <- unique(clusters_cut)
cluster_index <- cluster_indices[
which.max(tabulate(match(clusters_cut, cluster_indices)))
]
cluster <- data[which(clusters_cut == cluster_index)]
score <- clusteriness(data[, distance])
results[
gene == gene_id,
`:=`(
cluster_length = cluster[, .N],
cluster_mean = mean(cluster[, distance]),
cluster_species = list(cluster[, species])
)
clusteriness := score
]
}

5
init.R
View file

@ -90,8 +90,3 @@ results_replicative <- merge(
setnames(results_all, "id", "gene")
setnames(results_replicative, "id", "gene")
# Order results by cluster length descendingly as a start.
setorder(results_all, -cluster_length)
setorder(results_replicative, -cluster_length)

View file

@ -20,11 +20,6 @@ scatter_plot <- function(results, species, genes, distances) {
by.x = "id", by.y = "gene"
)
for (gene_id in genes[, id]) {
cluster_species <- unlist(results[gene == gene_id, cluster_species])
data[id == gene_id, in_cluster := species %in% cluster_species]
}
ggplot(data) +
scale_x_discrete(
name = "Species",
@ -42,17 +37,11 @@ scatter_plot <- function(results, species, genes, distances) {
}
) +
scale_color_discrete(name = "Gene") +
scale_shape_discrete(
name = "Part of cluster",
breaks = c(TRUE, FALSE),
labels = c("Yes", "No")
) +
geom_point(
mapping = aes(
x = species,
y = distance / 1000000,
color = name,
shape = in_cluster
color = name
),
size = 5
) +

View file

@ -17,36 +17,25 @@ server <- function(input, output) {
results_replicative
}
# Apply user defined filters.
results <- results[
cluster_length >= input$length &
cluster_mean >= input$range[1] * 1000000 &
cluster_mean <= input$range[2] * 1000000
]
# Compute scoring factors and the weighted score.
cluster_max <- results[, max(cluster_length)]
results[, cluster_score := cluster_length / cluster_max]
results[, score := input$clustering / 100 * cluster_score +
results[, score := input$clusteriness / 100 * clusteriness +
input$correlation / 100 * r_mean]
# Order the results based on their score. The resulting index will be
# used as the "rank".
setorder(results, -score)
setorder(results, -score, na.last = TRUE)
})
output$genes <- renderDT({
datatable(
results()[, .(.I, name, cluster_length, r_mean)],
results()[, .(.I, name, clusteriness, r_mean)],
rownames = FALSE,
colnames = c(
"Rank",
"Gene",
"Cluster length",
"Clusteriness",
"Correlation"
),
style = "bootstrap"

25
ui.R
View file

@ -11,36 +11,21 @@ ui <- fluidPage(
"species",
"Species to include",
choices = list(
"All qualified" = "all",
"Replicatively aging" = "replicative"
"Replicatively aging" = "replicative",
"All qualified" = "all"
)
),
sliderInput(
"range",
"Gene position (Mbp)",
min = 0,
max = 50,
value = c(0, 15),
step = 0.1
),
sliderInput(
"length",
"Minimum cluster size",
min = 0,
max = 30,
value = 10
)
),
wellPanel(
h3("Ranking"),
sliderInput(
"clustering",
"Size of largest cluster",
"clusteriness",
"Clustering of genes",
post = "%",
min = 0,
max = 100,
step = 1,
value = 100
value = 50
),
sliderInput(
"correlation",