Base clusteriness on species count

2025-10-26 19:27:24 +01:00 · 2021-10-04 09:19:38 +02:00 · 2021-10-04 09:19:38 +02:00 · 397b8d0ba2
commit 397b8d0ba2
parent 8b727a0329
2 changed files with 7 additions and 7 deletions
--- a/clustering.R
+++ b/clustering.R
@ -7,9 +7,10 @@ library(rlog)
 #' This function will cluster the data using `hclust` and `cutree` (with the
 #' specified height). Every cluster with at least two members qualifies for
 #' further analysis. Clusters are then ranked based on their size in relation
-#' to the total number of values. The return value is a final score between
+#' to the total number of possible values (`n`). The return value is a final
-#' zero and one. Lower ranking clusters contribute less to this score.
+#' score between zero and one. Lower ranking clusters contribute less to this
-clusteriness <- function(data, height = 1000000) {
+#' score.
 clusteriness <- function(data, n, height = 1000000) {
    # Cluster the data and compute the cluster sizes.
    tree <- hclust(dist(data))
@ -19,7 +20,6 @@ clusteriness <- function(data, height = 1000000) {
    # Compute the "cluteriness" score.
    score <- 0.0
    n <- length(data)
    for (i in seq_along(cluster_sizes)) {
        cluster_size <- cluster_sizes[i]
@ -70,11 +70,11 @@ process_clustering <- function(distances, species_ids, gene_ids) {
            .(species, distance)
        ]
-        if (data[, .N] < 12) {
+        if (data[, .N] < 10) {
            next
        }
-        score <- clusteriness(data[, distance])
+        score <- clusteriness(data[, distance], length(species_ids))
        results[
            gene == gene_id,
--- a/correlation.R
+++ b/correlation.R
@ -41,7 +41,7 @@ process_correlation <- function(distances, species_ids, gene_ids,
        gene_id <- gene_ids[i]
        gene_distances <- distances[gene == gene_id]
-        if (nrow(gene_distances) < 12) {
+        if (nrow(gene_distances) < 10) {
            next
        }