library(data.table) library(progress) library(rlog) #' Process genes clustering their distance to telomeres. #' #' The return value will be a data.table with the following columns: #' #' - `gene` Gene ID of the processed gene. #' - `cluster_length` Length of the largest cluster. #' - `cluster_mean` Mean value of the largest cluster. #' - `cluster_species` List of species contributing to the largest cluster. #' #' @param distances Gene distance data to use. #' @param species_ids IDs of species to include in the analysis. #' @param gene_ids Genes to include in the computation. process_clustering <- function(distances, species_ids, gene_ids) { results <- data.table(gene = gene_ids) gene_count <- length(gene_ids) log_info(sprintf( "Clustering %i genes from %i species", gene_count, length(species_ids) )) progress <- progress_bar$new( total = gene_count, format = "Clustering genes [:bar] :percent (ETA :eta)" ) for (i in 1:gene_count) { progress$tick() gene_id <- gene_ids[i] data <- distances[ species %chin% species_ids & gene == gene_id, .(species, distance) ] if (data[, .N] < 12) { next } clusters <- hclust(dist(data[, distance])) clusters_cut <- cutree(clusters, h = 1000000) # Find the largest cluster cluster_indices <- unique(clusters_cut) cluster_index <- cluster_indices[ which.max(tabulate(match(clusters_cut, cluster_indices))) ] cluster <- data[which(clusters_cut == cluster_index)] results[ gene == gene_id, `:=`( cluster_length = cluster[, .N], cluster_mean = mean(cluster[, distance]), cluster_species = list(cluster[, species]) ) ] } results }