| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | library(data.table) | 
					
						
							| 
									
										
										
										
											2021-09-21 16:47:13 +02:00
										 |  |  | library(progress) | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | library(rlog) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 12:54:40 +02:00
										 |  |  | #' Perform a cluster analysis. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' This function will cluster the data using `hclust` and `cutree` (with the | 
					
						
							|  |  |  | #' specified height). Every cluster with at least two members qualifies for | 
					
						
							|  |  |  | #' further analysis. Clusters are then ranked based on their size in relation | 
					
						
							| 
									
										
										
										
											2021-10-04 09:19:38 +02:00
										 |  |  | #' to the total number of possible values (`n`). The return value is a final | 
					
						
							|  |  |  | #' score between zero and one. Lower ranking clusters contribute less to this | 
					
						
							|  |  |  | #' score. | 
					
						
							|  |  |  | clusteriness <- function(data, n, height = 1000000) { | 
					
						
							| 
									
										
										
										
											2021-09-30 12:54:40 +02:00
										 |  |  |     # Cluster the data and compute the cluster sizes. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     tree <- hclust(dist(data)) | 
					
						
							|  |  |  |     clusters <- cutree(tree, h = height) | 
					
						
							|  |  |  |     cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Compute the "cluteriness" score. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     score <- 0.0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (i in seq_along(cluster_sizes)) { | 
					
						
							|  |  |  |         cluster_size <- cluster_sizes[i] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if (cluster_size >= 2) { | 
					
						
							|  |  |  |             cluster_score <- cluster_size / n | 
					
						
							|  |  |  |             score <- score + cluster_score / i | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     score | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | #' Process genes clustering their distance to telomeres. | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | #' The return value will be a data.table with the following columns: | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #'  - `gene` Gene ID of the processed gene. | 
					
						
							|  |  |  | #'  - `cluster_length` Length of the largest cluster. | 
					
						
							|  |  |  | #'  - `cluster_mean` Mean value of the largest cluster. | 
					
						
							|  |  |  | #'  - `cluster_species` List of species contributing to the largest cluster. | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' @param distances Gene distance data to use. | 
					
						
							| 
									
										
										
										
											2021-08-29 13:25:12 +02:00
										 |  |  | #' @param species_ids IDs of species to include in the analysis. | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' @param gene_ids Genes to include in the computation. | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | process_clustering <- function(distances, species_ids, gene_ids) { | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |     results <- data.table(gene = gene_ids) | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |     gene_count <- length(gene_ids) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-21 16:47:13 +02:00
										 |  |  |     log_info(sprintf( | 
					
						
							|  |  |  |         "Clustering %i genes from %i species", | 
					
						
							|  |  |  |         gene_count, | 
					
						
							|  |  |  |         length(species_ids) | 
					
						
							|  |  |  |     )) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     progress <- progress_bar$new( | 
					
						
							|  |  |  |         total = gene_count, | 
					
						
							|  |  |  |         format = "Clustering genes [:bar] :percent (ETA :eta)" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  |     for (i in 1:gene_count) { | 
					
						
							| 
									
										
										
										
											2021-09-21 16:47:13 +02:00
										 |  |  |         progress$tick() | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-21 16:47:13 +02:00
										 |  |  |         gene_id <- gene_ids[i] | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |         data <- distances[ | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |             species %chin% species_ids & gene == gene_id, | 
					
						
							|  |  |  |             .(species, distance) | 
					
						
							|  |  |  |         ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-04 09:19:38 +02:00
										 |  |  |         if (data[, .N] < 10) { | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |             next | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-04 09:19:38 +02:00
										 |  |  |         score <- clusteriness(data[, distance], length(species_ids)) | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         results[ | 
					
						
							|  |  |  |             gene == gene_id, | 
					
						
							| 
									
										
										
										
											2021-09-30 12:54:40 +02:00
										 |  |  |             clusteriness := score | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |         ] | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     results | 
					
						
							|  |  |  | } |