| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | library(data.table) | 
					
						
							|  |  |  | library(rlog) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | #' Process genes clustering their distance to telomeres. | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | #' The return value will be a data.table with the following columns: | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #'  - `gene` Gene ID of the processed gene. | 
					
						
							|  |  |  | #'  - `cluster_length` Length of the largest cluster. | 
					
						
							|  |  |  | #'  - `cluster_mean` Mean value of the largest cluster. | 
					
						
							|  |  |  | #'  - `cluster_species` List of species contributing to the largest cluster. | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' @param distances Gene distance data to use. | 
					
						
							| 
									
										
										
										
											2021-08-29 13:25:12 +02:00
										 |  |  | #' @param species_ids IDs of species to include in the analysis. | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' @param gene_ids Genes to include in the computation. | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | process_clustering <- function(distances, species_ids, gene_ids) { | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |     results <- data.table(gene = gene_ids) | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |     gene_count <- length(gene_ids) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  |     for (i in 1:gene_count) { | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |         gene_id <- gene_ids[i] | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         log_info(sprintf( | 
					
						
							|  |  |  |             "[%3i%%] Processing gene \"%s\"", | 
					
						
							|  |  |  |             round(i / gene_count * 100), | 
					
						
							|  |  |  |             gene_id | 
					
						
							|  |  |  |         )) | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |         data <- distances[ | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |             species %chin% species_ids & gene == gene_id, | 
					
						
							|  |  |  |             .(species, distance) | 
					
						
							|  |  |  |         ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |         if (data[, .N] < 12) { | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |             next | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |         clusters <- hclust(dist(data[, distance])) | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |         clusters_cut <- cutree(clusters, h = 1000000) | 
					
						
							| 
									
										
										
										
											2021-08-26 14:37:17 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # Find the largest cluster | 
					
						
							|  |  |  |         cluster_indices <- unique(clusters_cut) | 
					
						
							|  |  |  |         cluster_index <- cluster_indices[ | 
					
						
							|  |  |  |             which.max(tabulate(match(clusters_cut, cluster_indices))) | 
					
						
							|  |  |  |         ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |         cluster <- data[which(clusters_cut == cluster_index)] | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         results[ | 
					
						
							|  |  |  |             gene == gene_id, | 
					
						
							|  |  |  |             `:=`( | 
					
						
							|  |  |  |                 cluster_length = cluster[, .N], | 
					
						
							|  |  |  |                 cluster_mean = mean(cluster[, distance]), | 
					
						
							|  |  |  |                 cluster_species = list(cluster[, species]) | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         ] | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     results | 
					
						
							|  |  |  | } |