| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | library(data.table) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 12:54:40 +02:00
										 |  |  | #' Perform a cluster analysis. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' This function will cluster the data using `hclust` and `cutree` (with the | 
					
						
							|  |  |  | #' specified height). Every cluster with at least two members qualifies for | 
					
						
							|  |  |  | #' further analysis. Clusters are then ranked based on their size in relation | 
					
						
							| 
									
										
										
										
											2021-10-11 09:48:48 +02:00
										 |  |  | #' to the number of values. The return value is a final score between zero and | 
					
						
							|  |  |  | #' one. Lower ranking clusters contribute less to this score. | 
					
						
							|  |  |  | clusteriness <- function(data, height = 1000000) { | 
					
						
							|  |  |  |     n <- length(data) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-09 14:22:28 +02:00
										 |  |  |     # Return a score of 0.0 if there is just one or no value at all. | 
					
						
							| 
									
										
										
										
											2021-10-11 09:48:48 +02:00
										 |  |  |     if (n < 2) { | 
					
						
							| 
									
										
										
										
											2021-10-09 14:22:28 +02:00
										 |  |  |         return(0.0) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-30 12:54:40 +02:00
										 |  |  |     # Cluster the data and compute the cluster sizes. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     tree <- hclust(dist(data)) | 
					
						
							|  |  |  |     clusters <- cutree(tree, h = height) | 
					
						
							|  |  |  |     cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-09 14:22:28 +02:00
										 |  |  |     # Compute the "clusteriness" score. | 
					
						
							| 
									
										
										
										
											2021-09-30 12:54:40 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     score <- 0.0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (i in seq_along(cluster_sizes)) { | 
					
						
							|  |  |  |         cluster_size <- cluster_sizes[i] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if (cluster_size >= 2) { | 
					
						
							|  |  |  |             cluster_score <- cluster_size / n | 
					
						
							|  |  |  |             score <- score + cluster_score / i | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     score | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | #' Process genes clustering their distance to telomeres. | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-09-18 23:10:52 +02:00
										 |  |  | #' The return value will be a data.table with the following columns: | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #'  - `gene` Gene ID of the processed gene. | 
					
						
							| 
									
										
										
										
											2021-10-15 09:26:57 +02:00
										 |  |  | #'  - `score` Score quantidying the gene's clusters. | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' @param distances Gene distance data to use. | 
					
						
							| 
									
										
										
										
											2021-08-29 13:25:12 +02:00
										 |  |  | #' @param species_ids IDs of species to include in the analysis. | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' @param gene_ids Genes to include in the computation. | 
					
						
							| 
									
										
										
										
											2021-10-15 09:26:57 +02:00
										 |  |  | process_clusteriness <- function(distances, species_ids, gene_ids, ...) { | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |     results <- data.table(gene = gene_ids) | 
					
						
							| 
									
										
										
										
											2021-09-21 16:47:13 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-09 14:22:28 +02:00
										 |  |  |     # Prefilter the input data by species. | 
					
						
							|  |  |  |     distances <- distances[species %chin% species_ids] | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-09 14:22:28 +02:00
										 |  |  |     # Add an index for quickly accessing data per gene. | 
					
						
							|  |  |  |     setkey(distances, gene) | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-09 14:22:28 +02:00
										 |  |  |     #' Perform the cluster analysis for one gene. | 
					
						
							|  |  |  |     compute <- function(gene_id) { | 
					
						
							| 
									
										
										
										
											2021-10-11 09:48:48 +02:00
										 |  |  |         clusteriness(distances[gene_id, distance]) | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-15 09:26:57 +02:00
										 |  |  |     results[, score := compute(gene), by = 1:nrow(results)] | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  | } |