| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | #' Score genes based on their adjacency to the reference genes within species. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' For each gene and species, the method will first combine the gene's distances | 
					
						
							|  |  |  | #' to the reference genes within that species. Afterwards, the results are | 
					
						
							|  |  |  | #' summarized across species and determine the gene's score. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @param distance_estimate Function for combining the distance differences | 
					
						
							|  |  |  | #'   within one species. | 
					
						
							|  |  |  | #' @param summarize Function for summarizing the distance values across species. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @return An object of class `geposan_method`. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @seealso [adjacency()] | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @export | 
					
						
							| 
									
										
										
										
											2022-02-24 15:36:07 +01:00
										 |  |  | species_adjacency <- function(distance_estimate = stats::median, | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  |                               summarize = stats::median) { | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |   method( | 
					
						
							|  |  |  |     id = "species_adjacency", | 
					
						
							|  |  |  |     name = "Species adj.", | 
					
						
							|  |  |  |     description = "Species adjacency", | 
					
						
							|  |  |  |     function(preset, progress) { | 
					
						
							|  |  |  |       species_ids <- preset$species_ids | 
					
						
							|  |  |  |       gene_ids <- preset$gene_ids | 
					
						
							|  |  |  |       reference_gene_ids <- preset$reference_gene_ids | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |       cached( | 
					
						
							|  |  |  |         "species_adjacency", | 
					
						
							|  |  |  |         c( | 
					
						
							|  |  |  |           species_ids, | 
					
						
							|  |  |  |           gene_ids, | 
					
						
							|  |  |  |           reference_gene_ids, | 
					
						
							|  |  |  |           distance_estimate, | 
					
						
							|  |  |  |           summarize | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         { # nolint | 
					
						
							|  |  |  |           # Prefilter distances. | 
					
						
							|  |  |  |           data <- geposan::distances[ | 
					
						
							|  |  |  |             species %chin% species_ids & gene %chin% gene_ids | 
					
						
							|  |  |  |           ] | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           progress_state <- 0.0 | 
					
						
							|  |  |  |           progress_step <- 0.9 / length(species_ids) | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           # Iterate through all species and find the distance | 
					
						
							|  |  |  |           # estimates within that species. | 
					
						
							|  |  |  |           for (species_id in species_ids) { | 
					
						
							|  |  |  |             # For all genes, compute the distance to one reference | 
					
						
							|  |  |  |             # gene at a time in one go. | 
					
						
							|  |  |  |             for (reference_gene_id in reference_gene_ids) { | 
					
						
							|  |  |  |               comparison_distance <- data[ | 
					
						
							|  |  |  |                 species == species_id & | 
					
						
							|  |  |  |                   gene == reference_gene_id, | 
					
						
							|  |  |  |                 distance | 
					
						
							|  |  |  |               ] | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |               column <- quote(reference_gene_id) | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |               if (length(comparison_distance) != 1) { | 
					
						
							|  |  |  |                 # If we don't have a comparison distance, we | 
					
						
							|  |  |  |                 # can't compute a difference. This happens, if | 
					
						
							|  |  |  |                 # the species doesn't have the reference gene. | 
					
						
							|  |  |  |                 data[ | 
					
						
							|  |  |  |                   species == species_id & | 
					
						
							|  |  |  |                     gene %chin% gene_ids, | 
					
						
							|  |  |  |                   eval(column) := NA_integer_ | 
					
						
							|  |  |  |                 ] | 
					
						
							|  |  |  |               } else { | 
					
						
							|  |  |  |                 data[ | 
					
						
							|  |  |  |                   species == species_id & | 
					
						
							|  |  |  |                     gene %chin% gene_ids, | 
					
						
							|  |  |  |                   eval(column) := | 
					
						
							|  |  |  |                     abs(distance - comparison_distance) | 
					
						
							|  |  |  |                 ] | 
					
						
							|  |  |  |               } | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |             # Combine the distances to the different reference genes | 
					
						
							|  |  |  |             # into one value using the provided function. | 
					
						
							|  |  |  |             data[ | 
					
						
							|  |  |  |               species == species_id & | 
					
						
							|  |  |  |                 gene %chin% gene_ids, | 
					
						
							|  |  |  |               combined_distance := as.numeric( | 
					
						
							|  |  |  |                 distance_estimate(stats::na.omit( | 
					
						
							|  |  |  |                   # Convert the data.table subset into a | 
					
						
							|  |  |  |                   # vector to get the correct na.omit | 
					
						
							|  |  |  |                   # behavior. | 
					
						
							|  |  |  |                   as.matrix(.SD)[1, ] | 
					
						
							|  |  |  |                 )) | 
					
						
							|  |  |  |               ), | 
					
						
							|  |  |  |               .SDcols = reference_gene_ids, | 
					
						
							|  |  |  |               by = gene | 
					
						
							|  |  |  |             ] | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |             progress_state <- progress_state + progress_step | 
					
						
							|  |  |  |             progress(progress_state) | 
					
						
							|  |  |  |           } | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           progress(0.9) | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           # Remove the distances between the reference genes. | 
					
						
							|  |  |  |           for (reference_gene_id in reference_gene_ids) { | 
					
						
							|  |  |  |             column <- quote(reference_gene_id) | 
					
						
							|  |  |  |             data[gene == reference_gene_id, eval(column) := NA] | 
					
						
							|  |  |  |           } | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           # Recompute the combined distance for the reference genes. | 
					
						
							|  |  |  |           data[ | 
					
						
							|  |  |  |             gene %chin% reference_gene_ids, | 
					
						
							|  |  |  |             combined_distance := as.numeric( | 
					
						
							|  |  |  |               distance_estimate(stats::na.omit( | 
					
						
							|  |  |  |                 as.matrix(.SD)[1, ] | 
					
						
							|  |  |  |               )) | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             .SDcols = reference_gene_ids, | 
					
						
							|  |  |  |             by = list(species, gene) | 
					
						
							|  |  |  |           ] | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           # Combine the distances into one value. | 
					
						
							|  |  |  |           results <- data[, | 
					
						
							|  |  |  |             .( | 
					
						
							|  |  |  |               summarized_distances = as.numeric( | 
					
						
							|  |  |  |                 summarize(stats::na.omit(combined_distance)) | 
					
						
							|  |  |  |               ) | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             by = gene | 
					
						
							|  |  |  |           ] | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           # Compute the final score by normalizing the difference. | 
					
						
							|  |  |  |           results[ | 
					
						
							|  |  |  |             , | 
					
						
							|  |  |  |             score := 1 - summarized_distances / | 
					
						
							|  |  |  |               max(summarized_distances) | 
					
						
							|  |  |  |           ] | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           progress(1.0) | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           result( | 
					
						
							|  |  |  |             method = "species_adjacency", | 
					
						
							|  |  |  |             scores = results[, .(gene, score)], | 
					
						
							|  |  |  |             details = list( | 
					
						
							|  |  |  |               data = data, | 
					
						
							|  |  |  |               results = results | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           ) | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |       ) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   ) | 
					
						
							| 
									
										
										
										
											2022-01-17 20:11:07 +01:00
										 |  |  | } |