mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	
		
			
	
	
		
			149 lines
		
	
	
	
		
			5.9 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
		
		
			
		
	
	
			149 lines
		
	
	
	
		
			5.9 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
|  | #' Score genes based on their adjacency to the reference genes within species. | ||
|  | #' | ||
|  | #' For each gene and species, the method will first combine the gene's distances | ||
|  | #' to the reference genes within that species. Afterwards, the results are | ||
|  | #' summarized across species and determine the gene's score. | ||
|  | #' | ||
|  | #' @param distance_estimate Function for combining the distance differences | ||
|  | #'   within one species. | ||
|  | #' @param summarize Function for summarizing the distance values across species. | ||
|  | #' | ||
|  | #' @return An object of class `geposan_method`. | ||
|  | #' | ||
|  | #' @seealso [adjacency()] | ||
|  | #' | ||
|  | #' @export | ||
|  | species_adjacency <- function(distance_estimate = min, | ||
|  |                               summarize = stats::median) { | ||
|  |     method( | ||
|  |         id = "species_adjacency", | ||
|  |         name = "Species adj.", | ||
|  |         description = "Species adjacency", | ||
|  |         function(preset, progress) { | ||
|  |             species_ids <- preset$species_ids | ||
|  |             gene_ids <- preset$gene_ids | ||
|  |             reference_gene_ids <- preset$reference_gene_ids | ||
|  | 
 | ||
|  |             cached( | ||
|  |                 "species_adjacency", | ||
|  |                 c( | ||
|  |                     species_ids, | ||
|  |                     gene_ids, | ||
|  |                     reference_gene_ids, | ||
|  |                     distance_estimate, | ||
|  |                     summarize | ||
|  |                 ), | ||
|  |                 { # nolint | ||
|  |                     # Prefilter distances. | ||
|  |                     data <- geposan::distances[ | ||
|  |                         species %chin% species_ids & gene %chin% gene_ids | ||
|  |                     ] | ||
|  | 
 | ||
|  |                     progress_state <- 0.0 | ||
|  |                     progress_step <- 0.9 / length(species_ids) | ||
|  | 
 | ||
|  |                     # Iterate through all species and find the distance | ||
|  |                     # estimates within that species. | ||
|  |                     for (species_id in species_ids) { | ||
|  |                         # For all genes, compute the distance to one reference | ||
|  |                         # gene at a time in one go. | ||
|  |                         for (reference_gene_id in reference_gene_ids) { | ||
|  |                             comparison_distance <- data[ | ||
|  |                                 species == species_id & | ||
|  |                                     gene == reference_gene_id, | ||
|  |                                 distance | ||
|  |                             ] | ||
|  | 
 | ||
|  |                             column <- quote(reference_gene_id) | ||
|  | 
 | ||
|  |                             if (length(comparison_distance) != 1) { | ||
|  |                                 # If we don't have a comparison distance, we | ||
|  |                                 # can't compute a difference. This happens, if | ||
|  |                                 # the species doesn't have the reference gene. | ||
|  |                                 data[ | ||
|  |                                     species == species_id & | ||
|  |                                         gene %chin% gene_ids, | ||
|  |                                     eval(column) := NA_integer_ | ||
|  |                                 ] | ||
|  |                             } else { | ||
|  |                                 data[ | ||
|  |                                     species == species_id & | ||
|  |                                         gene %chin% gene_ids, | ||
|  |                                     eval(column) := | ||
|  |                                         abs(distance - comparison_distance) | ||
|  |                                 ] | ||
|  |                             } | ||
|  |                         } | ||
|  | 
 | ||
|  |                         # Combine the distances to the different reference genes | ||
|  |                         # into one value using the provided function. | ||
|  |                         data[ | ||
|  |                             species == species_id & | ||
|  |                                 gene %chin% gene_ids, | ||
|  |                             combined_distance := as.numeric( | ||
|  |                                 distance_estimate(na.omit( | ||
|  |                                     # Convert the data.table subset into a | ||
|  |                                     # vector to get the correct na.omit | ||
|  |                                     # behavior. | ||
|  |                                     as.matrix(.SD)[1, ] | ||
|  |                                 )) | ||
|  |                             ), | ||
|  |                             .SDcols = reference_gene_ids, | ||
|  |                             by = gene | ||
|  |                         ] | ||
|  | 
 | ||
|  |                         progress_state <- progress_state + progress_step | ||
|  |                         progress(progress_state) | ||
|  |                     } | ||
|  | 
 | ||
|  |                     progress(0.9) | ||
|  | 
 | ||
|  |                     # Remove the distances between the reference genes. | ||
|  |                     for (reference_gene_id in reference_gene_ids) { | ||
|  |                         column <- quote(reference_gene_id) | ||
|  |                         data[gene == reference_gene_id, eval(column) := NA] | ||
|  |                     } | ||
|  | 
 | ||
|  |                     # Recompute the combined distance for the reference genes. | ||
|  |                     data[ | ||
|  |                         gene %chin% reference_gene_ids, | ||
|  |                         combined_distance := as.numeric( | ||
|  |                             distance_estimate(na.omit(as.matrix(.SD)[1, ])) | ||
|  |                         ), | ||
|  |                         .SDcols = reference_gene_ids, | ||
|  |                         by = list(species, gene) | ||
|  |                     ] | ||
|  | 
 | ||
|  |                     # Combine the distances into one value. | ||
|  |                     results <- data[, | ||
|  |                         .( | ||
|  |                             summarized_distances = as.numeric( | ||
|  |                                 summarize(na.omit(combined_distance)) | ||
|  |                             ) | ||
|  |                         ), | ||
|  |                         by = gene | ||
|  |                     ] | ||
|  | 
 | ||
|  |                     # Compute the final score by normalizing the difference. | ||
|  |                     results[ | ||
|  |                         , | ||
|  |                         score := 1 - summarized_distances / | ||
|  |                             max(summarized_distances) | ||
|  |                     ] | ||
|  | 
 | ||
|  |                     progress(1.0) | ||
|  | 
 | ||
|  |                     result( | ||
|  |                         method = "species_adjacency", | ||
|  |                         scores = results[, .(gene, score)], | ||
|  |                         details = list( | ||
|  |                             data = data, | ||
|  |                             results = results | ||
|  |                         ) | ||
|  |                     ) | ||
|  |                 } | ||
|  |             ) | ||
|  |         } | ||
|  |     ) | ||
|  | } |