mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	Add species adjacency method
This commit is contained in:
		
							parent
							
								
									c8f1e522f9
								
							
						
					
					
						commit
						53f955f3da
					
				
					 6 changed files with 202 additions and 14 deletions
				
			
		|  | @ -24,16 +24,23 @@ densest <- function(data) { | |||
| 
 | ||||
| #' Score genes based on their proximity to the reference genes. | ||||
| #' | ||||
| #' @param estimate A function that will be used to summarize the distance | ||||
| #'   values for each gene. See [densest()] for the default implementation. | ||||
| #' @param combination A function that will be used to combine the different | ||||
| #' In this case, the distance data that is available for one gene is first | ||||
| #' combined. The resulting value is compared to the reference genes and | ||||
| #' determines the gene's score in relation to other genes. | ||||
| #' | ||||
| #' @param distance_estimate A function that will be used to summarize the | ||||
| #'   distance values for each gene. See [densest()] for the default | ||||
| #'   implementation. | ||||
| #' @param summarize A function that will be used to combine the different | ||||
| #'   distances to the reference genes. By default [min()] is used. That means | ||||
| #'   the distance to the nearest reference gene will be scored. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @seealso [species_adjacency()] | ||||
| #' | ||||
| #' @export | ||||
| adjacency <- function(estimate = densest, combination = min) { | ||||
| adjacency <- function(distance_estimate = densest, summarize = min) { | ||||
|     method( | ||||
|         id = "adjacency", | ||||
|         name = "Adjacency", | ||||
|  | @ -49,15 +56,15 @@ adjacency <- function(estimate = densest, combination = min) { | |||
|                     species_ids, | ||||
|                     gene_ids, | ||||
|                     reference_gene_ids, | ||||
|                     estimate, | ||||
|                     combination | ||||
|                     distance_estimate, | ||||
|                     summarize | ||||
|                 ), | ||||
|                 { # nolint | ||||
|                     # Filter distances by species and gene and summarize each | ||||
|                     # gene's distance values using the estimation function. | ||||
|                     data <- geposan::distances[ | ||||
|                         species %chin% species_ids & gene %chin% gene_ids, | ||||
|                         .(distance = as.numeric(estimate(distance))), | ||||
|                         .(distance = as.numeric(distance_estimate(distance))), | ||||
|                         by = gene | ||||
|                     ] | ||||
| 
 | ||||
|  | @ -70,7 +77,7 @@ adjacency <- function(estimate = densest, combination = min) { | |||
|                             .(difference = abs(distance_value - distance)) | ||||
|                         ] | ||||
| 
 | ||||
|                         combination(differences$difference) | ||||
|                         summarize(differences$difference) | ||||
|                     } | ||||
| 
 | ||||
|                     # Compute the differences to the reference genes. | ||||
|  |  | |||
|  | @ -36,7 +36,8 @@ all_methods <- function() { | |||
|         clustering(), | ||||
|         correlation(), | ||||
|         neural(), | ||||
|         adjacency() | ||||
|         adjacency(), | ||||
|         species_adjacency() | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										148
									
								
								R/species_adjacency.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										148
									
								
								R/species_adjacency.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,148 @@ | |||
| #' Score genes based on their adjacency to the reference genes within species. | ||||
| #' | ||||
| #' For each gene and species, the method will first combine the gene's distances | ||||
| #' to the reference genes within that species. Afterwards, the results are | ||||
| #' summarized across species and determine the gene's score. | ||||
| #' | ||||
| #' @param distance_estimate Function for combining the distance differences | ||||
| #'   within one species. | ||||
| #' @param summarize Function for summarizing the distance values across species. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @seealso [adjacency()] | ||||
| #' | ||||
| #' @export | ||||
| species_adjacency <- function(distance_estimate = min, | ||||
|                               summarize = stats::median) { | ||||
|     method( | ||||
|         id = "species_adjacency", | ||||
|         name = "Species adj.", | ||||
|         description = "Species adjacency", | ||||
|         function(preset, progress) { | ||||
|             species_ids <- preset$species_ids | ||||
|             gene_ids <- preset$gene_ids | ||||
|             reference_gene_ids <- preset$reference_gene_ids | ||||
| 
 | ||||
|             cached( | ||||
|                 "species_adjacency", | ||||
|                 c( | ||||
|                     species_ids, | ||||
|                     gene_ids, | ||||
|                     reference_gene_ids, | ||||
|                     distance_estimate, | ||||
|                     summarize | ||||
|                 ), | ||||
|                 { # nolint | ||||
|                     # Prefilter distances. | ||||
|                     data <- geposan::distances[ | ||||
|                         species %chin% species_ids & gene %chin% gene_ids | ||||
|                     ] | ||||
| 
 | ||||
|                     progress_state <- 0.0 | ||||
|                     progress_step <- 0.9 / length(species_ids) | ||||
| 
 | ||||
|                     # Iterate through all species and find the distance | ||||
|                     # estimates within that species. | ||||
|                     for (species_id in species_ids) { | ||||
|                         # For all genes, compute the distance to one reference | ||||
|                         # gene at a time in one go. | ||||
|                         for (reference_gene_id in reference_gene_ids) { | ||||
|                             comparison_distance <- data[ | ||||
|                                 species == species_id & | ||||
|                                     gene == reference_gene_id, | ||||
|                                 distance | ||||
|                             ] | ||||
| 
 | ||||
|                             column <- quote(reference_gene_id) | ||||
| 
 | ||||
|                             if (length(comparison_distance) != 1) { | ||||
|                                 # If we don't have a comparison distance, we | ||||
|                                 # can't compute a difference. This happens, if | ||||
|                                 # the species doesn't have the reference gene. | ||||
|                                 data[ | ||||
|                                     species == species_id & | ||||
|                                         gene %chin% gene_ids, | ||||
|                                     eval(column) := NA_integer_ | ||||
|                                 ] | ||||
|                             } else { | ||||
|                                 data[ | ||||
|                                     species == species_id & | ||||
|                                         gene %chin% gene_ids, | ||||
|                                     eval(column) := | ||||
|                                         abs(distance - comparison_distance) | ||||
|                                 ] | ||||
|                             } | ||||
|                         } | ||||
| 
 | ||||
|                         # Combine the distances to the different reference genes | ||||
|                         # into one value using the provided function. | ||||
|                         data[ | ||||
|                             species == species_id & | ||||
|                                 gene %chin% gene_ids, | ||||
|                             combined_distance := as.numeric( | ||||
|                                 distance_estimate(na.omit( | ||||
|                                     # Convert the data.table subset into a | ||||
|                                     # vector to get the correct na.omit | ||||
|                                     # behavior. | ||||
|                                     as.matrix(.SD)[1, ] | ||||
|                                 )) | ||||
|                             ), | ||||
|                             .SDcols = reference_gene_ids, | ||||
|                             by = gene | ||||
|                         ] | ||||
| 
 | ||||
|                         progress_state <- progress_state + progress_step | ||||
|                         progress(progress_state) | ||||
|                     } | ||||
| 
 | ||||
|                     progress(0.9) | ||||
| 
 | ||||
|                     # Remove the distances between the reference genes. | ||||
|                     for (reference_gene_id in reference_gene_ids) { | ||||
|                         column <- quote(reference_gene_id) | ||||
|                         data[gene == reference_gene_id, eval(column) := NA] | ||||
|                     } | ||||
| 
 | ||||
|                     # Recompute the combined distance for the reference genes. | ||||
|                     data[ | ||||
|                         gene %chin% reference_gene_ids, | ||||
|                         combined_distance := as.numeric( | ||||
|                             distance_estimate(na.omit(as.matrix(.SD)[1, ])) | ||||
|                         ), | ||||
|                         .SDcols = reference_gene_ids, | ||||
|                         by = list(species, gene) | ||||
|                     ] | ||||
| 
 | ||||
|                     # Combine the distances into one value. | ||||
|                     results <- data[, | ||||
|                         .( | ||||
|                             summarized_distances = as.numeric( | ||||
|                                 summarize(na.omit(combined_distance)) | ||||
|                             ) | ||||
|                         ), | ||||
|                         by = gene | ||||
|                     ] | ||||
| 
 | ||||
|                     # Compute the final score by normalizing the difference. | ||||
|                     results[ | ||||
|                         , | ||||
|                         score := 1 - summarized_distances / | ||||
|                             max(summarized_distances) | ||||
|                     ] | ||||
| 
 | ||||
|                     progress(1.0) | ||||
| 
 | ||||
|                     result( | ||||
|                         method = "species_adjacency", | ||||
|                         scores = results[, .(gene, score)], | ||||
|                         details = list( | ||||
|                             data = data, | ||||
|                             results = results | ||||
|                         ) | ||||
|                     ) | ||||
|                 } | ||||
|             ) | ||||
|         } | ||||
|     ) | ||||
| } | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue