mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	adjacency: Make distance estimation customizable
This commit is contained in:
		
							parent
							
								
									ac9894e988
								
							
						
					
					
						commit
						2ceda0691b
					
				
					 4 changed files with 109 additions and 71 deletions
				
			
		
							
								
								
									
										148
									
								
								R/adjacency.R
									
										
									
									
									
								
							
							
						
						
									
										148
									
								
								R/adjacency.R
									
										
									
									
									
								
							|  | @ -1,13 +1,36 @@ | |||
| #' Find the densest value in the data. | ||||
| #' | ||||
| #' This function assumes that data represents a continuous variable and finds | ||||
| #' a single value with the highest estimated density. This can be used to | ||||
| #' estimate the mode of the data. If there is only one value that value is | ||||
| #' returned. If multiple density maxima with the same density exist, their mean | ||||
| #' is returned. | ||||
| #' | ||||
| #' @param data The input data. | ||||
| #' | ||||
| #' @return The densest value of data. | ||||
| #' | ||||
| #' @export | ||||
| densest <- function(data) { | ||||
|     as.numeric(if (length(data) <= 0) { | ||||
|         NULL | ||||
|     } else if (length(data) == 1) { | ||||
|         data | ||||
|     } else { | ||||
|         density <- stats::density(data) | ||||
|         mean(density$x[density$y == max(density$y)]) | ||||
|     }) | ||||
| } | ||||
| 
 | ||||
| #' Score genes based on their proximity to the reference genes. | ||||
| #' | ||||
| #' This method finds the distance value with the maximum density for each gene | ||||
| #' (i.e. the mode of its estimated distribution). Genes are scored by comparing | ||||
| #' those distance values with the values of the reference genes. | ||||
| #' @param estimate A function that will be used to summarize the distance | ||||
| #'   values for each gene. See [densest()] for the default implementation. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @export | ||||
| adjacency <- function() { | ||||
| adjacency <- function(estimate = densest) { | ||||
|     method( | ||||
|         id = "adjacency", | ||||
|         name = "Adjacency", | ||||
|  | @ -17,73 +40,64 @@ adjacency <- function() { | |||
|             gene_ids <- preset$gene_ids | ||||
|             reference_gene_ids <- preset$reference_gene_ids | ||||
| 
 | ||||
|             cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), { | ||||
|                 # Get the virtual distance value with the highest density. | ||||
|                 compute_densest_distance <- function(distances) { | ||||
|                     if (length(distances) <= 2) { | ||||
|                         mean(distances) | ||||
|                     } else { | ||||
|                         d <- stats::density(distances) | ||||
|                         d$x[which.max(d$y)] | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 # Filter distances by species and gene and find the distance | ||||
|                 # with the highest density of values for each gene. | ||||
|                 data <- geposan::distances[ | ||||
|                     species %chin% species_ids & gene %chin% gene_ids, | ||||
|                     .(densest_distance = compute_densest_distance(distance)), | ||||
|                     by = gene | ||||
|                 ] | ||||
| 
 | ||||
|                 # Compute the absolute value of the difference between the | ||||
|                 # provided densest distance value in comparison to the mean of | ||||
|                 # the densest distances of the comparison genes. | ||||
|                 compute_difference <- function(densest_distance, | ||||
|                                                comparison_ids) { | ||||
|                     # Get the mean of the densest distances of the reference | ||||
|                     # genes. | ||||
|                     mean_densest_distance <- data[ | ||||
|                         gene %chin% comparison_ids, | ||||
|                         mean(densest_distance) | ||||
|             cached( | ||||
|                 "adjacency", | ||||
|                 c(species_ids, gene_ids, reference_gene_ids, estimate), | ||||
|                 { # nolint | ||||
|                     # Filter distances by species and gene and summarize each | ||||
|                     # gene's distance values using the estimation function. | ||||
|                     data <- geposan::distances[ | ||||
|                         species %chin% species_ids & gene %chin% gene_ids, | ||||
|                         .(distance = estimate(distance)), | ||||
|                         by = gene | ||||
|                     ] | ||||
| 
 | ||||
|                     abs(densest_distance - mean_densest_distance) | ||||
|                 } | ||||
|                     # Compute the absolute value of the difference between the | ||||
|                     # estimated distances of each gene to the reference genes. | ||||
|                     compute_difference <- function(distance, | ||||
|                                                    comparison_ids) { | ||||
|                         reference_distance <- data[ | ||||
|                             gene %chin% comparison_ids, | ||||
|                             mean(distance) | ||||
|                         ] | ||||
| 
 | ||||
|                 # Compute the differences to the reference genes. | ||||
|                 data[ | ||||
|                     !gene %chin% reference_gene_ids, | ||||
|                     difference := compute_difference( | ||||
|                         densest_distance, | ||||
|                         reference_gene_ids | ||||
|                         abs(distance - reference_distance) | ||||
|                     } | ||||
| 
 | ||||
|                     # Compute the differences to the reference genes. | ||||
|                     data[ | ||||
|                         !gene %chin% reference_gene_ids, | ||||
|                         difference := compute_difference( | ||||
|                             distance, | ||||
|                             reference_gene_ids | ||||
|                         ) | ||||
|                     ] | ||||
| 
 | ||||
|                     progress(0.5) | ||||
| 
 | ||||
|                     # Exclude the reference gene itself when computing its | ||||
|                     # difference. | ||||
|                     data[ | ||||
|                         gene %chin% reference_gene_ids, | ||||
|                         difference := compute_difference( | ||||
|                             distance, | ||||
|                             reference_gene_ids[reference_gene_ids != gene] | ||||
|                         ), | ||||
|                         by = gene | ||||
|                     ] | ||||
| 
 | ||||
|                     # Compute the final score by normalizing the difference. | ||||
|                     data[, score := 1 - difference / max(difference)] | ||||
| 
 | ||||
|                     progress(1.0) | ||||
| 
 | ||||
|                     result( | ||||
|                         method = "adjacency", | ||||
|                         scores = data[, .(gene, score)], | ||||
|                         details = list(data = data) | ||||
|                     ) | ||||
|                 ] | ||||
| 
 | ||||
|                 progress(0.5) | ||||
| 
 | ||||
|                 # Exclude the reference gene itself when computing its | ||||
|                 # difference. | ||||
|                 data[ | ||||
|                     gene %chin% reference_gene_ids, | ||||
|                     difference := compute_difference( | ||||
|                         densest_distance, | ||||
|                         reference_gene_ids[reference_gene_ids != gene] | ||||
|                     ), | ||||
|                     by = gene | ||||
|                 ] | ||||
| 
 | ||||
|                 # Compute the final score by normalizing the difference. | ||||
|                 data[, score := 1 - difference / max(difference)] | ||||
| 
 | ||||
|                 progress(1.0) | ||||
| 
 | ||||
|                 result( | ||||
|                     method = "adjacency", | ||||
|                     scores = data[, .(gene, score)], | ||||
|                     details = list(data = data) | ||||
|                 ) | ||||
|             }) | ||||
|                 } | ||||
|             ) | ||||
|         } | ||||
|     ) | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue