| 
									
										
										
										
											2022-01-09 21:32:37 +01:00
										 |  |  | #' Score genes based on their correlation with the reference genes. | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2022-06-22 11:20:39 +02:00
										 |  |  | #' @param id Unique ID for the method and its results. | 
					
						
							|  |  |  | #' @param name Human readable name for the method. | 
					
						
							|  |  |  | #' @param description Method description. | 
					
						
							| 
									
										
										
										
											2022-02-24 14:34:18 +01:00
										 |  |  | #' @param summarize A function for combining the different correlation | 
					
						
							|  |  |  | #'   coefficients into one metric. By default, [stats::median()] is used. Other | 
					
						
							|  |  |  | #'   suggested options include [max()] and [mean()]. | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | #' @return An object of class `geposan_method`. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @export | 
					
						
							| 
									
										
										
										
											2022-06-22 11:20:39 +02:00
										 |  |  | correlation <- function(id = "correlation", | 
					
						
							|  |  |  |                         name = "Correlation", | 
					
						
							|  |  |  |                         description = "Correlation with reference genes", | 
					
						
							|  |  |  |                         summarize = stats::median) { | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |   method( | 
					
						
							| 
									
										
										
										
											2022-06-22 11:20:39 +02:00
										 |  |  |     id = id, | 
					
						
							|  |  |  |     name = name, | 
					
						
							|  |  |  |     description = description, | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |     function(preset, progress) { | 
					
						
							|  |  |  |       species_ids <- preset$species_ids | 
					
						
							|  |  |  |       gene_ids <- preset$gene_ids | 
					
						
							|  |  |  |       reference_gene_ids <- preset$reference_gene_ids | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       cached( | 
					
						
							| 
									
										
										
										
											2022-08-12 12:41:56 +02:00
										 |  |  |         id, | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |         c(species_ids, gene_ids, reference_gene_ids, summarize), | 
					
						
							|  |  |  |         { # nolint | 
					
						
							|  |  |  |           # Prefilter distances by species. | 
					
						
							|  |  |  |           distances <- geposan::distances[species %chin% species_ids] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Tranform data to get species as rows and genes as columns. | 
					
						
							|  |  |  |           # We construct columns per species, because it requires | 
					
						
							|  |  |  |           # fewer iterations, and transpose the table afterwards. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           data <- data.table(gene = gene_ids) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Make a column containing distance data for each species. | 
					
						
							|  |  |  |           for (species_id in species_ids) { | 
					
						
							|  |  |  |             species_data <- distances[ | 
					
						
							|  |  |  |               species == species_id, | 
					
						
							|  |  |  |               .(gene, distance) | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             data <- merge(data, species_data, all.x = TRUE) | 
					
						
							|  |  |  |             setnames(data, "distance", species_id) | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Transpose to the desired format. | 
					
						
							|  |  |  |           data <- transpose(data, make.names = "gene") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           progress(0.33) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Take the reference data. | 
					
						
							|  |  |  |           reference_data <- data[, ..reference_gene_ids] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Perform the correlation between all possible pairs. | 
					
						
							|  |  |  |           results <- stats::cor( | 
					
						
							|  |  |  |             data[, ..gene_ids], | 
					
						
							|  |  |  |             reference_data, | 
					
						
							|  |  |  |             use = "pairwise.complete.obs", | 
					
						
							|  |  |  |             method = "spearman" | 
					
						
							|  |  |  |           ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           results <- data.table(results, keep.rownames = TRUE) | 
					
						
							|  |  |  |           setnames(results, "rn", "gene") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Remove correlations between the reference genes | 
					
						
							|  |  |  |           # themselves. | 
					
						
							|  |  |  |           for (reference_gene_id in reference_gene_ids) { | 
					
						
							|  |  |  |             column <- quote(reference_gene_id) | 
					
						
							|  |  |  |             results[gene == reference_gene_id, eval(column) := NA] | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           progress(0.66) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Combine the correlation coefficients. | 
					
						
							|  |  |  |           results[, | 
					
						
							|  |  |  |             max_correlation := as.double(summarize(stats::na.omit( | 
					
						
							|  |  |  |               # Convert the data.table subset into a | 
					
						
							|  |  |  |               # vector to get the correct na.omit | 
					
						
							|  |  |  |               # behavior. | 
					
						
							|  |  |  |               as.matrix(.SD)[1, ] | 
					
						
							|  |  |  |             ))), | 
					
						
							|  |  |  |             .SDcols = reference_gene_ids, | 
					
						
							|  |  |  |             by = gene | 
					
						
							|  |  |  |           ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Normalize scores. | 
					
						
							|  |  |  |           results[ | 
					
						
							|  |  |  |             , | 
					
						
							|  |  |  |             score := (max_correlation - min(max_correlation)) / | 
					
						
							|  |  |  |               (max(max_correlation) - min(max_correlation)) | 
					
						
							|  |  |  |           ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Normalize scores. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           results[, .(gene, score)] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           result( | 
					
						
							|  |  |  |             method = "correlation", | 
					
						
							|  |  |  |             scores = results[, .(gene, score)], | 
					
						
							|  |  |  |             details = list(all_correlations = results) | 
					
						
							|  |  |  |           ) | 
					
						
							| 
									
										
										
										
											2021-11-05 19:49:54 +01:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |       ) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   ) | 
					
						
							| 
									
										
										
										
											2021-10-19 13:39:55 +02:00
										 |  |  | } |