mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			116 lines
		
	
	
	
		
			3.7 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
			
		
		
	
	
			116 lines
		
	
	
	
		
			3.7 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
| #' Score genes based on their correlation with the reference genes.
 | |
| #'
 | |
| #' @param id Unique ID for the method and its results.
 | |
| #' @param name Human readable name for the method.
 | |
| #' @param description Method description.
 | |
| #' @param summarize A function for combining the different correlation
 | |
| #'   coefficients into one metric. By default, [stats::median()] is used. Other
 | |
| #'   suggested options include [max()] and [mean()].
 | |
| #'
 | |
| #' @return An object of class `geposan_method`.
 | |
| #'
 | |
| #' @export
 | |
| correlation <- function(id = "correlation",
 | |
|                         name = "Correlation",
 | |
|                         description = "Correlation with reference genes",
 | |
|                         summarize = stats::median) {
 | |
|   method(
 | |
|     id = id,
 | |
|     name = name,
 | |
|     description = description,
 | |
|     help = paste0(
 | |
|       "Spearman's rank correlation of the telomeric distances of the gene in ",
 | |
|       "comparison with the reference gene set. This helps to assess if the ",
 | |
|       "gene is as close or distant to telomeres as the reference genes within ",
 | |
|       "each species."
 | |
|     ),
 | |
|     function(preset, progress) {
 | |
|       species_ids <- preset$species_ids
 | |
|       gene_ids <- preset$gene_ids
 | |
|       reference_gene_ids <- preset$reference_gene_ids
 | |
| 
 | |
|       cached(
 | |
|         id,
 | |
|         c(species_ids, gene_ids, reference_gene_ids, summarize),
 | |
|         { # nolint
 | |
|           # Prefilter distances by species.
 | |
|           distances <- geposan::distances[species %chin% species_ids]
 | |
| 
 | |
|           # Tranform data to get species as rows and genes as columns.
 | |
|           # We construct columns per species, because it requires
 | |
|           # fewer iterations, and transpose the table afterwards.
 | |
| 
 | |
|           data <- data.table(gene = gene_ids)
 | |
| 
 | |
|           # Make a column containing distance data for each species.
 | |
|           for (species_id in species_ids) {
 | |
|             species_data <- distances[
 | |
|               species == species_id,
 | |
|               .(gene, distance)
 | |
|             ]
 | |
| 
 | |
|             data <- merge(data, species_data, all.x = TRUE)
 | |
|             setnames(data, "distance", species_id)
 | |
|           }
 | |
| 
 | |
|           # Transpose to the desired format.
 | |
|           data <- transpose(data, make.names = "gene")
 | |
| 
 | |
|           progress(0.33)
 | |
| 
 | |
|           # Take the reference data.
 | |
|           reference_data <- data[, ..reference_gene_ids]
 | |
| 
 | |
|           # Perform the correlation between all possible pairs.
 | |
|           results <- stats::cor(
 | |
|             data[, ..gene_ids],
 | |
|             reference_data,
 | |
|             use = "pairwise.complete.obs",
 | |
|             method = "spearman"
 | |
|           )
 | |
| 
 | |
|           results <- data.table(results, keep.rownames = TRUE)
 | |
|           setnames(results, "rn", "gene")
 | |
| 
 | |
|           # Remove correlations between the reference genes
 | |
|           # themselves.
 | |
|           for (reference_gene_id in reference_gene_ids) {
 | |
|             column <- quote(reference_gene_id)
 | |
|             results[gene == reference_gene_id, eval(column) := NA]
 | |
|           }
 | |
| 
 | |
|           progress(0.66)
 | |
| 
 | |
|           # Combine the correlation coefficients.
 | |
|           results[,
 | |
|             max_correlation := as.double(summarize(stats::na.omit(
 | |
|               # Convert the data.table subset into a
 | |
|               # vector to get the correct na.omit
 | |
|               # behavior.
 | |
|               as.matrix(.SD)[1, ]
 | |
|             ))),
 | |
|             .SDcols = reference_gene_ids,
 | |
|             by = gene
 | |
|           ]
 | |
| 
 | |
|           # Normalize scores.
 | |
|           results[
 | |
|             ,
 | |
|             score := (max_correlation - min(max_correlation)) /
 | |
|               (max(max_correlation) - min(max_correlation))
 | |
|           ]
 | |
| 
 | |
|           # Normalize scores.
 | |
| 
 | |
|           results[, .(gene, score)]
 | |
| 
 | |
|           result(
 | |
|             method = "correlation",
 | |
|             scores = results[, .(gene, score)],
 | |
|             details = list(all_correlations = results)
 | |
|           )
 | |
|         }
 | |
|       )
 | |
|     }
 | |
|   )
 | |
| }
 |