mirror of
				https://github.com/johrpan/geposanui.git
				synced 2025-10-26 19:27:24 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			61 lines
		
	
	
		
			No EOL
		
	
	
		
			1.8 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
			
		
		
	
	
			61 lines
		
	
	
		
			No EOL
		
	
	
		
			1.8 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
| library(data.table)
 | |
| library(rlog)
 | |
| 
 | |
| #' Compute the mean correlation coefficient comparing gene distances with a set
 | |
| #' of reference genes.
 | |
| #'
 | |
| #' The result will be a data.table with the following columns:
 | |
| #'
 | |
| #'  - `gene` Gene ID of the processed gene.
 | |
| #'  - `r_mean` Mean correlation coefficient.
 | |
| #'
 | |
| #' @param distances Distance data to use.
 | |
| #' @param species_ids Species, whose data should be included.
 | |
| #' @param gene_ids Genes to process.
 | |
| #' @param reference_gene_ids Genes to compare to.
 | |
| process_correlation <- function(distances, species_ids, gene_ids,
 | |
|                                 reference_gene_ids) {
 | |
|     log_info("Processing genes for correlation")
 | |
| 
 | |
|     results <- data.table(gene = gene_ids)
 | |
|     gene_count <- length(gene_ids)
 | |
|     reference_count <- length(reference_gene_ids)
 | |
| 
 | |
|     # Prefilter distances by species.
 | |
|     distances <- distances[species %chin% species_ids]
 | |
| 
 | |
|     for (i in 1:gene_count) {
 | |
|         gene_id <- gene_ids[i]
 | |
| 
 | |
|         log_info(sprintf(
 | |
|             "[%3i%%] Processing gene \"%s\"",
 | |
|             round(i / gene_count * 100),
 | |
|             gene_id
 | |
|         ))
 | |
| 
 | |
|         gene_distances <- distances[gene == gene_id]
 | |
| 
 | |
|         if (nrow(gene_distances) < 12) {
 | |
|             next
 | |
|         }
 | |
| 
 | |
|         #' Buffer for the sum of correlation coefficients.
 | |
|         r_sum <- 0
 | |
| 
 | |
|         # Correlate with all reference genes but not with the gene itself.
 | |
|         for (reference_gene_id in
 | |
|             reference_gene_ids[reference_gene_ids != gene_id]) {
 | |
|             data <- merge(
 | |
|                 gene_distances,
 | |
|                 distances[gene == reference_gene_id],
 | |
|                 by = "species"
 | |
|             )
 | |
| 
 | |
|             r_sum <- r_sum + abs(cor(data[, distance.x], data[, distance.y]))
 | |
|         }
 | |
| 
 | |
|         results[gene == gene_id, r_mean := r_sum / reference_count]
 | |
|     }
 | |
| 
 | |
|     results
 | |
| } |