geposanui/correlation.R

71 lines
2 KiB
R
Raw Normal View History

2021-09-18 23:10:52 +02:00
library(data.table)
2021-09-21 16:47:13 +02:00
library(progress)
2021-09-18 23:10:52 +02:00
library(rlog)
#' Compute the mean correlation coefficient comparing gene distances with a set
#' of reference genes.
#'
#' The result will be a data.table with the following columns:
#'
#' - `gene` Gene ID of the processed gene.
#' - `r_mean` Mean correlation coefficient.
#'
#' @param distances Distance data to use.
#' @param species_ids Species, whose data should be included.
#' @param gene_ids Genes to process.
#' @param reference_gene_ids Genes to compare to.
process_correlation <- function(distances, species_ids, gene_ids,
reference_gene_ids) {
results <- data.table(gene = gene_ids)
gene_count <- length(gene_ids)
reference_count <- length(reference_gene_ids)
2021-09-21 16:47:13 +02:00
log_info(sprintf(
"Correlating %i genes from %i species with %i reference genes",
gene_count,
length(species_ids),
reference_count
))
progress <- progress_bar$new(
total = gene_count,
format = "Correlating genes [:bar] :percent (ETA :eta)"
)
2021-09-18 23:10:52 +02:00
# Prefilter distances by species.
distances <- distances[species %chin% species_ids]
for (i in 1:gene_count) {
2021-09-21 16:47:13 +02:00
progress$tick()
2021-09-18 23:10:52 +02:00
2021-09-21 16:47:13 +02:00
gene_id <- gene_ids[i]
2021-09-18 23:10:52 +02:00
gene_distances <- distances[gene == gene_id]
2021-10-04 09:19:38 +02:00
if (nrow(gene_distances) < 10) {
2021-09-18 23:10:52 +02:00
next
}
#' Buffer for the sum of correlation coefficients.
r_sum <- 0
# Correlate with all reference genes but not with the gene itself.
for (reference_gene_id in
reference_gene_ids[reference_gene_ids != gene_id]) {
data <- merge(
gene_distances,
distances[gene == reference_gene_id],
by = "species"
)
2021-09-19 20:19:20 +02:00
# Order data by the reference gene's distance to get a monotonic
# relation.
setorder(data, distance.y)
2021-09-18 23:10:52 +02:00
r_sum <- r_sum + abs(cor(data[, distance.x], data[, distance.y]))
}
results[gene == gene_id, r_mean := r_sum / reference_count]
}
results
}