diff --git a/clustering.R b/clustering.R index e0b5a3e..1b5502c 100644 --- a/clustering.R +++ b/clustering.R @@ -1,4 +1,5 @@ library(data.table) +library(progress) library(rlog) #' Process genes clustering their distance to telomeres. @@ -17,14 +18,21 @@ process_clustering <- function(distances, species_ids, gene_ids) { results <- data.table(gene = gene_ids) gene_count <- length(gene_ids) - for (i in 1:gene_count) { - gene_id <- gene_ids[i] + log_info(sprintf( + "Clustering %i genes from %i species", + gene_count, + length(species_ids) + )) - log_info(sprintf( - "[%3i%%] Processing gene \"%s\"", - round(i / gene_count * 100), - gene_id - )) + progress <- progress_bar$new( + total = gene_count, + format = "Clustering genes [:bar] :percent (ETA :eta)" + ) + + for (i in 1:gene_count) { + progress$tick() + + gene_id <- gene_ids[i] data <- distances[ species %chin% species_ids & gene == gene_id, diff --git a/correlation.R b/correlation.R index 6814570..8f52cfa 100644 --- a/correlation.R +++ b/correlation.R @@ -1,4 +1,5 @@ library(data.table) +library(progress) library(rlog) #' Compute the mean correlation coefficient comparing gene distances with a set @@ -15,24 +16,29 @@ library(rlog) #' @param reference_gene_ids Genes to compare to. process_correlation <- function(distances, species_ids, gene_ids, reference_gene_ids) { - log_info("Processing genes for correlation") - results <- data.table(gene = gene_ids) gene_count <- length(gene_ids) reference_count <- length(reference_gene_ids) + log_info(sprintf( + "Correlating %i genes from %i species with %i reference genes", + gene_count, + length(species_ids), + reference_count + )) + + progress <- progress_bar$new( + total = gene_count, + format = "Correlating genes [:bar] :percent (ETA :eta)" + ) + # Prefilter distances by species. distances <- distances[species %chin% species_ids] for (i in 1:gene_count) { + progress$tick() + gene_id <- gene_ids[i] - - log_info(sprintf( - "[%3i%%] Processing gene \"%s\"", - round(i / gene_count * 100), - gene_id - )) - gene_distances <- distances[gene == gene_id] if (nrow(gene_distances) < 12) { diff --git a/input.R b/input.R index c6eb9e7..f2ad815 100644 --- a/input.R +++ b/input.R @@ -1,5 +1,6 @@ library(biomaRt) library(data.table) +library(progress) library(rlog) library(stringr) @@ -115,6 +116,23 @@ retrieve_genes <- function() { #' - `gene` Ensembl gene ID. #' - `distance` Distance to nearest telomere in base pairs. retrieve_distances <- function(species_ids, gene_ids) { + # Exclude the human from the species, in case it is present there. + species_ids <- species_ids[species_ids != "hsapiens"] + + species_count <- length(species_ids) + gene_count <- length(gene_ids) + + log_info(sprintf( + "Retrieving distance data for %i genes from %i species", + gene_count, + species_count + )) + + progress <- progress_bar$new( + total = gene_count, + format = "Retrieving distance data [:bar] :percent (ETA :eta)" + ) + # Special case the human species and retrieve all available distance # information. @@ -148,19 +166,10 @@ retrieve_distances <- function(species_ids, gene_ids) { ) ] - # Exclude the human from the species, in case it is present there. - species_ids <- species_ids[species_ids != "hsapiens"] - - species_count <- length(species_ids) - for (i in 1:species_count) { species_id <- species_ids[i] - log_info(sprintf( - "[%3i%%] Loading species \"%s\"", - round(i / species_count * 100), - species_id - )) + progress$tick() ensembl <- useDataset( sprintf("%s_gene_ensembl", species_id),