Enhance progress information

This commit is contained in:
Elias Projahn 2021-09-21 16:47:13 +02:00
parent 8e54dacd3d
commit ba7c624705
3 changed files with 49 additions and 26 deletions

View file

@ -1,4 +1,5 @@
library(data.table) library(data.table)
library(progress)
library(rlog) library(rlog)
#' Process genes clustering their distance to telomeres. #' Process genes clustering their distance to telomeres.
@ -17,15 +18,22 @@ process_clustering <- function(distances, species_ids, gene_ids) {
results <- data.table(gene = gene_ids) results <- data.table(gene = gene_ids)
gene_count <- length(gene_ids) gene_count <- length(gene_ids)
for (i in 1:gene_count) {
gene_id <- gene_ids[i]
log_info(sprintf( log_info(sprintf(
"[%3i%%] Processing gene \"%s\"", "Clustering %i genes from %i species",
round(i / gene_count * 100), gene_count,
gene_id length(species_ids)
)) ))
progress <- progress_bar$new(
total = gene_count,
format = "Clustering genes [:bar] :percent (ETA :eta)"
)
for (i in 1:gene_count) {
progress$tick()
gene_id <- gene_ids[i]
data <- distances[ data <- distances[
species %chin% species_ids & gene == gene_id, species %chin% species_ids & gene == gene_id,
.(species, distance) .(species, distance)

View file

@ -1,4 +1,5 @@
library(data.table) library(data.table)
library(progress)
library(rlog) library(rlog)
#' Compute the mean correlation coefficient comparing gene distances with a set #' Compute the mean correlation coefficient comparing gene distances with a set
@ -15,24 +16,29 @@ library(rlog)
#' @param reference_gene_ids Genes to compare to. #' @param reference_gene_ids Genes to compare to.
process_correlation <- function(distances, species_ids, gene_ids, process_correlation <- function(distances, species_ids, gene_ids,
reference_gene_ids) { reference_gene_ids) {
log_info("Processing genes for correlation")
results <- data.table(gene = gene_ids) results <- data.table(gene = gene_ids)
gene_count <- length(gene_ids) gene_count <- length(gene_ids)
reference_count <- length(reference_gene_ids) reference_count <- length(reference_gene_ids)
log_info(sprintf(
"Correlating %i genes from %i species with %i reference genes",
gene_count,
length(species_ids),
reference_count
))
progress <- progress_bar$new(
total = gene_count,
format = "Correlating genes [:bar] :percent (ETA :eta)"
)
# Prefilter distances by species. # Prefilter distances by species.
distances <- distances[species %chin% species_ids] distances <- distances[species %chin% species_ids]
for (i in 1:gene_count) { for (i in 1:gene_count) {
progress$tick()
gene_id <- gene_ids[i] gene_id <- gene_ids[i]
log_info(sprintf(
"[%3i%%] Processing gene \"%s\"",
round(i / gene_count * 100),
gene_id
))
gene_distances <- distances[gene == gene_id] gene_distances <- distances[gene == gene_id]
if (nrow(gene_distances) < 12) { if (nrow(gene_distances) < 12) {

29
input.R
View file

@ -1,5 +1,6 @@
library(biomaRt) library(biomaRt)
library(data.table) library(data.table)
library(progress)
library(rlog) library(rlog)
library(stringr) library(stringr)
@ -115,6 +116,23 @@ retrieve_genes <- function() {
#' - `gene` Ensembl gene ID. #' - `gene` Ensembl gene ID.
#' - `distance` Distance to nearest telomere in base pairs. #' - `distance` Distance to nearest telomere in base pairs.
retrieve_distances <- function(species_ids, gene_ids) { retrieve_distances <- function(species_ids, gene_ids) {
# Exclude the human from the species, in case it is present there.
species_ids <- species_ids[species_ids != "hsapiens"]
species_count <- length(species_ids)
gene_count <- length(gene_ids)
log_info(sprintf(
"Retrieving distance data for %i genes from %i species",
gene_count,
species_count
))
progress <- progress_bar$new(
total = gene_count,
format = "Retrieving distance data [:bar] :percent (ETA :eta)"
)
# Special case the human species and retrieve all available distance # Special case the human species and retrieve all available distance
# information. # information.
@ -148,19 +166,10 @@ retrieve_distances <- function(species_ids, gene_ids) {
) )
] ]
# Exclude the human from the species, in case it is present there.
species_ids <- species_ids[species_ids != "hsapiens"]
species_count <- length(species_ids)
for (i in 1:species_count) { for (i in 1:species_count) {
species_id <- species_ids[i] species_id <- species_ids[i]
log_info(sprintf( progress$tick()
"[%3i%%] Loading species \"%s\"",
round(i / species_count * 100),
species_id
))
ensembl <- useDataset( ensembl <- useDataset(
sprintf("%s_gene_ensembl", species_id), sprintf("%s_gene_ensembl", species_id),