Enhance progress information

2025-10-26 19:27:24 +01:00 · 2021-09-21 16:47:13 +02:00 · 2021-09-21 16:47:13 +02:00 · ba7c624705
commit ba7c624705
parent 8e54dacd3d
3 changed files with 49 additions and 26 deletions
--- a/clustering.R
+++ b/clustering.R
@ -1,4 +1,5 @@
 library(data.table)
 library(progress)
 library(rlog)
 #' Process genes clustering their distance to telomeres.
@ -17,15 +18,22 @@ process_clustering <- function(distances, species_ids, gene_ids) {
    results <- data.table(gene = gene_ids)
    gene_count <- length(gene_ids)
    for (i in 1:gene_count) {
        gene_id <- gene_ids[i]
    log_info(sprintf(
-            "[%3i%%] Processing gene \"%s\"",
+        "Clustering %i genes from %i species",
-            round(i / gene_count * 100),
+        gene_count,
-            gene_id
+        length(species_ids)
    ))
    progress <- progress_bar$new(
        total = gene_count,
        format = "Clustering genes [:bar] :percent (ETA :eta)"
    )
    for (i in 1:gene_count) {
        progress$tick()
        gene_id <- gene_ids[i]
        data <- distances[
            species %chin% species_ids & gene == gene_id,
            .(species, distance)
--- a/correlation.R
+++ b/correlation.R
@ -1,4 +1,5 @@
 library(data.table)
 library(progress)
 library(rlog)
 #' Compute the mean correlation coefficient comparing gene distances with a set
@ -15,24 +16,29 @@ library(rlog)
 #' @param reference_gene_ids Genes to compare to.
 process_correlation <- function(distances, species_ids, gene_ids,
                                reference_gene_ids) {
    log_info("Processing genes for correlation")
    results <- data.table(gene = gene_ids)
    gene_count <- length(gene_ids)
    reference_count <- length(reference_gene_ids)
    log_info(sprintf(
        "Correlating %i genes from %i species with %i reference genes",
        gene_count,
        length(species_ids),
        reference_count
    ))
    progress <- progress_bar$new(
        total = gene_count,
        format = "Correlating genes [:bar] :percent (ETA :eta)"
    )
    # Prefilter distances by species.
    distances <- distances[species %chin% species_ids]
    for (i in 1:gene_count) {
        progress$tick()
        gene_id <- gene_ids[i]
        log_info(sprintf(
            "[%3i%%] Processing gene \"%s\"",
            round(i / gene_count * 100),
            gene_id
        ))
        gene_distances <- distances[gene == gene_id]
        if (nrow(gene_distances) < 12) {
--- a/input.R
+++ b/input.R
@ -1,5 +1,6 @@
 library(biomaRt)
 library(data.table)
 library(progress)
 library(rlog)
 library(stringr)
@ -115,6 +116,23 @@ retrieve_genes <- function() {
 #'  - `gene` Ensembl gene ID.
 #'  - `distance` Distance to nearest telomere in base pairs.
 retrieve_distances <- function(species_ids, gene_ids) {
    # Exclude the human from the species, in case it is present there.
    species_ids <- species_ids[species_ids != "hsapiens"]
    species_count <- length(species_ids)
    gene_count <- length(gene_ids)
    log_info(sprintf(
        "Retrieving distance data for %i genes from %i species",
        gene_count,
        species_count
    ))
    progress <- progress_bar$new(
        total = gene_count,
        format = "Retrieving distance data [:bar] :percent (ETA :eta)"
    )
    # Special case the human species and retrieve all available distance
    # information.
@ -148,19 +166,10 @@ retrieve_distances <- function(species_ids, gene_ids) {
        )
    ]
    # Exclude the human from the species, in case it is present there.
    species_ids <- species_ids[species_ids != "hsapiens"]
    species_count <- length(species_ids)
    for (i in 1:species_count) {
        species_id <- species_ids[i]
-        log_info(sprintf(
+        progress$tick()
            "[%3i%%] Loading species \"%s\"",
            round(i / species_count * 100),
            species_id
        ))
        ensembl <- useDataset(
            sprintf("%s_gene_ensembl", species_id),