Reinclude human into distance data

2025-10-26 11:17:24 +01:00 · 2021-09-19 12:07:59 +02:00 · 2021-09-19 12:07:59 +02:00 · e9c05fdbab
commit e9c05fdbab
parent 22b25c9b83
2 changed files with 43 additions and 16 deletions
--- a/init.R
+++ b/init.R
@ -5,11 +5,11 @@ source("util.R")

 # Load input data

-species <- run_cached("input/species", retrieve_species)
-genes <- run_cached("input/genes", retrieve_genes)
+species <- run_cached("input_species", retrieve_species)
+genes <- run_cached("input_genes", retrieve_genes)

 distances <- run_cached(
-    "input/distances",
+    "input_distances",
    retrieve_distances,
    species[, id],
    genes[, id]
@ -23,7 +23,7 @@ all_genes <- genes[, id]
 tpe_old_genes <- genes[suggested | verified == TRUE, id]

 clustering_all <- run_cached(
-    "all_species/clustering",
+    "clustering_all",
    process_clustering,
    distances,
    all_species,
@ -31,7 +31,7 @@ clustering_all <- run_cached(
 )

 clustering_replicative <- run_cached(
-    "replicative_species/clustering",
+    "clustering_replicative",
    process_clustering,
    distances,
    replicative_species,
@ -39,7 +39,7 @@ clustering_replicative <- run_cached(
 )

 correlation_all <- run_cached(
-    "all_species/correlation",
+    "correlation_all",
    process_correlation,
    distances,
    all_species,
@ -48,7 +48,7 @@ correlation_all <- run_cached(
 )

 correlation_replicative <- run_cached(
-    "replicative_species/correlation",
+    "correlation_replicative",
    process_correlation,
    distances,
    replicative_species,
--- a/input.R
+++ b/input.R
@ -115,11 +115,41 @@ retrieve_genes <- function() {
 #'  - `gene` Ensembl gene ID.
 #'  - `distance` Distance to nearest telomere in base pairs.
 retrieve_distances <- function(species_ids, gene_ids) {
-    distances <- data.table(
-        species = character(),
-        gene = character(),
-        distance = integer()
-    )
+    # Special case the human species and retrieve all available distance
+    # information.
+
+    ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
+
+    human_distances <- data.table(getBM(
+        attributes = c(
+            "ensembl_gene_id",
+            "chromosome_name",
+            "start_position",
+            "end_position"
+        ),
+        mart = ensembl
+    ))
+
+    human_distances[,
+        chromosome_length := max(end_position),
+        by = chromosome_name
+    ]
+
+    # Filter out relevant information (see below).
+    distances <- human_distances[
+        chromosome_length > 15000000,
+        .(
+            species = "hsapiens",
+            gene = ensembl_gene_id,
+            distance = pmin(
+                start_position,
+                chromosome_length - end_position
+            )
+        )
+    ]
+
+    # Exclude the human from the species, in case it is present there.
+    species_ids <- species_ids[species_ids != "hsapiens"]

    species_count <- length(species_ids)

@ -156,10 +186,7 @@ retrieve_distances <- function(species_ids, gene_ids) {
                "start_position",
                "end_position"
            ),
-            mart = useDataset(
-                sprintf("%s_gene_ensembl", species_id),
-                mart = ensembl
-            )
+            mart = ensembl
        ))

        ensembl_distances[,