data: Simplify data structure

This commit also adds the input data to the index.
2025-10-26 11:17:24 +01:00 · 2021-06-24 20:20:46 +02:00 · 2021-06-24 20:20:46 +02:00 · 998009b418
commit 998009b418
parent 914673c79c
205 changed files with 3296891 additions and 272 deletions
--- a/data.R
+++ b/data.R
@ -27,35 +27,34 @@ load_data_cached <- function(path) {

 #' Merge genome data from files in `path` into `tibble`s.
 #'
-#' The result will be a list with two named elements:
-#' - `genes` will be a table with one row per unique `geneid` and multiple
-#'   columns per species containing the data of interest.
-#' - `species` will contain additional information on each species.
+#' The result will be a list with named elements:
+#' - `genes` will be a table with metadata on human genes.
+#' - `species` will contain metadata on each species.
+#' - `distances` will contain each species' genes' distances to the telomere.
 #'
 #' @seealso [load_data_cached()]
 load_data <- function(path) {
-    # The resulting table for information by species.
+    genes <- read_tsv(paste(path, "genes.tsv", sep = "/"))
    species <- read_csv(paste(path, "species.csv", sep = "/"))
-
-    # The resulting table for information by gene. For each species, columns
-    # will be appended.
-    genes <- tibble(geneid = integer())
+    distances <- tibble(geneid = integer())

    # Each file will contain data on one species.
-    file_names <- list.files(path, "*_raw.txt")
+    file_names <- list.files(paste(path, "genomes", sep = "/"))

-    # Table containing additional columns to be added to the species table.
+    # Table containing additional columns to be added to the species table
+    # later.
    species_computed <- tibble(
        id = character(),
        median_distance = numeric()
    )

    for (file_name in file_names) {
-        species_id <- strsplit(file_name, split = "_")[[1]][1]
-        genes_for_species <- read_tsv(paste(path, file_name, sep = "/"))
+        species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
+        species_path <- paste(path, "genomes", file_name, sep = "/")
+        species_distances <- read_tsv(species_path)

        # Compute the median distance across all genes of this species.
-        median_distance <- genes_for_species %>%
+        median_distance <- species_distances %>%
            select(dist) %>%
            summarise(median_distance = median(dist)) %>%
            pull(median_distance)
@ -67,19 +66,19 @@ load_data <- function(path) {
        )

        # Column names have to be unique for each species.
-        genes_for_species <- rename_with(
-            genes_for_species,
-            ~ paste(species_id, .x, sep = "_"),
-            c(dist, name, chromosome)
-        )
+        # TODO: How to create a dynamic column name using `rename()`?
+        species_distances <- species_distances %>%
+            rename_with(function(x) species_id, dist)

-        genes <- full_join(genes, genes_for_species)
+        distances <- full_join(distances, species_distances)
    }

+    # Add additional columns to the original species table.
    species <- left_join(species, species_computed)

    list(
        genes = genes,
-        species = species
+        species = species,
+        distances = distances
    )
 }