library(data.table)

#' Merge genome data from files in `path` into `data.table`s.
#'
#' The result will be a list with named elements:
#' - `genes` will be a table with metadata on human genes.
#' - `species` will contain metadata on each species.
#' - `distances` will contain each species' genes' distances to the telomere.
#'
#' @seealso [load_data_cached()]
load_data <- function(path) {
    genes <- fread(paste(path, "genes.tsv", sep = "/"))
    original_species <- fread(paste(path, "species.csv", sep = "/"))

    species <- data.table(
        id = character(),
        label = character(),
        median_distance = numeric()
    )

    distances <- data.table(
        species = character(),
        gene = integer(),
        distance = integer()
    )

    # Each file will contain data on one species.
    file_names <- list.files(paste(path, "genomes", sep = "/"))

    for (file_name in file_names) {
        species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]

        # Only continue for replicatively aging species.
        # TODO: Which other species should be included?
        if (original_species[id == species_id, group] == "replicative") {
            species_path <- paste(path, "genomes", file_name, sep = "/")
            species_distances <- fread(species_path)

            # Compute the median distance across all genes of this species and
            # add it to the species table along other static data.
            species <- rbindlist(list(species, data.table(
                id = species_id,
                label = original_species[id == species_id, label],
                median_distance = median(species_distances[, dist])
            )))

            species_distances <- data.table(
                species = species_id,
                gene = species_distances[, geneid],
                distance = species_distances[, dist]
            )

            distances <- rbindlist(list(distances, species_distances))
        }
    }

    # Order species by there median distance.
    setorder(species, median_distance)

    list(
        genes = genes,
        species = species,
        distances = distances
    )
}