2021-06-24 22:36:02 +02:00
|
|
|
library(data.table)
|
2021-06-16 22:01:09 +02:00
|
|
|
|
|
|
|
|
#' Load and preprocess input data from `path`.
|
|
|
|
|
#'
|
|
|
|
|
#' A file named `cache.rds` will be created within that directory to reuse the
|
|
|
|
|
#' results for future runs. To forcefully recompute, delete that file.
|
|
|
|
|
#'
|
|
|
|
|
#' @seealso [load_data()]
|
|
|
|
|
load_data_cached <- function(path) {
|
|
|
|
|
cache_file <- paste(path, "cache.rds", sep = "/")
|
|
|
|
|
|
|
|
|
|
if (!file.exists(cache_file)) {
|
|
|
|
|
# If the cache file doesn't exist, we have to do the computation.
|
2021-06-21 13:03:26 +02:00
|
|
|
data <- load_data(path)
|
2021-06-16 22:01:09 +02:00
|
|
|
|
|
|
|
|
# The results are cached for the next run.
|
|
|
|
|
saveRDS(data, cache_file)
|
|
|
|
|
|
|
|
|
|
data
|
|
|
|
|
} else {
|
|
|
|
|
# If the cache file exists, we restore the data from it.
|
|
|
|
|
readRDS(cache_file)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-24 22:36:02 +02:00
|
|
|
#' Merge genome data from files in `path` into `data.table`s.
|
2021-06-16 22:01:09 +02:00
|
|
|
#'
|
2021-06-24 20:20:46 +02:00
|
|
|
#' The result will be a list with named elements:
|
|
|
|
|
#' - `genes` will be a table with metadata on human genes.
|
|
|
|
|
#' - `species` will contain metadata on each species.
|
|
|
|
|
#' - `distances` will contain each species' genes' distances to the telomere.
|
2021-06-16 22:01:09 +02:00
|
|
|
#'
|
|
|
|
|
#' @seealso [load_data_cached()]
|
|
|
|
|
load_data <- function(path) {
|
2021-06-24 22:36:02 +02:00
|
|
|
genes <- fread(paste(path, "genes.tsv", sep = "/"))
|
|
|
|
|
original_species <- fread(paste(path, "species.csv", sep = "/"))
|
2021-06-16 22:01:09 +02:00
|
|
|
|
2021-06-24 22:36:02 +02:00
|
|
|
species <- data.table(
|
2021-06-21 13:03:26 +02:00
|
|
|
id = character(),
|
2021-06-24 22:36:02 +02:00
|
|
|
label = character(),
|
2021-06-21 13:03:26 +02:00
|
|
|
median_distance = numeric()
|
|
|
|
|
)
|
|
|
|
|
|
2021-08-16 17:21:01 +02:00
|
|
|
distances <- data.table(
|
|
|
|
|
species = character(),
|
|
|
|
|
gene = integer(),
|
|
|
|
|
distance = integer()
|
|
|
|
|
)
|
2021-06-24 22:36:02 +02:00
|
|
|
|
|
|
|
|
# Each file will contain data on one species.
|
|
|
|
|
file_names <- list.files(paste(path, "genomes", sep = "/"))
|
|
|
|
|
|
2021-06-16 22:01:09 +02:00
|
|
|
for (file_name in file_names) {
|
2021-06-24 20:20:46 +02:00
|
|
|
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
|
2021-06-16 22:01:09 +02:00
|
|
|
|
2021-06-24 22:36:02 +02:00
|
|
|
# Only continue for replicatively aging species.
|
|
|
|
|
# TODO: Which other species should be included?
|
|
|
|
|
if (original_species[id == species_id, group] == "replicative") {
|
|
|
|
|
species_path <- paste(path, "genomes", file_name, sep = "/")
|
|
|
|
|
species_distances <- fread(species_path)
|
2021-06-16 22:01:09 +02:00
|
|
|
|
2021-06-24 22:36:02 +02:00
|
|
|
# Compute the median distance across all genes of this species and
|
|
|
|
|
# add it to the species table along other static data.
|
|
|
|
|
species <- rbindlist(list(species, data.table(
|
|
|
|
|
id = species_id,
|
|
|
|
|
label = original_species[id == species_id, label],
|
|
|
|
|
median_distance = median(species_distances[, dist])
|
|
|
|
|
)))
|
2021-06-16 22:01:09 +02:00
|
|
|
|
2021-08-16 17:21:01 +02:00
|
|
|
species_distances <- data.table(
|
|
|
|
|
species = species_id,
|
|
|
|
|
gene = species_distances[, geneid],
|
|
|
|
|
distance = species_distances[, dist]
|
|
|
|
|
)
|
2021-06-21 13:03:26 +02:00
|
|
|
|
2021-08-16 17:21:01 +02:00
|
|
|
distances <- rbindlist(list(distances, species_distances))
|
2021-06-24 22:36:02 +02:00
|
|
|
}
|
2021-06-16 22:01:09 +02:00
|
|
|
}
|
|
|
|
|
|
2021-08-16 17:21:01 +02:00
|
|
|
# Order species by there median distance.
|
|
|
|
|
setorder(species, median_distance)
|
|
|
|
|
|
2021-06-16 22:01:09 +02:00
|
|
|
list(
|
2021-06-21 13:03:26 +02:00
|
|
|
genes = genes,
|
2021-06-24 20:20:46 +02:00
|
|
|
species = species,
|
|
|
|
|
distances = distances
|
2021-06-16 22:01:09 +02:00
|
|
|
)
|
2021-06-21 13:03:26 +02:00
|
|
|
}
|