geposanui/data.R

85 lines
2.6 KiB
R
Raw Normal View History

2021-06-21 13:03:26 +02:00
library(dplyr)
library(readr)
library(tibble)
2021-06-16 22:01:09 +02:00
#' Load and preprocess input data from `path`.
#'
#' A file named `cache.rds` will be created within that directory to reuse the
#' results for future runs. To forcefully recompute, delete that file.
#'
#' @seealso [load_data()]
load_data_cached <- function(path) {
cache_file <- paste(path, "cache.rds", sep = "/")
if (!file.exists(cache_file)) {
# If the cache file doesn't exist, we have to do the computation.
2021-06-21 13:03:26 +02:00
data <- load_data(path)
2021-06-16 22:01:09 +02:00
# The results are cached for the next run.
saveRDS(data, cache_file)
data
} else {
# If the cache file exists, we restore the data from it.
readRDS(cache_file)
}
}
2021-06-21 13:03:26 +02:00
#' Merge genome data from files in `path` into `tibble`s.
2021-06-16 22:01:09 +02:00
#'
2021-06-21 13:03:26 +02:00
#' The result will be a list with two named elements:
2021-06-16 22:01:09 +02:00
#' - `genes` will be a table with one row per unique `geneid` and multiple
2021-06-21 13:03:26 +02:00
#' columns per species containing the data of interest.
#' - `species` will contain additional information on each species.
2021-06-16 22:01:09 +02:00
#'
#' @seealso [load_data_cached()]
load_data <- function(path) {
2021-06-21 13:03:26 +02:00
# The resulting table for information by species.
species <- read_csv(paste(path, "species.csv", sep = "/"))
2021-06-16 22:01:09 +02:00
# The resulting table for information by gene. For each species, columns
# will be appended.
2021-06-21 13:03:26 +02:00
genes <- tibble(geneid = integer())
2021-06-16 22:01:09 +02:00
2021-06-21 13:03:26 +02:00
# Each file will contain data on one species.
2021-06-16 22:01:09 +02:00
file_names <- list.files(path, "*_raw.txt")
2021-06-21 13:03:26 +02:00
# Table containing additional columns to be added to the species table.
species_computed <- tibble(
id = character(),
median_distance = numeric()
)
2021-06-16 22:01:09 +02:00
for (file_name in file_names) {
species_id <- strsplit(file_name, split = "_")[[1]][1]
2021-06-21 13:03:26 +02:00
genes_for_species <- read_tsv(paste(path, file_name, sep = "/"))
2021-06-16 22:01:09 +02:00
2021-06-21 13:03:26 +02:00
# Compute the median distance across all genes of this species.
median_distance <- genes_for_species %>%
select(dist) %>%
summarise(median_distance = median(dist)) %>%
pull(median_distance)
2021-06-16 22:01:09 +02:00
2021-06-21 13:03:26 +02:00
# Cache the values to be added to the species table.
species_computed <- species_computed %>% add_row(
id = species_id,
median_distance = median_distance,
2021-06-16 22:01:09 +02:00
)
2021-06-21 13:03:26 +02:00
# Column names have to be unique for each species.
genes_for_species <- rename_with(
genes_for_species,
~ paste(species_id, .x, sep = "_"),
c(dist, name, chromosome)
2021-06-16 22:01:09 +02:00
)
2021-06-21 13:03:26 +02:00
genes <- full_join(genes, genes_for_species)
2021-06-16 22:01:09 +02:00
}
2021-06-21 13:03:26 +02:00
species <- left_join(species, species_computed)
2021-06-16 22:01:09 +02:00
list(
2021-06-21 13:03:26 +02:00
genes = genes,
species = species
2021-06-16 22:01:09 +02:00
)
2021-06-21 13:03:26 +02:00
}