data: Revert to using data.table

This commit is contained in:
Elias Projahn 2021-06-24 22:36:02 +02:00
parent 998009b418
commit a50feafcca

62
data.R
View file

@ -1,6 +1,4 @@
library(dplyr) library(data.table)
library(readr)
library(tibble)
#' Load and preprocess input data from `path`. #' Load and preprocess input data from `path`.
#' #'
@ -25,7 +23,7 @@ load_data_cached <- function(path) {
} }
} }
#' Merge genome data from files in `path` into `tibble`s. #' Merge genome data from files in `path` into `data.table`s.
#' #'
#' The result will be a list with named elements: #' The result will be a list with named elements:
#' - `genes` will be a table with metadata on human genes. #' - `genes` will be a table with metadata on human genes.
@ -34,48 +32,44 @@ load_data_cached <- function(path) {
#' #'
#' @seealso [load_data_cached()] #' @seealso [load_data_cached()]
load_data <- function(path) { load_data <- function(path) {
genes <- read_tsv(paste(path, "genes.tsv", sep = "/")) genes <- fread(paste(path, "genes.tsv", sep = "/"))
species <- read_csv(paste(path, "species.csv", sep = "/")) original_species <- fread(paste(path, "species.csv", sep = "/"))
distances <- tibble(geneid = integer())
species <- data.table(
id = character(),
label = character(),
median_distance = numeric()
)
distances <- data.table(geneid = integer())
# Each file will contain data on one species. # Each file will contain data on one species.
file_names <- list.files(paste(path, "genomes", sep = "/")) file_names <- list.files(paste(path, "genomes", sep = "/"))
# Table containing additional columns to be added to the species table
# later.
species_computed <- tibble(
id = character(),
median_distance = numeric()
)
for (file_name in file_names) { for (file_name in file_names) {
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1] species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
species_path <- paste(path, "genomes", file_name, sep = "/")
species_distances <- read_tsv(species_path)
# Compute the median distance across all genes of this species. # Only continue for replicatively aging species.
median_distance <- species_distances %>% # TODO: Which other species should be included?
select(dist) %>% if (original_species[id == species_id, group] == "replicative") {
summarise(median_distance = median(dist)) %>% species_path <- paste(path, "genomes", file_name, sep = "/")
pull(median_distance) species_distances <- fread(species_path)
# Cache the values to be added to the species table. # Compute the median distance across all genes of this species and
species_computed <- species_computed %>% add_row( # add it to the species table along other static data.
id = species_id, species <- rbindlist(list(species, data.table(
median_distance = median_distance, id = species_id,
) label = original_species[id == species_id, label],
median_distance = median(species_distances[, dist])
)))
# Column names have to be unique for each species. # Column names have to be unique for each species.
# TODO: How to create a dynamic column name using `rename()`? setnames(species_distances, "dist", species_id)
species_distances <- species_distances %>%
rename_with(function(x) species_id, dist)
distances <- full_join(distances, species_distances) distances <- merge(distances, species_distances, all = TRUE)
}
} }
# Add additional columns to the original species table.
species <- left_join(species, species_computed)
list( list(
genes = genes, genes = genes,
species = species, species = species,