mirror of
https://github.com/johrpan/geposanui.git
synced 2025-10-26 11:17:24 +01:00
data: Revert to using data.table
This commit is contained in:
parent
998009b418
commit
a50feafcca
1 changed files with 28 additions and 34 deletions
62
data.R
62
data.R
|
|
@ -1,6 +1,4 @@
|
||||||
library(dplyr)
|
library(data.table)
|
||||||
library(readr)
|
|
||||||
library(tibble)
|
|
||||||
|
|
||||||
#' Load and preprocess input data from `path`.
|
#' Load and preprocess input data from `path`.
|
||||||
#'
|
#'
|
||||||
|
|
@ -25,7 +23,7 @@ load_data_cached <- function(path) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Merge genome data from files in `path` into `tibble`s.
|
#' Merge genome data from files in `path` into `data.table`s.
|
||||||
#'
|
#'
|
||||||
#' The result will be a list with named elements:
|
#' The result will be a list with named elements:
|
||||||
#' - `genes` will be a table with metadata on human genes.
|
#' - `genes` will be a table with metadata on human genes.
|
||||||
|
|
@ -34,48 +32,44 @@ load_data_cached <- function(path) {
|
||||||
#'
|
#'
|
||||||
#' @seealso [load_data_cached()]
|
#' @seealso [load_data_cached()]
|
||||||
load_data <- function(path) {
|
load_data <- function(path) {
|
||||||
genes <- read_tsv(paste(path, "genes.tsv", sep = "/"))
|
genes <- fread(paste(path, "genes.tsv", sep = "/"))
|
||||||
species <- read_csv(paste(path, "species.csv", sep = "/"))
|
original_species <- fread(paste(path, "species.csv", sep = "/"))
|
||||||
distances <- tibble(geneid = integer())
|
|
||||||
|
species <- data.table(
|
||||||
|
id = character(),
|
||||||
|
label = character(),
|
||||||
|
median_distance = numeric()
|
||||||
|
)
|
||||||
|
|
||||||
|
distances <- data.table(geneid = integer())
|
||||||
|
|
||||||
# Each file will contain data on one species.
|
# Each file will contain data on one species.
|
||||||
file_names <- list.files(paste(path, "genomes", sep = "/"))
|
file_names <- list.files(paste(path, "genomes", sep = "/"))
|
||||||
|
|
||||||
# Table containing additional columns to be added to the species table
|
|
||||||
# later.
|
|
||||||
species_computed <- tibble(
|
|
||||||
id = character(),
|
|
||||||
median_distance = numeric()
|
|
||||||
)
|
|
||||||
|
|
||||||
for (file_name in file_names) {
|
for (file_name in file_names) {
|
||||||
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
|
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
|
||||||
species_path <- paste(path, "genomes", file_name, sep = "/")
|
|
||||||
species_distances <- read_tsv(species_path)
|
|
||||||
|
|
||||||
# Compute the median distance across all genes of this species.
|
# Only continue for replicatively aging species.
|
||||||
median_distance <- species_distances %>%
|
# TODO: Which other species should be included?
|
||||||
select(dist) %>%
|
if (original_species[id == species_id, group] == "replicative") {
|
||||||
summarise(median_distance = median(dist)) %>%
|
species_path <- paste(path, "genomes", file_name, sep = "/")
|
||||||
pull(median_distance)
|
species_distances <- fread(species_path)
|
||||||
|
|
||||||
# Cache the values to be added to the species table.
|
# Compute the median distance across all genes of this species and
|
||||||
species_computed <- species_computed %>% add_row(
|
# add it to the species table along other static data.
|
||||||
id = species_id,
|
species <- rbindlist(list(species, data.table(
|
||||||
median_distance = median_distance,
|
id = species_id,
|
||||||
)
|
label = original_species[id == species_id, label],
|
||||||
|
median_distance = median(species_distances[, dist])
|
||||||
|
)))
|
||||||
|
|
||||||
# Column names have to be unique for each species.
|
# Column names have to be unique for each species.
|
||||||
# TODO: How to create a dynamic column name using `rename()`?
|
setnames(species_distances, "dist", species_id)
|
||||||
species_distances <- species_distances %>%
|
|
||||||
rename_with(function(x) species_id, dist)
|
|
||||||
|
|
||||||
distances <- full_join(distances, species_distances)
|
distances <- merge(distances, species_distances, all = TRUE)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add additional columns to the original species table.
|
|
||||||
species <- left_join(species, species_computed)
|
|
||||||
|
|
||||||
list(
|
list(
|
||||||
genes = genes,
|
genes = genes,
|
||||||
species = species,
|
species = species,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue