mirror of
https://github.com/johrpan/geposanui.git
synced 2025-10-26 19:27:24 +01:00
data: Migrate to tidyverse
This commit is contained in:
parent
cbc444f20c
commit
914673c79c
1 changed files with 41 additions and 32 deletions
69
data.R
69
data.R
|
|
@ -1,6 +1,6 @@
|
||||||
library(data.table)
|
library(dplyr)
|
||||||
|
library(readr)
|
||||||
source("species.R")
|
library(tibble)
|
||||||
|
|
||||||
#' Load and preprocess input data from `path`.
|
#' Load and preprocess input data from `path`.
|
||||||
#'
|
#'
|
||||||
|
|
@ -13,7 +13,7 @@ load_data_cached <- function(path) {
|
||||||
|
|
||||||
if (!file.exists(cache_file)) {
|
if (!file.exists(cache_file)) {
|
||||||
# If the cache file doesn't exist, we have to do the computation.
|
# If the cache file doesn't exist, we have to do the computation.
|
||||||
data <- load_data("input")
|
data <- load_data(path)
|
||||||
|
|
||||||
# The results are cached for the next run.
|
# The results are cached for the next run.
|
||||||
saveRDS(data, cache_file)
|
saveRDS(data, cache_file)
|
||||||
|
|
@ -25,52 +25,61 @@ load_data_cached <- function(path) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Merge genome data from files in `path` into `data.table`s.
|
#' Merge genome data from files in `path` into `tibble`s.
|
||||||
#'
|
#'
|
||||||
#' The result will be a list with two items:
|
#' The result will be a list with two named elements:
|
||||||
#' - `genes` will be a table with one row per unique `geneid` and multiple
|
#' - `genes` will be a table with one row per unique `geneid` and multiple
|
||||||
#' columns per species containing the data of interest.
|
#' columns per species containing the data of interest.
|
||||||
#' - `species` will contain information that is useful to be accessed by
|
#' - `species` will contain additional information on each species.
|
||||||
#' species.
|
|
||||||
#'
|
#'
|
||||||
#' @seealso [load_data_cached()]
|
#' @seealso [load_data_cached()]
|
||||||
load_data <- function(path) {
|
load_data <- function(path) {
|
||||||
|
# The resulting table for information by species.
|
||||||
|
species <- read_csv(paste(path, "species.csv", sep = "/"))
|
||||||
|
|
||||||
# The resulting table for information by gene. For each species, columns
|
# The resulting table for information by gene. For each species, columns
|
||||||
# will be appended.
|
# will be appended.
|
||||||
genes_table <- data.table(geneid = integer())
|
genes <- tibble(geneid = integer())
|
||||||
|
|
||||||
# The resulting table for information by species. This will result in a
|
|
||||||
# warning, because all median_distance values will be filled with `NA`
|
|
||||||
# (correctly).
|
|
||||||
species_table <- data.table(species, median_distance = numeric())
|
|
||||||
|
|
||||||
|
# Each file will contain data on one species.
|
||||||
file_names <- list.files(path, "*_raw.txt")
|
file_names <- list.files(path, "*_raw.txt")
|
||||||
|
|
||||||
|
# Table containing additional columns to be added to the species table.
|
||||||
|
species_computed <- tibble(
|
||||||
|
id = character(),
|
||||||
|
median_distance = numeric()
|
||||||
|
)
|
||||||
|
|
||||||
for (file_name in file_names) {
|
for (file_name in file_names) {
|
||||||
species_id <- strsplit(file_name, split = "_")[[1]][1]
|
species_id <- strsplit(file_name, split = "_")[[1]][1]
|
||||||
genes_table_for_species <- fread(paste(path, file_name, sep = "/"))
|
genes_for_species <- read_tsv(paste(path, file_name, sep = "/"))
|
||||||
|
|
||||||
# Fill in the new column of the species table (`median_distance`).
|
# Compute the median distance across all genes of this species.
|
||||||
species_table[
|
median_distance <- genes_for_species %>%
|
||||||
id == species_id,
|
select(dist) %>%
|
||||||
median_distance := median(genes_table_for_species[, dist])
|
summarise(median_distance = median(dist)) %>%
|
||||||
]
|
pull(median_distance)
|
||||||
|
|
||||||
|
# Cache the values to be added to the species table.
|
||||||
|
species_computed <- species_computed %>% add_row(
|
||||||
|
id = species_id,
|
||||||
|
median_distance = median_distance,
|
||||||
|
)
|
||||||
|
|
||||||
# Column names have to be unique for each species.
|
# Column names have to be unique for each species.
|
||||||
colnames(genes_table_for_species)[c(2, 3, 4)] <- c(
|
genes_for_species <- rename_with(
|
||||||
paste(species_id, c("dist", "name", "chromosome"), sep = "_")
|
genes_for_species,
|
||||||
|
~ paste(species_id, .x, sep = "_"),
|
||||||
|
c(dist, name, chromosome)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add new genes as rows as well as new columns for this species.
|
genes <- full_join(genes, genes_for_species)
|
||||||
genes_table <- merge(
|
|
||||||
genes_table,
|
|
||||||
genes_table_for_species,
|
|
||||||
all = TRUE
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
species <- left_join(species, species_computed)
|
||||||
|
|
||||||
list(
|
list(
|
||||||
genes = genes_table,
|
genes = genes,
|
||||||
species = species_table
|
species = species
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue