mirror of
https://github.com/johrpan/geposanui.git
synced 2025-10-26 11:17:24 +01:00
data: Simplify data structure
This commit also adds the input data to the index.
This commit is contained in:
parent
914673c79c
commit
998009b418
205 changed files with 3296891 additions and 272 deletions
41
data.R
41
data.R
|
|
@ -27,35 +27,34 @@ load_data_cached <- function(path) {
|
|||
|
||||
#' Merge genome data from files in `path` into `tibble`s.
|
||||
#'
|
||||
#' The result will be a list with two named elements:
|
||||
#' - `genes` will be a table with one row per unique `geneid` and multiple
|
||||
#' columns per species containing the data of interest.
|
||||
#' - `species` will contain additional information on each species.
|
||||
#' The result will be a list with named elements:
|
||||
#' - `genes` will be a table with metadata on human genes.
|
||||
#' - `species` will contain metadata on each species.
|
||||
#' - `distances` will contain each species' genes' distances to the telomere.
|
||||
#'
|
||||
#' @seealso [load_data_cached()]
|
||||
load_data <- function(path) {
|
||||
# The resulting table for information by species.
|
||||
genes <- read_tsv(paste(path, "genes.tsv", sep = "/"))
|
||||
species <- read_csv(paste(path, "species.csv", sep = "/"))
|
||||
|
||||
# The resulting table for information by gene. For each species, columns
|
||||
# will be appended.
|
||||
genes <- tibble(geneid = integer())
|
||||
distances <- tibble(geneid = integer())
|
||||
|
||||
# Each file will contain data on one species.
|
||||
file_names <- list.files(path, "*_raw.txt")
|
||||
file_names <- list.files(paste(path, "genomes", sep = "/"))
|
||||
|
||||
# Table containing additional columns to be added to the species table.
|
||||
# Table containing additional columns to be added to the species table
|
||||
# later.
|
||||
species_computed <- tibble(
|
||||
id = character(),
|
||||
median_distance = numeric()
|
||||
)
|
||||
|
||||
for (file_name in file_names) {
|
||||
species_id <- strsplit(file_name, split = "_")[[1]][1]
|
||||
genes_for_species <- read_tsv(paste(path, file_name, sep = "/"))
|
||||
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
|
||||
species_path <- paste(path, "genomes", file_name, sep = "/")
|
||||
species_distances <- read_tsv(species_path)
|
||||
|
||||
# Compute the median distance across all genes of this species.
|
||||
median_distance <- genes_for_species %>%
|
||||
median_distance <- species_distances %>%
|
||||
select(dist) %>%
|
||||
summarise(median_distance = median(dist)) %>%
|
||||
pull(median_distance)
|
||||
|
|
@ -67,19 +66,19 @@ load_data <- function(path) {
|
|||
)
|
||||
|
||||
# Column names have to be unique for each species.
|
||||
genes_for_species <- rename_with(
|
||||
genes_for_species,
|
||||
~ paste(species_id, .x, sep = "_"),
|
||||
c(dist, name, chromosome)
|
||||
)
|
||||
# TODO: How to create a dynamic column name using `rename()`?
|
||||
species_distances <- species_distances %>%
|
||||
rename_with(function(x) species_id, dist)
|
||||
|
||||
genes <- full_join(genes, genes_for_species)
|
||||
distances <- full_join(distances, species_distances)
|
||||
}
|
||||
|
||||
# Add additional columns to the original species table.
|
||||
species <- left_join(species, species_computed)
|
||||
|
||||
list(
|
||||
genes = genes,
|
||||
species = species
|
||||
species = species,
|
||||
distances = distances
|
||||
)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue