geposanui/input.R

68 lines
2.1 KiB
R
Raw Normal View History

2021-06-24 22:36:02 +02:00
library(data.table)
2021-08-25 12:00:22 +02:00
library(rlog)
2021-06-16 22:01:09 +02:00
2021-06-24 22:36:02 +02:00
#' Merge genome data from files in `path` into `data.table`s.
2021-06-16 22:01:09 +02:00
#'
#' The result will be a list with named elements:
#' - `genes` will be a table with metadata on human genes.
#' - `species` will contain metadata on each species.
#' - `distances` will contain each species' genes' distances to the telomere.
2021-08-25 12:03:11 +02:00
load_input <- function(path) {
2021-06-24 22:36:02 +02:00
genes <- fread(paste(path, "genes.tsv", sep = "/"))
original_species <- fread(paste(path, "species.csv", sep = "/"))
2021-06-16 22:01:09 +02:00
2021-06-24 22:36:02 +02:00
species <- data.table(
2021-06-21 13:03:26 +02:00
id = character(),
2021-08-25 15:01:18 +02:00
group = character(),
2021-06-24 22:36:02 +02:00
label = character(),
2021-06-21 13:03:26 +02:00
median_distance = numeric()
)
distances <- data.table(
species = character(),
gene = integer(),
distance = integer()
)
2021-06-24 22:36:02 +02:00
# Each file will contain data on one species.
file_names <- list.files(paste(path, "genomes", sep = "/"))
2021-08-25 12:00:22 +02:00
n_species <- length(file_names)
2021-06-24 22:36:02 +02:00
2021-08-25 12:00:22 +02:00
for (i in seq_along(file_names)) {
file_name <- file_names[i]
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
2021-08-25 12:00:22 +02:00
species_path <- paste(path, "genomes", file_name, sep = "/")
2021-06-16 22:01:09 +02:00
2021-08-25 12:00:22 +02:00
log_info(sprintf(
"Reading species %i/%i (%s)", i, n_species, species_id
))
2021-06-16 22:01:09 +02:00
2021-08-25 12:00:22 +02:00
species_distances <- fread(species_path)
2021-06-16 22:01:09 +02:00
2021-08-25 12:00:22 +02:00
# Compute the median distance across all genes of this species and
# add it to the species table along other static data.
species <- rbindlist(list(species, data.table(
id = species_id,
2021-08-25 15:01:18 +02:00
group = original_species[id == species_id, group],
2021-08-25 12:00:22 +02:00
label = original_species[id == species_id, label],
median_distance = median(species_distances[, dist])
)))
2021-06-21 13:03:26 +02:00
2021-08-25 12:00:22 +02:00
species_distances <- data.table(
species = species_id,
gene = species_distances[, geneid],
distance = species_distances[, dist]
)
distances <- rbindlist(list(distances, species_distances))
2021-06-16 22:01:09 +02:00
}
2021-08-25 12:00:22 +02:00
# Order species by their median distance.
setorder(species, median_distance)
2021-06-16 22:01:09 +02:00
list(
2021-06-21 13:03:26 +02:00
genes = genes,
species = species,
distances = distances
2021-06-16 22:01:09 +02:00
)
2021-06-21 13:03:26 +02:00
}