data: Simplify data structure

This commit also adds the input data to the index.
This commit is contained in:
Elias Projahn 2021-06-24 20:20:46 +02:00
parent 914673c79c
commit 998009b418
205 changed files with 3296891 additions and 272 deletions

1
.gitignore vendored
View file

@ -1 +0,0 @@
/input

41
data.R
View file

@ -27,35 +27,34 @@ load_data_cached <- function(path) {
#' Merge genome data from files in `path` into `tibble`s.
#'
#' The result will be a list with two named elements:
#' - `genes` will be a table with one row per unique `geneid` and multiple
#' columns per species containing the data of interest.
#' - `species` will contain additional information on each species.
#' The result will be a list with named elements:
#' - `genes` will be a table with metadata on human genes.
#' - `species` will contain metadata on each species.
#' - `distances` will contain each species' genes' distances to the telomere.
#'
#' @seealso [load_data_cached()]
load_data <- function(path) {
# The resulting table for information by species.
genes <- read_tsv(paste(path, "genes.tsv", sep = "/"))
species <- read_csv(paste(path, "species.csv", sep = "/"))
# The resulting table for information by gene. For each species, columns
# will be appended.
genes <- tibble(geneid = integer())
distances <- tibble(geneid = integer())
# Each file will contain data on one species.
file_names <- list.files(path, "*_raw.txt")
file_names <- list.files(paste(path, "genomes", sep = "/"))
# Table containing additional columns to be added to the species table.
# Table containing additional columns to be added to the species table
# later.
species_computed <- tibble(
id = character(),
median_distance = numeric()
)
for (file_name in file_names) {
species_id <- strsplit(file_name, split = "_")[[1]][1]
genes_for_species <- read_tsv(paste(path, file_name, sep = "/"))
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
species_path <- paste(path, "genomes", file_name, sep = "/")
species_distances <- read_tsv(species_path)
# Compute the median distance across all genes of this species.
median_distance <- genes_for_species %>%
median_distance <- species_distances %>%
select(dist) %>%
summarise(median_distance = median(dist)) %>%
pull(median_distance)
@ -67,19 +66,19 @@ load_data <- function(path) {
)
# Column names have to be unique for each species.
genes_for_species <- rename_with(
genes_for_species,
~ paste(species_id, .x, sep = "_"),
c(dist, name, chromosome)
)
# TODO: How to create a dynamic column name using `rename()`?
species_distances <- species_distances %>%
rename_with(function(x) species_id, dist)
genes <- full_join(genes, genes_for_species)
distances <- full_join(distances, species_distances)
}
# Add additional columns to the original species table.
species <- left_join(species, species_computed)
list(
genes = genes,
species = species
species = species,
distances = distances
)
}

BIN
input/cache.rds Normal file

Binary file not shown.

67129
input/genes.tsv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

14776
input/genomes/acalliptera.tsv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

14792
input/genomes/acchrysaetos.tsv Normal file

File diff suppressed because it is too large Load diff

14243
input/genomes/acitrinellus.tsv Normal file

File diff suppressed because it is too large Load diff

18959
input/genomes/amelanoleuca.tsv Normal file

File diff suppressed because it is too large Load diff

14870
input/genomes/amexicanus.tsv Normal file

File diff suppressed because it is too large Load diff

17685
input/genomes/anancymaae.tsv Normal file

File diff suppressed because it is too large Load diff

14450
input/genomes/aocellaris.tsv Normal file

File diff suppressed because it is too large Load diff

14719
input/genomes/apercula.tsv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

14424
input/genomes/atestudineus.tsv Normal file

File diff suppressed because it is too large Load diff

17371
input/genomes/bbbison.tsv Normal file

File diff suppressed because it is too large Load diff

18364
input/genomes/bgrunniens.tsv Normal file

File diff suppressed because it is too large Load diff

19142
input/genomes/bihybrid.tsv Normal file

File diff suppressed because it is too large Load diff

18616
input/genomes/bmusculus.tsv Normal file

File diff suppressed because it is too large Load diff

17773
input/genomes/bmutus.tsv Normal file

File diff suppressed because it is too large Load diff

14321
input/genomes/bsplendens.tsv Normal file

File diff suppressed because it is too large Load diff

19402
input/genomes/btaurus.tsv Normal file

File diff suppressed because it is too large Load diff

15731
input/genomes/cabingdonii.tsv Normal file

File diff suppressed because it is too large Load diff

18247
input/genomes/catys.tsv Normal file

File diff suppressed because it is too large Load diff

15122
input/genomes/cauratus.tsv Normal file

File diff suppressed because it is too large Load diff

14270
input/genomes/ccarpio.tsv Normal file

File diff suppressed because it is too large Load diff

19027
input/genomes/cdromedarius.tsv Normal file

File diff suppressed because it is too large Load diff

8235
input/genomes/celegans.tsv Normal file

File diff suppressed because it is too large Load diff

19183
input/genomes/cgchok1gshd.tsv Normal file

File diff suppressed because it is too large Load diff

13851
input/genomes/cgobio.tsv Normal file

File diff suppressed because it is too large Load diff

14151
input/genomes/charengus.tsv Normal file

File diff suppressed because it is too large Load diff

19133
input/genomes/chircus.tsv Normal file

File diff suppressed because it is too large Load diff

11381
input/genomes/choffmanni.tsv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

21180
input/genomes/cjacchus.tsv Normal file

File diff suppressed because it is too large Load diff

14570
input/genomes/cjaponica.tsv Normal file

File diff suppressed because it is too large Load diff

17004
input/genomes/clanigera.tsv Normal file

File diff suppressed because it is too large Load diff

19332
input/genomes/cldingo.tsv Normal file

File diff suppressed because it is too large Load diff

18684
input/genomes/clfamiliaris.tsv Normal file

File diff suppressed because it is too large Load diff

14177
input/genomes/clumpus.tsv Normal file

File diff suppressed because it is too large Load diff

13897
input/genomes/cmilii.tsv Normal file

File diff suppressed because it is too large Load diff

16083
input/genomes/cpbellii.tsv Normal file

File diff suppressed because it is too large Load diff

18525
input/genomes/cporcellus.tsv Normal file

File diff suppressed because it is too large Load diff

13827
input/genomes/cporosus.tsv Normal file

File diff suppressed because it is too large Load diff

22241
input/genomes/csabaeus.tsv Normal file

File diff suppressed because it is too large Load diff

8411
input/genomes/csavignyi.tsv Normal file

File diff suppressed because it is too large Load diff

14222
input/genomes/csemilaevis.tsv Normal file

File diff suppressed because it is too large Load diff

16520
input/genomes/csyrichta.tsv Normal file

File diff suppressed because it is too large Load diff

14163
input/genomes/cvariegatus.tsv Normal file

File diff suppressed because it is too large Load diff

19044
input/genomes/cwagneri.tsv Normal file

File diff suppressed because it is too large Load diff

14514
input/genomes/dclupeoides.tsv Normal file

File diff suppressed because it is too large Load diff

14372
input/genomes/dlabrax.tsv Normal file

File diff suppressed because it is too large Load diff

18098
input/genomes/dleucas.tsv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

15827
input/genomes/dordii.tsv Normal file

File diff suppressed because it is too large Load diff

14826
input/genomes/drerio.tsv Normal file

File diff suppressed because it is too large Load diff

17856
input/genomes/eaasinus.tsv Normal file

File diff suppressed because it is too large Load diff

10212
input/genomes/eburgeri.tsv Normal file

File diff suppressed because it is too large Load diff

19001
input/genomes/ecaballus.tsv Normal file

File diff suppressed because it is too large Load diff

15068
input/genomes/ecalabaricus.tsv Normal file

File diff suppressed because it is too large Load diff

14268
input/genomes/eelectricus.tsv Normal file

File diff suppressed because it is too large Load diff

13489
input/genomes/eeuropaeus.tsv Normal file

File diff suppressed because it is too large Load diff

14901
input/genomes/elucius.tsv Normal file

File diff suppressed because it is too large Load diff

14362
input/genomes/etelfairi.tsv Normal file

File diff suppressed because it is too large Load diff

13497
input/genomes/falbicollis.tsv Normal file

File diff suppressed because it is too large Load diff

19234
input/genomes/fcatus.tsv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

13830
input/genomes/gaculeatus.tsv Normal file

File diff suppressed because it is too large Load diff

16198
input/genomes/gevgoodei.tsv Normal file

File diff suppressed because it is too large Load diff

12967
input/genomes/gfortis.tsv Normal file

File diff suppressed because it is too large Load diff

14845
input/genomes/ggallus.tsv Normal file

File diff suppressed because it is too large Load diff

23150
input/genomes/ggorilla.tsv Normal file

File diff suppressed because it is too large Load diff

13741
input/genomes/gmorhua.tsv Normal file

File diff suppressed because it is too large Load diff

14404
input/genomes/hburtoni.tsv Normal file

File diff suppressed because it is too large Load diff

13801
input/genomes/hcomes.tsv Normal file

File diff suppressed because it is too large Load diff

17112
input/genomes/hgfemale.tsv Normal file

File diff suppressed because it is too large Load diff

14942
input/genomes/hhucho.tsv Normal file

File diff suppressed because it is too large Load diff

67129
input/genomes/hsapiens.tsv Normal file

File diff suppressed because it is too large Load diff

14720
input/genomes/ipunctatus.tsv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

15882
input/genomes/jjaculus.tsv Normal file

File diff suppressed because it is too large Load diff

14206
input/genomes/kmarmoratus.tsv Normal file

File diff suppressed because it is too large Load diff

18299
input/genomes/lafricana.tsv Normal file

File diff suppressed because it is too large Load diff

14277
input/genomes/lbergylta.tsv Normal file

File diff suppressed because it is too large Load diff

14415
input/genomes/lcalcarifer.tsv Normal file

File diff suppressed because it is too large Load diff

15064
input/genomes/lchalumnae.tsv Normal file

File diff suppressed because it is too large Load diff

14584
input/genomes/lcrocea.tsv Normal file

File diff suppressed because it is too large Load diff

14071
input/genomes/llaticaudata.tsv Normal file

File diff suppressed because it is too large Load diff

15038
input/genomes/lleishanense.tsv Normal file

File diff suppressed because it is too large Load diff

14935
input/genomes/loculatus.tsv Normal file

File diff suppressed because it is too large Load diff

14321
input/genomes/marmatus.tsv Normal file

File diff suppressed because it is too large Load diff

16590
input/genomes/mauratus.tsv Normal file

File diff suppressed because it is too large Load diff

18511
input/genomes/mcaroli.tsv Normal file

File diff suppressed because it is too large Load diff

17333
input/genomes/mdomestica.tsv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

13598
input/genomes/mgallopavo.tsv Normal file

File diff suppressed because it is too large Load diff

Some files were not shown because too many files have changed in this diff Show more