mirror of
https://github.com/johrpan/geposanui.git
synced 2025-10-26 11:17:24 +01:00
Retrieve input data using biomaRt
This commit is contained in:
parent
040aabc610
commit
1cea6c3631
205 changed files with 187 additions and 3296961 deletions
193
input.R
193
input.R
|
|
@ -1,5 +1,36 @@
|
|||
library(biomaRt)
|
||||
library(data.table)
|
||||
library(rlog)
|
||||
library(stringr)
|
||||
|
||||
#' Species IDs of known replicatively aging species.
|
||||
species_ids_replicative <- c(
|
||||
"bihybrid",
|
||||
"btaurus",
|
||||
"bthybrid",
|
||||
"cfamiliaris",
|
||||
"chircus",
|
||||
"cjacchus",
|
||||
"clfamiliaris",
|
||||
"csabaeus",
|
||||
"ecaballus",
|
||||
"fcatus",
|
||||
"ggorilla",
|
||||
"hsapiens",
|
||||
"lafricana",
|
||||
"mfascicularis",
|
||||
"mmulatta",
|
||||
"mmurinus",
|
||||
"mnemestrina",
|
||||
"nleucogenys",
|
||||
"oaries",
|
||||
"pabelii",
|
||||
"panubis",
|
||||
"ppaniscus",
|
||||
"ptroglodytes",
|
||||
"sscrofa",
|
||||
"tgelada"
|
||||
)
|
||||
|
||||
#' Gene names of genes for verified TPE-OLD genes.
|
||||
genes_verified_tpe_old <- c(
|
||||
|
|
@ -25,75 +56,137 @@ genes_suggested_tpe_old <- c(
|
|||
"TSPAN9"
|
||||
)
|
||||
|
||||
#' Merge genome data from files in `path` into `data.table`s.
|
||||
ensembl <- useEnsembl(
|
||||
biomart = "ensembl",
|
||||
version = 104
|
||||
)
|
||||
|
||||
#' Retrieve information on species.
|
||||
#'
|
||||
#' The result will be a list with named elements:
|
||||
#' - `genes` will be a table with metadata on human genes.
|
||||
#' - `species` will contain metadata on each species.
|
||||
#' - `distances` will contain each species' genes' distances to the telomere.
|
||||
load_input <- function(path) {
|
||||
# Include data on TPE-OLD status for genes.
|
||||
#' The result will be a `data.table` with the following columns:
|
||||
#'
|
||||
#' - `id` Species ID as presented by Ensembl.
|
||||
#' - `name` Human readable species name.
|
||||
#' - `replicative` Whether the species is likely to be aging replicatively.
|
||||
retrieve_species <- function() {
|
||||
# Ensembl datasets correspond to distinct species.
|
||||
ensembl_datasets <- data.table(listDatasets(ensembl))
|
||||
|
||||
genes <- fread(paste(path, "genes.tsv", sep = "/"))
|
||||
genes[name %chin% genes_verified_tpe_old, verified := TRUE]
|
||||
genes[name %chin% genes_suggested_tpe_old, suggested := TRUE]
|
||||
# Filter out species ID and name from the result.
|
||||
species <- ensembl_datasets[, .(
|
||||
id = str_match(dataset, "(.*)_gene_ensembl")[, 2],
|
||||
name = str_match(description, "(.*) genes \\(.*\\)")[, 2]
|
||||
)]
|
||||
|
||||
# Load and combine data on species and gene distances.
|
||||
species[, replicative := id %chin% species_ids_replicative]
|
||||
}
|
||||
|
||||
original_species <- fread(paste(path, "species.csv", sep = "/"))
|
||||
#' Retrieve information on human genes.
|
||||
#'
|
||||
#' The result will be a `data.table` with the following columns:
|
||||
#'
|
||||
#' - `id` Ensembl gene ID.
|
||||
#' - `ǹame` HGNC name of the gene.
|
||||
#' - `chromosome` Human chromosome on which the gene is located.
|
||||
retrieve_genes <- function() {
|
||||
genes <- data.table(getBM(
|
||||
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
|
||||
mart = useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
||||
))
|
||||
|
||||
species <- data.table(
|
||||
id = character(),
|
||||
group = character(),
|
||||
label = character(),
|
||||
median_distance = numeric()
|
||||
)
|
||||
genes[, .(
|
||||
id = ensembl_gene_id,
|
||||
name = hgnc_symbol,
|
||||
chromosome = chromosome_name,
|
||||
verified = hgnc_symbol %chin% genes_verified_tpe_old,
|
||||
suggested = hgnc_symbol %chin% genes_suggested_tpe_old
|
||||
)]
|
||||
}
|
||||
|
||||
#' Retrieve gene distance data.
|
||||
#'
|
||||
#' The data will include all available values for the given species and genes.
|
||||
#' Specific values on naturally or artificially (e.g. due to incomplete
|
||||
#' sequencing) short chromosomes will be excluded.
|
||||
#'
|
||||
#' The result will be a `data.table` with the following columns:
|
||||
#'
|
||||
#' - `species` Species ID.
|
||||
#' - `gene` Ensembl gene ID.
|
||||
#' - `distance` Distance to nearest telomere in base pairs.
|
||||
retrieve_distances <- function(species_ids, gene_ids) {
|
||||
distances <- data.table(
|
||||
species = character(),
|
||||
gene = integer(),
|
||||
gene = character(),
|
||||
distance = integer()
|
||||
)
|
||||
|
||||
# Each file will contain data on one species.
|
||||
file_names <- list.files(paste(path, "genomes", sep = "/"))
|
||||
n_species <- length(file_names)
|
||||
species_count <- length(species_ids)
|
||||
|
||||
for (i in seq_along(file_names)) {
|
||||
file_name <- file_names[i]
|
||||
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
|
||||
species_path <- paste(path, "genomes", file_name, sep = "/")
|
||||
for (i in 1:species_count) {
|
||||
species_id <- species_ids[i]
|
||||
|
||||
log_info(sprintf(
|
||||
"Reading species %i/%i (%s)", i, n_species, species_id
|
||||
"[%3i%%] Loading species \"%s\"",
|
||||
round(i / species_count * 100),
|
||||
species_id
|
||||
))
|
||||
|
||||
species_distances <- fread(species_path)
|
||||
|
||||
# Compute the median distance across all genes of this species and
|
||||
# add it to the species table along other static data.
|
||||
species <- rbindlist(list(species, data.table(
|
||||
id = species_id,
|
||||
group = original_species[id == species_id, group],
|
||||
label = original_species[id == species_id, label],
|
||||
median_distance = median(species_distances[, dist])
|
||||
)))
|
||||
|
||||
species_distances <- data.table(
|
||||
species = species_id,
|
||||
gene = species_distances[, geneid],
|
||||
distance = species_distances[, dist]
|
||||
ensembl <- useDataset(
|
||||
sprintf("%s_gene_ensembl", species_id),
|
||||
mart = ensembl
|
||||
)
|
||||
|
||||
# Besides the attributes that are always present, we need to check for
|
||||
# human orthologs. Some species don't have that information and will be
|
||||
# skipped.
|
||||
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
||||
listAttributes(ensembl, what = "name")) {
|
||||
next
|
||||
}
|
||||
|
||||
# Retrieve information on all genes of the current species, that have
|
||||
# human orthologs. This is called "homolog" in the Ensembl schema.
|
||||
ensembl_distances <- data.table(getBM(
|
||||
filters = c("with_hsapiens_homolog"),
|
||||
values = c(TRUE),
|
||||
attributes = c(
|
||||
"hsapiens_homolog_ensembl_gene",
|
||||
"chromosome_name",
|
||||
"start_position",
|
||||
"end_position"
|
||||
),
|
||||
mart = useDataset(
|
||||
sprintf("%s_gene_ensembl", species_id),
|
||||
mart = ensembl
|
||||
)
|
||||
))
|
||||
|
||||
ensembl_distances[,
|
||||
chromosome_length := max(end_position),
|
||||
by = chromosome_name
|
||||
]
|
||||
|
||||
# Filter out relevant information and precompute the genes' distance to
|
||||
# the nearest telomere. Exclude genes on naturally or artificially
|
||||
# short chromosomes.
|
||||
species_distances <- ensembl_distances[
|
||||
chromosome_length > 15000000,
|
||||
.(
|
||||
species = species_id,
|
||||
gene = hsapiens_homolog_ensembl_gene,
|
||||
distance = pmin(
|
||||
start_position,
|
||||
chromosome_length - end_position
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
distances <- rbindlist(list(distances, species_distances))
|
||||
}
|
||||
|
||||
# Order species by their median distance.
|
||||
setorder(species, median_distance)
|
||||
|
||||
list(
|
||||
genes = genes,
|
||||
species = species,
|
||||
distances = distances
|
||||
)
|
||||
# Arbitrarily exclude duplicated genes.
|
||||
# TODO: Consider a refined approach or work out how to include all
|
||||
# duplicates.
|
||||
unique(distances, by = c("species", "gene"))
|
||||
}
|
||||
67129
input/genes.tsv
67129
input/genes.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
17773
input/genomes/bmutus.tsv
17773
input/genomes/bmutus.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
18247
input/genomes/catys.tsv
18247
input/genomes/catys.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
13851
input/genomes/cgobio.tsv
13851
input/genomes/cgobio.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
13897
input/genomes/cmilii.tsv
13897
input/genomes/cmilii.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
15827
input/genomes/dordii.tsv
15827
input/genomes/dordii.tsv
File diff suppressed because it is too large
Load diff
14826
input/genomes/drerio.tsv
14826
input/genomes/drerio.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
19234
input/genomes/fcatus.tsv
19234
input/genomes/fcatus.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
13801
input/genomes/hcomes.tsv
13801
input/genomes/hcomes.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
14942
input/genomes/hhucho.tsv
14942
input/genomes/hhucho.tsv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue