Retrieve input data using biomaRt

This commit is contained in:
Elias Projahn 2021-09-16 00:06:54 +02:00
parent 040aabc610
commit 1cea6c3631
205 changed files with 187 additions and 3296961 deletions

193
input.R
View file

@ -1,5 +1,36 @@
library(biomaRt)
library(data.table) library(data.table)
library(rlog) library(rlog)
library(stringr)
#' Species IDs of known replicatively aging species.
species_ids_replicative <- c(
"bihybrid",
"btaurus",
"bthybrid",
"cfamiliaris",
"chircus",
"cjacchus",
"clfamiliaris",
"csabaeus",
"ecaballus",
"fcatus",
"ggorilla",
"hsapiens",
"lafricana",
"mfascicularis",
"mmulatta",
"mmurinus",
"mnemestrina",
"nleucogenys",
"oaries",
"pabelii",
"panubis",
"ppaniscus",
"ptroglodytes",
"sscrofa",
"tgelada"
)
#' Gene names of genes for verified TPE-OLD genes. #' Gene names of genes for verified TPE-OLD genes.
genes_verified_tpe_old <- c( genes_verified_tpe_old <- c(
@ -25,75 +56,137 @@ genes_suggested_tpe_old <- c(
"TSPAN9" "TSPAN9"
) )
#' Merge genome data from files in `path` into `data.table`s. ensembl <- useEnsembl(
biomart = "ensembl",
version = 104
)
#' Retrieve information on species.
#' #'
#' The result will be a list with named elements: #' The result will be a `data.table` with the following columns:
#' - `genes` will be a table with metadata on human genes. #'
#' - `species` will contain metadata on each species. #' - `id` Species ID as presented by Ensembl.
#' - `distances` will contain each species' genes' distances to the telomere. #' - `name` Human readable species name.
load_input <- function(path) { #' - `replicative` Whether the species is likely to be aging replicatively.
# Include data on TPE-OLD status for genes. retrieve_species <- function() {
# Ensembl datasets correspond to distinct species.
ensembl_datasets <- data.table(listDatasets(ensembl))
genes <- fread(paste(path, "genes.tsv", sep = "/")) # Filter out species ID and name from the result.
genes[name %chin% genes_verified_tpe_old, verified := TRUE] species <- ensembl_datasets[, .(
genes[name %chin% genes_suggested_tpe_old, suggested := TRUE] id = str_match(dataset, "(.*)_gene_ensembl")[, 2],
name = str_match(description, "(.*) genes \\(.*\\)")[, 2]
)]
# Load and combine data on species and gene distances. species[, replicative := id %chin% species_ids_replicative]
}
original_species <- fread(paste(path, "species.csv", sep = "/")) #' Retrieve information on human genes.
#'
#' The result will be a `data.table` with the following columns:
#'
#' - `id` Ensembl gene ID.
#' - `ǹame` HGNC name of the gene.
#' - `chromosome` Human chromosome on which the gene is located.
retrieve_genes <- function() {
genes <- data.table(getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
mart = useDataset("hsapiens_gene_ensembl", mart = ensembl)
))
species <- data.table( genes[, .(
id = character(), id = ensembl_gene_id,
group = character(), name = hgnc_symbol,
label = character(), chromosome = chromosome_name,
median_distance = numeric() verified = hgnc_symbol %chin% genes_verified_tpe_old,
) suggested = hgnc_symbol %chin% genes_suggested_tpe_old
)]
}
#' Retrieve gene distance data.
#'
#' The data will include all available values for the given species and genes.
#' Specific values on naturally or artificially (e.g. due to incomplete
#' sequencing) short chromosomes will be excluded.
#'
#' The result will be a `data.table` with the following columns:
#'
#' - `species` Species ID.
#' - `gene` Ensembl gene ID.
#' - `distance` Distance to nearest telomere in base pairs.
retrieve_distances <- function(species_ids, gene_ids) {
distances <- data.table( distances <- data.table(
species = character(), species = character(),
gene = integer(), gene = character(),
distance = integer() distance = integer()
) )
# Each file will contain data on one species. species_count <- length(species_ids)
file_names <- list.files(paste(path, "genomes", sep = "/"))
n_species <- length(file_names)
for (i in seq_along(file_names)) { for (i in 1:species_count) {
file_name <- file_names[i] species_id <- species_ids[i]
species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1]
species_path <- paste(path, "genomes", file_name, sep = "/")
log_info(sprintf( log_info(sprintf(
"Reading species %i/%i (%s)", i, n_species, species_id "[%3i%%] Loading species \"%s\"",
round(i / species_count * 100),
species_id
)) ))
species_distances <- fread(species_path) ensembl <- useDataset(
sprintf("%s_gene_ensembl", species_id),
# Compute the median distance across all genes of this species and mart = ensembl
# add it to the species table along other static data.
species <- rbindlist(list(species, data.table(
id = species_id,
group = original_species[id == species_id, group],
label = original_species[id == species_id, label],
median_distance = median(species_distances[, dist])
)))
species_distances <- data.table(
species = species_id,
gene = species_distances[, geneid],
distance = species_distances[, dist]
) )
# Besides the attributes that are always present, we need to check for
# human orthologs. Some species don't have that information and will be
# skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin%
listAttributes(ensembl, what = "name")) {
next
}
# Retrieve information on all genes of the current species, that have
# human orthologs. This is called "homolog" in the Ensembl schema.
ensembl_distances <- data.table(getBM(
filters = c("with_hsapiens_homolog"),
values = c(TRUE),
attributes = c(
"hsapiens_homolog_ensembl_gene",
"chromosome_name",
"start_position",
"end_position"
),
mart = useDataset(
sprintf("%s_gene_ensembl", species_id),
mart = ensembl
)
))
ensembl_distances[,
chromosome_length := max(end_position),
by = chromosome_name
]
# Filter out relevant information and precompute the genes' distance to
# the nearest telomere. Exclude genes on naturally or artificially
# short chromosomes.
species_distances <- ensembl_distances[
chromosome_length > 15000000,
.(
species = species_id,
gene = hsapiens_homolog_ensembl_gene,
distance = pmin(
start_position,
chromosome_length - end_position
)
)
]
distances <- rbindlist(list(distances, species_distances)) distances <- rbindlist(list(distances, species_distances))
} }
# Order species by their median distance. # Arbitrarily exclude duplicated genes.
setorder(species, median_distance) # TODO: Consider a refined approach or work out how to include all
# duplicates.
list( unique(distances, by = c("species", "gene"))
genes = genes,
species = species,
distances = distances
)
} }

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Some files were not shown because too many files have changed in this diff Show more