mirror of
https://github.com/johrpan/geposanui.git
synced 2025-10-26 19:27:24 +01:00
Reorganize source files and generalize presets
This commit is contained in:
parent
8104e9bd8a
commit
68354bf808
14 changed files with 119 additions and 147 deletions
254
process/input.R
Normal file
254
process/input.R
Normal file
|
|
@ -0,0 +1,254 @@
|
|||
library(biomaRt)
|
||||
library(data.table)
|
||||
library(progress)
|
||||
library(rlog)
|
||||
library(stringr)
|
||||
|
||||
#' Species IDs of known replicatively aging species.
|
||||
species_ids_replicative <- c(
|
||||
"bihybrid",
|
||||
"btaurus",
|
||||
"bthybrid",
|
||||
"cfamiliaris",
|
||||
"chircus",
|
||||
"cjacchus",
|
||||
"clfamiliaris",
|
||||
"csabaeus",
|
||||
"ecaballus",
|
||||
"fcatus",
|
||||
"ggorilla",
|
||||
"hsapiens",
|
||||
"lafricana",
|
||||
"mfascicularis",
|
||||
"mmulatta",
|
||||
"mmurinus",
|
||||
"mnemestrina",
|
||||
"nleucogenys",
|
||||
"oaries",
|
||||
"pabelii",
|
||||
"panubis",
|
||||
"ppaniscus",
|
||||
"ptroglodytes",
|
||||
"sscrofa",
|
||||
"tgelada"
|
||||
)
|
||||
|
||||
#' Gene names of genes for verified TPE-OLD genes.
|
||||
genes_verified_tpe_old <- c(
|
||||
"C1S",
|
||||
"DSP",
|
||||
"ISG15",
|
||||
"SORBS2",
|
||||
"TERT"
|
||||
)
|
||||
|
||||
#' Gene names of genes with a suggested TPE-OLD.
|
||||
genes_suggested_tpe_old <- c(
|
||||
"AKAP3",
|
||||
"ANO2",
|
||||
"CCND2",
|
||||
"CD163L1",
|
||||
"CD9",
|
||||
"FOXM1",
|
||||
"GALNT8",
|
||||
"NDUFA9",
|
||||
"TEAD4",
|
||||
"TIGAR",
|
||||
"TSPAN9"
|
||||
)
|
||||
|
||||
#' Shared accessor for the Ensembl API.
|
||||
ensembl <- NULL
|
||||
|
||||
#' Get the ensembl accessor and initialize it if necessary.
|
||||
get_ensembl <- function() {
|
||||
if (is.null(ensembl)) {
|
||||
ensembl <<- useEnsembl("ensembl", version = 104)
|
||||
}
|
||||
|
||||
ensembl
|
||||
}
|
||||
|
||||
#' Get all chromosome names for a Ensembl dataset.
|
||||
#'
|
||||
#' Valid chromosome names include decimal numbers as well as 'X' and 'Y'.
|
||||
get_chromosome_names <- function(dataset) {
|
||||
chromosome_names <- listFilterOptions(dataset, "chromosome_name")
|
||||
chromosome_names[str_which(chromosome_names, "^[0-9]+|[XY]$")]
|
||||
}
|
||||
|
||||
#' Retrieve information on species.
|
||||
#'
|
||||
#' The result will be a `data.table` with the following columns:
|
||||
#'
|
||||
#' - `id` Species ID as presented by Ensembl.
|
||||
#' - `name` Human readable species name.
|
||||
#' - `replicative` Whether the species is likely to be aging replicatively.
|
||||
retrieve_species <- function() {
|
||||
# Ensembl datasets correspond to distinct species.
|
||||
ensembl_datasets <- data.table(listDatasets(get_ensembl()))
|
||||
|
||||
# Filter out species ID and name from the result.
|
||||
species <- ensembl_datasets[, .(
|
||||
id = str_match(dataset, "(.*)_gene_ensembl")[, 2],
|
||||
name = str_match(description, "(.*) genes \\(.*\\)")[, 2]
|
||||
)]
|
||||
|
||||
species[, replicative := id %chin% species_ids_replicative]
|
||||
}
|
||||
|
||||
#' Retrieve information on human genes.
|
||||
#'
|
||||
#' This will only include genes on assembled chromosomes. Chromosomes are
|
||||
#' filtered based on their name being either a decimal number, 'X' or 'Y'.
|
||||
#'
|
||||
#' The result will be a `data.table` with the following columns:
|
||||
#'
|
||||
#' - `id` Ensembl gene ID.
|
||||
#' - `ǹame` HGNC name of the gene.
|
||||
#' - `chromosome` Human chromosome on which the gene is located.
|
||||
retrieve_genes <- function() {
|
||||
dataset <- useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
|
||||
|
||||
genes <- data.table(getBM(
|
||||
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
|
||||
filters = "chromosome_name",
|
||||
values = get_chromosome_names(dataset),
|
||||
mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
|
||||
))
|
||||
|
||||
genes[, .(
|
||||
id = ensembl_gene_id,
|
||||
name = hgnc_symbol,
|
||||
chromosome = chromosome_name,
|
||||
verified = hgnc_symbol %chin% genes_verified_tpe_old,
|
||||
suggested = hgnc_symbol %chin% genes_suggested_tpe_old
|
||||
)]
|
||||
}
|
||||
|
||||
#' Retrieve gene distance data.
|
||||
#'
|
||||
#' The data will include all available values for the given species and genes
|
||||
#' that are located on assembled chromosomes.
|
||||
#'
|
||||
#' The result will be a `data.table` with the following columns:
|
||||
#'
|
||||
#' - `species` Species ID.
|
||||
#' - `gene` Ensembl gene ID.
|
||||
#' - `distance` Distance to nearest telomere in base pairs.
|
||||
retrieve_distances <- function(species_ids, gene_ids) {
|
||||
ensembl <- get_ensembl()
|
||||
|
||||
# Exclude the human from the species, in case it is present there.
|
||||
species_ids <- species_ids[species_ids != "hsapiens"]
|
||||
|
||||
species_count <- length(species_ids)
|
||||
gene_count <- length(gene_ids)
|
||||
|
||||
log_info(sprintf(
|
||||
"Retrieving distance data for %i genes from %i species",
|
||||
gene_count,
|
||||
species_count
|
||||
))
|
||||
|
||||
progress <- progress_bar$new(
|
||||
total = gene_count,
|
||||
format = "Retrieving distance data [:bar] :percent (ETA :eta)"
|
||||
)
|
||||
|
||||
# Special case the human species and retrieve all available distance
|
||||
# information.
|
||||
|
||||
dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
||||
|
||||
human_distances <- data.table(getBM(
|
||||
attributes = c(
|
||||
"ensembl_gene_id",
|
||||
"chromosome_name",
|
||||
"start_position",
|
||||
"end_position"
|
||||
),
|
||||
filters = "chromosome_name",
|
||||
values = get_chromosome_names(dataset),
|
||||
mart = dataset
|
||||
))
|
||||
|
||||
# Compute the nearest distance to telomeres.
|
||||
|
||||
human_distances[,
|
||||
chromosome_length := max(end_position),
|
||||
by = chromosome_name
|
||||
]
|
||||
|
||||
distances <- human_distances[, .(
|
||||
species = "hsapiens",
|
||||
gene = ensembl_gene_id,
|
||||
distance = pmin(
|
||||
start_position,
|
||||
chromosome_length - end_position
|
||||
)
|
||||
)]
|
||||
|
||||
for (i in 1:species_count) {
|
||||
species_id <- species_ids[i]
|
||||
|
||||
progress$tick()
|
||||
|
||||
dataset <- useDataset(
|
||||
sprintf("%s_gene_ensembl", species_id),
|
||||
mart = ensembl
|
||||
)
|
||||
|
||||
# Besides the attributes that are always present, we need to check for
|
||||
# human orthologs. Some species don't have that information and will be
|
||||
# skipped.
|
||||
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
||||
listAttributes(dataset, what = "name")) {
|
||||
next
|
||||
}
|
||||
|
||||
chromosome_names <- get_chromosome_names(dataset)
|
||||
|
||||
# Skip the species, if there are no assembled chromosomes.
|
||||
if (length(chromosome_names) <= 0) {
|
||||
next
|
||||
}
|
||||
|
||||
# Retrieve information on all genes of the current species, that have
|
||||
# human orthologs. This is called "homolog" in the Ensembl schema.
|
||||
ensembl_distances <- data.table(getBM(
|
||||
attributes = c(
|
||||
"hsapiens_homolog_ensembl_gene",
|
||||
"chromosome_name",
|
||||
"start_position",
|
||||
"end_position"
|
||||
),
|
||||
filters = c("with_hsapiens_homolog", "chromosome_name"),
|
||||
values = list(TRUE, chromosome_names),
|
||||
mart = dataset
|
||||
))
|
||||
|
||||
# Precompute the genes' distance to the nearest telomere.
|
||||
|
||||
ensembl_distances[,
|
||||
chromosome_length := max(end_position),
|
||||
by = chromosome_name
|
||||
]
|
||||
|
||||
species_distances <- ensembl_distances[, .(
|
||||
species = species_id,
|
||||
gene = hsapiens_homolog_ensembl_gene,
|
||||
distance = pmin(
|
||||
start_position,
|
||||
chromosome_length - end_position
|
||||
)
|
||||
)]
|
||||
|
||||
distances <- rbindlist(list(distances, species_distances))
|
||||
}
|
||||
|
||||
# Arbitrarily exclude duplicated genes.
|
||||
# TODO: Consider a refined approach or work out how to include all
|
||||
# duplicates.
|
||||
unique(distances, by = c("species", "gene"))
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue