geposanui/input.R

219 lines
5.9 KiB
R
Raw Normal View History

2021-09-16 00:06:54 +02:00
library(biomaRt)
2021-06-24 22:36:02 +02:00
library(data.table)
2021-08-25 12:00:22 +02:00
library(rlog)
2021-09-16 00:06:54 +02:00
library(stringr)
#' Species IDs of known replicatively aging species.
species_ids_replicative <- c(
"bihybrid",
"btaurus",
"bthybrid",
"cfamiliaris",
"chircus",
"cjacchus",
"clfamiliaris",
"csabaeus",
"ecaballus",
"fcatus",
"ggorilla",
"hsapiens",
"lafricana",
"mfascicularis",
"mmulatta",
"mmurinus",
"mnemestrina",
"nleucogenys",
"oaries",
"pabelii",
"panubis",
"ppaniscus",
"ptroglodytes",
"sscrofa",
"tgelada"
)
2021-06-16 22:01:09 +02:00
#' Gene names of genes for verified TPE-OLD genes.
genes_verified_tpe_old <- c(
"C1S",
"DSP",
"ISG15",
"SORBS2",
"TERT"
)
#' Gene names of genes with a suggested TPE-OLD.
genes_suggested_tpe_old <- c(
"AKAP3",
"ANO2",
"CCND2",
"CD163L1",
"CD9",
"FOXM1",
"GALNT8",
"NDUFA9",
"TEAD4",
"TIGAR",
"TSPAN9"
)
2021-09-16 00:06:54 +02:00
ensembl <- useEnsembl(
biomart = "ensembl",
version = 104
)
#' Retrieve information on species.
2021-06-16 22:01:09 +02:00
#'
2021-09-16 00:06:54 +02:00
#' The result will be a `data.table` with the following columns:
#'
#' - `id` Species ID as presented by Ensembl.
#' - `name` Human readable species name.
#' - `replicative` Whether the species is likely to be aging replicatively.
retrieve_species <- function() {
# Ensembl datasets correspond to distinct species.
ensembl_datasets <- data.table(listDatasets(ensembl))
# Filter out species ID and name from the result.
species <- ensembl_datasets[, .(
id = str_match(dataset, "(.*)_gene_ensembl")[, 2],
name = str_match(description, "(.*) genes \\(.*\\)")[, 2]
)]
species[, replicative := id %chin% species_ids_replicative]
}
#' Retrieve information on human genes.
#'
#' The result will be a `data.table` with the following columns:
#'
#' - `id` Ensembl gene ID.
#' - `ǹame` HGNC name of the gene.
#' - `chromosome` Human chromosome on which the gene is located.
retrieve_genes <- function() {
genes <- data.table(getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
mart = useDataset("hsapiens_gene_ensembl", mart = ensembl)
))
2021-06-21 13:03:26 +02:00
2021-09-16 00:06:54 +02:00
genes[, .(
id = ensembl_gene_id,
name = hgnc_symbol,
chromosome = chromosome_name,
verified = hgnc_symbol %chin% genes_verified_tpe_old,
suggested = hgnc_symbol %chin% genes_suggested_tpe_old
)]
}
#' Retrieve gene distance data.
#'
#' The data will include all available values for the given species and genes.
#' Specific values on naturally or artificially (e.g. due to incomplete
#' sequencing) short chromosomes will be excluded.
#'
#' The result will be a `data.table` with the following columns:
#'
#' - `species` Species ID.
#' - `gene` Ensembl gene ID.
#' - `distance` Distance to nearest telomere in base pairs.
retrieve_distances <- function(species_ids, gene_ids) {
2021-09-19 12:07:59 +02:00
# Special case the human species and retrieve all available distance
# information.
ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
human_distances <- data.table(getBM(
attributes = c(
"ensembl_gene_id",
"chromosome_name",
"start_position",
"end_position"
),
mart = ensembl
))
human_distances[,
chromosome_length := max(end_position),
by = chromosome_name
]
# Filter out relevant information (see below).
distances <- human_distances[
chromosome_length > 15000000,
.(
species = "hsapiens",
gene = ensembl_gene_id,
distance = pmin(
start_position,
chromosome_length - end_position
)
)
]
# Exclude the human from the species, in case it is present there.
species_ids <- species_ids[species_ids != "hsapiens"]
2021-06-24 22:36:02 +02:00
2021-09-16 00:06:54 +02:00
species_count <- length(species_ids)
2021-06-24 22:36:02 +02:00
2021-09-16 00:06:54 +02:00
for (i in 1:species_count) {
species_id <- species_ids[i]
2021-06-16 22:01:09 +02:00
2021-08-25 12:00:22 +02:00
log_info(sprintf(
2021-09-16 00:06:54 +02:00
"[%3i%%] Loading species \"%s\"",
round(i / species_count * 100),
species_id
2021-08-25 12:00:22 +02:00
))
2021-06-16 22:01:09 +02:00
2021-09-16 00:06:54 +02:00
ensembl <- useDataset(
sprintf("%s_gene_ensembl", species_id),
mart = ensembl
2021-08-25 12:00:22 +02:00
)
2021-09-16 00:06:54 +02:00
# Besides the attributes that are always present, we need to check for
# human orthologs. Some species don't have that information and will be
# skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin%
listAttributes(ensembl, what = "name")) {
next
}
# Retrieve information on all genes of the current species, that have
# human orthologs. This is called "homolog" in the Ensembl schema.
ensembl_distances <- data.table(getBM(
filters = c("with_hsapiens_homolog"),
values = c(TRUE),
attributes = c(
"hsapiens_homolog_ensembl_gene",
"chromosome_name",
"start_position",
"end_position"
),
2021-09-19 12:07:59 +02:00
mart = ensembl
2021-09-16 00:06:54 +02:00
))
ensembl_distances[,
chromosome_length := max(end_position),
by = chromosome_name
]
# Filter out relevant information and precompute the genes' distance to
# the nearest telomere. Exclude genes on naturally or artificially
# short chromosomes.
species_distances <- ensembl_distances[
chromosome_length > 15000000,
.(
species = species_id,
gene = hsapiens_homolog_ensembl_gene,
distance = pmin(
start_position,
chromosome_length - end_position
)
)
]
2021-08-25 12:00:22 +02:00
distances <- rbindlist(list(distances, species_distances))
2021-06-16 22:01:09 +02:00
}
2021-09-16 00:06:54 +02:00
# Arbitrarily exclude duplicated genes.
# TODO: Consider a refined approach or work out how to include all
# duplicates.
unique(distances, by = c("species", "gene"))
2021-06-21 13:03:26 +02:00
}