geposan/scripts/ensembl.R
2021-10-19 13:39:55 +02:00

154 lines
4.6 KiB
R

library(data.table)
rlog::log_info("Connecting to Ensembl API")
#' Object to access the Ensembl API.
ensembl <- biomaRt::useEnsembl("ensembl")
# Retrieve species information.
rlog::log_info("Retrieving species information")
ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
# Filter out species ID and name from the result.
species <- ensembl_datasets[, .(
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
)]
#' Get all chromosome names for an Ensembl dataset.
#'
#' Valid chromosome names include decimal numbers as well as typical sex
#' chromosome names (X, Y, W and Z).
get_chromosome_names <- function(dataset) {
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
chromosome_names[stringr::str_which(chromosome_names, "^[0-9]+|[XYWZ]$")]
}
# Retrieve information on human genes. This will only include genes on
# assembled chromosomes. Chromosomes are filtered using get_chromosome_names().
rlog::log_info("Retrieving information on human genes")
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
human_data <- data.table(biomaRt::getBM(
attributes = c(
"ensembl_gene_id",
"hgnc_symbol",
"chromosome_name",
"start_position",
"end_position"
),
filters = "chromosome_name",
values = get_chromosome_names(dataset),
mart = dataset
))
# Remove duplicated gene IDs (at the time of writing, there are a handful).
human_data <- unique(human_data, by = "ensembl_gene_id")
# Only keep relevant information on genes.
genes <- human_data[, .(
id = ensembl_gene_id,
name = hgnc_symbol,
chromosome = chromosome_name
)]
# Retrieve gene distance data across species.
rlog::log_info("Retrieving distance data")
# Handle the human first, as we already retrieved the data and don't need to
# filter based on orthologies.
human_data[, chromosome_length := max(end_position), by = chromosome_name]
distances <- human_data[, .(
species = "hsapiens",
gene = ensembl_gene_id,
distance = pmin(
start_position,
chromosome_length - end_position
)
)]
# Iterate through all other species and retrieve their distance data.
for (species_id in species[!id == "hsapiens", id]) {
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
dataset <- biomaRt::useDataset(
sprintf("%s_gene_ensembl", species_id),
mart = ensembl
)
# Besides the attributes that are always present, we need to check for
# human orthologs. Some species don't have that information and will be
# skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin%
biomaRt::listAttributes(dataset, what = "name")) {
rlog::log_info("No data on human orthologs")
species <- species[id != species_id]
next
}
chromosome_names <- get_chromosome_names(dataset)
# Skip the species, if there are no assembled chromosomes.
if (length(chromosome_names) <= 0) {
rlog::log_info("No matching chromosome assemblies")
species <- species[id != species_id]
next
}
# Retrieve information on all genes of the current species, that have
# human orthologs. This is called "homolog" in the Ensembl schema.
species_distances <- data.table(biomaRt::getBM(
attributes = c(
"hsapiens_homolog_ensembl_gene",
"chromosome_name",
"start_position",
"end_position"
),
filters = c("with_hsapiens_homolog", "chromosome_name"),
values = list(TRUE, chromosome_names),
mart = dataset
))
# Only include one ortholog per human gene.
species_distances <- unique(
species_distances,
by = "hsapiens_homolog_ensembl_gene"
)
# Precompute the genes' distance to the nearest telomere.
species_distances[,
chromosome_length := max(end_position),
by = chromosome_name
]
species_distances <- species_distances[, .(
species = species_id,
gene = hsapiens_homolog_ensembl_gene,
distance = pmin(
start_position,
chromosome_length - end_position
)
)]
distances <- rbindlist(list(distances, species_distances))
}
# Add information on number of species per gene.
genes_n_species <- distances[, .(n_species = .N), by = "gene"]
genes <- merge(genes, genes_n_species, by.x = "id", by.y = "gene")
# Save data in the appropriate place.
usethis::use_data(species, overwrite = TRUE)
usethis::use_data(genes, overwrite = TRUE)
usethis::use_data(distances, overwrite = TRUE)