Only retrieve assembled chromosomes

This commit is contained in:
Elias Projahn 2021-10-11 15:10:03 +02:00
parent c26e4ff4a3
commit d3edeefbe2

65
input.R
View file

@ -69,6 +69,14 @@ get_ensembl <- function() {
ensembl
}
#' Get all chromosome names for a Ensembl dataset.
#'
#' Valid chromosome names include decimal numbers as well as 'X' and 'Y'.
get_chromosome_names <- function(dataset) {
chromosome_names <- listFilterOptions(dataset, "chromosome_name")
chromosome_names[str_which(chromosome_names, "^[0-9]+|[XY]$")]
}
#' Retrieve information on species.
#'
#' The result will be a `data.table` with the following columns:
@ -91,14 +99,21 @@ retrieve_species <- function() {
#' Retrieve information on human genes.
#'
#' This will only include genes on assembled chromosomes. Chromosomes are
#' filtered based on their name being either a decimal number, 'X' or 'Y'.
#'
#' The result will be a `data.table` with the following columns:
#'
#' - `id` Ensembl gene ID.
#' - `ǹame` HGNC name of the gene.
#' - `chromosome` Human chromosome on which the gene is located.
retrieve_genes <- function() {
dataset <- useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
genes <- data.table(getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
filters = "chromosome_name",
values = get_chromosome_names(dataset),
mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
))
@ -113,9 +128,8 @@ retrieve_genes <- function() {
#' Retrieve gene distance data.
#'
#' The data will include all available values for the given species and genes.
#' Specific values on naturally or artificially (e.g. due to incomplete
#' sequencing) short chromosomes will be excluded.
#' The data will include all available values for the given species and genes
#' that are located on assembled chromosomes.
#'
#' The result will be a `data.table` with the following columns:
#'
@ -145,7 +159,7 @@ retrieve_distances <- function(species_ids, gene_ids) {
# Special case the human species and retrieve all available distance
# information.
ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
human_distances <- data.table(getBM(
attributes = c(
@ -154,33 +168,33 @@ retrieve_distances <- function(species_ids, gene_ids) {
"start_position",
"end_position"
),
mart = ensembl
filters = "chromosome_name",
values = get_chromosome_names(dataset),
mart = dataset
))
# Compute the nearest distance to telomeres.
human_distances[,
chromosome_length := max(end_position),
by = chromosome_name
]
# Filter out relevant information (see below).
distances <- human_distances[
chromosome_length > 15000000,
.(
distances <- human_distances[, .(
species = "hsapiens",
gene = ensembl_gene_id,
distance = pmin(
start_position,
chromosome_length - end_position
)
)
]
)]
for (i in 1:species_count) {
species_id <- species_ids[i]
progress$tick()
ensembl <- useDataset(
dataset <- useDataset(
sprintf("%s_gene_ensembl", species_id),
mart = ensembl
)
@ -189,43 +203,46 @@ retrieve_distances <- function(species_ids, gene_ids) {
# human orthologs. Some species don't have that information and will be
# skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin%
listAttributes(ensembl, what = "name")) {
listAttributes(dataset, what = "name")) {
next
}
chromosome_names <- get_chromosome_names(dataset)
# Skip the species, if there are no assembled chromosomes.
if (length(chromosome_names) <= 0) {
next
}
# Retrieve information on all genes of the current species, that have
# human orthologs. This is called "homolog" in the Ensembl schema.
ensembl_distances <- data.table(getBM(
filters = c("with_hsapiens_homolog"),
values = c(TRUE),
attributes = c(
"hsapiens_homolog_ensembl_gene",
"chromosome_name",
"start_position",
"end_position"
),
mart = ensembl
filters = c("with_hsapiens_homolog", "chromosome_name"),
values = list(TRUE, chromosome_names),
mart = dataset
))
# Precompute the genes' distance to the nearest telomere.
ensembl_distances[,
chromosome_length := max(end_position),
by = chromosome_name
]
# Filter out relevant information and precompute the genes' distance to
# the nearest telomere. Exclude genes on naturally or artificially
# short chromosomes.
species_distances <- ensembl_distances[
chromosome_length > 15000000,
.(
species_distances <- ensembl_distances[, .(
species = species_id,
gene = hsapiens_homolog_ensembl_gene,
distance = pmin(
start_position,
chromosome_length - end_position
)
)
]
)]
distances <- rbindlist(list(distances, species_distances))
}