Only retrieve assembled chromosomes

This commit is contained in:
Elias Projahn 2021-10-11 15:10:03 +02:00
parent c26e4ff4a3
commit d3edeefbe2

65
input.R
View file

@ -69,6 +69,14 @@ get_ensembl <- function() {
ensembl ensembl
} }
#' Get all chromosome names for a Ensembl dataset.
#'
#' Valid chromosome names include decimal numbers as well as 'X' and 'Y'.
get_chromosome_names <- function(dataset) {
chromosome_names <- listFilterOptions(dataset, "chromosome_name")
chromosome_names[str_which(chromosome_names, "^[0-9]+|[XY]$")]
}
#' Retrieve information on species. #' Retrieve information on species.
#' #'
#' The result will be a `data.table` with the following columns: #' The result will be a `data.table` with the following columns:
@ -91,14 +99,21 @@ retrieve_species <- function() {
#' Retrieve information on human genes. #' Retrieve information on human genes.
#' #'
#' This will only include genes on assembled chromosomes. Chromosomes are
#' filtered based on their name being either a decimal number, 'X' or 'Y'.
#'
#' The result will be a `data.table` with the following columns: #' The result will be a `data.table` with the following columns:
#' #'
#' - `id` Ensembl gene ID. #' - `id` Ensembl gene ID.
#' - `ǹame` HGNC name of the gene. #' - `ǹame` HGNC name of the gene.
#' - `chromosome` Human chromosome on which the gene is located. #' - `chromosome` Human chromosome on which the gene is located.
retrieve_genes <- function() { retrieve_genes <- function() {
dataset <- useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
genes <- data.table(getBM( genes <- data.table(getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"), attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
filters = "chromosome_name",
values = get_chromosome_names(dataset),
mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl()) mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
)) ))
@ -113,9 +128,8 @@ retrieve_genes <- function() {
#' Retrieve gene distance data. #' Retrieve gene distance data.
#' #'
#' The data will include all available values for the given species and genes. #' The data will include all available values for the given species and genes
#' Specific values on naturally or artificially (e.g. due to incomplete #' that are located on assembled chromosomes.
#' sequencing) short chromosomes will be excluded.
#' #'
#' The result will be a `data.table` with the following columns: #' The result will be a `data.table` with the following columns:
#' #'
@ -145,7 +159,7 @@ retrieve_distances <- function(species_ids, gene_ids) {
# Special case the human species and retrieve all available distance # Special case the human species and retrieve all available distance
# information. # information.
ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl) dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
human_distances <- data.table(getBM( human_distances <- data.table(getBM(
attributes = c( attributes = c(
@ -154,33 +168,33 @@ retrieve_distances <- function(species_ids, gene_ids) {
"start_position", "start_position",
"end_position" "end_position"
), ),
mart = ensembl filters = "chromosome_name",
values = get_chromosome_names(dataset),
mart = dataset
)) ))
# Compute the nearest distance to telomeres.
human_distances[, human_distances[,
chromosome_length := max(end_position), chromosome_length := max(end_position),
by = chromosome_name by = chromosome_name
] ]
# Filter out relevant information (see below). distances <- human_distances[, .(
distances <- human_distances[
chromosome_length > 15000000,
.(
species = "hsapiens", species = "hsapiens",
gene = ensembl_gene_id, gene = ensembl_gene_id,
distance = pmin( distance = pmin(
start_position, start_position,
chromosome_length - end_position chromosome_length - end_position
) )
) )]
]
for (i in 1:species_count) { for (i in 1:species_count) {
species_id <- species_ids[i] species_id <- species_ids[i]
progress$tick() progress$tick()
ensembl <- useDataset( dataset <- useDataset(
sprintf("%s_gene_ensembl", species_id), sprintf("%s_gene_ensembl", species_id),
mart = ensembl mart = ensembl
) )
@ -189,43 +203,46 @@ retrieve_distances <- function(species_ids, gene_ids) {
# human orthologs. Some species don't have that information and will be # human orthologs. Some species don't have that information and will be
# skipped. # skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin% if (!"hsapiens_homolog_ensembl_gene" %chin%
listAttributes(ensembl, what = "name")) { listAttributes(dataset, what = "name")) {
next
}
chromosome_names <- get_chromosome_names(dataset)
# Skip the species, if there are no assembled chromosomes.
if (length(chromosome_names) <= 0) {
next next
} }
# Retrieve information on all genes of the current species, that have # Retrieve information on all genes of the current species, that have
# human orthologs. This is called "homolog" in the Ensembl schema. # human orthologs. This is called "homolog" in the Ensembl schema.
ensembl_distances <- data.table(getBM( ensembl_distances <- data.table(getBM(
filters = c("with_hsapiens_homolog"),
values = c(TRUE),
attributes = c( attributes = c(
"hsapiens_homolog_ensembl_gene", "hsapiens_homolog_ensembl_gene",
"chromosome_name", "chromosome_name",
"start_position", "start_position",
"end_position" "end_position"
), ),
mart = ensembl filters = c("with_hsapiens_homolog", "chromosome_name"),
values = list(TRUE, chromosome_names),
mart = dataset
)) ))
# Precompute the genes' distance to the nearest telomere.
ensembl_distances[, ensembl_distances[,
chromosome_length := max(end_position), chromosome_length := max(end_position),
by = chromosome_name by = chromosome_name
] ]
# Filter out relevant information and precompute the genes' distance to species_distances <- ensembl_distances[, .(
# the nearest telomere. Exclude genes on naturally or artificially
# short chromosomes.
species_distances <- ensembl_distances[
chromosome_length > 15000000,
.(
species = species_id, species = species_id,
gene = hsapiens_homolog_ensembl_gene, gene = hsapiens_homolog_ensembl_gene,
distance = pmin( distance = pmin(
start_position, start_position,
chromosome_length - end_position chromosome_length - end_position
) )
) )]
]
distances <- rbindlist(list(distances, species_distances)) distances <- rbindlist(list(distances, species_distances))
} }