data: Include more chromosomes

This commit is contained in:
Elias Projahn 2021-11-02 11:00:37 +01:00
parent df6e23d219
commit 6494ae8200
5 changed files with 27 additions and 8 deletions

View file

@ -1,6 +1,6 @@
#' Information on included species from the Ensembl database. #' Information on included species from the Ensembl database.
#' #'
#' @format A [data.table] with 91 rows and 2 variables: #' @format A [data.table] with 99 rows and 2 variables:
#' \describe{ #' \describe{
#' \item{id}{Unique species ID} #' \item{id}{Unique species ID}
#' \item{name}{Human readable species name} #' \item{name}{Human readable species name}
@ -25,10 +25,11 @@
#' This dataset contains each known value for a gene's distance to the telomeres #' This dataset contains each known value for a gene's distance to the telomeres
#' per species. The data is sourced from Ensembl. #' per species. The data is sourced from Ensembl.
#' #'
#' @format A [data.table] with 1390730 rows and 3 variables: #' @format A [data.table] with 1506182 rows and 4 variables:
#' \describe{ #' \describe{
#' \item{species}{Species ID} #' \item{species}{Species ID}
#' \item{gene}{Gene ID} #' \item{gene}{Gene ID}
#' \item{position}{Gene start position}
#' \item{distance}{Distance to nearest telomere} #' \item{distance}{Distance to nearest telomere}
#' } #' }
"distances" "distances"

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -2,8 +2,9 @@ library(data.table)
rlog::log_info("Connecting to Ensembl API") rlog::log_info("Connecting to Ensembl API")
#' Object to access the Ensembl API. # Object to access the Ensembl API. We use the US east mirror to circumvent
ensembl <- biomaRt::useEnsembl("ensembl") # current issues with the main server being temporarily unreliable.
ensembl <- biomaRt::useEnsembl("ensembl", host = "useast.ensembl.org")
# Retrieve species information. # Retrieve species information.
@ -18,11 +19,22 @@ species <- ensembl_datasets[, .(
#' Get all chromosome names for an Ensembl dataset. #' Get all chromosome names for an Ensembl dataset.
#' #'
#' Valid chromosome names include decimal numbers as well as typical sex #' The following chromosome naming schemes will be recognized and have been
#' chromosome names (X, Y, W and Z). #' sourced from Ensembl by manually screening chromosome-level assemblies.
#'
#' - a decimal number (most species' autosomes)
#' - X, Y, W or Z (gonosomes)
#' - LG followed by a decimal number (some fishes)
#' - ssa/sgr followed by a number (Atlantic salmon/Turquoise killifish)
#'
#' The function tries to filter out those chromosome names from the available
#' assemblies in the dataset.
get_chromosome_names <- function(dataset) { get_chromosome_names <- function(dataset) {
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
chromosome_names[stringr::str_which(chromosome_names, "^[0-9]+|[XYWZ]$")] chromosome_names[stringr::str_which(
chromosome_names,
"^(LG|sgr|ssa)?[0-9]+|[XYWZ]$"
)]
} }
# Retrieve information on human genes. This will only include genes on # Retrieve information on human genes. This will only include genes on
@ -66,6 +78,7 @@ human_data[, chromosome_length := max(end_position), by = chromosome_name]
distances <- human_data[, .( distances <- human_data[, .(
species = "hsapiens", species = "hsapiens",
gene = ensembl_gene_id, gene = ensembl_gene_id,
position = start_position,
distance = pmin( distance = pmin(
start_position, start_position,
chromosome_length - end_position chromosome_length - end_position
@ -86,7 +99,6 @@ for (species_id in species[!id == "hsapiens", id]) {
# skipped. # skipped.
if (!"hsapiens_homolog_ensembl_gene" %chin% if (!"hsapiens_homolog_ensembl_gene" %chin%
biomaRt::listAttributes(dataset, what = "name")) { biomaRt::listAttributes(dataset, what = "name")) {
rlog::log_info("No data on human orthologs") rlog::log_info("No data on human orthologs")
species <- species[id != species_id] species <- species[id != species_id]
@ -117,6 +129,11 @@ for (species_id in species[!id == "hsapiens", id]) {
mart = dataset mart = dataset
)) ))
# Only include human genes that we have information on.
species_distances <- species_distances[
hsapiens_homolog_ensembl_gene %chin% genes$id
]
# Only include one ortholog per human gene. # Only include one ortholog per human gene.
species_distances <- unique( species_distances <- unique(
species_distances, species_distances,
@ -133,6 +150,7 @@ for (species_id in species[!id == "hsapiens", id]) {
species_distances <- species_distances[, .( species_distances <- species_distances[, .(
species = species_id, species = species_id,
gene = hsapiens_homolog_ensembl_gene, gene = hsapiens_homolog_ensembl_gene,
position = start_position,
distance = pmin( distance = pmin(
start_position, start_position,
chromosome_length - end_position chromosome_length - end_position