data: Add more chromosomes

This commit is contained in:
Elias Projahn 2021-12-05 13:23:00 +01:00
parent f940d7d9b0
commit 8ced026b79
6 changed files with 303 additions and 13 deletions

View file

@ -27,6 +27,7 @@ Imports:
tensorflow
Suggests:
biomaRt,
httr,
plotly,
rlog,
stringr,

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,34 @@
library(data.table)
library(httr)
ensembl_api_url <- "https://rest.ensembl.org"
#' Perform a request to the Ensembl REST API.
ensembl_request <- function(api_path) {
content(stop_for_status(GET(
paste0(ensembl_api_url, api_path),
content_type_json()
)))
}
#' Get IDs of all available vertebrates.
get_species_ids <- function() {
species <- ensembl_request("/info/species")$species
sapply(species, function(species) species$name)
}
#' Get all chromosomes names for a species.
get_species_chromosomes <- function(species_id) {
chromosomes <- unlist(ensembl_request(
paste0("/info/assembly/", species_id)
)$karyotype)
}
#' Get a vector of all available unqiue chromosome names.
#'
#' There are multiple names for mitochondrial DNA which have to be removed
#' manually, unfortunately.
get_all_chromosomes <- function() {
chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
unique(unlist(chromosomes))
}

View file

@ -17,24 +17,279 @@ species <- ensembl_datasets[, .(
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
)]
# List of assemblies that the Ensembl Rest API advertises as chromosomes.
# Mitochondrial DNA has been manually removed. Unfortunately, species IDs from
# the Ensembl REST API don't map to dataset names in the BioMart interface.
# Because of that, we can't programatically filter chromosome names.
#
# See get_all_chromosomes()
valid_chromosome_names <- c(
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"Z",
"1A",
"4A",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"X",
"25LG1",
"25LG2",
"LGE22",
"Y",
"41",
"42",
"43",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"LG34",
"LG35",
"2A",
"2B",
"LG1",
"LG2",
"LG3",
"LG4",
"LG5",
"LG6",
"LG7",
"LG8",
"LG9",
"LG10",
"LG11",
"LG12",
"LG13",
"LG14",
"LG15",
"LG16",
"LG17",
"LG18",
"LG19",
"LG20",
"LG21",
"LG22",
"LG23",
"W",
"LG24",
"LG25",
"LG26",
"LG27",
"LG28",
"LG29",
"LG30",
"LG01",
"LG02",
"LG03",
"LG04",
"LG05",
"LG06",
"LG07",
"LG08",
"LG09",
"A1",
"A2",
"A3",
"B1",
"B2",
"B3",
"B4",
"C1",
"C2",
"D1",
"D2",
"D3",
"D4",
"E1",
"E2",
"E3",
"F1",
"F2",
"LGE64",
"LG7_11",
"a",
"b",
"c",
"d",
"f",
"g",
"h",
"LG28B",
"LG30F",
"LG36F",
"LG37M",
"LG42F",
"LG44F",
"LG45M",
"LG48F",
"LG49B",
"ssa01",
"ssa02",
"ssa03",
"ssa04",
"ssa05",
"ssa06",
"ssa07",
"ssa08",
"ssa09",
"ssa10",
"ssa11",
"ssa12",
"ssa13",
"ssa14",
"ssa15",
"ssa16",
"ssa17",
"ssa18",
"ssa19",
"ssa20",
"ssa21",
"ssa22",
"ssa23",
"ssa24",
"ssa25",
"ssa26",
"ssa27",
"ssa28",
"ssa29",
"2a",
"2b",
"7a",
"7b",
"I",
"II",
"III",
"IV",
"V",
"VI",
"VII",
"VIII",
"IX",
"XI",
"XII",
"XIII",
"XIV",
"XV",
"XVI",
"LGE22C19W28_E50C23",
"1a",
"22a",
"sgr01",
"sgr02",
"sgr03",
"sgr04",
"sgr05",
"sgr06",
"sgr07",
"sgr08",
"sgr09",
"sgr10",
"sgr11",
"sgr12",
"sgr13",
"sgr14",
"sgr15",
"sgr16",
"sgr17",
"sgr18",
"sgr19",
"XVII",
"XVIII",
"XIX",
"XX",
"XXI",
"XXII",
"XXIII",
"XXIV",
"groupI",
"groupII",
"groupIII",
"groupIV",
"groupV",
"groupVI",
"groupVII",
"groupVIII",
"groupIX",
"groupX",
"groupXI",
"groupXII",
"groupXIII",
"groupXIV",
"groupXV",
"groupXVI",
"groupXVII",
"groupXVIII",
"groupXIX",
"groupXX",
"groupXXI",
"2L",
"2R",
"3L",
"3R",
"MIC_1",
"MIC_10",
"MIC_11",
"MIC_2",
"MIC_3",
"MIC_4",
"MIC_5",
"MIC_6",
"MIC_7",
"MIC_8",
"MIC_9",
"X1",
"X2",
"X3",
"X4",
"X5"
)
#' Get all chromosome names for an Ensembl dataset.
#'
#' The following chromosome naming schemes will be recognized and have been
#' sourced from Ensembl by manually screening chromosome-level assemblies.
#'
#' - a decimal number (most species' autosomes)
#' - X, Y, W or Z (gonosomes)
#' - LG followed by a decimal number (some fishes)
#' - ssa/sgr followed by a number (Atlantic salmon/Turquoise killifish)
#'
#' The function tries to filter out those chromosome names from the available
#' The function tries to filter out valid chromosome names from the available
#' assemblies in the dataset.
get_chromosome_names <- function(dataset) {
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
chromosome_names[stringr::str_which(
chromosome_names,
"^(LG|sgr|ssa)?[0-9]+|[XYWZ]$"
)]
chromosome_names[chromosome_names %chin% valid_chromosome_names]
}
# Retrieve information on human genes. This will only include genes on