diff --git a/DESCRIPTION b/DESCRIPTION index 12c2548..00a2214 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,6 +27,7 @@ Imports: tensorflow Suggests: biomaRt, + httr, plotly, rlog, stringr, diff --git a/data/distances.rda b/data/distances.rda index 0a47bfa..52a94f2 100644 Binary files a/data/distances.rda and b/data/distances.rda differ diff --git a/data/genes.rda b/data/genes.rda index 731c316..a0b8d02 100644 Binary files a/data/genes.rda and b/data/genes.rda differ diff --git a/data/species.rda b/data/species.rda index 57875bb..5b96c12 100644 Binary files a/data/species.rda and b/data/species.rda differ diff --git a/scripts/chromosome_names.R b/scripts/chromosome_names.R new file mode 100644 index 0000000..cf1a59a --- /dev/null +++ b/scripts/chromosome_names.R @@ -0,0 +1,34 @@ +library(data.table) +library(httr) + +ensembl_api_url <- "https://rest.ensembl.org" + +#' Perform a request to the Ensembl REST API. +ensembl_request <- function(api_path) { + content(stop_for_status(GET( + paste0(ensembl_api_url, api_path), + content_type_json() + ))) +} + +#' Get IDs of all available vertebrates. +get_species_ids <- function() { + species <- ensembl_request("/info/species")$species + sapply(species, function(species) species$name) +} + +#' Get all chromosomes names for a species. +get_species_chromosomes <- function(species_id) { + chromosomes <- unlist(ensembl_request( + paste0("/info/assembly/", species_id) + )$karyotype) +} + +#' Get a vector of all available unqiue chromosome names. +#' +#' There are multiple names for mitochondrial DNA which have to be removed +#' manually, unfortunately. +get_all_chromosomes <- function() { + chromosomes <- sapply(get_species_ids(), get_species_chromosomes) + unique(unlist(chromosomes)) +} diff --git a/scripts/ensembl.R b/scripts/ensembl.R index aa5189e..e640a0b 100644 --- a/scripts/ensembl.R +++ b/scripts/ensembl.R @@ -17,24 +17,279 @@ species <- ensembl_datasets[, .( name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2] )] +# List of assemblies that the Ensembl Rest API advertises as chromosomes. +# Mitochondrial DNA has been manually removed. Unfortunately, species IDs from +# the Ensembl REST API don't map to dataset names in the BioMart interface. +# Because of that, we can't programatically filter chromosome names. +# +# See get_all_chromosomes() +valid_chromosome_names <- c( + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "Z", + "1A", + "4A", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "X", + "25LG1", + "25LG2", + "LGE22", + "Y", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "LG34", + "LG35", + "2A", + "2B", + "LG1", + "LG2", + "LG3", + "LG4", + "LG5", + "LG6", + "LG7", + "LG8", + "LG9", + "LG10", + "LG11", + "LG12", + "LG13", + "LG14", + "LG15", + "LG16", + "LG17", + "LG18", + "LG19", + "LG20", + "LG21", + "LG22", + "LG23", + "W", + "LG24", + "LG25", + "LG26", + "LG27", + "LG28", + "LG29", + "LG30", + "LG01", + "LG02", + "LG03", + "LG04", + "LG05", + "LG06", + "LG07", + "LG08", + "LG09", + "A1", + "A2", + "A3", + "B1", + "B2", + "B3", + "B4", + "C1", + "C2", + "D1", + "D2", + "D3", + "D4", + "E1", + "E2", + "E3", + "F1", + "F2", + "LGE64", + "LG7_11", + "a", + "b", + "c", + "d", + "f", + "g", + "h", + "LG28B", + "LG30F", + "LG36F", + "LG37M", + "LG42F", + "LG44F", + "LG45M", + "LG48F", + "LG49B", + "ssa01", + "ssa02", + "ssa03", + "ssa04", + "ssa05", + "ssa06", + "ssa07", + "ssa08", + "ssa09", + "ssa10", + "ssa11", + "ssa12", + "ssa13", + "ssa14", + "ssa15", + "ssa16", + "ssa17", + "ssa18", + "ssa19", + "ssa20", + "ssa21", + "ssa22", + "ssa23", + "ssa24", + "ssa25", + "ssa26", + "ssa27", + "ssa28", + "ssa29", + "2a", + "2b", + "7a", + "7b", + "I", + "II", + "III", + "IV", + "V", + "VI", + "VII", + "VIII", + "IX", + "XI", + "XII", + "XIII", + "XIV", + "XV", + "XVI", + "LGE22C19W28_E50C23", + "1a", + "22a", + "sgr01", + "sgr02", + "sgr03", + "sgr04", + "sgr05", + "sgr06", + "sgr07", + "sgr08", + "sgr09", + "sgr10", + "sgr11", + "sgr12", + "sgr13", + "sgr14", + "sgr15", + "sgr16", + "sgr17", + "sgr18", + "sgr19", + "XVII", + "XVIII", + "XIX", + "XX", + "XXI", + "XXII", + "XXIII", + "XXIV", + "groupI", + "groupII", + "groupIII", + "groupIV", + "groupV", + "groupVI", + "groupVII", + "groupVIII", + "groupIX", + "groupX", + "groupXI", + "groupXII", + "groupXIII", + "groupXIV", + "groupXV", + "groupXVI", + "groupXVII", + "groupXVIII", + "groupXIX", + "groupXX", + "groupXXI", + "2L", + "2R", + "3L", + "3R", + "MIC_1", + "MIC_10", + "MIC_11", + "MIC_2", + "MIC_3", + "MIC_4", + "MIC_5", + "MIC_6", + "MIC_7", + "MIC_8", + "MIC_9", + "X1", + "X2", + "X3", + "X4", + "X5" +) + #' Get all chromosome names for an Ensembl dataset. #' -#' The following chromosome naming schemes will be recognized and have been -#' sourced from Ensembl by manually screening chromosome-level assemblies. -#' -#' - a decimal number (most species' autosomes) -#' - X, Y, W or Z (gonosomes) -#' - LG followed by a decimal number (some fishes) -#' - ssa/sgr followed by a number (Atlantic salmon/Turquoise killifish) -#' -#' The function tries to filter out those chromosome names from the available +#' The function tries to filter out valid chromosome names from the available #' assemblies in the dataset. get_chromosome_names <- function(dataset) { chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") - chromosome_names[stringr::str_which( - chromosome_names, - "^(LG|sgr|ssa)?[0-9]+|[XYWZ]$" - )] + chromosome_names[chromosome_names %chin% valid_chromosome_names] } # Retrieve information on human genes. This will only include genes on