Update to Ensembl 106

This also introduces some additional columns to the stored data tables.
This commit is contained in:
Elias Projahn 2022-04-22 13:21:54 +02:00
parent 2427e8c0c0
commit 5bfd20b6b6
7 changed files with 141 additions and 159 deletions

View file

@ -4,6 +4,8 @@
#' \describe{ #' \describe{
#' \item{id}{Unique species ID} #' \item{id}{Unique species ID}
#' \item{name}{Human readable species name} #' \item{name}{Human readable species name}
#' \item{n_chromosomes}{Number of chromosomes}
#' \item{median_chromosome_length}{Median length of chromosomes}
#' } #' }
"species" "species"
@ -29,6 +31,9 @@
#' \describe{ #' \describe{
#' \item{species}{Species ID} #' \item{species}{Species ID}
#' \item{gene}{Gene ID} #' \item{gene}{Gene ID}
#' \item{distance}{Distance to nearest telomere} #' \item{chromosome_name}{Chromosome name from the specified species}
#' \item{start_position}{Start position in base pairs}
#' \item{end_position}{End position in base pairs}
#' \item{distance}{Computed distance to nearest telomere}
#' } #' }
"distances" "distances"

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -9,7 +9,10 @@ A \link{data.table} with the following columns:
\describe{ \describe{
\item{species}{Species ID} \item{species}{Species ID}
\item{gene}{Gene ID} \item{gene}{Gene ID}
\item{distance}{Distance to nearest telomere} \item{chromosome_name}{Chromosome name from the specified species}
\item{start_position}{Start position in base pairs}
\item{end_position}{End position in base pairs}
\item{distance}{Computed distance to nearest telomere}
} }
} }
\usage{ \usage{

View file

@ -9,6 +9,8 @@ A \link{data.table} with the following columns:
\describe{ \describe{
\item{id}{Unique species ID} \item{id}{Unique species ID}
\item{name}{Human readable species name} \item{name}{Human readable species name}
\item{n_chromosomes}{Number of chromosomes}
\item{median_chromosome_length}{Median length of chromosomes}
} }
} }
\usage{ \usage{

View file

@ -3,7 +3,7 @@ library(data.table)
rlog::log_info("Connecting to Ensembl API") rlog::log_info("Connecting to Ensembl API")
# Object to access the Ensembl API. # Object to access the Ensembl API.
ensembl <- biomaRt::useEnsembl("ensembl", version = 105) ensembl <- biomaRt::useEnsembl("ensembl", version = 106)
# Retrieve species information. # Retrieve species information.
@ -43,88 +43,23 @@ valid_chromosome_names <- c(
"18", "18",
"19", "19",
"20", "20",
"X",
"Y",
"21", "21",
"4A",
"1A",
"22", "22",
"23", "23",
"24", "24",
"25", "25LG1",
"25LG2",
"26", "26",
"27", "27",
"28", "28",
"29",
"Z",
"1A",
"4A",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"X",
"25LG1",
"25LG2",
"LGE22", "LGE22",
"Y", "Z",
"41", "25",
"42", "29",
"43",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"LG34",
"LG35",
"2A",
"2B",
"LG1",
"LG2",
"LG3",
"LG4",
"LG5",
"LG6",
"LG7",
"LG8",
"LG9",
"LG10",
"LG11",
"LG12",
"LG13",
"LG14",
"LG15",
"LG16",
"LG17",
"LG18",
"LG19",
"LG20",
"LG21",
"LG22",
"LG23",
"W",
"LG24",
"LG25",
"LG26",
"LG27",
"LG28",
"LG29",
"LG30",
"LG01",
"LG02",
"LG03",
"LG04",
"LG05",
"LG06",
"LG07",
"LG08",
"LG09",
"A1", "A1",
"A2", "A2",
"A3", "A3",
@ -143,57 +78,33 @@ valid_chromosome_names <- c(
"E3", "E3",
"F1", "F1",
"F2", "F2",
"LGE64", "2A",
"LG7_11", "2B",
"a", "LG01",
"b", "LG02",
"c", "LG03",
"d", "LG04",
"f", "LG05",
"g", "LG06",
"h", "LG07",
"LG28B", "LG08",
"LG30F", "LG09",
"LG36F", "LG10",
"LG37M", "LG11",
"LG42F", "LG12",
"LG44F", "LG13",
"LG45M", "LG14",
"LG48F", "LG15",
"LG49B", "LG16",
"ssa01", "LG17",
"ssa02", "LG18",
"ssa03", "LG19",
"ssa04", "LG20",
"ssa05", "LG21",
"ssa06", "LG22",
"ssa07", "LG23",
"ssa08", "LG24",
"ssa09", "LG25",
"ssa10",
"ssa11",
"ssa12",
"ssa13",
"ssa14",
"ssa15",
"ssa16",
"ssa17",
"ssa18",
"ssa19",
"ssa20",
"ssa21",
"ssa22",
"ssa23",
"ssa24",
"ssa25",
"ssa26",
"ssa27",
"ssa28",
"ssa29",
"2a",
"2b",
"7a",
"7b",
"I", "I",
"II", "II",
"III", "III",
@ -209,9 +120,61 @@ valid_chromosome_names <- c(
"XIV", "XIV",
"XV", "XV",
"XVI", "XVI",
"LGE22C19W28_E50C23", "XVII",
"XVIII",
"XIX",
"XX",
"XXI",
"XXII",
"XXIII",
"XXIV",
"W",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"2L",
"2R",
"3L",
"3R",
"LG1",
"LG2",
"LG3",
"LG4",
"LG5",
"LG6",
"LG7",
"LG8",
"LG9",
"LG26",
"LG27",
"LG28",
"LG29",
"LG30",
"1a", "1a",
"7b",
"22a", "22a",
"LGE22C19W28_E50C23",
"LGE64",
"7a",
"MIC_1",
"MIC_10",
"MIC_11",
"MIC_2",
"MIC_3",
"MIC_4",
"MIC_5",
"MIC_6",
"MIC_7",
"MIC_8",
"MIC_9",
"sgr01", "sgr01",
"sgr02", "sgr02",
"sgr03", "sgr03",
@ -231,14 +194,40 @@ valid_chromosome_names <- c(
"sgr17", "sgr17",
"sgr18", "sgr18",
"sgr19", "sgr19",
"XVII", "X1",
"XVIII", "X2",
"XIX", "X3",
"XX", "X4",
"XXI", "X5",
"XXII", "a",
"XXIII", "b",
"XXIV", "c",
"d",
"f",
"g",
"h",
"41",
"42",
"43",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"LG28B",
"LG30F",
"LG36F",
"LG37M",
"LG42F",
"LG44F",
"LG45M",
"LG48F",
"LG49B",
"LG34",
"LG35",
"LG7_11",
"groupI", "groupI",
"groupII", "groupII",
"groupIII", "groupIII",
@ -259,27 +248,7 @@ valid_chromosome_names <- c(
"groupXVIII", "groupXVIII",
"groupXIX", "groupXIX",
"groupXX", "groupXX",
"groupXXI", "groupXXI"
"2L",
"2R",
"3L",
"3R",
"MIC_1",
"MIC_10",
"MIC_11",
"MIC_2",
"MIC_3",
"MIC_4",
"MIC_5",
"MIC_6",
"MIC_7",
"MIC_8",
"MIC_9",
"X1",
"X2",
"X3",
"X4",
"X5"
) )
#' Get all chromosome names for an Ensembl dataset. #' Get all chromosome names for an Ensembl dataset.
@ -347,6 +316,9 @@ handle_species <- function(species_id, species_data) {
.( .(
species = species_id, species = species_id,
gene = ensembl_gene_id, gene = ensembl_gene_id,
chromosome_name = chromosome_name,
start_position = start_position,
end_position = end_position,
distance = pmin( distance = pmin(
start_position, start_position,
chromosome_length - end_position chromosome_length - end_position
@ -364,7 +336,7 @@ handle_species <- function(species_id, species_data) {
handle_species("hsapiens", human_data) handle_species("hsapiens", human_data)
# Iterate through all other species and retrieve their distance data. # Iterate through all other species and retrieve their distance data.
for (species_id in species[86:nrow(species), id]) { for (species_id in species[id != "hsapiens", id]) {
rlog::log_info(sprintf("Loading species \"%s\"", species_id)) rlog::log_info(sprintf("Loading species \"%s\"", species_id))
dataset <- biomaRt::useDataset( dataset <- biomaRt::useDataset(