diff --git a/R/data.R b/R/data.R index 9ce0db8..534425e 100644 --- a/R/data.R +++ b/R/data.R @@ -4,6 +4,8 @@ #' \describe{ #' \item{id}{Unique species ID} #' \item{name}{Human readable species name} +#' \item{n_chromosomes}{Number of chromosomes} +#' \item{median_chromosome_length}{Median length of chromosomes} #' } "species" @@ -29,6 +31,9 @@ #' \describe{ #' \item{species}{Species ID} #' \item{gene}{Gene ID} -#' \item{distance}{Distance to nearest telomere} +#' \item{chromosome_name}{Chromosome name from the specified species} +#' \item{start_position}{Start position in base pairs} +#' \item{end_position}{End position in base pairs} +#' \item{distance}{Computed distance to nearest telomere} #' } "distances" diff --git a/data/distances.rda b/data/distances.rda index 24dde6f..9763b49 100644 Binary files a/data/distances.rda and b/data/distances.rda differ diff --git a/data/genes.rda b/data/genes.rda index 096208d..f132a7d 100644 Binary files a/data/genes.rda and b/data/genes.rda differ diff --git a/data/species.rda b/data/species.rda index 39e5d4f..5d0d7a5 100644 Binary files a/data/species.rda and b/data/species.rda differ diff --git a/man/distances.Rd b/man/distances.Rd index 66ccf83..b7b4782 100644 --- a/man/distances.Rd +++ b/man/distances.Rd @@ -9,7 +9,10 @@ A \link{data.table} with the following columns: \describe{ \item{species}{Species ID} \item{gene}{Gene ID} -\item{distance}{Distance to nearest telomere} +\item{chromosome_name}{Chromosome name from the specified species} +\item{start_position}{Start position in base pairs} +\item{end_position}{End position in base pairs} +\item{distance}{Computed distance to nearest telomere} } } \usage{ diff --git a/man/species.Rd b/man/species.Rd index 4b11b6b..6844654 100644 --- a/man/species.Rd +++ b/man/species.Rd @@ -9,6 +9,8 @@ A \link{data.table} with the following columns: \describe{ \item{id}{Unique species ID} \item{name}{Human readable species name} +\item{n_chromosomes}{Number of chromosomes} +\item{median_chromosome_length}{Median length of chromosomes} } } \usage{ diff --git a/scripts/ensembl.R b/scripts/ensembl.R index cfe908d..c66d706 100644 --- a/scripts/ensembl.R +++ b/scripts/ensembl.R @@ -3,7 +3,7 @@ library(data.table) rlog::log_info("Connecting to Ensembl API") # Object to access the Ensembl API. -ensembl <- biomaRt::useEnsembl("ensembl", version = 105) +ensembl <- biomaRt::useEnsembl("ensembl", version = 106) # Retrieve species information. @@ -43,88 +43,23 @@ valid_chromosome_names <- c( "18", "19", "20", + "X", + "Y", "21", + "4A", + "1A", "22", "23", "24", - "25", + "25LG1", + "25LG2", "26", "27", "28", - "29", - "Z", - "1A", - "4A", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "X", - "25LG1", - "25LG2", "LGE22", - "Y", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "LG34", - "LG35", - "2A", - "2B", - "LG1", - "LG2", - "LG3", - "LG4", - "LG5", - "LG6", - "LG7", - "LG8", - "LG9", - "LG10", - "LG11", - "LG12", - "LG13", - "LG14", - "LG15", - "LG16", - "LG17", - "LG18", - "LG19", - "LG20", - "LG21", - "LG22", - "LG23", - "W", - "LG24", - "LG25", - "LG26", - "LG27", - "LG28", - "LG29", - "LG30", - "LG01", - "LG02", - "LG03", - "LG04", - "LG05", - "LG06", - "LG07", - "LG08", - "LG09", + "Z", + "25", + "29", "A1", "A2", "A3", @@ -143,57 +78,33 @@ valid_chromosome_names <- c( "E3", "F1", "F2", - "LGE64", - "LG7_11", - "a", - "b", - "c", - "d", - "f", - "g", - "h", - "LG28B", - "LG30F", - "LG36F", - "LG37M", - "LG42F", - "LG44F", - "LG45M", - "LG48F", - "LG49B", - "ssa01", - "ssa02", - "ssa03", - "ssa04", - "ssa05", - "ssa06", - "ssa07", - "ssa08", - "ssa09", - "ssa10", - "ssa11", - "ssa12", - "ssa13", - "ssa14", - "ssa15", - "ssa16", - "ssa17", - "ssa18", - "ssa19", - "ssa20", - "ssa21", - "ssa22", - "ssa23", - "ssa24", - "ssa25", - "ssa26", - "ssa27", - "ssa28", - "ssa29", - "2a", - "2b", - "7a", - "7b", + "2A", + "2B", + "LG01", + "LG02", + "LG03", + "LG04", + "LG05", + "LG06", + "LG07", + "LG08", + "LG09", + "LG10", + "LG11", + "LG12", + "LG13", + "LG14", + "LG15", + "LG16", + "LG17", + "LG18", + "LG19", + "LG20", + "LG21", + "LG22", + "LG23", + "LG24", + "LG25", "I", "II", "III", @@ -209,9 +120,61 @@ valid_chromosome_names <- c( "XIV", "XV", "XVI", - "LGE22C19W28_E50C23", + "XVII", + "XVIII", + "XIX", + "XX", + "XXI", + "XXII", + "XXIII", + "XXIV", + "W", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "2L", + "2R", + "3L", + "3R", + "LG1", + "LG2", + "LG3", + "LG4", + "LG5", + "LG6", + "LG7", + "LG8", + "LG9", + "LG26", + "LG27", + "LG28", + "LG29", + "LG30", "1a", + "7b", "22a", + "LGE22C19W28_E50C23", + "LGE64", + "7a", + "MIC_1", + "MIC_10", + "MIC_11", + "MIC_2", + "MIC_3", + "MIC_4", + "MIC_5", + "MIC_6", + "MIC_7", + "MIC_8", + "MIC_9", "sgr01", "sgr02", "sgr03", @@ -231,14 +194,40 @@ valid_chromosome_names <- c( "sgr17", "sgr18", "sgr19", - "XVII", - "XVIII", - "XIX", - "XX", - "XXI", - "XXII", - "XXIII", - "XXIV", + "X1", + "X2", + "X3", + "X4", + "X5", + "a", + "b", + "c", + "d", + "f", + "g", + "h", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "LG28B", + "LG30F", + "LG36F", + "LG37M", + "LG42F", + "LG44F", + "LG45M", + "LG48F", + "LG49B", + "LG34", + "LG35", + "LG7_11", "groupI", "groupII", "groupIII", @@ -259,27 +248,7 @@ valid_chromosome_names <- c( "groupXVIII", "groupXIX", "groupXX", - "groupXXI", - "2L", - "2R", - "3L", - "3R", - "MIC_1", - "MIC_10", - "MIC_11", - "MIC_2", - "MIC_3", - "MIC_4", - "MIC_5", - "MIC_6", - "MIC_7", - "MIC_8", - "MIC_9", - "X1", - "X2", - "X3", - "X4", - "X5" + "groupXXI" ) #' Get all chromosome names for an Ensembl dataset. @@ -347,6 +316,9 @@ handle_species <- function(species_id, species_data) { .( species = species_id, gene = ensembl_gene_id, + chromosome_name = chromosome_name, + start_position = start_position, + end_position = end_position, distance = pmin( start_position, chromosome_length - end_position @@ -364,7 +336,7 @@ handle_species <- function(species_id, species_data) { handle_species("hsapiens", human_data) # Iterate through all other species and retrieve their distance data. -for (species_id in species[86:nrow(species), id]) { +for (species_id in species[id != "hsapiens", id]) { rlog::log_info(sprintf("Loading species \"%s\"", species_id)) dataset <- biomaRt::useDataset(