diff --git a/.gitignore b/.gitignore index 9180b12..746b3d0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ /cache .Rproj.user +/chromosomes.csv +/genes.csv +/species.csv diff --git a/R/data.R b/R/data.R index 78f82c4..10e2914 100644 --- a/R/data.R +++ b/R/data.R @@ -2,13 +2,26 @@ #' #' @format A [data.table] with the following columns: #' \describe{ -#' \item{id}{Unique species ID} +#' \item{id}{Unique species ID, these are NCBI taxon IDs} #' \item{name}{Human readable species name} +#' \item{scientific_name}{Scientific name of the species} +#' \item{table_name}{Table name within the Ensembl database} #' \item{n_chromosomes}{Number of chromosomes} #' \item{median_chromosome_length}{Median length of chromosomes} #' } "species" +#' Information on chromosomes for each included species. +#' +#' @format A [data.table] with the following columns: +#' \describe{ +#' \item{species}{Species ID} +#' \item{id}{Chromosome ID, theses are Ensembl sequence IDs} +#' \item{name}{Chromosome name} +#' \item{length}{Length in base pairs} +#' } +"chromosomes" + #' Information on human genes within the Ensembl database. #' #' This includes only genes on the primary suggested assembly of the human @@ -18,7 +31,7 @@ #' \describe{ #' \item{id}{Ensembl gene ID} #' \item{name}{The gene's HGNC name (if available)} -#' \item{chrosome}{The human chromosome the gene is located on} +#' \item{chromosome}{The human chromosome the gene is located on} #' } "genes" @@ -31,7 +44,7 @@ #' \describe{ #' \item{species}{Species ID} #' \item{gene}{Gene ID} -#' \item{chromosome_name}{Chromosome name from the specified species} +#' \item{chromosome}{Chromosome ID} #' \item{start_position}{Start position in base pairs} #' \item{end_position}{End position in base pairs} #' \item{distance}{Computed distance to nearest telomere} diff --git a/R/plots.R b/R/plots.R index bb38285..691d148 100644 --- a/R/plots.R +++ b/R/plots.R @@ -561,13 +561,14 @@ plot_scores_by_position <- function(ranking, } distance_data <- if (!is.null(chromosome_name)) { - chromosome_name_ <- chromosome_name - geposan::distances[ - species == "hsapiens" & - chromosome_name == chromosome_name_ + chromosome_id <- geposan::chromosomes[ + species == "9606" & name == chromosome_name, + id ] + + geposan::distances[species == "9606" & chromosome == chromosome_id] } else { - geposan::distances[species == "hsapiens"] + geposan::distances[species == "9606"] } data <- merge(ranking, distance_data, by = "gene") diff --git a/data/chromosomes.rda b/data/chromosomes.rda new file mode 100644 index 0000000..d65e6ed Binary files /dev/null and b/data/chromosomes.rda differ diff --git a/data/distances.rda b/data/distances.rda index deb1417..ae685fd 100644 Binary files a/data/distances.rda and b/data/distances.rda differ diff --git a/data/genes.rda b/data/genes.rda index d8a139e..2d43c6a 100644 Binary files a/data/genes.rda and b/data/genes.rda differ diff --git a/data/species.rda b/data/species.rda index 3fd6e69..40ef496 100644 Binary files a/data/species.rda and b/data/species.rda differ diff --git a/man/chromosomes.Rd b/man/chromosomes.Rd new file mode 100644 index 0000000..15e9aed --- /dev/null +++ b/man/chromosomes.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{chromosomes} +\alias{chromosomes} +\title{Information on chromosomes for each included species.} +\format{ +A \link{data.table} with the following columns: +\describe{ +\item{species}{Species ID} +\item{id}{Chromosome ID, theses are Ensembl sequence IDs} +\item{name}{Chromosome name} +\item{length}{Length in base pairs} +} +} +\usage{ +chromosomes +} +\description{ +Information on chromosomes for each included species. +} +\keyword{datasets} diff --git a/man/distances.Rd b/man/distances.Rd index b7b4782..196ad88 100644 --- a/man/distances.Rd +++ b/man/distances.Rd @@ -9,7 +9,7 @@ A \link{data.table} with the following columns: \describe{ \item{species}{Species ID} \item{gene}{Gene ID} -\item{chromosome_name}{Chromosome name from the specified species} +\item{chromosome}{Chromosome ID} \item{start_position}{Start position in base pairs} \item{end_position}{End position in base pairs} \item{distance}{Computed distance to nearest telomere} diff --git a/man/genes.Rd b/man/genes.Rd index 6d6c945..c8c9f54 100644 --- a/man/genes.Rd +++ b/man/genes.Rd @@ -9,7 +9,7 @@ A \link{data.table} with the following columns: \describe{ \item{id}{Ensembl gene ID} \item{name}{The gene's HGNC name (if available)} -\item{chrosome}{The human chromosome the gene is located on} +\item{chromosome}{The human chromosome the gene is located on} } } \usage{ diff --git a/man/species.Rd b/man/species.Rd index 6844654..c9e332a 100644 --- a/man/species.Rd +++ b/man/species.Rd @@ -7,8 +7,10 @@ \format{ A \link{data.table} with the following columns: \describe{ -\item{id}{Unique species ID} +\item{id}{Unique species ID, these are NCBI taxon IDs} \item{name}{Human readable species name} +\item{scientific_name}{Scientific name of the species} +\item{table_name}{Table name within the Ensembl database} \item{n_chromosomes}{Number of chromosomes} \item{median_chromosome_length}{Median length of chromosomes} } diff --git a/scripts/chromosome_names.R b/scripts/chromosome_names.R deleted file mode 100644 index 8312716..0000000 --- a/scripts/chromosome_names.R +++ /dev/null @@ -1,34 +0,0 @@ -library(data.table) -library(httr) - -ensembl_api_url <- "https://rest.ensembl.org" - -#' Perform a request to the Ensembl REST API. -ensembl_request <- function(api_path) { - content(stop_for_status(GET( - paste0(ensembl_api_url, api_path), - content_type_json() - ))) -} - -#' Get IDs of all available vertebrates. -get_species_ids <- function() { - species <- ensembl_request("/info/species")$species - sapply(species, function(species) species$name) -} - -#' Get all chromosomes names for a species. -get_species_chromosomes <- function(species_id) { - unlist(ensembl_request( - paste0("/info/assembly/", species_id) - )$karyotype) -} - -#' Get a vector of all available unqiue chromosome names. -#' -#' There are multiple names for mitochondrial DNA which have to be removed -#' manually, unfortunately. -get_all_chromosomes <- function() { - chromosomes <- sapply(get_species_ids(), get_species_chromosomes) - unique(unlist(chromosomes)) -} diff --git a/scripts/ensembl.R b/scripts/ensembl.R deleted file mode 100644 index 814ab3f..0000000 --- a/scripts/ensembl.R +++ /dev/null @@ -1,407 +0,0 @@ -library(data.table) - -rlog::log_info("Connecting to Ensembl API") - -# Object to access the Ensembl API. -ensembl <- biomaRt::useEnsembl("ensembl", version = 110) - -# Retrieve species information. - -rlog::log_info("Retrieving species information") -ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl)) - -# Filter out species ID and name from the result. -species <- ensembl_datasets[, .( - id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2], - name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2] -)] - -# List of assemblies that the Ensembl Rest API advertises as chromosomes. -# Mitochondrial DNA has been manually removed. Unfortunately, species IDs from -# the Ensembl REST API don't map to dataset names in the BioMart interface. -# Because of that, we can't programatically filter chromosome names. -# -# See get_all_chromosomes() -valid_chromosome_names <- c( - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "X", - "groupI", - "groupII", - "groupIII", - "groupIV", - "groupV", - "groupVI", - "groupVII", - "groupVIII", - "groupIX", - "groupX", - "groupXI", - "groupXII", - "groupXIII", - "groupXIV", - "groupXV", - "groupXVI", - "groupXVII", - "groupXVIII", - "groupXIX", - "groupXX", - "groupXXI", - "20", - "Y", - "21", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "I", - "II", - "III", - "IV", - "V", - "VI", - "VII", - "VIII", - "IX", - "XI", - "XII", - "XIII", - "XIV", - "XV", - "XVI", - "XVII", - "XVIII", - "XIX", - "XX", - "XXI", - "XXII", - "XXIII", - "XXIV", - "7a", - "7b", - "Z", - "W", - "a", - "b", - "c", - "d", - "f", - "g", - "h", - "39", - "40", - "1a", - "22a", - "sgr01", - "sgr02", - "sgr03", - "sgr04", - "sgr05", - "sgr06", - "sgr07", - "sgr08", - "sgr09", - "sgr10", - "sgr11", - "sgr12", - "sgr13", - "sgr14", - "sgr15", - "sgr16", - "sgr17", - "sgr18", - "sgr19", - "LGE64", - "2A", - "2B", - "X1", - "X2", - "X3", - "X4", - "X5", - "LG1", - "LG2", - "LG3", - "LG4", - "LG5", - "LG6", - "LG7", - "LG8", - "LG9", - "LG10", - "LG11", - "LG12", - "LG13", - "LG14", - "LG15", - "LG16", - "LG17", - "LG18", - "LG19", - "LG20", - "LG22", - "LG23", - "4A", - "1A", - "25LG1", - "25LG2", - "LGE22", - "LG21", - "A1", - "A2", - "A3", - "B1", - "B2", - "B3", - "B4", - "C1", - "C2", - "D1", - "D2", - "D3", - "D4", - "E1", - "E2", - "E3", - "F1", - "F2", - "LG34", - "LG35", - "LG24", - "LG25", - "LG26", - "LG27", - "LG28", - "LG29", - "LG30", - "MIC_1", - "MIC_10", - "MIC_11", - "MIC_2", - "MIC_3", - "MIC_4", - "MIC_5", - "MIC_6", - "MIC_7", - "MIC_8", - "MIC_9", - "2L", - "2R", - "3L", - "3R", - "LGE22C19W28_E50C23", - "LG01", - "LG02", - "LG03", - "LG04", - "LG05", - "LG06", - "LG07", - "LG08", - "LG09", - "LG7_11", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "LG28B", - "LG30F", - "LG36F", - "LG37M", - "LG42F", - "LG44F", - "LG45M", - "LG48F", - "LG49B" -) - -#' Get all chromosome names for an Ensembl dataset. -#' -#' The function tries to filter out valid chromosome names from the available -#' assemblies in the dataset. -get_chromosome_names <- function(dataset) { - chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") - chromosome_names[chromosome_names %chin% valid_chromosome_names] -} - -# Retrieve information on human genes. This will only include genes on -# assembled chromosomes. Chromosomes are filtered using get_chromosome_names(). - -rlog::log_info("Retrieving information on human genes") -dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl) - -human_data <- data.table(biomaRt::getBM( - attributes = c( - "ensembl_gene_id", - "hgnc_symbol", - "chromosome_name", - "start_position", - "end_position" - ), - filters = "chromosome_name", - values = get_chromosome_names(dataset), - mart = dataset -)) - -# Remove duplicated gene IDs (at the time of writing, there are a handful). -human_data <- unique(human_data, by = "ensembl_gene_id") - -# Only keep relevant information on genes. -genes <- human_data[, .( - id = ensembl_gene_id, - name = hgnc_symbol, - chromosome = chromosome_name -)] - -# Retrieve gene distance data across species. - -rlog::log_info("Retrieving distance data") -distances <- data.table() - -#' Handle data for one species. -handle_species <- function(species_id, species_data) { - chromosomes <- species_data[, - .(chromosome_length = max(end_position)), - by = chromosome_name - ] - - # Store the number of chromosomes in the species table. - species[id == species_id, n_chromosomes := nrow(chromosomes)] - - # Store the median chromosome length in the species table. - species[ - id == species_id, - median_chromosome_length := chromosomes[, median(chromosome_length)] - ] - - # Precompute the genes' distance to the nearest telomere. - species_distances <- species_data[ - chromosomes, - .( - species = species_id, - gene = ensembl_gene_id, - chromosome_name = chromosome_name, - start_position = start_position, - end_position = end_position, - distance = pmin( - start_position, - chromosome_length - end_position - ) - ), - on = "chromosome_name" - ] - - # Add species distances to the distances table. - distances <<- rbindlist(list(distances, species_distances)) -} - -# Handle the human first, as we already retrieved the data and don't need to -# filter based on orthologies. -handle_species("hsapiens", human_data) - -# Iterate through all other species and retrieve their distance data. -for (species_id in species[id != "hsapiens", id]) { - rlog::log_info(sprintf("Loading species \"%s\"", species_id)) - - dataset <- biomaRt::useDataset( - sprintf("%s_gene_ensembl", species_id), - mart = ensembl - ) - - # Besides the attributes that are always present, we need to check for - # human orthologs. Some species don't have that information and will be - # skipped. - if (!"hsapiens_homolog_ensembl_gene" %chin% - biomaRt::listAttributes(dataset, what = "name")) { - rlog::log_info("No data on human orthologs") - species <- species[id != species_id] - - next - } - - chromosome_names <- get_chromosome_names(dataset) - - # Skip the species, if there are no assembled chromosomes. - if (length(chromosome_names) <= 0) { - rlog::log_info("No matching chromosome assemblies") - species <- species[id != species_id] - - next - } - - # Retrieve information on all genes of the current species, that have - # human orthologs. This is called "homolog" in the Ensembl schema. - species_distances <- data.table(biomaRt::getBM( - attributes = c( - "hsapiens_homolog_ensembl_gene", - "chromosome_name", - "start_position", - "end_position" - ), - filters = c("with_hsapiens_homolog", "chromosome_name"), - values = list(TRUE, chromosome_names), - mart = dataset - )) - - # Only include human genes that we have information on. - species_distances <- species_distances[ - hsapiens_homolog_ensembl_gene %chin% genes$id - ] - - # Only include one ortholog per human gene. - species_distances <- unique( - species_distances, - by = "hsapiens_homolog_ensembl_gene" - ) - - # Rename gene ID column to match the human data. - setnames( - species_distances, - "hsapiens_homolog_ensembl_gene", - "ensembl_gene_id" - ) - - handle_species(species_id, species_distances) -} - -# Save data in the appropriate place. - -usethis::use_data(species, overwrite = TRUE) -usethis::use_data(genes, overwrite = TRUE) -usethis::use_data(distances, overwrite = TRUE) diff --git a/scripts/ensembl_data.R b/scripts/ensembl_data.R new file mode 100644 index 0000000..4a7719f --- /dev/null +++ b/scripts/ensembl_data.R @@ -0,0 +1,77 @@ +# This script does post processing on the data from Ensembl and imports it into +# the R package. Run this script after `ensembl_species.R` and +# `ensembl_species.R`. + +library(data.table) + +species <- fread("species.csv") +chromosomes <- fread("chromosomes.csv") +genes <- fread("genes.csv") + +species_metadata <- chromosomes[, + .( + n_chromosomes = .N, + median_chromosome_length = as.double(stats::median(length)) + ), + by = species +] + +species <- merge( + species, + species_metadata, + by.x = "id", + by.y = "species", + sort = FALSE +) + +# Remove duplicated genes within species. +genes <- genes[!duplicated(genes, by = c("species", "gene"))] + +genes_chromosomes <- merge( + genes, + chromosomes, + by.x = c("species", "chromosome"), + by.y = c("species", "id"), + sort = FALSE +) + +genes_chromosomes[, distance := ifelse( + start_position < length - end_position, + start_position, + length - end_position +)] + +distances <- genes_chromosomes[, .( + species, + gene, + chromosome, + start_position, + end_position, + distance +)] + +# This table will hold information on human genes. +genes <- genes_chromosomes[ + species == 9606, + .( + id = gene, + chromosome = name + ) +] + +genes[, name := gprofiler2::gconvert( + id, + target = "HGNC", + mthreshold = 1, + filter_na = FALSE +)$target] + +# Previous versions of geposan used different species IDs. For backwards +# compatibility, convert integer IDs to character. + +species[, id := as.character(id)] +distances[, species := as.character(species)] + +usethis::use_data(species, overwrite = TRUE) +usethis::use_data(genes, overwrite = TRUE) +usethis::use_data(distances, overwrite = TRUE) diff --git a/scripts/ensembl_genes.R b/scripts/ensembl_genes.R new file mode 100644 index 0000000..ae130ba --- /dev/null +++ b/scripts/ensembl_genes.R @@ -0,0 +1,125 @@ +# This script retrieves genome data from the Ensembl database. Run +# `ensembl_species.R` first and keep its output files "species.csv" and +# "chromosomes.csv". + +library(data.table) +library(DBI) +library(glue) + +compara_table <- "ensembl_compara_110" + +# This is the output table of this script: + +genes <- data.table( + species = integer(), + gene = character(), + chromosome = integer(), + start_position = integer(), + end_position = integer() +) + +species <- fread("species.csv") +chromosomes <- fread("chromosomes.csv") + +rlog::log_info("Connecting to Ensembl database server") + +db <- dbConnect( + RMariaDB::MariaDB(), + host = "ensembldb.ensembl.org", + port = 5306, + user = "anonymous" +) + +rlog::log_info("Retrieving human genes") + +human_species_id <- 9606 +human_present_row_count <- genes[species == human_species_id, .N] + +if (human_present_row_count > 0) { + rlog::log_info(glue("Skipping. Present rows: {human_present_row_count}")) +} else { + human_table <- species[id == human_species_id, table_name] + dbExecute(db, glue_sql("USE {`human_table`}", .con = db)) + + human_chromosome_ids <- chromosomes[species == human_species_id, id] + + human_genes <- db |> + dbGetQuery(glue_sql(" + SELECT stable_id, seq_region_id, seq_region_start, seq_region_end + FROM gene WHERE seq_region_id IN ({human_chromosome_ids*})")) |> + as.data.table() |> + setnames( + c("stable_id", "seq_region_id", "seq_region_start", "seq_region_end"), + c("gene", "chromosome", "start_position", "end_position") + ) + + human_genes[, species := human_species_id] + genes <- rbind(genes, human_genes) +} + +dbExecute(db, glue_sql("USE {`compara_table`}", .con = db)) + +for (species_id in species[id != human_species_id, id]) { + present_row_count <- genes[species == species_id, .N] + species_name <- species[id == species_id, name] + + if (present_row_count > 0) { + rlog::log_info(glue("Skipping species {species_id} ({species_name})")) + rlog::log_info(glue("Present rows: {present_row_count}")) + next + } + + rlog::log_info(glue( + "Retrieving genes for species {species_id} ({species_name})" + )) + + table_name <- species[id == species_id, table_name] + chromosome_ids <- chromosomes[species == species_id, id] + + species_genes <- db |> + dbGetQuery(glue_sql(" + SELECT + human.stable_id AS gene, + species.seq_region_id AS chromosome, + species.seq_region_start AS start_position, + species.seq_region_end AS end_position + FROM + ( + SELECT + homology_id, + stable_id, + seq_region_id, + seq_region_start, + seq_region_end + FROM {`table_name`}.gene + JOIN gene_member USING (stable_id) + JOIN homology_member USING (gene_member_id) + JOIN homology USING (homology_id) + WHERE seq_region_id IN ({chromosome_ids*}) + AND homology.description IN ( + 'ortholog_one2one', + 'ortholog_one2many', + 'ortholog_many2many' + ) + ) AS species + JOIN ( + SELECT + homology_id, + stable_id + FROM homology_member + JOIN gene_member USING (gene_member_id) + WHERE taxon_id = {human_species_id} + ) AS human ON species.homology_id = human.homology_id; + ", .con = db)) |> + as.data.table() + + if (nrow(species_genes) == 0) { + rlog::log_info("No human homologs found") + } + + species_genes[, species := species_id] + genes <- rbind(genes, species_genes) + fwrite(genes, "genes.csv") +} + +dbDisconnect(db) \ No newline at end of file diff --git a/scripts/ensembl_species.R b/scripts/ensembl_species.R new file mode 100644 index 0000000..bbb2dc6 --- /dev/null +++ b/scripts/ensembl_species.R @@ -0,0 +1,119 @@ +# This is an *interactive* script for retrieving information on species from the +# Ensembl database. There are taxons with more than one entry in the database. +# For each species that has already been seen, the script asks whether to keep +# it or replace it. We recommend to choose the most generic entry in most +# cases. + +library(data.table) +library(DBI) +library(glue) + +# These are the output tables of this script: + +species <- data.table( + id = integer(), + name = character(), + scientific_name = character(), + table_name = character() +) + +chromosomes <- data.table( + species = integer(), + id = integer(), + name = character(), + length = integer() +) + +rlog::log_info("Connecting to Ensembl database server") +db <- dbConnect( + RMariaDB::MariaDB(), + host = "ensembldb.ensembl.org", + port = 5306, + user = "anonymous" +) + +rlog::log_info("Retrieving list of databases") +tables <- dbGetQuery(db, "SHOW DATABASES LIKE '%_core_110_%'")[, 1] + +# Populates the species and chromosomes tables using data from each species' +# table within the Ensembl database. Species without a karyotype will be skipped +# without adding any information to the tables. +for (table in tables) { + rlog::log_info(glue("Reading species information from {table}")) + dbExecute(db, glue_sql("USE {`table`}", .con = db)) + + species_id <- db |> + dbGetQuery(" + SELECT meta_value FROM meta + WHERE meta_key = 'species.taxonomy_id'") |> + as.integer() + + species_name <- db |> + dbGetQuery(" + SELECT meta_value FROM meta + WHERE meta_key = 'species.display_name'") |> + as.character() + + species_scientific_name <- db |> + dbGetQuery(" + SELECT meta_value FROM meta + WHERE meta_key = 'species.scientific_name'") |> + as.character() + + rlog::log_info(glue( + "Found species {species_name} ({species_scientific_name})" + )) + + if (species[id == species_id, .N] > 0) { + old_name <- species[id == species_id, name] + old_scientific_name <- species[id == species_id, scientific_name] + input <- readline(glue("\\ + Taxon already present ({old_name}, {old_scientific_name}). \\ + Replace with {species_name} ({species_scientific_name})? [y/N] ")) + + if (input == "y") { + species <- species[id != species_id] + chromosomes <- chromosomes[species != species_id] + } else { + next + } + } + + species_chromosomes <- db |> + dbGetQuery(glue(" + SELECT seq_region_id, seq_region.name, length + FROM seq_region + JOIN seq_region_attrib USING (seq_region_id) + JOIN attrib_type USING (attrib_type_id) + WHERE code = 'karyotype_rank' + AND NOT EXISTS + (SELECT * FROM seq_region_attrib AS chromosome_attrib + JOIN attrib_type USING (attrib_type_id) + WHERE chromosome_attrib.seq_region_id = seq_region.seq_region_id + AND code = 'sequence_location' + AND chromosome_attrib.value != 'nuclear_chromosome'); + ")) |> + as.data.table() |> + setnames("seq_region_id", "id") + + species_chromosomes[, species := species_id] + + if (nrow(species_chromosomes) == 0) { + rlog::log_info("Skipping (no karyotype)") + next + } + + species <- rbind(species, data.table( + id = species_id, + name = species_name, + scientific_name = species_scientific_name, + table_name = table + )) + + chromosomes <- rbind(chromosomes, species_chromosomes) +} + +dbDisconnect(db) + +fwrite(species, "species.csv") +fwrite(chromosomes, "chromosomes.csv") \ No newline at end of file