Use ensembl database server instead of biomart

2025-10-26 02:37:25 +01:00 · 2023-11-08 12:15:26 +01:00 · 2023-11-08 12:15:26 +01:00 · 795fe99003
commit 795fe99003
parent 948cb337db
16 changed files with 373 additions and 452 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,5 @@
 /cache
 .Rproj.user
 /chromosomes.csv
 /genes.csv
 /species.csv
--- a/R/data.R
+++ b/R/data.R
@ -2,13 +2,26 @@
 #'
 #' @format A [data.table] with the following columns:
 #' \describe{
-#'   \item{id}{Unique species ID}
+#'   \item{id}{Unique species ID, these are NCBI taxon IDs}
 #'   \item{name}{Human readable species name}
 #'   \item{scientific_name}{Scientific name of the species}
 #'   \item{table_name}{Table name within the Ensembl database}
 #'   \item{n_chromosomes}{Number of chromosomes}
 #'   \item{median_chromosome_length}{Median length of chromosomes}
 #' }
 "species"
 #' Information on chromosomes for each included species.
 #'
 #' @format A [data.table] with the following columns:
 #' \describe{
 #'   \item{species}{Species ID}
 #'   \item{id}{Chromosome ID, theses are Ensembl sequence IDs}
 #'   \item{name}{Chromosome name}
 #'   \item{length}{Length in base pairs}
 #' }
 "chromosomes"
 #' Information on human genes within the Ensembl database.
 #'
 #' This includes only genes on the primary suggested assembly of the human
@ -18,7 +31,7 @@
 #' \describe{
 #'   \item{id}{Ensembl gene ID}
 #'   \item{name}{The gene's HGNC name (if available)}
-#'   \item{chrosome}{The human chromosome the gene is located on}
+#'   \item{chromosome}{The human chromosome the gene is located on}
 #' }
 "genes"
@ -31,7 +44,7 @@
 #' \describe{
 #'   \item{species}{Species ID}
 #'   \item{gene}{Gene ID}
-#'   \item{chromosome_name}{Chromosome name from the specified species}
+#'   \item{chromosome}{Chromosome ID}
 #'   \item{start_position}{Start position in base pairs}
 #'   \item{end_position}{End position in base pairs}
 #'   \item{distance}{Computed distance to nearest telomere}
--- a/R/plots.R
+++ b/R/plots.R
@ -561,13 +561,14 @@ plot_scores_by_position <- function(ranking,
  }
  distance_data <- if (!is.null(chromosome_name)) {
-    chromosome_name_ <- chromosome_name
+    chromosome_id <- geposan::chromosomes[
-    geposan::distances[
+      species == "9606" & name == chromosome_name,
-      species == "hsapiens" &
+      id
        chromosome_name == chromosome_name_
    ]
    geposan::distances[species == "9606" & chromosome == chromosome_id]
  } else {
-    geposan::distances[species == "hsapiens"]
+    geposan::distances[species == "9606"]
  }
  data <- merge(ranking, distance_data, by = "gene")
--- a/data/chromosomes.rda
+++ b/data/chromosomes.rda
--- a/data/distances.rda
+++ b/data/distances.rda
--- a/data/genes.rda
+++ b/data/genes.rda
--- a/data/species.rda
+++ b/data/species.rda
--- a/man/chromosomes.Rd
+++ b/man/chromosomes.Rd
@ -0,0 +1,22 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/data.R
 \docType{data}
 \name{chromosomes}
 \alias{chromosomes}
 \title{Information on chromosomes for each included species.}
 \format{
 A \link{data.table} with the following columns:
 \describe{
 \item{species}{Species ID}
 \item{id}{Chromosome ID, theses are Ensembl sequence IDs}
 \item{name}{Chromosome name}
 \item{length}{Length in base pairs}
 }
 }
 \usage{
 chromosomes
 }
 \description{
 Information on chromosomes for each included species.
 }
 \keyword{datasets}
--- a/man/distances.Rd
+++ b/man/distances.Rd
@ -9,7 +9,7 @@ A \link{data.table} with the following columns:
 \describe{
 \item{species}{Species ID}
 \item{gene}{Gene ID}
-\item{chromosome_name}{Chromosome name from the specified species}
+\item{chromosome}{Chromosome ID}
 \item{start_position}{Start position in base pairs}
 \item{end_position}{End position in base pairs}
 \item{distance}{Computed distance to nearest telomere}
--- a/man/genes.Rd
+++ b/man/genes.Rd
@ -9,7 +9,7 @@ A \link{data.table} with the following columns:
 \describe{
 \item{id}{Ensembl gene ID}
 \item{name}{The gene's HGNC name (if available)}
-\item{chrosome}{The human chromosome the gene is located on}
+\item{chromosome}{The human chromosome the gene is located on}
 }
 }
 \usage{
--- a/man/species.Rd
+++ b/man/species.Rd
@ -7,8 +7,10 @@
 \format{
 A \link{data.table} with the following columns:
 \describe{
-\item{id}{Unique species ID}
+\item{id}{Unique species ID, these are NCBI taxon IDs}
 \item{name}{Human readable species name}
 \item{scientific_name}{Scientific name of the species}
 \item{table_name}{Table name within the Ensembl database}
 \item{n_chromosomes}{Number of chromosomes}
 \item{median_chromosome_length}{Median length of chromosomes}
 }
--- a/scripts/chromosome_names.R
+++ b/scripts/chromosome_names.R
@ -1,34 +0,0 @@
 library(data.table)
 library(httr)
 ensembl_api_url <- "https://rest.ensembl.org"
 #' Perform a request to the Ensembl REST API.
 ensembl_request <- function(api_path) {
  content(stop_for_status(GET(
    paste0(ensembl_api_url, api_path),
    content_type_json()
  )))
 }
 #' Get IDs of all available vertebrates.
 get_species_ids <- function() {
  species <- ensembl_request("/info/species")$species
  sapply(species, function(species) species$name)
 }
 #' Get all chromosomes names for a species.
 get_species_chromosomes <- function(species_id) {
  unlist(ensembl_request(
    paste0("/info/assembly/", species_id)
  )$karyotype)
 }
 #' Get a vector of all available unqiue chromosome names.
 #'
 #' There are multiple names for mitochondrial DNA which have to be removed
 #' manually, unfortunately.
 get_all_chromosomes <- function() {
  chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
  unique(unlist(chromosomes))
 }
--- a/scripts/ensembl.R
+++ b/scripts/ensembl.R
@ -1,407 +0,0 @@
 library(data.table)
 rlog::log_info("Connecting to Ensembl API")
 # Object to access the Ensembl API.
 ensembl <- biomaRt::useEnsembl("ensembl", version = 110)
 # Retrieve species information.
 rlog::log_info("Retrieving species information")
 ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
 # Filter out species ID and name from the result.
 species <- ensembl_datasets[, .(
  id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
  name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
 )]
 # List of assemblies that the Ensembl Rest API advertises as chromosomes.
 # Mitochondrial DNA has been manually removed. Unfortunately, species IDs from
 # the Ensembl REST API don't map to dataset names in the BioMart interface.
 # Because of that, we can't programatically filter chromosome names.
 #
 # See get_all_chromosomes()
 valid_chromosome_names <- c(
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "10",
  "11",
  "12",
  "13",
  "14",
  "15",
  "16",
  "17",
  "18",
  "19",
  "X",
  "groupI",
  "groupII",
  "groupIII",
  "groupIV",
  "groupV",
  "groupVI",
  "groupVII",
  "groupVIII",
  "groupIX",
  "groupX",
  "groupXI",
  "groupXII",
  "groupXIII",
  "groupXIV",
  "groupXV",
  "groupXVI",
  "groupXVII",
  "groupXVIII",
  "groupXIX",
  "groupXX",
  "groupXXI",
  "20",
  "Y",
  "21",
  "22",
  "23",
  "24",
  "25",
  "26",
  "27",
  "28",
  "29",
  "30",
  "31",
  "32",
  "33",
  "34",
  "35",
  "36",
  "37",
  "38",
  "I",
  "II",
  "III",
  "IV",
  "V",
  "VI",
  "VII",
  "VIII",
  "IX",
  "XI",
  "XII",
  "XIII",
  "XIV",
  "XV",
  "XVI",
  "XVII",
  "XVIII",
  "XIX",
  "XX",
  "XXI",
  "XXII",
  "XXIII",
  "XXIV",
  "7a",
  "7b",
  "Z",
  "W",
  "a",
  "b",
  "c",
  "d",
  "f",
  "g",
  "h",
  "39",
  "40",
  "1a",
  "22a",
  "sgr01",
  "sgr02",
  "sgr03",
  "sgr04",
  "sgr05",
  "sgr06",
  "sgr07",
  "sgr08",
  "sgr09",
  "sgr10",
  "sgr11",
  "sgr12",
  "sgr13",
  "sgr14",
  "sgr15",
  "sgr16",
  "sgr17",
  "sgr18",
  "sgr19",
  "LGE64",
  "2A",
  "2B",
  "X1",
  "X2",
  "X3",
  "X4",
  "X5",
  "LG1",
  "LG2",
  "LG3",
  "LG4",
  "LG5",
  "LG6",
  "LG7",
  "LG8",
  "LG9",
  "LG10",
  "LG11",
  "LG12",
  "LG13",
  "LG14",
  "LG15",
  "LG16",
  "LG17",
  "LG18",
  "LG19",
  "LG20",
  "LG22",
  "LG23",
  "4A",
  "1A",
  "25LG1",
  "25LG2",
  "LGE22",
  "LG21",
  "A1",
  "A2",
  "A3",
  "B1",
  "B2",
  "B3",
  "B4",
  "C1",
  "C2",
  "D1",
  "D2",
  "D3",
  "D4",
  "E1",
  "E2",
  "E3",
  "F1",
  "F2",
  "LG34",
  "LG35",
  "LG24",
  "LG25",
  "LG26",
  "LG27",
  "LG28",
  "LG29",
  "LG30",
  "MIC_1",
  "MIC_10",
  "MIC_11",
  "MIC_2",
  "MIC_3",
  "MIC_4",
  "MIC_5",
  "MIC_6",
  "MIC_7",
  "MIC_8",
  "MIC_9",
  "2L",
  "2R",
  "3L",
  "3R",
  "LGE22C19W28_E50C23",
  "LG01",
  "LG02",
  "LG03",
  "LG04",
  "LG05",
  "LG06",
  "LG07",
  "LG08",
  "LG09",
  "LG7_11",
  "41",
  "42",
  "43",
  "44",
  "45",
  "46",
  "47",
  "48",
  "49",
  "50",
  "LG28B",
  "LG30F",
  "LG36F",
  "LG37M",
  "LG42F",
  "LG44F",
  "LG45M",
  "LG48F",
  "LG49B"
 )
 #' Get all chromosome names for an Ensembl dataset.
 #'
 #' The function tries to filter out valid chromosome names from the available
 #' assemblies in the dataset.
 get_chromosome_names <- function(dataset) {
  chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
  chromosome_names[chromosome_names %chin% valid_chromosome_names]
 }
 # Retrieve information on human genes. This will only include genes on
 # assembled chromosomes. Chromosomes are filtered using get_chromosome_names().
 rlog::log_info("Retrieving information on human genes")
 dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
 human_data <- data.table(biomaRt::getBM(
  attributes = c(
    "ensembl_gene_id",
    "hgnc_symbol",
    "chromosome_name",
    "start_position",
    "end_position"
  ),
  filters = "chromosome_name",
  values = get_chromosome_names(dataset),
  mart = dataset
 ))
 # Remove duplicated gene IDs (at the time of writing, there are a handful).
 human_data <- unique(human_data, by = "ensembl_gene_id")
 # Only keep relevant information on genes.
 genes <- human_data[, .(
  id = ensembl_gene_id,
  name = hgnc_symbol,
  chromosome = chromosome_name
 )]
 # Retrieve gene distance data across species.
 rlog::log_info("Retrieving distance data")
 distances <- data.table()
 #' Handle data for one species.
 handle_species <- function(species_id, species_data) {
  chromosomes <- species_data[,
    .(chromosome_length = max(end_position)),
    by = chromosome_name
  ]
  # Store the number of chromosomes in the species table.
  species[id == species_id, n_chromosomes := nrow(chromosomes)]
  # Store the median chromosome length in the species table.
  species[
    id == species_id,
    median_chromosome_length := chromosomes[, median(chromosome_length)]
  ]
  # Precompute the genes' distance to the nearest telomere.
  species_distances <- species_data[
    chromosomes,
    .(
      species = species_id,
      gene = ensembl_gene_id,
      chromosome_name = chromosome_name,
      start_position = start_position,
      end_position = end_position,
      distance = pmin(
        start_position,
        chromosome_length - end_position
      )
    ),
    on = "chromosome_name"
  ]
  # Add species distances to the distances table.
  distances <<- rbindlist(list(distances, species_distances))
 }
 # Handle the human first, as we already retrieved the data and don't need to
 # filter based on orthologies.
 handle_species("hsapiens", human_data)
 # Iterate through all other species and retrieve their distance data.
 for (species_id in species[id != "hsapiens", id]) {
  rlog::log_info(sprintf("Loading species \"%s\"", species_id))
  dataset <- biomaRt::useDataset(
    sprintf("%s_gene_ensembl", species_id),
    mart = ensembl
  )
  # Besides the attributes that are always present, we need to check for
  # human orthologs. Some species don't have that information and will be
  # skipped.
  if (!"hsapiens_homolog_ensembl_gene" %chin%
    biomaRt::listAttributes(dataset, what = "name")) {
    rlog::log_info("No data on human orthologs")
    species <- species[id != species_id]
    next
  }
  chromosome_names <- get_chromosome_names(dataset)
  # Skip the species, if there are no assembled chromosomes.
  if (length(chromosome_names) <= 0) {
    rlog::log_info("No matching chromosome assemblies")
    species <- species[id != species_id]
    next
  }
  # Retrieve information on all genes of the current species, that have
  # human orthologs. This is called "homolog" in the Ensembl schema.
  species_distances <- data.table(biomaRt::getBM(
    attributes = c(
      "hsapiens_homolog_ensembl_gene",
      "chromosome_name",
      "start_position",
      "end_position"
    ),
    filters = c("with_hsapiens_homolog", "chromosome_name"),
    values = list(TRUE, chromosome_names),
    mart = dataset
  ))
  # Only include human genes that we have information on.
  species_distances <- species_distances[
    hsapiens_homolog_ensembl_gene %chin% genes$id
  ]
  # Only include one ortholog per human gene.
  species_distances <- unique(
    species_distances,
    by = "hsapiens_homolog_ensembl_gene"
  )
  # Rename gene ID column to match the human data.
  setnames(
    species_distances,
    "hsapiens_homolog_ensembl_gene",
    "ensembl_gene_id"
  )
  handle_species(species_id, species_distances)
 }
 # Save data in the appropriate place.
 usethis::use_data(species, overwrite = TRUE)
 usethis::use_data(genes, overwrite = TRUE)
 usethis::use_data(distances, overwrite = TRUE)
--- a/scripts/ensembl_data.R
+++ b/scripts/ensembl_data.R
@ -0,0 +1,77 @@
 # This script does post processing on the data from Ensembl and imports it into
 # the R package. Run this script after `ensembl_species.R` and
 # `ensembl_species.R`.
 library(data.table)
 species <- fread("species.csv")
 chromosomes <- fread("chromosomes.csv")
 genes <- fread("genes.csv")
 species_metadata <- chromosomes[,
  .(
    n_chromosomes = .N,
    median_chromosome_length = as.double(stats::median(length))
  ),
  by = species
 ]
 species <- merge(
  species,
  species_metadata,
  by.x = "id",
  by.y = "species",
  sort = FALSE
 )
 # Remove duplicated genes within species.
 genes <- genes[!duplicated(genes, by = c("species", "gene"))]
 genes_chromosomes <- merge(
  genes,
  chromosomes,
  by.x = c("species", "chromosome"),
  by.y = c("species", "id"),
  sort = FALSE
 )
 genes_chromosomes[, distance := ifelse(
  start_position < length - end_position,
  start_position,
  length - end_position
 )]
 distances <- genes_chromosomes[, .(
  species,
  gene,
  chromosome,
  start_position,
  end_position,
  distance
 )]
 # This table will hold information on human genes.
 genes <- genes_chromosomes[
  species == 9606,
  .(
    id = gene,
    chromosome = name
  )
 ]
 genes[, name := gprofiler2::gconvert(
  id,
  target = "HGNC",
  mthreshold = 1,
  filter_na = FALSE
 )$target]
 # Previous versions of geposan used different species IDs. For backwards
 # compatibility, convert integer IDs to character.
 species[, id := as.character(id)]
 distances[, species := as.character(species)]
 usethis::use_data(species, overwrite = TRUE)
 usethis::use_data(genes, overwrite = TRUE)
 usethis::use_data(distances, overwrite = TRUE)
--- a/scripts/ensembl_genes.R
+++ b/scripts/ensembl_genes.R
@ -0,0 +1,125 @@
 # This script retrieves genome data from the Ensembl database. Run
 # `ensembl_species.R` first and keep its output files "species.csv" and
 # "chromosomes.csv".
 library(data.table)
 library(DBI)
 library(glue)
 compara_table <- "ensembl_compara_110"
 # This is the output table of this script:
 genes <- data.table(
  species = integer(),
  gene = character(),
  chromosome = integer(),
  start_position = integer(),
  end_position = integer()
 )
 species <- fread("species.csv")
 chromosomes <- fread("chromosomes.csv")
 rlog::log_info("Connecting to Ensembl database server")
 db <- dbConnect(
  RMariaDB::MariaDB(),
  host = "ensembldb.ensembl.org",
  port = 5306,
  user = "anonymous"
 )
 rlog::log_info("Retrieving human genes")
 human_species_id <- 9606
 human_present_row_count <- genes[species == human_species_id, .N]
 if (human_present_row_count > 0) {
  rlog::log_info(glue("Skipping. Present rows: {human_present_row_count}"))
 } else {
  human_table <- species[id == human_species_id, table_name]
  dbExecute(db, glue_sql("USE {`human_table`}", .con = db))
  human_chromosome_ids <- chromosomes[species == human_species_id, id]
  human_genes <- db |>
    dbGetQuery(glue_sql("
      SELECT stable_id, seq_region_id, seq_region_start, seq_region_end
        FROM gene WHERE seq_region_id IN ({human_chromosome_ids*})")) |>
    as.data.table() |>
    setnames(
      c("stable_id", "seq_region_id", "seq_region_start", "seq_region_end"),
      c("gene", "chromosome", "start_position", "end_position")
    )
  human_genes[, species := human_species_id]
  genes <- rbind(genes, human_genes)
 }
 dbExecute(db, glue_sql("USE {`compara_table`}", .con = db))
 for (species_id in species[id != human_species_id, id]) {
  present_row_count <- genes[species == species_id, .N]
  species_name <- species[id == species_id, name]
  if (present_row_count > 0) {
    rlog::log_info(glue("Skipping species {species_id} ({species_name})"))
    rlog::log_info(glue("Present rows: {present_row_count}"))
    next
  }
  rlog::log_info(glue(
    "Retrieving genes for species {species_id} ({species_name})"
  ))
  table_name <- species[id == species_id, table_name]
  chromosome_ids <- chromosomes[species == species_id, id]
  species_genes <- db |>
    dbGetQuery(glue_sql("
      SELECT
        human.stable_id AS gene,
        species.seq_region_id AS chromosome,
        species.seq_region_start AS start_position,
        species.seq_region_end AS end_position
      FROM
        (
          SELECT
            homology_id,
            stable_id,
            seq_region_id,
            seq_region_start,
            seq_region_end
          FROM {`table_name`}.gene
            JOIN gene_member USING (stable_id)
            JOIN homology_member USING (gene_member_id)
            JOIN homology USING (homology_id)
          WHERE seq_region_id IN ({chromosome_ids*})
            AND homology.description IN (
              'ortholog_one2one',
              'ortholog_one2many',
              'ortholog_many2many'
            )
        ) AS species
        JOIN (
          SELECT
            homology_id,
            stable_id
          FROM homology_member
            JOIN gene_member USING (gene_member_id)
          WHERE taxon_id = {human_species_id}
        ) AS human ON species.homology_id = human.homology_id;
    ", .con = db)) |>
    as.data.table()
  if (nrow(species_genes) == 0) {
    rlog::log_info("No human homologs found")
  }
  species_genes[, species := species_id]
  genes <- rbind(genes, species_genes)
  fwrite(genes, "genes.csv")
 }
 dbDisconnect(db)
--- a/scripts/ensembl_species.R
+++ b/scripts/ensembl_species.R
@ -0,0 +1,119 @@
 # This is an *interactive* script for retrieving information on species from the
 # Ensembl database. There are taxons with more than one entry in the database.
 # For each species that has already been seen, the script asks whether to keep
 # it or replace it. We recommend to choose the most generic entry in most
 # cases.
 library(data.table)
 library(DBI)
 library(glue)
 # These are the output tables of this script:
 species <- data.table(
  id = integer(),
  name = character(),
  scientific_name = character(),
  table_name = character()
 )
 chromosomes <- data.table(
  species = integer(),
  id = integer(),
  name = character(),
  length = integer()
 )
 rlog::log_info("Connecting to Ensembl database server")
 db <- dbConnect(
  RMariaDB::MariaDB(),
  host = "ensembldb.ensembl.org",
  port = 5306,
  user = "anonymous"
 )
 rlog::log_info("Retrieving list of databases")
 tables <- dbGetQuery(db, "SHOW DATABASES LIKE '%_core_110_%'")[, 1]
 # Populates the species and chromosomes tables using data from each species'
 # table within the Ensembl database. Species without a karyotype will be skipped
 # without adding any information to the tables.
 for (table in tables) {
  rlog::log_info(glue("Reading species information from {table}"))
  dbExecute(db, glue_sql("USE {`table`}", .con = db))
  species_id <- db |>
    dbGetQuery("
      SELECT meta_value FROM meta
        WHERE meta_key = 'species.taxonomy_id'") |>
    as.integer()
  species_name <- db |>
    dbGetQuery("
      SELECT meta_value FROM meta
        WHERE meta_key = 'species.display_name'") |>
    as.character()
  species_scientific_name <- db |>
    dbGetQuery("
      SELECT meta_value FROM meta
        WHERE meta_key = 'species.scientific_name'") |>
    as.character()
  rlog::log_info(glue(
    "Found species {species_name} ({species_scientific_name})"
  ))
  if (species[id == species_id, .N] > 0) {
    old_name <- species[id == species_id, name]
    old_scientific_name <- species[id == species_id, scientific_name]
    input <- readline(glue("\\
      Taxon already present ({old_name}, {old_scientific_name}). \\
      Replace with {species_name} ({species_scientific_name})? [y/N] "))
    if (input == "y") {
      species <- species[id != species_id]
      chromosomes <- chromosomes[species != species_id]
    } else {
      next
    }
  }
  species_chromosomes <- db |>
    dbGetQuery(glue("
      SELECT seq_region_id, seq_region.name, length
      FROM seq_region
        JOIN seq_region_attrib USING (seq_region_id)
        JOIN attrib_type USING (attrib_type_id)
      WHERE code = 'karyotype_rank'
        AND NOT EXISTS
          (SELECT * FROM seq_region_attrib AS chromosome_attrib
            JOIN attrib_type USING (attrib_type_id)
            WHERE chromosome_attrib.seq_region_id = seq_region.seq_region_id
              AND code = 'sequence_location'
              AND chromosome_attrib.value != 'nuclear_chromosome');
    ")) |>
    as.data.table() |>
    setnames("seq_region_id", "id")
  species_chromosomes[, species := species_id]
  if (nrow(species_chromosomes) == 0) {
    rlog::log_info("Skipping (no karyotype)")
    next
  }
  species <- rbind(species, data.table(
    id = species_id,
    name = species_name,
    scientific_name = species_scientific_name,
    table_name = table
  ))
  chromosomes <- rbind(chromosomes, species_chromosomes)
 }
 dbDisconnect(db)
 fwrite(species, "species.csv")
 fwrite(chromosomes, "chromosomes.csv")