Use ensembl database server instead of biomart

2025-10-25 19:37:23 +02:00 · 2023-11-08 12:15:26 +01:00 · 2023-11-08 12:15:26 +01:00 · 795fe99003
commit 795fe99003
parent 948cb337db
16 changed files with 373 additions and 452 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,5 @@
 /cache
 .Rproj.user
+/chromosomes.csv
+/genes.csv
+/species.csv
--- a/R/data.R
+++ b/R/data.R
@ -2,13 +2,26 @@
 #'
 #' @format A [data.table] with the following columns:
 #' \describe{
-#'   \item{id}{Unique species ID}
+#'   \item{id}{Unique species ID, these are NCBI taxon IDs}
 #'   \item{name}{Human readable species name}
+#'   \item{scientific_name}{Scientific name of the species}
+#'   \item{table_name}{Table name within the Ensembl database}
 #'   \item{n_chromosomes}{Number of chromosomes}
 #'   \item{median_chromosome_length}{Median length of chromosomes}
 #' }
 "species"

+#' Information on chromosomes for each included species.
+#'
+#' @format A [data.table] with the following columns:
+#' \describe{
+#'   \item{species}{Species ID}
+#'   \item{id}{Chromosome ID, theses are Ensembl sequence IDs}
+#'   \item{name}{Chromosome name}
+#'   \item{length}{Length in base pairs}
+#' }
+"chromosomes"
+
 #' Information on human genes within the Ensembl database.
 #'
 #' This includes only genes on the primary suggested assembly of the human
@ -18,7 +31,7 @@
 #' \describe{
 #'   \item{id}{Ensembl gene ID}
 #'   \item{name}{The gene's HGNC name (if available)}
-#'   \item{chrosome}{The human chromosome the gene is located on}
+#'   \item{chromosome}{The human chromosome the gene is located on}
 #' }
 "genes"

@ -31,7 +44,7 @@
 #' \describe{
 #'   \item{species}{Species ID}
 #'   \item{gene}{Gene ID}
-#'   \item{chromosome_name}{Chromosome name from the specified species}
+#'   \item{chromosome}{Chromosome ID}
 #'   \item{start_position}{Start position in base pairs}
 #'   \item{end_position}{End position in base pairs}
 #'   \item{distance}{Computed distance to nearest telomere}
--- a/R/plots.R
+++ b/R/plots.R
@ -561,13 +561,14 @@ plot_scores_by_position <- function(ranking,
  }

  distance_data <- if (!is.null(chromosome_name)) {
-    chromosome_name_ <- chromosome_name
-    geposan::distances[
-      species == "hsapiens" &
-        chromosome_name == chromosome_name_
+    chromosome_id <- geposan::chromosomes[
+      species == "9606" & name == chromosome_name,
+      id
    ]
+
+    geposan::distances[species == "9606" & chromosome == chromosome_id]
  } else {
-    geposan::distances[species == "hsapiens"]
+    geposan::distances[species == "9606"]
  }

  data <- merge(ranking, distance_data, by = "gene")
--- a/data/chromosomes.rda
+++ b/data/chromosomes.rda
--- a/data/distances.rda
+++ b/data/distances.rda
--- a/data/genes.rda
+++ b/data/genes.rda
--- a/data/species.rda
+++ b/data/species.rda
--- a/man/chromosomes.Rd
+++ b/man/chromosomes.Rd
@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{chromosomes}
+\alias{chromosomes}
+\title{Information on chromosomes for each included species.}
+\format{
+A \link{data.table} with the following columns:
+\describe{
+\item{species}{Species ID}
+\item{id}{Chromosome ID, theses are Ensembl sequence IDs}
+\item{name}{Chromosome name}
+\item{length}{Length in base pairs}
+}
+}
+\usage{
+chromosomes
+}
+\description{
+Information on chromosomes for each included species.
+}
+\keyword{datasets}
--- a/man/distances.Rd
+++ b/man/distances.Rd
@ -9,7 +9,7 @@ A \link{data.table} with the following columns:
 \describe{
 \item{species}{Species ID}
 \item{gene}{Gene ID}
-\item{chromosome_name}{Chromosome name from the specified species}
+\item{chromosome}{Chromosome ID}
 \item{start_position}{Start position in base pairs}
 \item{end_position}{End position in base pairs}
 \item{distance}{Computed distance to nearest telomere}
--- a/man/genes.Rd
+++ b/man/genes.Rd
@ -9,7 +9,7 @@ A \link{data.table} with the following columns:
 \describe{
 \item{id}{Ensembl gene ID}
 \item{name}{The gene's HGNC name (if available)}
-\item{chrosome}{The human chromosome the gene is located on}
+\item{chromosome}{The human chromosome the gene is located on}
 }
 }
 \usage{
--- a/man/species.Rd
+++ b/man/species.Rd
@ -7,8 +7,10 @@
 \format{
 A \link{data.table} with the following columns:
 \describe{
-\item{id}{Unique species ID}
+\item{id}{Unique species ID, these are NCBI taxon IDs}
 \item{name}{Human readable species name}
+\item{scientific_name}{Scientific name of the species}
+\item{table_name}{Table name within the Ensembl database}
 \item{n_chromosomes}{Number of chromosomes}
 \item{median_chromosome_length}{Median length of chromosomes}
 }
--- a/scripts/chromosome_names.R
+++ b/scripts/chromosome_names.R
@ -1,34 +0,0 @@
-library(data.table)
-library(httr)
-
-ensembl_api_url <- "https://rest.ensembl.org"
-
-#' Perform a request to the Ensembl REST API.
-ensembl_request <- function(api_path) {
-  content(stop_for_status(GET(
-    paste0(ensembl_api_url, api_path),
-    content_type_json()
-  )))
-}
-
-#' Get IDs of all available vertebrates.
-get_species_ids <- function() {
-  species <- ensembl_request("/info/species")$species
-  sapply(species, function(species) species$name)
-}
-
-#' Get all chromosomes names for a species.
-get_species_chromosomes <- function(species_id) {
-  unlist(ensembl_request(
-    paste0("/info/assembly/", species_id)
-  )$karyotype)
-}
-
-#' Get a vector of all available unqiue chromosome names.
-#'
-#' There are multiple names for mitochondrial DNA which have to be removed
-#' manually, unfortunately.
-get_all_chromosomes <- function() {
-  chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
-  unique(unlist(chromosomes))
-}
--- a/scripts/ensembl.R
+++ b/scripts/ensembl.R
@ -1,407 +0,0 @@
-library(data.table)
-
-rlog::log_info("Connecting to Ensembl API")
-
-# Object to access the Ensembl API.
-ensembl <- biomaRt::useEnsembl("ensembl", version = 110)
-
-# Retrieve species information.
-
-rlog::log_info("Retrieving species information")
-ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
-
-# Filter out species ID and name from the result.
-species <- ensembl_datasets[, .(
-  id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
-  name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
-)]
-
-# List of assemblies that the Ensembl Rest API advertises as chromosomes.
-# Mitochondrial DNA has been manually removed. Unfortunately, species IDs from
-# the Ensembl REST API don't map to dataset names in the BioMart interface.
-# Because of that, we can't programatically filter chromosome names.
-#
-# See get_all_chromosomes()
-valid_chromosome_names <- c(
-  "1",
-  "2",
-  "3",
-  "4",
-  "5",
-  "6",
-  "7",
-  "8",
-  "9",
-  "10",
-  "11",
-  "12",
-  "13",
-  "14",
-  "15",
-  "16",
-  "17",
-  "18",
-  "19",
-  "X",
-  "groupI",
-  "groupII",
-  "groupIII",
-  "groupIV",
-  "groupV",
-  "groupVI",
-  "groupVII",
-  "groupVIII",
-  "groupIX",
-  "groupX",
-  "groupXI",
-  "groupXII",
-  "groupXIII",
-  "groupXIV",
-  "groupXV",
-  "groupXVI",
-  "groupXVII",
-  "groupXVIII",
-  "groupXIX",
-  "groupXX",
-  "groupXXI",
-  "20",
-  "Y",
-  "21",
-  "22",
-  "23",
-  "24",
-  "25",
-  "26",
-  "27",
-  "28",
-  "29",
-  "30",
-  "31",
-  "32",
-  "33",
-  "34",
-  "35",
-  "36",
-  "37",
-  "38",
-  "I",
-  "II",
-  "III",
-  "IV",
-  "V",
-  "VI",
-  "VII",
-  "VIII",
-  "IX",
-  "XI",
-  "XII",
-  "XIII",
-  "XIV",
-  "XV",
-  "XVI",
-  "XVII",
-  "XVIII",
-  "XIX",
-  "XX",
-  "XXI",
-  "XXII",
-  "XXIII",
-  "XXIV",
-  "7a",
-  "7b",
-  "Z",
-  "W",
-  "a",
-  "b",
-  "c",
-  "d",
-  "f",
-  "g",
-  "h",
-  "39",
-  "40",
-  "1a",
-  "22a",
-  "sgr01",
-  "sgr02",
-  "sgr03",
-  "sgr04",
-  "sgr05",
-  "sgr06",
-  "sgr07",
-  "sgr08",
-  "sgr09",
-  "sgr10",
-  "sgr11",
-  "sgr12",
-  "sgr13",
-  "sgr14",
-  "sgr15",
-  "sgr16",
-  "sgr17",
-  "sgr18",
-  "sgr19",
-  "LGE64",
-  "2A",
-  "2B",
-  "X1",
-  "X2",
-  "X3",
-  "X4",
-  "X5",
-  "LG1",
-  "LG2",
-  "LG3",
-  "LG4",
-  "LG5",
-  "LG6",
-  "LG7",
-  "LG8",
-  "LG9",
-  "LG10",
-  "LG11",
-  "LG12",
-  "LG13",
-  "LG14",
-  "LG15",
-  "LG16",
-  "LG17",
-  "LG18",
-  "LG19",
-  "LG20",
-  "LG22",
-  "LG23",
-  "4A",
-  "1A",
-  "25LG1",
-  "25LG2",
-  "LGE22",
-  "LG21",
-  "A1",
-  "A2",
-  "A3",
-  "B1",
-  "B2",
-  "B3",
-  "B4",
-  "C1",
-  "C2",
-  "D1",
-  "D2",
-  "D3",
-  "D4",
-  "E1",
-  "E2",
-  "E3",
-  "F1",
-  "F2",
-  "LG34",
-  "LG35",
-  "LG24",
-  "LG25",
-  "LG26",
-  "LG27",
-  "LG28",
-  "LG29",
-  "LG30",
-  "MIC_1",
-  "MIC_10",
-  "MIC_11",
-  "MIC_2",
-  "MIC_3",
-  "MIC_4",
-  "MIC_5",
-  "MIC_6",
-  "MIC_7",
-  "MIC_8",
-  "MIC_9",
-  "2L",
-  "2R",
-  "3L",
-  "3R",
-  "LGE22C19W28_E50C23",
-  "LG01",
-  "LG02",
-  "LG03",
-  "LG04",
-  "LG05",
-  "LG06",
-  "LG07",
-  "LG08",
-  "LG09",
-  "LG7_11",
-  "41",
-  "42",
-  "43",
-  "44",
-  "45",
-  "46",
-  "47",
-  "48",
-  "49",
-  "50",
-  "LG28B",
-  "LG30F",
-  "LG36F",
-  "LG37M",
-  "LG42F",
-  "LG44F",
-  "LG45M",
-  "LG48F",
-  "LG49B"
-)
-
-#' Get all chromosome names for an Ensembl dataset.
-#'
-#' The function tries to filter out valid chromosome names from the available
-#' assemblies in the dataset.
-get_chromosome_names <- function(dataset) {
-  chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
-  chromosome_names[chromosome_names %chin% valid_chromosome_names]
-}
-
-# Retrieve information on human genes. This will only include genes on
-# assembled chromosomes. Chromosomes are filtered using get_chromosome_names().
-
-rlog::log_info("Retrieving information on human genes")
-dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
-
-human_data <- data.table(biomaRt::getBM(
-  attributes = c(
-    "ensembl_gene_id",
-    "hgnc_symbol",
-    "chromosome_name",
-    "start_position",
-    "end_position"
-  ),
-  filters = "chromosome_name",
-  values = get_chromosome_names(dataset),
-  mart = dataset
-))
-
-# Remove duplicated gene IDs (at the time of writing, there are a handful).
-human_data <- unique(human_data, by = "ensembl_gene_id")
-
-# Only keep relevant information on genes.
-genes <- human_data[, .(
-  id = ensembl_gene_id,
-  name = hgnc_symbol,
-  chromosome = chromosome_name
-)]
-
-# Retrieve gene distance data across species.
-
-rlog::log_info("Retrieving distance data")
-distances <- data.table()
-
-#' Handle data for one species.
-handle_species <- function(species_id, species_data) {
-  chromosomes <- species_data[,
-    .(chromosome_length = max(end_position)),
-    by = chromosome_name
-  ]
-
-  # Store the number of chromosomes in the species table.
-  species[id == species_id, n_chromosomes := nrow(chromosomes)]
-
-  # Store the median chromosome length in the species table.
-  species[
-    id == species_id,
-    median_chromosome_length := chromosomes[, median(chromosome_length)]
-  ]
-
-  # Precompute the genes' distance to the nearest telomere.
-  species_distances <- species_data[
-    chromosomes,
-    .(
-      species = species_id,
-      gene = ensembl_gene_id,
-      chromosome_name = chromosome_name,
-      start_position = start_position,
-      end_position = end_position,
-      distance = pmin(
-        start_position,
-        chromosome_length - end_position
-      )
-    ),
-    on = "chromosome_name"
-  ]
-
-  # Add species distances to the distances table.
-  distances <<- rbindlist(list(distances, species_distances))
-}
-
-# Handle the human first, as we already retrieved the data and don't need to
-# filter based on orthologies.
-handle_species("hsapiens", human_data)
-
-# Iterate through all other species and retrieve their distance data.
-for (species_id in species[id != "hsapiens", id]) {
-  rlog::log_info(sprintf("Loading species \"%s\"", species_id))
-
-  dataset <- biomaRt::useDataset(
-    sprintf("%s_gene_ensembl", species_id),
-    mart = ensembl
-  )
-
-  # Besides the attributes that are always present, we need to check for
-  # human orthologs. Some species don't have that information and will be
-  # skipped.
-  if (!"hsapiens_homolog_ensembl_gene" %chin%
-    biomaRt::listAttributes(dataset, what = "name")) {
-    rlog::log_info("No data on human orthologs")
-    species <- species[id != species_id]
-
-    next
-  }
-
-  chromosome_names <- get_chromosome_names(dataset)
-
-  # Skip the species, if there are no assembled chromosomes.
-  if (length(chromosome_names) <= 0) {
-    rlog::log_info("No matching chromosome assemblies")
-    species <- species[id != species_id]
-
-    next
-  }
-
-  # Retrieve information on all genes of the current species, that have
-  # human orthologs. This is called "homolog" in the Ensembl schema.
-  species_distances <- data.table(biomaRt::getBM(
-    attributes = c(
-      "hsapiens_homolog_ensembl_gene",
-      "chromosome_name",
-      "start_position",
-      "end_position"
-    ),
-    filters = c("with_hsapiens_homolog", "chromosome_name"),
-    values = list(TRUE, chromosome_names),
-    mart = dataset
-  ))
-
-  # Only include human genes that we have information on.
-  species_distances <- species_distances[
-    hsapiens_homolog_ensembl_gene %chin% genes$id
-  ]
-
-  # Only include one ortholog per human gene.
-  species_distances <- unique(
-    species_distances,
-    by = "hsapiens_homolog_ensembl_gene"
-  )
-
-  # Rename gene ID column to match the human data.
-  setnames(
-    species_distances,
-    "hsapiens_homolog_ensembl_gene",
-    "ensembl_gene_id"
-  )
-
-  handle_species(species_id, species_distances)
-}
-
-# Save data in the appropriate place.
-
-usethis::use_data(species, overwrite = TRUE)
-usethis::use_data(genes, overwrite = TRUE)
-usethis::use_data(distances, overwrite = TRUE)
--- a/scripts/ensembl_data.R
+++ b/scripts/ensembl_data.R
@ -0,0 +1,77 @@
+# This script does post processing on the data from Ensembl and imports it into
+# the R package. Run this script after `ensembl_species.R` and
+# `ensembl_species.R`.
+
+library(data.table)
+
+species <- fread("species.csv")
+chromosomes <- fread("chromosomes.csv")
+genes <- fread("genes.csv")
+
+species_metadata <- chromosomes[,
+  .(
+    n_chromosomes = .N,
+    median_chromosome_length = as.double(stats::median(length))
+  ),
+  by = species
+]
+
+species <- merge(
+  species,
+  species_metadata,
+  by.x = "id",
+  by.y = "species",
+  sort = FALSE
+)
+
+# Remove duplicated genes within species.
+genes <- genes[!duplicated(genes, by = c("species", "gene"))]
+
+genes_chromosomes <- merge(
+  genes,
+  chromosomes,
+  by.x = c("species", "chromosome"),
+  by.y = c("species", "id"),
+  sort = FALSE
+)
+
+genes_chromosomes[, distance := ifelse(
+  start_position < length - end_position,
+  start_position,
+  length - end_position
+)]
+
+distances <- genes_chromosomes[, .(
+  species,
+  gene,
+  chromosome,
+  start_position,
+  end_position,
+  distance
+)]
+
+# This table will hold information on human genes.
+genes <- genes_chromosomes[
+  species == 9606,
+  .(
+    id = gene,
+    chromosome = name
+  )
+]
+
+genes[, name := gprofiler2::gconvert(
+  id,
+  target = "HGNC",
+  mthreshold = 1,
+  filter_na = FALSE
+)$target]
+
+# Previous versions of geposan used different species IDs. For backwards
+# compatibility, convert integer IDs to character.
+
+species[, id := as.character(id)]
+distances[, species := as.character(species)]
+
+usethis::use_data(species, overwrite = TRUE)
+usethis::use_data(genes, overwrite = TRUE)
+usethis::use_data(distances, overwrite = TRUE)
--- a/scripts/ensembl_genes.R
+++ b/scripts/ensembl_genes.R
@ -0,0 +1,125 @@
+# This script retrieves genome data from the Ensembl database. Run
+# `ensembl_species.R` first and keep its output files "species.csv" and
+# "chromosomes.csv".
+
+library(data.table)
+library(DBI)
+library(glue)
+
+compara_table <- "ensembl_compara_110"
+
+# This is the output table of this script:
+
+genes <- data.table(
+  species = integer(),
+  gene = character(),
+  chromosome = integer(),
+  start_position = integer(),
+  end_position = integer()
+)
+
+species <- fread("species.csv")
+chromosomes <- fread("chromosomes.csv")
+
+rlog::log_info("Connecting to Ensembl database server")
+
+db <- dbConnect(
+  RMariaDB::MariaDB(),
+  host = "ensembldb.ensembl.org",
+  port = 5306,
+  user = "anonymous"
+)
+
+rlog::log_info("Retrieving human genes")
+
+human_species_id <- 9606
+human_present_row_count <- genes[species == human_species_id, .N]
+
+if (human_present_row_count > 0) {
+  rlog::log_info(glue("Skipping. Present rows: {human_present_row_count}"))
+} else {
+  human_table <- species[id == human_species_id, table_name]
+  dbExecute(db, glue_sql("USE {`human_table`}", .con = db))
+
+  human_chromosome_ids <- chromosomes[species == human_species_id, id]
+
+  human_genes <- db |>
+    dbGetQuery(glue_sql("
+      SELECT stable_id, seq_region_id, seq_region_start, seq_region_end
+        FROM gene WHERE seq_region_id IN ({human_chromosome_ids*})")) |>
+    as.data.table() |>
+    setnames(
+      c("stable_id", "seq_region_id", "seq_region_start", "seq_region_end"),
+      c("gene", "chromosome", "start_position", "end_position")
+    )
+
+  human_genes[, species := human_species_id]
+  genes <- rbind(genes, human_genes)
+}
+
+dbExecute(db, glue_sql("USE {`compara_table`}", .con = db))
+
+for (species_id in species[id != human_species_id, id]) {
+  present_row_count <- genes[species == species_id, .N]
+  species_name <- species[id == species_id, name]
+
+  if (present_row_count > 0) {
+    rlog::log_info(glue("Skipping species {species_id} ({species_name})"))
+    rlog::log_info(glue("Present rows: {present_row_count}"))
+    next
+  }
+
+  rlog::log_info(glue(
+    "Retrieving genes for species {species_id} ({species_name})"
+  ))
+
+  table_name <- species[id == species_id, table_name]
+  chromosome_ids <- chromosomes[species == species_id, id]
+
+  species_genes <- db |>
+    dbGetQuery(glue_sql("
+      SELECT
+        human.stable_id AS gene,
+        species.seq_region_id AS chromosome,
+        species.seq_region_start AS start_position,
+        species.seq_region_end AS end_position
+      FROM
+        (
+          SELECT
+            homology_id,
+            stable_id,
+            seq_region_id,
+            seq_region_start,
+            seq_region_end
+          FROM {`table_name`}.gene
+            JOIN gene_member USING (stable_id)
+            JOIN homology_member USING (gene_member_id)
+            JOIN homology USING (homology_id)
+          WHERE seq_region_id IN ({chromosome_ids*})
+            AND homology.description IN (
+              'ortholog_one2one',
+              'ortholog_one2many',
+              'ortholog_many2many'
+            )
+        ) AS species
+        JOIN (
+          SELECT
+            homology_id,
+            stable_id
+          FROM homology_member
+            JOIN gene_member USING (gene_member_id)
+          WHERE taxon_id = {human_species_id}
+        ) AS human ON species.homology_id = human.homology_id;
+    ", .con = db)) |>
+    as.data.table()
+
+  if (nrow(species_genes) == 0) {
+    rlog::log_info("No human homologs found")
+  }
+
+  species_genes[, species := species_id]
+  genes <- rbind(genes, species_genes)
+  fwrite(genes, "genes.csv")
+}
+
+dbDisconnect(db)
--- a/scripts/ensembl_species.R
+++ b/scripts/ensembl_species.R
@ -0,0 +1,119 @@
+# This is an *interactive* script for retrieving information on species from the
+# Ensembl database. There are taxons with more than one entry in the database.
+# For each species that has already been seen, the script asks whether to keep
+# it or replace it. We recommend to choose the most generic entry in most
+# cases.
+
+library(data.table)
+library(DBI)
+library(glue)
+
+# These are the output tables of this script:
+
+species <- data.table(
+  id = integer(),
+  name = character(),
+  scientific_name = character(),
+  table_name = character()
+)
+
+chromosomes <- data.table(
+  species = integer(),
+  id = integer(),
+  name = character(),
+  length = integer()
+)
+
+rlog::log_info("Connecting to Ensembl database server")
+db <- dbConnect(
+  RMariaDB::MariaDB(),
+  host = "ensembldb.ensembl.org",
+  port = 5306,
+  user = "anonymous"
+)
+
+rlog::log_info("Retrieving list of databases")
+tables <- dbGetQuery(db, "SHOW DATABASES LIKE '%_core_110_%'")[, 1]
+
+# Populates the species and chromosomes tables using data from each species'
+# table within the Ensembl database. Species without a karyotype will be skipped
+# without adding any information to the tables.
+for (table in tables) {
+  rlog::log_info(glue("Reading species information from {table}"))
+  dbExecute(db, glue_sql("USE {`table`}", .con = db))
+
+  species_id <- db |>
+    dbGetQuery("
+      SELECT meta_value FROM meta
+        WHERE meta_key = 'species.taxonomy_id'") |>
+    as.integer()
+
+  species_name <- db |>
+    dbGetQuery("
+      SELECT meta_value FROM meta
+        WHERE meta_key = 'species.display_name'") |>
+    as.character()
+
+  species_scientific_name <- db |>
+    dbGetQuery("
+      SELECT meta_value FROM meta
+        WHERE meta_key = 'species.scientific_name'") |>
+    as.character()
+
+  rlog::log_info(glue(
+    "Found species {species_name} ({species_scientific_name})"
+  ))
+
+  if (species[id == species_id, .N] > 0) {
+    old_name <- species[id == species_id, name]
+    old_scientific_name <- species[id == species_id, scientific_name]
+    input <- readline(glue("\\
+      Taxon already present ({old_name}, {old_scientific_name}). \\
+      Replace with {species_name} ({species_scientific_name})? [y/N] "))
+
+    if (input == "y") {
+      species <- species[id != species_id]
+      chromosomes <- chromosomes[species != species_id]
+    } else {
+      next
+    }
+  }
+
+  species_chromosomes <- db |>
+    dbGetQuery(glue("
+      SELECT seq_region_id, seq_region.name, length
+      FROM seq_region
+        JOIN seq_region_attrib USING (seq_region_id)
+        JOIN attrib_type USING (attrib_type_id)
+      WHERE code = 'karyotype_rank'
+        AND NOT EXISTS
+          (SELECT * FROM seq_region_attrib AS chromosome_attrib
+            JOIN attrib_type USING (attrib_type_id)
+            WHERE chromosome_attrib.seq_region_id = seq_region.seq_region_id
+              AND code = 'sequence_location'
+              AND chromosome_attrib.value != 'nuclear_chromosome');
+    ")) |>
+    as.data.table() |>
+    setnames("seq_region_id", "id")
+
+  species_chromosomes[, species := species_id]
+
+  if (nrow(species_chromosomes) == 0) {
+    rlog::log_info("Skipping (no karyotype)")
+    next
+  }
+
+  species <- rbind(species, data.table(
+    id = species_id,
+    name = species_name,
+    scientific_name = species_scientific_name,
+    table_name = table
+  ))
+
+  chromosomes <- rbind(chromosomes, species_chromosomes)
+}
+
+dbDisconnect(db)
+
+fwrite(species, "species.csv")
+fwrite(chromosomes, "chromosomes.csv")