geposan/scripts/ensembl_species.R

# This is an *interactive* script for retrieving information on species from the
# Ensembl database. There are taxons with more than one entry in the database.
# For each species that has already been seen, the script asks whether to keep
# it or replace it. We recommend to choose the most generic entry in most
# cases.

library(data.table)
library(DBI)
library(glue)

# These are the output tables of this script:

species <- data.table(
  id = integer(),
  name = character(),
  scientific_name = character(),
  table_name = character()
)

chromosomes <- data.table(
  species = integer(),
  id = integer(),
  name = character(),
  length = integer()
)

rlog::log_info("Connecting to Ensembl database server")
db <- dbConnect(
  RMariaDB::MariaDB(),
  host = "ensembldb.ensembl.org",
  port = 5306,
  user = "anonymous"
)

rlog::log_info("Retrieving list of databases")
tables <- dbGetQuery(db, "SHOW DATABASES LIKE '%_core_110_%'")[, 1]

# Populates the species and chromosomes tables using data from each species'
# table within the Ensembl database. Species without a karyotype will be skipped
# without adding any information to the tables.
for (table in tables) {
  rlog::log_info(glue("Reading species information from {table}"))
  dbExecute(db, glue_sql("USE {`table`}", .con = db))

  species_id <- db |>
    dbGetQuery("
      SELECT meta_value FROM meta
        WHERE meta_key = 'species.taxonomy_id'") |>
    as.integer()

  species_name <- db |>
    dbGetQuery("
      SELECT meta_value FROM meta
        WHERE meta_key = 'species.display_name'") |>
    as.character()

  species_scientific_name <- db |>
    dbGetQuery("
      SELECT meta_value FROM meta
        WHERE meta_key = 'species.scientific_name'") |>
    as.character()

  rlog::log_info(glue(
    "Found species {species_name} ({species_scientific_name})"
  ))

  if (species[id == species_id, .N] > 0) {
    old_name <- species[id == species_id, name]
    old_scientific_name <- species[id == species_id, scientific_name]
    input <- readline(glue("\\
      Taxon already present ({old_name}, {old_scientific_name}). \\
      Replace with {species_name} ({species_scientific_name})? [y/N] "))

    if (input == "y") {
      species <- species[id != species_id]
      chromosomes <- chromosomes[species != species_id]
    } else {
      next
    }
  }

  species_chromosomes <- db |>
    dbGetQuery(glue("
      SELECT seq_region_id, seq_region.name, length
      FROM seq_region
        JOIN seq_region_attrib USING (seq_region_id)
        JOIN attrib_type USING (attrib_type_id)
      WHERE code = 'karyotype_rank'
        AND NOT EXISTS
          (SELECT * FROM seq_region_attrib AS chromosome_attrib
            JOIN attrib_type USING (attrib_type_id)
            WHERE chromosome_attrib.seq_region_id = seq_region.seq_region_id
              AND code = 'sequence_location'
              AND chromosome_attrib.value != 'nuclear_chromosome');
    ")) |>
    as.data.table() |>
    setnames("seq_region_id", "id")

  species_chromosomes[, species := species_id]

  if (nrow(species_chromosomes) == 0) {
    rlog::log_info("Skipping (no karyotype)")
    next
  }

  species <- rbind(species, data.table(
    id = species_id,
    name = species_name,
    scientific_name = species_scientific_name,
    table_name = table
  ))

  chromosomes <- rbind(chromosomes, species_chromosomes)
}

dbDisconnect(db)

fwrite(species, "species.csv")
fwrite(chromosomes, "chromosomes.csv")