geposan/scripts/ensembl_species.R

119 lines
No EOL
3.4 KiB
R

# This is an *interactive* script for retrieving information on species from the
# Ensembl database. There are taxons with more than one entry in the database.
# For each species that has already been seen, the script asks whether to keep
# it or replace it. We recommend to choose the most generic entry in most
# cases.
library(data.table)
library(DBI)
library(glue)
# These are the output tables of this script:
species <- data.table(
id = integer(),
name = character(),
scientific_name = character(),
table_name = character()
)
chromosomes <- data.table(
species = integer(),
id = integer(),
name = character(),
length = integer()
)
rlog::log_info("Connecting to Ensembl database server")
db <- dbConnect(
RMariaDB::MariaDB(),
host = "ensembldb.ensembl.org",
port = 5306,
user = "anonymous"
)
rlog::log_info("Retrieving list of databases")
tables <- dbGetQuery(db, "SHOW DATABASES LIKE '%_core_110_%'")[, 1]
# Populates the species and chromosomes tables using data from each species'
# table within the Ensembl database. Species without a karyotype will be skipped
# without adding any information to the tables.
for (table in tables) {
rlog::log_info(glue("Reading species information from {table}"))
dbExecute(db, glue_sql("USE {`table`}", .con = db))
species_id <- db |>
dbGetQuery("
SELECT meta_value FROM meta
WHERE meta_key = 'species.taxonomy_id'") |>
as.integer()
species_name <- db |>
dbGetQuery("
SELECT meta_value FROM meta
WHERE meta_key = 'species.display_name'") |>
as.character()
species_scientific_name <- db |>
dbGetQuery("
SELECT meta_value FROM meta
WHERE meta_key = 'species.scientific_name'") |>
as.character()
rlog::log_info(glue(
"Found species {species_name} ({species_scientific_name})"
))
if (species[id == species_id, .N] > 0) {
old_name <- species[id == species_id, name]
old_scientific_name <- species[id == species_id, scientific_name]
input <- readline(glue("\\
Taxon already present ({old_name}, {old_scientific_name}). \\
Replace with {species_name} ({species_scientific_name})? [y/N] "))
if (input == "y") {
species <- species[id != species_id]
chromosomes <- chromosomes[species != species_id]
} else {
next
}
}
species_chromosomes <- db |>
dbGetQuery(glue("
SELECT seq_region_id, seq_region.name, length
FROM seq_region
JOIN seq_region_attrib USING (seq_region_id)
JOIN attrib_type USING (attrib_type_id)
WHERE code = 'karyotype_rank'
AND NOT EXISTS
(SELECT * FROM seq_region_attrib AS chromosome_attrib
JOIN attrib_type USING (attrib_type_id)
WHERE chromosome_attrib.seq_region_id = seq_region.seq_region_id
AND code = 'sequence_location'
AND chromosome_attrib.value != 'nuclear_chromosome');
")) |>
as.data.table() |>
setnames("seq_region_id", "id")
species_chromosomes[, species := species_id]
if (nrow(species_chromosomes) == 0) {
rlog::log_info("Skipping (no karyotype)")
next
}
species <- rbind(species, data.table(
id = species_id,
name = species_name,
scientific_name = species_scientific_name,
table_name = table
))
chromosomes <- rbind(chromosomes, species_chromosomes)
}
dbDisconnect(db)
fwrite(species, "species.csv")
fwrite(chromosomes, "chromosomes.csv")