mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 02:37:25 +01:00
125 lines
No EOL
3.5 KiB
R
125 lines
No EOL
3.5 KiB
R
# This script retrieves genome data from the Ensembl database. Run
|
|
# `ensembl_species.R` first and keep its output files "species.csv" and
|
|
# "chromosomes.csv".
|
|
|
|
library(data.table)
|
|
library(DBI)
|
|
library(glue)
|
|
|
|
compara_table <- "ensembl_compara_110"
|
|
|
|
# This is the output table of this script:
|
|
|
|
genes <- data.table(
|
|
species = integer(),
|
|
gene = character(),
|
|
chromosome = integer(),
|
|
start_position = integer(),
|
|
end_position = integer()
|
|
)
|
|
|
|
species <- fread("species.csv")
|
|
chromosomes <- fread("chromosomes.csv")
|
|
|
|
rlog::log_info("Connecting to Ensembl database server")
|
|
|
|
db <- dbConnect(
|
|
RMariaDB::MariaDB(),
|
|
host = "ensembldb.ensembl.org",
|
|
port = 5306,
|
|
user = "anonymous"
|
|
)
|
|
|
|
rlog::log_info("Retrieving human genes")
|
|
|
|
human_species_id <- 9606
|
|
human_present_row_count <- genes[species == human_species_id, .N]
|
|
|
|
if (human_present_row_count > 0) {
|
|
rlog::log_info(glue("Skipping. Present rows: {human_present_row_count}"))
|
|
} else {
|
|
human_table <- species[id == human_species_id, table_name]
|
|
dbExecute(db, glue_sql("USE {`human_table`}", .con = db))
|
|
|
|
human_chromosome_ids <- chromosomes[species == human_species_id, id]
|
|
|
|
human_genes <- db |>
|
|
dbGetQuery(glue_sql("
|
|
SELECT stable_id, seq_region_id, seq_region_start, seq_region_end
|
|
FROM gene WHERE seq_region_id IN ({human_chromosome_ids*})")) |>
|
|
as.data.table() |>
|
|
setnames(
|
|
c("stable_id", "seq_region_id", "seq_region_start", "seq_region_end"),
|
|
c("gene", "chromosome", "start_position", "end_position")
|
|
)
|
|
|
|
human_genes[, species := human_species_id]
|
|
genes <- rbind(genes, human_genes)
|
|
}
|
|
|
|
dbExecute(db, glue_sql("USE {`compara_table`}", .con = db))
|
|
|
|
for (species_id in species[id != human_species_id, id]) {
|
|
present_row_count <- genes[species == species_id, .N]
|
|
species_name <- species[id == species_id, name]
|
|
|
|
if (present_row_count > 0) {
|
|
rlog::log_info(glue("Skipping species {species_id} ({species_name})"))
|
|
rlog::log_info(glue("Present rows: {present_row_count}"))
|
|
next
|
|
}
|
|
|
|
rlog::log_info(glue(
|
|
"Retrieving genes for species {species_id} ({species_name})"
|
|
))
|
|
|
|
table_name <- species[id == species_id, table_name]
|
|
chromosome_ids <- chromosomes[species == species_id, id]
|
|
|
|
species_genes <- db |>
|
|
dbGetQuery(glue_sql("
|
|
SELECT
|
|
human.stable_id AS gene,
|
|
species.seq_region_id AS chromosome,
|
|
species.seq_region_start AS start_position,
|
|
species.seq_region_end AS end_position
|
|
FROM
|
|
(
|
|
SELECT
|
|
homology_id,
|
|
stable_id,
|
|
seq_region_id,
|
|
seq_region_start,
|
|
seq_region_end
|
|
FROM {`table_name`}.gene
|
|
JOIN gene_member USING (stable_id)
|
|
JOIN homology_member USING (gene_member_id)
|
|
JOIN homology USING (homology_id)
|
|
WHERE seq_region_id IN ({chromosome_ids*})
|
|
AND homology.description IN (
|
|
'ortholog_one2one',
|
|
'ortholog_one2many',
|
|
'ortholog_many2many'
|
|
)
|
|
) AS species
|
|
JOIN (
|
|
SELECT
|
|
homology_id,
|
|
stable_id
|
|
FROM homology_member
|
|
JOIN gene_member USING (gene_member_id)
|
|
WHERE taxon_id = {human_species_id}
|
|
) AS human ON species.homology_id = human.homology_id;
|
|
", .con = db)) |>
|
|
as.data.table()
|
|
|
|
if (nrow(species_genes) == 0) {
|
|
rlog::log_info("No human homologs found")
|
|
}
|
|
|
|
species_genes[, species := species_id]
|
|
genes <- rbind(genes, species_genes)
|
|
fwrite(genes, "genes.csv")
|
|
}
|
|
|
|
dbDisconnect(db) |