mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 10:47:25 +01:00
Use ensembl database server instead of biomart
This commit is contained in:
parent
948cb337db
commit
795fe99003
16 changed files with 373 additions and 452 deletions
125
scripts/ensembl_genes.R
Normal file
125
scripts/ensembl_genes.R
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
# This script retrieves genome data from the Ensembl database. Run
|
||||
# `ensembl_species.R` first and keep its output files "species.csv" and
|
||||
# "chromosomes.csv".
|
||||
|
||||
library(data.table)
|
||||
library(DBI)
|
||||
library(glue)
|
||||
|
||||
compara_table <- "ensembl_compara_110"
|
||||
|
||||
# This is the output table of this script:
|
||||
|
||||
genes <- data.table(
|
||||
species = integer(),
|
||||
gene = character(),
|
||||
chromosome = integer(),
|
||||
start_position = integer(),
|
||||
end_position = integer()
|
||||
)
|
||||
|
||||
species <- fread("species.csv")
|
||||
chromosomes <- fread("chromosomes.csv")
|
||||
|
||||
rlog::log_info("Connecting to Ensembl database server")
|
||||
|
||||
db <- dbConnect(
|
||||
RMariaDB::MariaDB(),
|
||||
host = "ensembldb.ensembl.org",
|
||||
port = 5306,
|
||||
user = "anonymous"
|
||||
)
|
||||
|
||||
rlog::log_info("Retrieving human genes")
|
||||
|
||||
human_species_id <- 9606
|
||||
human_present_row_count <- genes[species == human_species_id, .N]
|
||||
|
||||
if (human_present_row_count > 0) {
|
||||
rlog::log_info(glue("Skipping. Present rows: {human_present_row_count}"))
|
||||
} else {
|
||||
human_table <- species[id == human_species_id, table_name]
|
||||
dbExecute(db, glue_sql("USE {`human_table`}", .con = db))
|
||||
|
||||
human_chromosome_ids <- chromosomes[species == human_species_id, id]
|
||||
|
||||
human_genes <- db |>
|
||||
dbGetQuery(glue_sql("
|
||||
SELECT stable_id, seq_region_id, seq_region_start, seq_region_end
|
||||
FROM gene WHERE seq_region_id IN ({human_chromosome_ids*})")) |>
|
||||
as.data.table() |>
|
||||
setnames(
|
||||
c("stable_id", "seq_region_id", "seq_region_start", "seq_region_end"),
|
||||
c("gene", "chromosome", "start_position", "end_position")
|
||||
)
|
||||
|
||||
human_genes[, species := human_species_id]
|
||||
genes <- rbind(genes, human_genes)
|
||||
}
|
||||
|
||||
dbExecute(db, glue_sql("USE {`compara_table`}", .con = db))
|
||||
|
||||
for (species_id in species[id != human_species_id, id]) {
|
||||
present_row_count <- genes[species == species_id, .N]
|
||||
species_name <- species[id == species_id, name]
|
||||
|
||||
if (present_row_count > 0) {
|
||||
rlog::log_info(glue("Skipping species {species_id} ({species_name})"))
|
||||
rlog::log_info(glue("Present rows: {present_row_count}"))
|
||||
next
|
||||
}
|
||||
|
||||
rlog::log_info(glue(
|
||||
"Retrieving genes for species {species_id} ({species_name})"
|
||||
))
|
||||
|
||||
table_name <- species[id == species_id, table_name]
|
||||
chromosome_ids <- chromosomes[species == species_id, id]
|
||||
|
||||
species_genes <- db |>
|
||||
dbGetQuery(glue_sql("
|
||||
SELECT
|
||||
human.stable_id AS gene,
|
||||
species.seq_region_id AS chromosome,
|
||||
species.seq_region_start AS start_position,
|
||||
species.seq_region_end AS end_position
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
homology_id,
|
||||
stable_id,
|
||||
seq_region_id,
|
||||
seq_region_start,
|
||||
seq_region_end
|
||||
FROM {`table_name`}.gene
|
||||
JOIN gene_member USING (stable_id)
|
||||
JOIN homology_member USING (gene_member_id)
|
||||
JOIN homology USING (homology_id)
|
||||
WHERE seq_region_id IN ({chromosome_ids*})
|
||||
AND homology.description IN (
|
||||
'ortholog_one2one',
|
||||
'ortholog_one2many',
|
||||
'ortholog_many2many'
|
||||
)
|
||||
) AS species
|
||||
JOIN (
|
||||
SELECT
|
||||
homology_id,
|
||||
stable_id
|
||||
FROM homology_member
|
||||
JOIN gene_member USING (gene_member_id)
|
||||
WHERE taxon_id = {human_species_id}
|
||||
) AS human ON species.homology_id = human.homology_id;
|
||||
", .con = db)) |>
|
||||
as.data.table()
|
||||
|
||||
if (nrow(species_genes) == 0) {
|
||||
rlog::log_info("No human homologs found")
|
||||
}
|
||||
|
||||
species_genes[, species := species_id]
|
||||
genes <- rbind(genes, species_genes)
|
||||
fwrite(genes, "genes.csv")
|
||||
}
|
||||
|
||||
dbDisconnect(db)
|
||||
Loading…
Add table
Add a link
Reference in a new issue