mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 02:37:25 +01:00
Use ensembl database server instead of biomart
This commit is contained in:
parent
948cb337db
commit
795fe99003
16 changed files with 373 additions and 452 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,2 +1,5 @@
|
||||||
/cache
|
/cache
|
||||||
.Rproj.user
|
.Rproj.user
|
||||||
|
/chromosomes.csv
|
||||||
|
/genes.csv
|
||||||
|
/species.csv
|
||||||
|
|
|
||||||
19
R/data.R
19
R/data.R
|
|
@ -2,13 +2,26 @@
|
||||||
#'
|
#'
|
||||||
#' @format A [data.table] with the following columns:
|
#' @format A [data.table] with the following columns:
|
||||||
#' \describe{
|
#' \describe{
|
||||||
#' \item{id}{Unique species ID}
|
#' \item{id}{Unique species ID, these are NCBI taxon IDs}
|
||||||
#' \item{name}{Human readable species name}
|
#' \item{name}{Human readable species name}
|
||||||
|
#' \item{scientific_name}{Scientific name of the species}
|
||||||
|
#' \item{table_name}{Table name within the Ensembl database}
|
||||||
#' \item{n_chromosomes}{Number of chromosomes}
|
#' \item{n_chromosomes}{Number of chromosomes}
|
||||||
#' \item{median_chromosome_length}{Median length of chromosomes}
|
#' \item{median_chromosome_length}{Median length of chromosomes}
|
||||||
#' }
|
#' }
|
||||||
"species"
|
"species"
|
||||||
|
|
||||||
|
#' Information on chromosomes for each included species.
|
||||||
|
#'
|
||||||
|
#' @format A [data.table] with the following columns:
|
||||||
|
#' \describe{
|
||||||
|
#' \item{species}{Species ID}
|
||||||
|
#' \item{id}{Chromosome ID, theses are Ensembl sequence IDs}
|
||||||
|
#' \item{name}{Chromosome name}
|
||||||
|
#' \item{length}{Length in base pairs}
|
||||||
|
#' }
|
||||||
|
"chromosomes"
|
||||||
|
|
||||||
#' Information on human genes within the Ensembl database.
|
#' Information on human genes within the Ensembl database.
|
||||||
#'
|
#'
|
||||||
#' This includes only genes on the primary suggested assembly of the human
|
#' This includes only genes on the primary suggested assembly of the human
|
||||||
|
|
@ -18,7 +31,7 @@
|
||||||
#' \describe{
|
#' \describe{
|
||||||
#' \item{id}{Ensembl gene ID}
|
#' \item{id}{Ensembl gene ID}
|
||||||
#' \item{name}{The gene's HGNC name (if available)}
|
#' \item{name}{The gene's HGNC name (if available)}
|
||||||
#' \item{chrosome}{The human chromosome the gene is located on}
|
#' \item{chromosome}{The human chromosome the gene is located on}
|
||||||
#' }
|
#' }
|
||||||
"genes"
|
"genes"
|
||||||
|
|
||||||
|
|
@ -31,7 +44,7 @@
|
||||||
#' \describe{
|
#' \describe{
|
||||||
#' \item{species}{Species ID}
|
#' \item{species}{Species ID}
|
||||||
#' \item{gene}{Gene ID}
|
#' \item{gene}{Gene ID}
|
||||||
#' \item{chromosome_name}{Chromosome name from the specified species}
|
#' \item{chromosome}{Chromosome ID}
|
||||||
#' \item{start_position}{Start position in base pairs}
|
#' \item{start_position}{Start position in base pairs}
|
||||||
#' \item{end_position}{End position in base pairs}
|
#' \item{end_position}{End position in base pairs}
|
||||||
#' \item{distance}{Computed distance to nearest telomere}
|
#' \item{distance}{Computed distance to nearest telomere}
|
||||||
|
|
|
||||||
11
R/plots.R
11
R/plots.R
|
|
@ -561,13 +561,14 @@ plot_scores_by_position <- function(ranking,
|
||||||
}
|
}
|
||||||
|
|
||||||
distance_data <- if (!is.null(chromosome_name)) {
|
distance_data <- if (!is.null(chromosome_name)) {
|
||||||
chromosome_name_ <- chromosome_name
|
chromosome_id <- geposan::chromosomes[
|
||||||
geposan::distances[
|
species == "9606" & name == chromosome_name,
|
||||||
species == "hsapiens" &
|
id
|
||||||
chromosome_name == chromosome_name_
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
geposan::distances[species == "9606" & chromosome == chromosome_id]
|
||||||
} else {
|
} else {
|
||||||
geposan::distances[species == "hsapiens"]
|
geposan::distances[species == "9606"]
|
||||||
}
|
}
|
||||||
|
|
||||||
data <- merge(ranking, distance_data, by = "gene")
|
data <- merge(ranking, distance_data, by = "gene")
|
||||||
|
|
|
||||||
BIN
data/chromosomes.rda
Normal file
BIN
data/chromosomes.rda
Normal file
Binary file not shown.
Binary file not shown.
BIN
data/genes.rda
BIN
data/genes.rda
Binary file not shown.
BIN
data/species.rda
BIN
data/species.rda
Binary file not shown.
22
man/chromosomes.Rd
Normal file
22
man/chromosomes.Rd
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/data.R
|
||||||
|
\docType{data}
|
||||||
|
\name{chromosomes}
|
||||||
|
\alias{chromosomes}
|
||||||
|
\title{Information on chromosomes for each included species.}
|
||||||
|
\format{
|
||||||
|
A \link{data.table} with the following columns:
|
||||||
|
\describe{
|
||||||
|
\item{species}{Species ID}
|
||||||
|
\item{id}{Chromosome ID, theses are Ensembl sequence IDs}
|
||||||
|
\item{name}{Chromosome name}
|
||||||
|
\item{length}{Length in base pairs}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
\usage{
|
||||||
|
chromosomes
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Information on chromosomes for each included species.
|
||||||
|
}
|
||||||
|
\keyword{datasets}
|
||||||
|
|
@ -9,7 +9,7 @@ A \link{data.table} with the following columns:
|
||||||
\describe{
|
\describe{
|
||||||
\item{species}{Species ID}
|
\item{species}{Species ID}
|
||||||
\item{gene}{Gene ID}
|
\item{gene}{Gene ID}
|
||||||
\item{chromosome_name}{Chromosome name from the specified species}
|
\item{chromosome}{Chromosome ID}
|
||||||
\item{start_position}{Start position in base pairs}
|
\item{start_position}{Start position in base pairs}
|
||||||
\item{end_position}{End position in base pairs}
|
\item{end_position}{End position in base pairs}
|
||||||
\item{distance}{Computed distance to nearest telomere}
|
\item{distance}{Computed distance to nearest telomere}
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ A \link{data.table} with the following columns:
|
||||||
\describe{
|
\describe{
|
||||||
\item{id}{Ensembl gene ID}
|
\item{id}{Ensembl gene ID}
|
||||||
\item{name}{The gene's HGNC name (if available)}
|
\item{name}{The gene's HGNC name (if available)}
|
||||||
\item{chrosome}{The human chromosome the gene is located on}
|
\item{chromosome}{The human chromosome the gene is located on}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
\usage{
|
\usage{
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,10 @@
|
||||||
\format{
|
\format{
|
||||||
A \link{data.table} with the following columns:
|
A \link{data.table} with the following columns:
|
||||||
\describe{
|
\describe{
|
||||||
\item{id}{Unique species ID}
|
\item{id}{Unique species ID, these are NCBI taxon IDs}
|
||||||
\item{name}{Human readable species name}
|
\item{name}{Human readable species name}
|
||||||
|
\item{scientific_name}{Scientific name of the species}
|
||||||
|
\item{table_name}{Table name within the Ensembl database}
|
||||||
\item{n_chromosomes}{Number of chromosomes}
|
\item{n_chromosomes}{Number of chromosomes}
|
||||||
\item{median_chromosome_length}{Median length of chromosomes}
|
\item{median_chromosome_length}{Median length of chromosomes}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
library(data.table)
|
|
||||||
library(httr)
|
|
||||||
|
|
||||||
ensembl_api_url <- "https://rest.ensembl.org"
|
|
||||||
|
|
||||||
#' Perform a request to the Ensembl REST API.
|
|
||||||
ensembl_request <- function(api_path) {
|
|
||||||
content(stop_for_status(GET(
|
|
||||||
paste0(ensembl_api_url, api_path),
|
|
||||||
content_type_json()
|
|
||||||
)))
|
|
||||||
}
|
|
||||||
|
|
||||||
#' Get IDs of all available vertebrates.
|
|
||||||
get_species_ids <- function() {
|
|
||||||
species <- ensembl_request("/info/species")$species
|
|
||||||
sapply(species, function(species) species$name)
|
|
||||||
}
|
|
||||||
|
|
||||||
#' Get all chromosomes names for a species.
|
|
||||||
get_species_chromosomes <- function(species_id) {
|
|
||||||
unlist(ensembl_request(
|
|
||||||
paste0("/info/assembly/", species_id)
|
|
||||||
)$karyotype)
|
|
||||||
}
|
|
||||||
|
|
||||||
#' Get a vector of all available unqiue chromosome names.
|
|
||||||
#'
|
|
||||||
#' There are multiple names for mitochondrial DNA which have to be removed
|
|
||||||
#' manually, unfortunately.
|
|
||||||
get_all_chromosomes <- function() {
|
|
||||||
chromosomes <- sapply(get_species_ids(), get_species_chromosomes)
|
|
||||||
unique(unlist(chromosomes))
|
|
||||||
}
|
|
||||||
|
|
@ -1,407 +0,0 @@
|
||||||
library(data.table)
|
|
||||||
|
|
||||||
rlog::log_info("Connecting to Ensembl API")
|
|
||||||
|
|
||||||
# Object to access the Ensembl API.
|
|
||||||
ensembl <- biomaRt::useEnsembl("ensembl", version = 110)
|
|
||||||
|
|
||||||
# Retrieve species information.
|
|
||||||
|
|
||||||
rlog::log_info("Retrieving species information")
|
|
||||||
ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
|
|
||||||
|
|
||||||
# Filter out species ID and name from the result.
|
|
||||||
species <- ensembl_datasets[, .(
|
|
||||||
id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
|
|
||||||
name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
|
|
||||||
)]
|
|
||||||
|
|
||||||
# List of assemblies that the Ensembl Rest API advertises as chromosomes.
|
|
||||||
# Mitochondrial DNA has been manually removed. Unfortunately, species IDs from
|
|
||||||
# the Ensembl REST API don't map to dataset names in the BioMart interface.
|
|
||||||
# Because of that, we can't programatically filter chromosome names.
|
|
||||||
#
|
|
||||||
# See get_all_chromosomes()
|
|
||||||
valid_chromosome_names <- c(
|
|
||||||
"1",
|
|
||||||
"2",
|
|
||||||
"3",
|
|
||||||
"4",
|
|
||||||
"5",
|
|
||||||
"6",
|
|
||||||
"7",
|
|
||||||
"8",
|
|
||||||
"9",
|
|
||||||
"10",
|
|
||||||
"11",
|
|
||||||
"12",
|
|
||||||
"13",
|
|
||||||
"14",
|
|
||||||
"15",
|
|
||||||
"16",
|
|
||||||
"17",
|
|
||||||
"18",
|
|
||||||
"19",
|
|
||||||
"X",
|
|
||||||
"groupI",
|
|
||||||
"groupII",
|
|
||||||
"groupIII",
|
|
||||||
"groupIV",
|
|
||||||
"groupV",
|
|
||||||
"groupVI",
|
|
||||||
"groupVII",
|
|
||||||
"groupVIII",
|
|
||||||
"groupIX",
|
|
||||||
"groupX",
|
|
||||||
"groupXI",
|
|
||||||
"groupXII",
|
|
||||||
"groupXIII",
|
|
||||||
"groupXIV",
|
|
||||||
"groupXV",
|
|
||||||
"groupXVI",
|
|
||||||
"groupXVII",
|
|
||||||
"groupXVIII",
|
|
||||||
"groupXIX",
|
|
||||||
"groupXX",
|
|
||||||
"groupXXI",
|
|
||||||
"20",
|
|
||||||
"Y",
|
|
||||||
"21",
|
|
||||||
"22",
|
|
||||||
"23",
|
|
||||||
"24",
|
|
||||||
"25",
|
|
||||||
"26",
|
|
||||||
"27",
|
|
||||||
"28",
|
|
||||||
"29",
|
|
||||||
"30",
|
|
||||||
"31",
|
|
||||||
"32",
|
|
||||||
"33",
|
|
||||||
"34",
|
|
||||||
"35",
|
|
||||||
"36",
|
|
||||||
"37",
|
|
||||||
"38",
|
|
||||||
"I",
|
|
||||||
"II",
|
|
||||||
"III",
|
|
||||||
"IV",
|
|
||||||
"V",
|
|
||||||
"VI",
|
|
||||||
"VII",
|
|
||||||
"VIII",
|
|
||||||
"IX",
|
|
||||||
"XI",
|
|
||||||
"XII",
|
|
||||||
"XIII",
|
|
||||||
"XIV",
|
|
||||||
"XV",
|
|
||||||
"XVI",
|
|
||||||
"XVII",
|
|
||||||
"XVIII",
|
|
||||||
"XIX",
|
|
||||||
"XX",
|
|
||||||
"XXI",
|
|
||||||
"XXII",
|
|
||||||
"XXIII",
|
|
||||||
"XXIV",
|
|
||||||
"7a",
|
|
||||||
"7b",
|
|
||||||
"Z",
|
|
||||||
"W",
|
|
||||||
"a",
|
|
||||||
"b",
|
|
||||||
"c",
|
|
||||||
"d",
|
|
||||||
"f",
|
|
||||||
"g",
|
|
||||||
"h",
|
|
||||||
"39",
|
|
||||||
"40",
|
|
||||||
"1a",
|
|
||||||
"22a",
|
|
||||||
"sgr01",
|
|
||||||
"sgr02",
|
|
||||||
"sgr03",
|
|
||||||
"sgr04",
|
|
||||||
"sgr05",
|
|
||||||
"sgr06",
|
|
||||||
"sgr07",
|
|
||||||
"sgr08",
|
|
||||||
"sgr09",
|
|
||||||
"sgr10",
|
|
||||||
"sgr11",
|
|
||||||
"sgr12",
|
|
||||||
"sgr13",
|
|
||||||
"sgr14",
|
|
||||||
"sgr15",
|
|
||||||
"sgr16",
|
|
||||||
"sgr17",
|
|
||||||
"sgr18",
|
|
||||||
"sgr19",
|
|
||||||
"LGE64",
|
|
||||||
"2A",
|
|
||||||
"2B",
|
|
||||||
"X1",
|
|
||||||
"X2",
|
|
||||||
"X3",
|
|
||||||
"X4",
|
|
||||||
"X5",
|
|
||||||
"LG1",
|
|
||||||
"LG2",
|
|
||||||
"LG3",
|
|
||||||
"LG4",
|
|
||||||
"LG5",
|
|
||||||
"LG6",
|
|
||||||
"LG7",
|
|
||||||
"LG8",
|
|
||||||
"LG9",
|
|
||||||
"LG10",
|
|
||||||
"LG11",
|
|
||||||
"LG12",
|
|
||||||
"LG13",
|
|
||||||
"LG14",
|
|
||||||
"LG15",
|
|
||||||
"LG16",
|
|
||||||
"LG17",
|
|
||||||
"LG18",
|
|
||||||
"LG19",
|
|
||||||
"LG20",
|
|
||||||
"LG22",
|
|
||||||
"LG23",
|
|
||||||
"4A",
|
|
||||||
"1A",
|
|
||||||
"25LG1",
|
|
||||||
"25LG2",
|
|
||||||
"LGE22",
|
|
||||||
"LG21",
|
|
||||||
"A1",
|
|
||||||
"A2",
|
|
||||||
"A3",
|
|
||||||
"B1",
|
|
||||||
"B2",
|
|
||||||
"B3",
|
|
||||||
"B4",
|
|
||||||
"C1",
|
|
||||||
"C2",
|
|
||||||
"D1",
|
|
||||||
"D2",
|
|
||||||
"D3",
|
|
||||||
"D4",
|
|
||||||
"E1",
|
|
||||||
"E2",
|
|
||||||
"E3",
|
|
||||||
"F1",
|
|
||||||
"F2",
|
|
||||||
"LG34",
|
|
||||||
"LG35",
|
|
||||||
"LG24",
|
|
||||||
"LG25",
|
|
||||||
"LG26",
|
|
||||||
"LG27",
|
|
||||||
"LG28",
|
|
||||||
"LG29",
|
|
||||||
"LG30",
|
|
||||||
"MIC_1",
|
|
||||||
"MIC_10",
|
|
||||||
"MIC_11",
|
|
||||||
"MIC_2",
|
|
||||||
"MIC_3",
|
|
||||||
"MIC_4",
|
|
||||||
"MIC_5",
|
|
||||||
"MIC_6",
|
|
||||||
"MIC_7",
|
|
||||||
"MIC_8",
|
|
||||||
"MIC_9",
|
|
||||||
"2L",
|
|
||||||
"2R",
|
|
||||||
"3L",
|
|
||||||
"3R",
|
|
||||||
"LGE22C19W28_E50C23",
|
|
||||||
"LG01",
|
|
||||||
"LG02",
|
|
||||||
"LG03",
|
|
||||||
"LG04",
|
|
||||||
"LG05",
|
|
||||||
"LG06",
|
|
||||||
"LG07",
|
|
||||||
"LG08",
|
|
||||||
"LG09",
|
|
||||||
"LG7_11",
|
|
||||||
"41",
|
|
||||||
"42",
|
|
||||||
"43",
|
|
||||||
"44",
|
|
||||||
"45",
|
|
||||||
"46",
|
|
||||||
"47",
|
|
||||||
"48",
|
|
||||||
"49",
|
|
||||||
"50",
|
|
||||||
"LG28B",
|
|
||||||
"LG30F",
|
|
||||||
"LG36F",
|
|
||||||
"LG37M",
|
|
||||||
"LG42F",
|
|
||||||
"LG44F",
|
|
||||||
"LG45M",
|
|
||||||
"LG48F",
|
|
||||||
"LG49B"
|
|
||||||
)
|
|
||||||
|
|
||||||
#' Get all chromosome names for an Ensembl dataset.
|
|
||||||
#'
|
|
||||||
#' The function tries to filter out valid chromosome names from the available
|
|
||||||
#' assemblies in the dataset.
|
|
||||||
get_chromosome_names <- function(dataset) {
|
|
||||||
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
|
|
||||||
chromosome_names[chromosome_names %chin% valid_chromosome_names]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Retrieve information on human genes. This will only include genes on
|
|
||||||
# assembled chromosomes. Chromosomes are filtered using get_chromosome_names().
|
|
||||||
|
|
||||||
rlog::log_info("Retrieving information on human genes")
|
|
||||||
dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
|
||||||
|
|
||||||
human_data <- data.table(biomaRt::getBM(
|
|
||||||
attributes = c(
|
|
||||||
"ensembl_gene_id",
|
|
||||||
"hgnc_symbol",
|
|
||||||
"chromosome_name",
|
|
||||||
"start_position",
|
|
||||||
"end_position"
|
|
||||||
),
|
|
||||||
filters = "chromosome_name",
|
|
||||||
values = get_chromosome_names(dataset),
|
|
||||||
mart = dataset
|
|
||||||
))
|
|
||||||
|
|
||||||
# Remove duplicated gene IDs (at the time of writing, there are a handful).
|
|
||||||
human_data <- unique(human_data, by = "ensembl_gene_id")
|
|
||||||
|
|
||||||
# Only keep relevant information on genes.
|
|
||||||
genes <- human_data[, .(
|
|
||||||
id = ensembl_gene_id,
|
|
||||||
name = hgnc_symbol,
|
|
||||||
chromosome = chromosome_name
|
|
||||||
)]
|
|
||||||
|
|
||||||
# Retrieve gene distance data across species.
|
|
||||||
|
|
||||||
rlog::log_info("Retrieving distance data")
|
|
||||||
distances <- data.table()
|
|
||||||
|
|
||||||
#' Handle data for one species.
|
|
||||||
handle_species <- function(species_id, species_data) {
|
|
||||||
chromosomes <- species_data[,
|
|
||||||
.(chromosome_length = max(end_position)),
|
|
||||||
by = chromosome_name
|
|
||||||
]
|
|
||||||
|
|
||||||
# Store the number of chromosomes in the species table.
|
|
||||||
species[id == species_id, n_chromosomes := nrow(chromosomes)]
|
|
||||||
|
|
||||||
# Store the median chromosome length in the species table.
|
|
||||||
species[
|
|
||||||
id == species_id,
|
|
||||||
median_chromosome_length := chromosomes[, median(chromosome_length)]
|
|
||||||
]
|
|
||||||
|
|
||||||
# Precompute the genes' distance to the nearest telomere.
|
|
||||||
species_distances <- species_data[
|
|
||||||
chromosomes,
|
|
||||||
.(
|
|
||||||
species = species_id,
|
|
||||||
gene = ensembl_gene_id,
|
|
||||||
chromosome_name = chromosome_name,
|
|
||||||
start_position = start_position,
|
|
||||||
end_position = end_position,
|
|
||||||
distance = pmin(
|
|
||||||
start_position,
|
|
||||||
chromosome_length - end_position
|
|
||||||
)
|
|
||||||
),
|
|
||||||
on = "chromosome_name"
|
|
||||||
]
|
|
||||||
|
|
||||||
# Add species distances to the distances table.
|
|
||||||
distances <<- rbindlist(list(distances, species_distances))
|
|
||||||
}
|
|
||||||
|
|
||||||
# Handle the human first, as we already retrieved the data and don't need to
|
|
||||||
# filter based on orthologies.
|
|
||||||
handle_species("hsapiens", human_data)
|
|
||||||
|
|
||||||
# Iterate through all other species and retrieve their distance data.
|
|
||||||
for (species_id in species[id != "hsapiens", id]) {
|
|
||||||
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
|
|
||||||
|
|
||||||
dataset <- biomaRt::useDataset(
|
|
||||||
sprintf("%s_gene_ensembl", species_id),
|
|
||||||
mart = ensembl
|
|
||||||
)
|
|
||||||
|
|
||||||
# Besides the attributes that are always present, we need to check for
|
|
||||||
# human orthologs. Some species don't have that information and will be
|
|
||||||
# skipped.
|
|
||||||
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
|
||||||
biomaRt::listAttributes(dataset, what = "name")) {
|
|
||||||
rlog::log_info("No data on human orthologs")
|
|
||||||
species <- species[id != species_id]
|
|
||||||
|
|
||||||
next
|
|
||||||
}
|
|
||||||
|
|
||||||
chromosome_names <- get_chromosome_names(dataset)
|
|
||||||
|
|
||||||
# Skip the species, if there are no assembled chromosomes.
|
|
||||||
if (length(chromosome_names) <= 0) {
|
|
||||||
rlog::log_info("No matching chromosome assemblies")
|
|
||||||
species <- species[id != species_id]
|
|
||||||
|
|
||||||
next
|
|
||||||
}
|
|
||||||
|
|
||||||
# Retrieve information on all genes of the current species, that have
|
|
||||||
# human orthologs. This is called "homolog" in the Ensembl schema.
|
|
||||||
species_distances <- data.table(biomaRt::getBM(
|
|
||||||
attributes = c(
|
|
||||||
"hsapiens_homolog_ensembl_gene",
|
|
||||||
"chromosome_name",
|
|
||||||
"start_position",
|
|
||||||
"end_position"
|
|
||||||
),
|
|
||||||
filters = c("with_hsapiens_homolog", "chromosome_name"),
|
|
||||||
values = list(TRUE, chromosome_names),
|
|
||||||
mart = dataset
|
|
||||||
))
|
|
||||||
|
|
||||||
# Only include human genes that we have information on.
|
|
||||||
species_distances <- species_distances[
|
|
||||||
hsapiens_homolog_ensembl_gene %chin% genes$id
|
|
||||||
]
|
|
||||||
|
|
||||||
# Only include one ortholog per human gene.
|
|
||||||
species_distances <- unique(
|
|
||||||
species_distances,
|
|
||||||
by = "hsapiens_homolog_ensembl_gene"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Rename gene ID column to match the human data.
|
|
||||||
setnames(
|
|
||||||
species_distances,
|
|
||||||
"hsapiens_homolog_ensembl_gene",
|
|
||||||
"ensembl_gene_id"
|
|
||||||
)
|
|
||||||
|
|
||||||
handle_species(species_id, species_distances)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Save data in the appropriate place.
|
|
||||||
|
|
||||||
usethis::use_data(species, overwrite = TRUE)
|
|
||||||
usethis::use_data(genes, overwrite = TRUE)
|
|
||||||
usethis::use_data(distances, overwrite = TRUE)
|
|
||||||
77
scripts/ensembl_data.R
Normal file
77
scripts/ensembl_data.R
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
# This script does post processing on the data from Ensembl and imports it into
|
||||||
|
# the R package. Run this script after `ensembl_species.R` and
|
||||||
|
# `ensembl_species.R`.
|
||||||
|
|
||||||
|
library(data.table)
|
||||||
|
|
||||||
|
species <- fread("species.csv")
|
||||||
|
chromosomes <- fread("chromosomes.csv")
|
||||||
|
genes <- fread("genes.csv")
|
||||||
|
|
||||||
|
species_metadata <- chromosomes[,
|
||||||
|
.(
|
||||||
|
n_chromosomes = .N,
|
||||||
|
median_chromosome_length = as.double(stats::median(length))
|
||||||
|
),
|
||||||
|
by = species
|
||||||
|
]
|
||||||
|
|
||||||
|
species <- merge(
|
||||||
|
species,
|
||||||
|
species_metadata,
|
||||||
|
by.x = "id",
|
||||||
|
by.y = "species",
|
||||||
|
sort = FALSE
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove duplicated genes within species.
|
||||||
|
genes <- genes[!duplicated(genes, by = c("species", "gene"))]
|
||||||
|
|
||||||
|
genes_chromosomes <- merge(
|
||||||
|
genes,
|
||||||
|
chromosomes,
|
||||||
|
by.x = c("species", "chromosome"),
|
||||||
|
by.y = c("species", "id"),
|
||||||
|
sort = FALSE
|
||||||
|
)
|
||||||
|
|
||||||
|
genes_chromosomes[, distance := ifelse(
|
||||||
|
start_position < length - end_position,
|
||||||
|
start_position,
|
||||||
|
length - end_position
|
||||||
|
)]
|
||||||
|
|
||||||
|
distances <- genes_chromosomes[, .(
|
||||||
|
species,
|
||||||
|
gene,
|
||||||
|
chromosome,
|
||||||
|
start_position,
|
||||||
|
end_position,
|
||||||
|
distance
|
||||||
|
)]
|
||||||
|
|
||||||
|
# This table will hold information on human genes.
|
||||||
|
genes <- genes_chromosomes[
|
||||||
|
species == 9606,
|
||||||
|
.(
|
||||||
|
id = gene,
|
||||||
|
chromosome = name
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
genes[, name := gprofiler2::gconvert(
|
||||||
|
id,
|
||||||
|
target = "HGNC",
|
||||||
|
mthreshold = 1,
|
||||||
|
filter_na = FALSE
|
||||||
|
)$target]
|
||||||
|
|
||||||
|
# Previous versions of geposan used different species IDs. For backwards
|
||||||
|
# compatibility, convert integer IDs to character.
|
||||||
|
|
||||||
|
species[, id := as.character(id)]
|
||||||
|
distances[, species := as.character(species)]
|
||||||
|
|
||||||
|
usethis::use_data(species, overwrite = TRUE)
|
||||||
|
usethis::use_data(genes, overwrite = TRUE)
|
||||||
|
usethis::use_data(distances, overwrite = TRUE)
|
||||||
125
scripts/ensembl_genes.R
Normal file
125
scripts/ensembl_genes.R
Normal file
|
|
@ -0,0 +1,125 @@
|
||||||
|
# This script retrieves genome data from the Ensembl database. Run
|
||||||
|
# `ensembl_species.R` first and keep its output files "species.csv" and
|
||||||
|
# "chromosomes.csv".
|
||||||
|
|
||||||
|
library(data.table)
|
||||||
|
library(DBI)
|
||||||
|
library(glue)
|
||||||
|
|
||||||
|
compara_table <- "ensembl_compara_110"
|
||||||
|
|
||||||
|
# This is the output table of this script:
|
||||||
|
|
||||||
|
genes <- data.table(
|
||||||
|
species = integer(),
|
||||||
|
gene = character(),
|
||||||
|
chromosome = integer(),
|
||||||
|
start_position = integer(),
|
||||||
|
end_position = integer()
|
||||||
|
)
|
||||||
|
|
||||||
|
species <- fread("species.csv")
|
||||||
|
chromosomes <- fread("chromosomes.csv")
|
||||||
|
|
||||||
|
rlog::log_info("Connecting to Ensembl database server")
|
||||||
|
|
||||||
|
db <- dbConnect(
|
||||||
|
RMariaDB::MariaDB(),
|
||||||
|
host = "ensembldb.ensembl.org",
|
||||||
|
port = 5306,
|
||||||
|
user = "anonymous"
|
||||||
|
)
|
||||||
|
|
||||||
|
rlog::log_info("Retrieving human genes")
|
||||||
|
|
||||||
|
human_species_id <- 9606
|
||||||
|
human_present_row_count <- genes[species == human_species_id, .N]
|
||||||
|
|
||||||
|
if (human_present_row_count > 0) {
|
||||||
|
rlog::log_info(glue("Skipping. Present rows: {human_present_row_count}"))
|
||||||
|
} else {
|
||||||
|
human_table <- species[id == human_species_id, table_name]
|
||||||
|
dbExecute(db, glue_sql("USE {`human_table`}", .con = db))
|
||||||
|
|
||||||
|
human_chromosome_ids <- chromosomes[species == human_species_id, id]
|
||||||
|
|
||||||
|
human_genes <- db |>
|
||||||
|
dbGetQuery(glue_sql("
|
||||||
|
SELECT stable_id, seq_region_id, seq_region_start, seq_region_end
|
||||||
|
FROM gene WHERE seq_region_id IN ({human_chromosome_ids*})")) |>
|
||||||
|
as.data.table() |>
|
||||||
|
setnames(
|
||||||
|
c("stable_id", "seq_region_id", "seq_region_start", "seq_region_end"),
|
||||||
|
c("gene", "chromosome", "start_position", "end_position")
|
||||||
|
)
|
||||||
|
|
||||||
|
human_genes[, species := human_species_id]
|
||||||
|
genes <- rbind(genes, human_genes)
|
||||||
|
}
|
||||||
|
|
||||||
|
dbExecute(db, glue_sql("USE {`compara_table`}", .con = db))
|
||||||
|
|
||||||
|
for (species_id in species[id != human_species_id, id]) {
|
||||||
|
present_row_count <- genes[species == species_id, .N]
|
||||||
|
species_name <- species[id == species_id, name]
|
||||||
|
|
||||||
|
if (present_row_count > 0) {
|
||||||
|
rlog::log_info(glue("Skipping species {species_id} ({species_name})"))
|
||||||
|
rlog::log_info(glue("Present rows: {present_row_count}"))
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
rlog::log_info(glue(
|
||||||
|
"Retrieving genes for species {species_id} ({species_name})"
|
||||||
|
))
|
||||||
|
|
||||||
|
table_name <- species[id == species_id, table_name]
|
||||||
|
chromosome_ids <- chromosomes[species == species_id, id]
|
||||||
|
|
||||||
|
species_genes <- db |>
|
||||||
|
dbGetQuery(glue_sql("
|
||||||
|
SELECT
|
||||||
|
human.stable_id AS gene,
|
||||||
|
species.seq_region_id AS chromosome,
|
||||||
|
species.seq_region_start AS start_position,
|
||||||
|
species.seq_region_end AS end_position
|
||||||
|
FROM
|
||||||
|
(
|
||||||
|
SELECT
|
||||||
|
homology_id,
|
||||||
|
stable_id,
|
||||||
|
seq_region_id,
|
||||||
|
seq_region_start,
|
||||||
|
seq_region_end
|
||||||
|
FROM {`table_name`}.gene
|
||||||
|
JOIN gene_member USING (stable_id)
|
||||||
|
JOIN homology_member USING (gene_member_id)
|
||||||
|
JOIN homology USING (homology_id)
|
||||||
|
WHERE seq_region_id IN ({chromosome_ids*})
|
||||||
|
AND homology.description IN (
|
||||||
|
'ortholog_one2one',
|
||||||
|
'ortholog_one2many',
|
||||||
|
'ortholog_many2many'
|
||||||
|
)
|
||||||
|
) AS species
|
||||||
|
JOIN (
|
||||||
|
SELECT
|
||||||
|
homology_id,
|
||||||
|
stable_id
|
||||||
|
FROM homology_member
|
||||||
|
JOIN gene_member USING (gene_member_id)
|
||||||
|
WHERE taxon_id = {human_species_id}
|
||||||
|
) AS human ON species.homology_id = human.homology_id;
|
||||||
|
", .con = db)) |>
|
||||||
|
as.data.table()
|
||||||
|
|
||||||
|
if (nrow(species_genes) == 0) {
|
||||||
|
rlog::log_info("No human homologs found")
|
||||||
|
}
|
||||||
|
|
||||||
|
species_genes[, species := species_id]
|
||||||
|
genes <- rbind(genes, species_genes)
|
||||||
|
fwrite(genes, "genes.csv")
|
||||||
|
}
|
||||||
|
|
||||||
|
dbDisconnect(db)
|
||||||
119
scripts/ensembl_species.R
Normal file
119
scripts/ensembl_species.R
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
# This is an *interactive* script for retrieving information on species from the
|
||||||
|
# Ensembl database. There are taxons with more than one entry in the database.
|
||||||
|
# For each species that has already been seen, the script asks whether to keep
|
||||||
|
# it or replace it. We recommend to choose the most generic entry in most
|
||||||
|
# cases.
|
||||||
|
|
||||||
|
library(data.table)
|
||||||
|
library(DBI)
|
||||||
|
library(glue)
|
||||||
|
|
||||||
|
# These are the output tables of this script:
|
||||||
|
|
||||||
|
species <- data.table(
|
||||||
|
id = integer(),
|
||||||
|
name = character(),
|
||||||
|
scientific_name = character(),
|
||||||
|
table_name = character()
|
||||||
|
)
|
||||||
|
|
||||||
|
chromosomes <- data.table(
|
||||||
|
species = integer(),
|
||||||
|
id = integer(),
|
||||||
|
name = character(),
|
||||||
|
length = integer()
|
||||||
|
)
|
||||||
|
|
||||||
|
rlog::log_info("Connecting to Ensembl database server")
|
||||||
|
db <- dbConnect(
|
||||||
|
RMariaDB::MariaDB(),
|
||||||
|
host = "ensembldb.ensembl.org",
|
||||||
|
port = 5306,
|
||||||
|
user = "anonymous"
|
||||||
|
)
|
||||||
|
|
||||||
|
rlog::log_info("Retrieving list of databases")
|
||||||
|
tables <- dbGetQuery(db, "SHOW DATABASES LIKE '%_core_110_%'")[, 1]
|
||||||
|
|
||||||
|
# Populates the species and chromosomes tables using data from each species'
|
||||||
|
# table within the Ensembl database. Species without a karyotype will be skipped
|
||||||
|
# without adding any information to the tables.
|
||||||
|
for (table in tables) {
|
||||||
|
rlog::log_info(glue("Reading species information from {table}"))
|
||||||
|
dbExecute(db, glue_sql("USE {`table`}", .con = db))
|
||||||
|
|
||||||
|
species_id <- db |>
|
||||||
|
dbGetQuery("
|
||||||
|
SELECT meta_value FROM meta
|
||||||
|
WHERE meta_key = 'species.taxonomy_id'") |>
|
||||||
|
as.integer()
|
||||||
|
|
||||||
|
species_name <- db |>
|
||||||
|
dbGetQuery("
|
||||||
|
SELECT meta_value FROM meta
|
||||||
|
WHERE meta_key = 'species.display_name'") |>
|
||||||
|
as.character()
|
||||||
|
|
||||||
|
species_scientific_name <- db |>
|
||||||
|
dbGetQuery("
|
||||||
|
SELECT meta_value FROM meta
|
||||||
|
WHERE meta_key = 'species.scientific_name'") |>
|
||||||
|
as.character()
|
||||||
|
|
||||||
|
rlog::log_info(glue(
|
||||||
|
"Found species {species_name} ({species_scientific_name})"
|
||||||
|
))
|
||||||
|
|
||||||
|
if (species[id == species_id, .N] > 0) {
|
||||||
|
old_name <- species[id == species_id, name]
|
||||||
|
old_scientific_name <- species[id == species_id, scientific_name]
|
||||||
|
input <- readline(glue("\\
|
||||||
|
Taxon already present ({old_name}, {old_scientific_name}). \\
|
||||||
|
Replace with {species_name} ({species_scientific_name})? [y/N] "))
|
||||||
|
|
||||||
|
if (input == "y") {
|
||||||
|
species <- species[id != species_id]
|
||||||
|
chromosomes <- chromosomes[species != species_id]
|
||||||
|
} else {
|
||||||
|
next
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
species_chromosomes <- db |>
|
||||||
|
dbGetQuery(glue("
|
||||||
|
SELECT seq_region_id, seq_region.name, length
|
||||||
|
FROM seq_region
|
||||||
|
JOIN seq_region_attrib USING (seq_region_id)
|
||||||
|
JOIN attrib_type USING (attrib_type_id)
|
||||||
|
WHERE code = 'karyotype_rank'
|
||||||
|
AND NOT EXISTS
|
||||||
|
(SELECT * FROM seq_region_attrib AS chromosome_attrib
|
||||||
|
JOIN attrib_type USING (attrib_type_id)
|
||||||
|
WHERE chromosome_attrib.seq_region_id = seq_region.seq_region_id
|
||||||
|
AND code = 'sequence_location'
|
||||||
|
AND chromosome_attrib.value != 'nuclear_chromosome');
|
||||||
|
")) |>
|
||||||
|
as.data.table() |>
|
||||||
|
setnames("seq_region_id", "id")
|
||||||
|
|
||||||
|
species_chromosomes[, species := species_id]
|
||||||
|
|
||||||
|
if (nrow(species_chromosomes) == 0) {
|
||||||
|
rlog::log_info("Skipping (no karyotype)")
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
species <- rbind(species, data.table(
|
||||||
|
id = species_id,
|
||||||
|
name = species_name,
|
||||||
|
scientific_name = species_scientific_name,
|
||||||
|
table_name = table
|
||||||
|
))
|
||||||
|
|
||||||
|
chromosomes <- rbind(chromosomes, species_chromosomes)
|
||||||
|
}
|
||||||
|
|
||||||
|
dbDisconnect(db)
|
||||||
|
|
||||||
|
fwrite(species, "species.csv")
|
||||||
|
fwrite(chromosomes, "chromosomes.csv")
|
||||||
Loading…
Add table
Add a link
Reference in a new issue