mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 18:57:25 +01:00
data: Include more chromosomes
This commit is contained in:
parent
df6e23d219
commit
6494ae8200
5 changed files with 27 additions and 8 deletions
5
R/data.R
5
R/data.R
|
|
@ -1,6 +1,6 @@
|
||||||
#' Information on included species from the Ensembl database.
|
#' Information on included species from the Ensembl database.
|
||||||
#'
|
#'
|
||||||
#' @format A [data.table] with 91 rows and 2 variables:
|
#' @format A [data.table] with 99 rows and 2 variables:
|
||||||
#' \describe{
|
#' \describe{
|
||||||
#' \item{id}{Unique species ID}
|
#' \item{id}{Unique species ID}
|
||||||
#' \item{name}{Human readable species name}
|
#' \item{name}{Human readable species name}
|
||||||
|
|
@ -25,10 +25,11 @@
|
||||||
#' This dataset contains each known value for a gene's distance to the telomeres
|
#' This dataset contains each known value for a gene's distance to the telomeres
|
||||||
#' per species. The data is sourced from Ensembl.
|
#' per species. The data is sourced from Ensembl.
|
||||||
#'
|
#'
|
||||||
#' @format A [data.table] with 1390730 rows and 3 variables:
|
#' @format A [data.table] with 1506182 rows and 4 variables:
|
||||||
#' \describe{
|
#' \describe{
|
||||||
#' \item{species}{Species ID}
|
#' \item{species}{Species ID}
|
||||||
#' \item{gene}{Gene ID}
|
#' \item{gene}{Gene ID}
|
||||||
|
#' \item{position}{Gene start position}
|
||||||
#' \item{distance}{Distance to nearest telomere}
|
#' \item{distance}{Distance to nearest telomere}
|
||||||
#' }
|
#' }
|
||||||
"distances"
|
"distances"
|
||||||
|
|
|
||||||
Binary file not shown.
BIN
data/genes.rda
BIN
data/genes.rda
Binary file not shown.
BIN
data/species.rda
BIN
data/species.rda
Binary file not shown.
|
|
@ -2,8 +2,9 @@ library(data.table)
|
||||||
|
|
||||||
rlog::log_info("Connecting to Ensembl API")
|
rlog::log_info("Connecting to Ensembl API")
|
||||||
|
|
||||||
#' Object to access the Ensembl API.
|
# Object to access the Ensembl API. We use the US east mirror to circumvent
|
||||||
ensembl <- biomaRt::useEnsembl("ensembl")
|
# current issues with the main server being temporarily unreliable.
|
||||||
|
ensembl <- biomaRt::useEnsembl("ensembl", host = "useast.ensembl.org")
|
||||||
|
|
||||||
# Retrieve species information.
|
# Retrieve species information.
|
||||||
|
|
||||||
|
|
@ -18,11 +19,22 @@ species <- ensembl_datasets[, .(
|
||||||
|
|
||||||
#' Get all chromosome names for an Ensembl dataset.
|
#' Get all chromosome names for an Ensembl dataset.
|
||||||
#'
|
#'
|
||||||
#' Valid chromosome names include decimal numbers as well as typical sex
|
#' The following chromosome naming schemes will be recognized and have been
|
||||||
#' chromosome names (X, Y, W and Z).
|
#' sourced from Ensembl by manually screening chromosome-level assemblies.
|
||||||
|
#'
|
||||||
|
#' - a decimal number (most species' autosomes)
|
||||||
|
#' - X, Y, W or Z (gonosomes)
|
||||||
|
#' - LG followed by a decimal number (some fishes)
|
||||||
|
#' - ssa/sgr followed by a number (Atlantic salmon/Turquoise killifish)
|
||||||
|
#'
|
||||||
|
#' The function tries to filter out those chromosome names from the available
|
||||||
|
#' assemblies in the dataset.
|
||||||
get_chromosome_names <- function(dataset) {
|
get_chromosome_names <- function(dataset) {
|
||||||
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
|
chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
|
||||||
chromosome_names[stringr::str_which(chromosome_names, "^[0-9]+|[XYWZ]$")]
|
chromosome_names[stringr::str_which(
|
||||||
|
chromosome_names,
|
||||||
|
"^(LG|sgr|ssa)?[0-9]+|[XYWZ]$"
|
||||||
|
)]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Retrieve information on human genes. This will only include genes on
|
# Retrieve information on human genes. This will only include genes on
|
||||||
|
|
@ -66,6 +78,7 @@ human_data[, chromosome_length := max(end_position), by = chromosome_name]
|
||||||
distances <- human_data[, .(
|
distances <- human_data[, .(
|
||||||
species = "hsapiens",
|
species = "hsapiens",
|
||||||
gene = ensembl_gene_id,
|
gene = ensembl_gene_id,
|
||||||
|
position = start_position,
|
||||||
distance = pmin(
|
distance = pmin(
|
||||||
start_position,
|
start_position,
|
||||||
chromosome_length - end_position
|
chromosome_length - end_position
|
||||||
|
|
@ -86,7 +99,6 @@ for (species_id in species[!id == "hsapiens", id]) {
|
||||||
# skipped.
|
# skipped.
|
||||||
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
||||||
biomaRt::listAttributes(dataset, what = "name")) {
|
biomaRt::listAttributes(dataset, what = "name")) {
|
||||||
|
|
||||||
rlog::log_info("No data on human orthologs")
|
rlog::log_info("No data on human orthologs")
|
||||||
species <- species[id != species_id]
|
species <- species[id != species_id]
|
||||||
|
|
||||||
|
|
@ -117,6 +129,11 @@ for (species_id in species[!id == "hsapiens", id]) {
|
||||||
mart = dataset
|
mart = dataset
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# Only include human genes that we have information on.
|
||||||
|
species_distances <- species_distances[
|
||||||
|
hsapiens_homolog_ensembl_gene %chin% genes$id
|
||||||
|
]
|
||||||
|
|
||||||
# Only include one ortholog per human gene.
|
# Only include one ortholog per human gene.
|
||||||
species_distances <- unique(
|
species_distances <- unique(
|
||||||
species_distances,
|
species_distances,
|
||||||
|
|
@ -133,6 +150,7 @@ for (species_id in species[!id == "hsapiens", id]) {
|
||||||
species_distances <- species_distances[, .(
|
species_distances <- species_distances[, .(
|
||||||
species = species_id,
|
species = species_id,
|
||||||
gene = hsapiens_homolog_ensembl_gene,
|
gene = hsapiens_homolog_ensembl_gene,
|
||||||
|
position = start_position,
|
||||||
distance = pmin(
|
distance = pmin(
|
||||||
start_position,
|
start_position,
|
||||||
chromosome_length - end_position
|
chromosome_length - end_position
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue