mirror of
https://github.com/johrpan/geposanui.git
synced 2025-10-26 19:27:24 +01:00
Only retrieve assembled chromosomes
This commit is contained in:
parent
c26e4ff4a3
commit
d3edeefbe2
1 changed files with 51 additions and 34 deletions
85
input.R
85
input.R
|
|
@ -69,6 +69,14 @@ get_ensembl <- function() {
|
||||||
ensembl
|
ensembl
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#' Get all chromosome names for a Ensembl dataset.
|
||||||
|
#'
|
||||||
|
#' Valid chromosome names include decimal numbers as well as 'X' and 'Y'.
|
||||||
|
get_chromosome_names <- function(dataset) {
|
||||||
|
chromosome_names <- listFilterOptions(dataset, "chromosome_name")
|
||||||
|
chromosome_names[str_which(chromosome_names, "^[0-9]+|[XY]$")]
|
||||||
|
}
|
||||||
|
|
||||||
#' Retrieve information on species.
|
#' Retrieve information on species.
|
||||||
#'
|
#'
|
||||||
#' The result will be a `data.table` with the following columns:
|
#' The result will be a `data.table` with the following columns:
|
||||||
|
|
@ -91,14 +99,21 @@ retrieve_species <- function() {
|
||||||
|
|
||||||
#' Retrieve information on human genes.
|
#' Retrieve information on human genes.
|
||||||
#'
|
#'
|
||||||
|
#' This will only include genes on assembled chromosomes. Chromosomes are
|
||||||
|
#' filtered based on their name being either a decimal number, 'X' or 'Y'.
|
||||||
|
#'
|
||||||
#' The result will be a `data.table` with the following columns:
|
#' The result will be a `data.table` with the following columns:
|
||||||
#'
|
#'
|
||||||
#' - `id` Ensembl gene ID.
|
#' - `id` Ensembl gene ID.
|
||||||
#' - `ǹame` HGNC name of the gene.
|
#' - `ǹame` HGNC name of the gene.
|
||||||
#' - `chromosome` Human chromosome on which the gene is located.
|
#' - `chromosome` Human chromosome on which the gene is located.
|
||||||
retrieve_genes <- function() {
|
retrieve_genes <- function() {
|
||||||
|
dataset <- useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
|
||||||
|
|
||||||
genes <- data.table(getBM(
|
genes <- data.table(getBM(
|
||||||
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
|
attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"),
|
||||||
|
filters = "chromosome_name",
|
||||||
|
values = get_chromosome_names(dataset),
|
||||||
mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
|
mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl())
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
@ -113,9 +128,8 @@ retrieve_genes <- function() {
|
||||||
|
|
||||||
#' Retrieve gene distance data.
|
#' Retrieve gene distance data.
|
||||||
#'
|
#'
|
||||||
#' The data will include all available values for the given species and genes.
|
#' The data will include all available values for the given species and genes
|
||||||
#' Specific values on naturally or artificially (e.g. due to incomplete
|
#' that are located on assembled chromosomes.
|
||||||
#' sequencing) short chromosomes will be excluded.
|
|
||||||
#'
|
#'
|
||||||
#' The result will be a `data.table` with the following columns:
|
#' The result will be a `data.table` with the following columns:
|
||||||
#'
|
#'
|
||||||
|
|
@ -145,7 +159,7 @@ retrieve_distances <- function(species_ids, gene_ids) {
|
||||||
# Special case the human species and retrieve all available distance
|
# Special case the human species and retrieve all available distance
|
||||||
# information.
|
# information.
|
||||||
|
|
||||||
ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
|
||||||
|
|
||||||
human_distances <- data.table(getBM(
|
human_distances <- data.table(getBM(
|
||||||
attributes = c(
|
attributes = c(
|
||||||
|
|
@ -154,33 +168,33 @@ retrieve_distances <- function(species_ids, gene_ids) {
|
||||||
"start_position",
|
"start_position",
|
||||||
"end_position"
|
"end_position"
|
||||||
),
|
),
|
||||||
mart = ensembl
|
filters = "chromosome_name",
|
||||||
|
values = get_chromosome_names(dataset),
|
||||||
|
mart = dataset
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# Compute the nearest distance to telomeres.
|
||||||
|
|
||||||
human_distances[,
|
human_distances[,
|
||||||
chromosome_length := max(end_position),
|
chromosome_length := max(end_position),
|
||||||
by = chromosome_name
|
by = chromosome_name
|
||||||
]
|
]
|
||||||
|
|
||||||
# Filter out relevant information (see below).
|
distances <- human_distances[, .(
|
||||||
distances <- human_distances[
|
species = "hsapiens",
|
||||||
chromosome_length > 15000000,
|
gene = ensembl_gene_id,
|
||||||
.(
|
distance = pmin(
|
||||||
species = "hsapiens",
|
start_position,
|
||||||
gene = ensembl_gene_id,
|
chromosome_length - end_position
|
||||||
distance = pmin(
|
|
||||||
start_position,
|
|
||||||
chromosome_length - end_position
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
]
|
)]
|
||||||
|
|
||||||
for (i in 1:species_count) {
|
for (i in 1:species_count) {
|
||||||
species_id <- species_ids[i]
|
species_id <- species_ids[i]
|
||||||
|
|
||||||
progress$tick()
|
progress$tick()
|
||||||
|
|
||||||
ensembl <- useDataset(
|
dataset <- useDataset(
|
||||||
sprintf("%s_gene_ensembl", species_id),
|
sprintf("%s_gene_ensembl", species_id),
|
||||||
mart = ensembl
|
mart = ensembl
|
||||||
)
|
)
|
||||||
|
|
@ -189,43 +203,46 @@ retrieve_distances <- function(species_ids, gene_ids) {
|
||||||
# human orthologs. Some species don't have that information and will be
|
# human orthologs. Some species don't have that information and will be
|
||||||
# skipped.
|
# skipped.
|
||||||
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
if (!"hsapiens_homolog_ensembl_gene" %chin%
|
||||||
listAttributes(ensembl, what = "name")) {
|
listAttributes(dataset, what = "name")) {
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
chromosome_names <- get_chromosome_names(dataset)
|
||||||
|
|
||||||
|
# Skip the species, if there are no assembled chromosomes.
|
||||||
|
if (length(chromosome_names) <= 0) {
|
||||||
next
|
next
|
||||||
}
|
}
|
||||||
|
|
||||||
# Retrieve information on all genes of the current species, that have
|
# Retrieve information on all genes of the current species, that have
|
||||||
# human orthologs. This is called "homolog" in the Ensembl schema.
|
# human orthologs. This is called "homolog" in the Ensembl schema.
|
||||||
ensembl_distances <- data.table(getBM(
|
ensembl_distances <- data.table(getBM(
|
||||||
filters = c("with_hsapiens_homolog"),
|
|
||||||
values = c(TRUE),
|
|
||||||
attributes = c(
|
attributes = c(
|
||||||
"hsapiens_homolog_ensembl_gene",
|
"hsapiens_homolog_ensembl_gene",
|
||||||
"chromosome_name",
|
"chromosome_name",
|
||||||
"start_position",
|
"start_position",
|
||||||
"end_position"
|
"end_position"
|
||||||
),
|
),
|
||||||
mart = ensembl
|
filters = c("with_hsapiens_homolog", "chromosome_name"),
|
||||||
|
values = list(TRUE, chromosome_names),
|
||||||
|
mart = dataset
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# Precompute the genes' distance to the nearest telomere.
|
||||||
|
|
||||||
ensembl_distances[,
|
ensembl_distances[,
|
||||||
chromosome_length := max(end_position),
|
chromosome_length := max(end_position),
|
||||||
by = chromosome_name
|
by = chromosome_name
|
||||||
]
|
]
|
||||||
|
|
||||||
# Filter out relevant information and precompute the genes' distance to
|
species_distances <- ensembl_distances[, .(
|
||||||
# the nearest telomere. Exclude genes on naturally or artificially
|
species = species_id,
|
||||||
# short chromosomes.
|
gene = hsapiens_homolog_ensembl_gene,
|
||||||
species_distances <- ensembl_distances[
|
distance = pmin(
|
||||||
chromosome_length > 15000000,
|
start_position,
|
||||||
.(
|
chromosome_length - end_position
|
||||||
species = species_id,
|
|
||||||
gene = hsapiens_homolog_ensembl_gene,
|
|
||||||
distance = pmin(
|
|
||||||
start_position,
|
|
||||||
chromosome_length - end_position
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
]
|
)]
|
||||||
|
|
||||||
distances <- rbindlist(list(distances, species_distances))
|
distances <- rbindlist(list(distances, species_distances))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue