Reinclude human into distance data

This commit is contained in:
Elias Projahn 2021-09-19 12:07:59 +02:00
parent 22b25c9b83
commit e9c05fdbab
2 changed files with 43 additions and 16 deletions

14
init.R
View file

@ -5,11 +5,11 @@ source("util.R")
# Load input data
species <- run_cached("input/species", retrieve_species)
genes <- run_cached("input/genes", retrieve_genes)
species <- run_cached("input_species", retrieve_species)
genes <- run_cached("input_genes", retrieve_genes)
distances <- run_cached(
"input/distances",
"input_distances",
retrieve_distances,
species[, id],
genes[, id]
@ -23,7 +23,7 @@ all_genes <- genes[, id]
tpe_old_genes <- genes[suggested | verified == TRUE, id]
clustering_all <- run_cached(
"all_species/clustering",
"clustering_all",
process_clustering,
distances,
all_species,
@ -31,7 +31,7 @@ clustering_all <- run_cached(
)
clustering_replicative <- run_cached(
"replicative_species/clustering",
"clustering_replicative",
process_clustering,
distances,
replicative_species,
@ -39,7 +39,7 @@ clustering_replicative <- run_cached(
)
correlation_all <- run_cached(
"all_species/correlation",
"correlation_all",
process_correlation,
distances,
all_species,
@ -48,7 +48,7 @@ correlation_all <- run_cached(
)
correlation_replicative <- run_cached(
"replicative_species/correlation",
"correlation_replicative",
process_correlation,
distances,
replicative_species,

45
input.R
View file

@ -115,11 +115,41 @@ retrieve_genes <- function() {
#' - `gene` Ensembl gene ID.
#' - `distance` Distance to nearest telomere in base pairs.
retrieve_distances <- function(species_ids, gene_ids) {
distances <- data.table(
species = character(),
gene = character(),
distance = integer()
)
# Special case the human species and retrieve all available distance
# information.
ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
human_distances <- data.table(getBM(
attributes = c(
"ensembl_gene_id",
"chromosome_name",
"start_position",
"end_position"
),
mart = ensembl
))
human_distances[,
chromosome_length := max(end_position),
by = chromosome_name
]
# Filter out relevant information (see below).
distances <- human_distances[
chromosome_length > 15000000,
.(
species = "hsapiens",
gene = ensembl_gene_id,
distance = pmin(
start_position,
chromosome_length - end_position
)
)
]
# Exclude the human from the species, in case it is present there.
species_ids <- species_ids[species_ids != "hsapiens"]
species_count <- length(species_ids)
@ -156,10 +186,7 @@ retrieve_distances <- function(species_ids, gene_ids) {
"start_position",
"end_position"
),
mart = useDataset(
sprintf("%s_gene_ensembl", species_id),
mart = ensembl
)
mart = ensembl
))
ensembl_distances[,