diff --git a/init.R b/init.R index 1c29603..17ba01c 100644 --- a/init.R +++ b/init.R @@ -5,11 +5,11 @@ source("util.R") # Load input data -species <- run_cached("input/species", retrieve_species) -genes <- run_cached("input/genes", retrieve_genes) +species <- run_cached("input_species", retrieve_species) +genes <- run_cached("input_genes", retrieve_genes) distances <- run_cached( - "input/distances", + "input_distances", retrieve_distances, species[, id], genes[, id] @@ -23,7 +23,7 @@ all_genes <- genes[, id] tpe_old_genes <- genes[suggested | verified == TRUE, id] clustering_all <- run_cached( - "all_species/clustering", + "clustering_all", process_clustering, distances, all_species, @@ -31,7 +31,7 @@ clustering_all <- run_cached( ) clustering_replicative <- run_cached( - "replicative_species/clustering", + "clustering_replicative", process_clustering, distances, replicative_species, @@ -39,7 +39,7 @@ clustering_replicative <- run_cached( ) correlation_all <- run_cached( - "all_species/correlation", + "correlation_all", process_correlation, distances, all_species, @@ -48,7 +48,7 @@ correlation_all <- run_cached( ) correlation_replicative <- run_cached( - "replicative_species/correlation", + "correlation_replicative", process_correlation, distances, replicative_species, diff --git a/input.R b/input.R index 337d494..c6eb9e7 100644 --- a/input.R +++ b/input.R @@ -115,11 +115,41 @@ retrieve_genes <- function() { #' - `gene` Ensembl gene ID. #' - `distance` Distance to nearest telomere in base pairs. retrieve_distances <- function(species_ids, gene_ids) { - distances <- data.table( - species = character(), - gene = character(), - distance = integer() - ) + # Special case the human species and retrieve all available distance + # information. + + ensembl <- useDataset("hsapiens_gene_ensembl", mart = ensembl) + + human_distances <- data.table(getBM( + attributes = c( + "ensembl_gene_id", + "chromosome_name", + "start_position", + "end_position" + ), + mart = ensembl + )) + + human_distances[, + chromosome_length := max(end_position), + by = chromosome_name + ] + + # Filter out relevant information (see below). + distances <- human_distances[ + chromosome_length > 15000000, + .( + species = "hsapiens", + gene = ensembl_gene_id, + distance = pmin( + start_position, + chromosome_length - end_position + ) + ) + ] + + # Exclude the human from the species, in case it is present there. + species_ids <- species_ids[species_ids != "hsapiens"] species_count <- length(species_ids) @@ -156,10 +186,7 @@ retrieve_distances <- function(species_ids, gene_ids) { "start_position", "end_position" ), - mart = useDataset( - sprintf("%s_gene_ensembl", species_id), - mart = ensembl - ) + mart = ensembl )) ensembl_distances[,