Initial commit

2025-10-26 10:47:25 +01:00 · 2021-10-19 13:39:55 +02:00 · 2021-10-19 13:39:55 +02:00 · c52d42c2b6
commit c52d42c2b6
24 changed files with 1350 additions and 0 deletions
--- a/scripts/ensembl.R
+++ b/scripts/ensembl.R
@ -0,0 +1,154 @@
+library(data.table)
+
+rlog::log_info("Connecting to Ensembl API")
+
+#' Object to access the Ensembl API.
+ensembl <- biomaRt::useEnsembl("ensembl")
+
+# Retrieve species information.
+
+rlog::log_info("Retrieving species information")
+ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl))
+
+# Filter out species ID and name from the result.
+species <- ensembl_datasets[, .(
+    id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2],
+    name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2]
+)]
+
+#' Get all chromosome names for an Ensembl dataset.
+#'
+#' Valid chromosome names include decimal numbers as well as typical sex
+#' chromosome names (X, Y, W and Z).
+get_chromosome_names <- function(dataset) {
+    chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name")
+    chromosome_names[stringr::str_which(chromosome_names, "^[0-9]+|[XYWZ]$")]
+}
+
+# Retrieve information on human genes. This will only include genes on
+# assembled chromosomes. Chromosomes are filtered using get_chromosome_names().
+
+rlog::log_info("Retrieving information on human genes")
+dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl)
+
+human_data <- data.table(biomaRt::getBM(
+    attributes = c(
+        "ensembl_gene_id",
+        "hgnc_symbol",
+        "chromosome_name",
+        "start_position",
+        "end_position"
+    ),
+    filters = "chromosome_name",
+    values = get_chromosome_names(dataset),
+    mart = dataset
+))
+
+# Remove duplicated gene IDs (at the time of writing, there are a handful).
+human_data <- unique(human_data, by = "ensembl_gene_id")
+
+# Only keep relevant information on genes.
+genes <- human_data[, .(
+    id = ensembl_gene_id,
+    name = hgnc_symbol,
+    chromosome = chromosome_name
+)]
+
+# Retrieve gene distance data across species.
+
+rlog::log_info("Retrieving distance data")
+
+# Handle the human first, as we already retrieved the data and don't need to
+# filter based on orthologies.
+
+human_data[, chromosome_length := max(end_position), by = chromosome_name]
+
+distances <- human_data[, .(
+    species = "hsapiens",
+    gene = ensembl_gene_id,
+    distance = pmin(
+        start_position,
+        chromosome_length - end_position
+    )
+)]
+
+# Iterate through all other species and retrieve their distance data.
+for (species_id in species[!id == "hsapiens", id]) {
+    rlog::log_info(sprintf("Loading species \"%s\"", species_id))
+
+    dataset <- biomaRt::useDataset(
+        sprintf("%s_gene_ensembl", species_id),
+        mart = ensembl
+    )
+
+    # Besides the attributes that are always present, we need to check for
+    # human orthologs. Some species don't have that information and will be
+    # skipped.
+    if (!"hsapiens_homolog_ensembl_gene" %chin%
+        biomaRt::listAttributes(dataset, what = "name")) {
+
+        rlog::log_info("No data on human orthologs")
+        species <- species[id != species_id]
+
+        next
+    }
+
+    chromosome_names <- get_chromosome_names(dataset)
+
+    # Skip the species, if there are no assembled chromosomes.
+    if (length(chromosome_names) <= 0) {
+        rlog::log_info("No matching chromosome assemblies")
+        species <- species[id != species_id]
+
+        next
+    }
+
+    # Retrieve information on all genes of the current species, that have
+    # human orthologs. This is called "homolog" in the Ensembl schema.
+    species_distances <- data.table(biomaRt::getBM(
+        attributes = c(
+            "hsapiens_homolog_ensembl_gene",
+            "chromosome_name",
+            "start_position",
+            "end_position"
+        ),
+        filters = c("with_hsapiens_homolog", "chromosome_name"),
+        values = list(TRUE, chromosome_names),
+        mart = dataset
+    ))
+
+    # Only include one ortholog per human gene.
+    species_distances <- unique(
+        species_distances,
+        by = "hsapiens_homolog_ensembl_gene"
+    )
+
+    # Precompute the genes' distance to the nearest telomere.
+
+    species_distances[,
+        chromosome_length := max(end_position),
+        by = chromosome_name
+    ]
+
+    species_distances <- species_distances[, .(
+        species = species_id,
+        gene = hsapiens_homolog_ensembl_gene,
+        distance = pmin(
+            start_position,
+            chromosome_length - end_position
+        )
+    )]
+
+    distances <- rbindlist(list(distances, species_distances))
+}
+
+# Add information on number of species per gene.
+
+genes_n_species <- distances[, .(n_species = .N), by = "gene"]
+genes <- merge(genes, genes_n_species, by.x = "id", by.y = "gene")
+
+# Save data in the appropriate place.
+
+usethis::use_data(species, overwrite = TRUE)
+usethis::use_data(genes, overwrite = TRUE)
+usethis::use_data(distances, overwrite = TRUE)