| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | library(biomaRt) | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  | library(data.table) | 
					
						
							| 
									
										
										
										
											2021-09-21 16:47:13 +02:00
										 |  |  | library(progress) | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  | library(rlog) | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | library(stringr) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' Species IDs of known replicatively aging species. | 
					
						
							|  |  |  | species_ids_replicative <- c( | 
					
						
							|  |  |  |     "bihybrid", | 
					
						
							|  |  |  |     "btaurus", | 
					
						
							|  |  |  |     "bthybrid", | 
					
						
							|  |  |  |     "cfamiliaris", | 
					
						
							|  |  |  |     "chircus", | 
					
						
							|  |  |  |     "cjacchus", | 
					
						
							|  |  |  |     "clfamiliaris", | 
					
						
							|  |  |  |     "csabaeus", | 
					
						
							|  |  |  |     "ecaballus", | 
					
						
							|  |  |  |     "fcatus", | 
					
						
							|  |  |  |     "ggorilla", | 
					
						
							|  |  |  |     "hsapiens", | 
					
						
							|  |  |  |     "lafricana", | 
					
						
							|  |  |  |     "mfascicularis", | 
					
						
							|  |  |  |     "mmulatta", | 
					
						
							|  |  |  |     "mmurinus", | 
					
						
							|  |  |  |     "mnemestrina", | 
					
						
							|  |  |  |     "nleucogenys", | 
					
						
							|  |  |  |     "oaries", | 
					
						
							|  |  |  |     "pabelii", | 
					
						
							|  |  |  |     "panubis", | 
					
						
							|  |  |  |     "ppaniscus", | 
					
						
							|  |  |  |     "ptroglodytes", | 
					
						
							|  |  |  |     "sscrofa", | 
					
						
							|  |  |  |     "tgelada" | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-29 14:02:37 +02:00
										 |  |  | #' Gene names of genes for verified TPE-OLD genes. | 
					
						
							|  |  |  | genes_verified_tpe_old <- c( | 
					
						
							|  |  |  |     "C1S", | 
					
						
							|  |  |  |     "DSP", | 
					
						
							|  |  |  |     "ISG15", | 
					
						
							|  |  |  |     "SORBS2", | 
					
						
							|  |  |  |     "TERT" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' Gene names of genes with a suggested TPE-OLD. | 
					
						
							|  |  |  | genes_suggested_tpe_old <- c( | 
					
						
							|  |  |  |     "AKAP3", | 
					
						
							|  |  |  |     "ANO2", | 
					
						
							|  |  |  |     "CCND2", | 
					
						
							|  |  |  |     "CD163L1", | 
					
						
							|  |  |  |     "CD9", | 
					
						
							|  |  |  |     "FOXM1", | 
					
						
							|  |  |  |     "GALNT8", | 
					
						
							|  |  |  |     "NDUFA9", | 
					
						
							|  |  |  |     "TEAD4", | 
					
						
							|  |  |  |     "TIGAR", | 
					
						
							|  |  |  |     "TSPAN9" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-11 10:17:58 +02:00
										 |  |  | #' Shared accessor for the Ensembl API. | 
					
						
							|  |  |  | ensembl <- NULL | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' Get the ensembl accessor and initialize it if necessary. | 
					
						
							|  |  |  | get_ensembl <- function() { | 
					
						
							|  |  |  |     if (is.null(ensembl)) { | 
					
						
							|  |  |  |         ensembl <<- useEnsembl("ensembl", version = 104) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ensembl | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  | #' Get all chromosome names for a Ensembl dataset. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' Valid chromosome names include decimal numbers as well as 'X' and 'Y'. | 
					
						
							|  |  |  | get_chromosome_names <- function(dataset) { | 
					
						
							|  |  |  |     chromosome_names <- listFilterOptions(dataset, "chromosome_name") | 
					
						
							|  |  |  |     chromosome_names[str_which(chromosome_names, "^[0-9]+|[XY]$")] | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' Retrieve information on species. | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' The result will be a `data.table` with the following columns: | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #'  - `id` Species ID as presented by Ensembl. | 
					
						
							|  |  |  | #'  - `name` Human readable species name. | 
					
						
							|  |  |  | #'  - `replicative` Whether the species is likely to be aging replicatively. | 
					
						
							|  |  |  | retrieve_species <- function() { | 
					
						
							|  |  |  |     # Ensembl datasets correspond to distinct species. | 
					
						
							| 
									
										
										
										
											2021-10-11 10:17:58 +02:00
										 |  |  |     ensembl_datasets <- data.table(listDatasets(get_ensembl())) | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Filter out species ID and name from the result. | 
					
						
							|  |  |  |     species <- ensembl_datasets[, .( | 
					
						
							|  |  |  |         id = str_match(dataset, "(.*)_gene_ensembl")[, 2], | 
					
						
							|  |  |  |         name = str_match(description, "(.*) genes \\(.*\\)")[, 2] | 
					
						
							|  |  |  |     )] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     species[, replicative := id %chin% species_ids_replicative] | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' Retrieve information on human genes. | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  | #' This will only include genes on assembled chromosomes. Chromosomes are | 
					
						
							|  |  |  | #' filtered based on their name being either a decimal number, 'X' or 'Y'. | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' The result will be a `data.table` with the following columns: | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #'  - `id` Ensembl gene ID. | 
					
						
							|  |  |  | #'  - `ǹame` HGNC name of the gene. | 
					
						
							|  |  |  | #'  - `chromosome` Human chromosome on which the gene is located. | 
					
						
							|  |  |  | retrieve_genes <- function() { | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |     dataset <- useDataset("hsapiens_gene_ensembl", mart = get_ensembl()) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |     genes <- data.table(getBM( | 
					
						
							|  |  |  |         attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name"), | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |         filters = "chromosome_name", | 
					
						
							|  |  |  |         values = get_chromosome_names(dataset), | 
					
						
							| 
									
										
										
										
											2021-10-11 10:17:58 +02:00
										 |  |  |         mart = useDataset("hsapiens_gene_ensembl", mart = get_ensembl()) | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |     )) | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |     genes[, .( | 
					
						
							|  |  |  |         id = ensembl_gene_id, | 
					
						
							|  |  |  |         name = hgnc_symbol, | 
					
						
							|  |  |  |         chromosome = chromosome_name, | 
					
						
							|  |  |  |         verified = hgnc_symbol %chin% genes_verified_tpe_old, | 
					
						
							|  |  |  |         suggested = hgnc_symbol %chin% genes_suggested_tpe_old | 
					
						
							|  |  |  |     )] | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' Retrieve gene distance data. | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  | #' The data will include all available values for the given species and genes | 
					
						
							|  |  |  | #' that are located on assembled chromosomes. | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | #' | 
					
						
							|  |  |  | #' The result will be a `data.table` with the following columns: | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #'  - `species` Species ID. | 
					
						
							|  |  |  | #'  - `gene` Ensembl gene ID. | 
					
						
							|  |  |  | #'  - `distance` Distance to nearest telomere in base pairs. | 
					
						
							|  |  |  | retrieve_distances <- function(species_ids, gene_ids) { | 
					
						
							| 
									
										
										
										
											2021-10-11 10:17:58 +02:00
										 |  |  |     ensembl <- get_ensembl() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-21 16:47:13 +02:00
										 |  |  |     # Exclude the human from the species, in case it is present there. | 
					
						
							|  |  |  |     species_ids <- species_ids[species_ids != "hsapiens"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     species_count <- length(species_ids) | 
					
						
							|  |  |  |     gene_count <- length(gene_ids) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     log_info(sprintf( | 
					
						
							|  |  |  |         "Retrieving distance data for %i genes from %i species", | 
					
						
							|  |  |  |         gene_count, | 
					
						
							|  |  |  |         species_count | 
					
						
							|  |  |  |     )) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     progress <- progress_bar$new( | 
					
						
							|  |  |  |         total = gene_count, | 
					
						
							|  |  |  |         format = "Retrieving distance data [:bar] :percent (ETA :eta)" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-19 12:07:59 +02:00
										 |  |  |     # Special case the human species and retrieve all available distance | 
					
						
							|  |  |  |     # information. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |     dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl) | 
					
						
							| 
									
										
										
										
											2021-09-19 12:07:59 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     human_distances <- data.table(getBM( | 
					
						
							|  |  |  |         attributes = c( | 
					
						
							|  |  |  |             "ensembl_gene_id", | 
					
						
							|  |  |  |             "chromosome_name", | 
					
						
							|  |  |  |             "start_position", | 
					
						
							|  |  |  |             "end_position" | 
					
						
							|  |  |  |         ), | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |         filters = "chromosome_name", | 
					
						
							|  |  |  |         values = get_chromosome_names(dataset), | 
					
						
							|  |  |  |         mart = dataset | 
					
						
							| 
									
										
										
										
											2021-09-19 12:07:59 +02:00
										 |  |  |     )) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |     # Compute the nearest distance to telomeres. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-19 12:07:59 +02:00
										 |  |  |     human_distances[, | 
					
						
							|  |  |  |         chromosome_length := max(end_position), | 
					
						
							|  |  |  |         by = chromosome_name | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |     distances <- human_distances[, .( | 
					
						
							|  |  |  |         species = "hsapiens", | 
					
						
							|  |  |  |         gene = ensembl_gene_id, | 
					
						
							|  |  |  |         distance = pmin( | 
					
						
							|  |  |  |             start_position, | 
					
						
							|  |  |  |             chromosome_length - end_position | 
					
						
							| 
									
										
										
										
											2021-09-19 12:07:59 +02:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |     )] | 
					
						
							| 
									
										
										
										
											2021-09-19 12:07:59 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |     for (i in 1:species_count) { | 
					
						
							|  |  |  |         species_id <- species_ids[i] | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-21 16:47:13 +02:00
										 |  |  |         progress$tick() | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |         dataset <- useDataset( | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |             sprintf("%s_gene_ensembl", species_id), | 
					
						
							|  |  |  |             mart = ensembl | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |         # Besides the attributes that are always present, we need to check for | 
					
						
							|  |  |  |         # human orthologs. Some species don't have that information and will be | 
					
						
							|  |  |  |         # skipped. | 
					
						
							|  |  |  |         if (!"hsapiens_homolog_ensembl_gene" %chin% | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |             listAttributes(dataset, what = "name")) { | 
					
						
							|  |  |  |             next | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         chromosome_names <- get_chromosome_names(dataset) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Skip the species, if there are no assembled chromosomes. | 
					
						
							|  |  |  |         if (length(chromosome_names) <= 0) { | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |             next | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Retrieve information on all genes of the current species, that have | 
					
						
							|  |  |  |         # human orthologs. This is called "homolog" in the Ensembl schema. | 
					
						
							|  |  |  |         ensembl_distances <- data.table(getBM( | 
					
						
							|  |  |  |             attributes = c( | 
					
						
							|  |  |  |                 "hsapiens_homolog_ensembl_gene", | 
					
						
							|  |  |  |                 "chromosome_name", | 
					
						
							|  |  |  |                 "start_position", | 
					
						
							|  |  |  |                 "end_position" | 
					
						
							|  |  |  |             ), | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |             filters = c("with_hsapiens_homolog", "chromosome_name"), | 
					
						
							|  |  |  |             values = list(TRUE, chromosome_names), | 
					
						
							|  |  |  |             mart = dataset | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |         )) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |         # Precompute the genes' distance to the nearest telomere. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |         ensembl_distances[, | 
					
						
							|  |  |  |             chromosome_length := max(end_position), | 
					
						
							|  |  |  |             by = chromosome_name | 
					
						
							|  |  |  |         ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |         species_distances <- ensembl_distances[, .( | 
					
						
							|  |  |  |             species = species_id, | 
					
						
							|  |  |  |             gene = hsapiens_homolog_ensembl_gene, | 
					
						
							|  |  |  |             distance = pmin( | 
					
						
							|  |  |  |                 start_position, | 
					
						
							|  |  |  |                 chromosome_length - end_position | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2021-10-11 15:10:03 +02:00
										 |  |  |         )] | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |         distances <- rbindlist(list(distances, species_distances)) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-16 00:06:54 +02:00
										 |  |  |     # Arbitrarily exclude duplicated genes. | 
					
						
							|  |  |  |     # TODO: Consider a refined approach or work out how to include all | 
					
						
							|  |  |  |     # duplicates. | 
					
						
							|  |  |  |     unique(distances, by = c("species", "gene")) | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  | } |