| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  | library(data.table) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | #' Load and preprocess input data from `path`. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' A file named `cache.rds` will be created within that directory to reuse the | 
					
						
							|  |  |  | #' results for future runs. To forcefully recompute, delete that file. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @seealso [load_data()] | 
					
						
							|  |  |  | load_data_cached <- function(path) { | 
					
						
							|  |  |  |     cache_file <- paste(path, "cache.rds", sep = "/") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (!file.exists(cache_file)) { | 
					
						
							|  |  |  |         # If the cache file doesn't exist, we have to do the computation. | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  |         data <- load_data(path) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # The results are cached for the next run. | 
					
						
							|  |  |  |         saveRDS(data, cache_file) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         data | 
					
						
							|  |  |  |     } else { | 
					
						
							|  |  |  |         # If the cache file exists, we restore the data from it. | 
					
						
							|  |  |  |         readRDS(cache_file) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  | #' Merge genome data from files in `path` into `data.table`s. | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-06-24 20:20:46 +02:00
										 |  |  | #' The result will be a list with named elements: | 
					
						
							|  |  |  | #' - `genes` will be a table with metadata on human genes. | 
					
						
							|  |  |  | #' - `species` will contain metadata on each species. | 
					
						
							|  |  |  | #' - `distances` will contain each species' genes' distances to the telomere. | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | #' | 
					
						
							|  |  |  | #' @seealso [load_data_cached()] | 
					
						
							|  |  |  | load_data <- function(path) { | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |     genes <- fread(paste(path, "genes.tsv", sep = "/")) | 
					
						
							|  |  |  |     original_species <- fread(paste(path, "species.csv", sep = "/")) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |     species <- data.table( | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  |         id = character(), | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |         label = character(), | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  |         median_distance = numeric() | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |     distances <- data.table(geneid = integer()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Each file will contain data on one species. | 
					
						
							|  |  |  |     file_names <- list.files(paste(path, "genomes", sep = "/")) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  |     for (file_name in file_names) { | 
					
						
							| 
									
										
										
										
											2021-06-24 20:20:46 +02:00
										 |  |  |         species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1] | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |         # Only continue for replicatively aging species. | 
					
						
							|  |  |  |         # TODO: Which other species should be included? | 
					
						
							|  |  |  |         if (original_species[id == species_id, group] == "replicative") { | 
					
						
							|  |  |  |             species_path <- paste(path, "genomes", file_name, sep = "/") | 
					
						
							|  |  |  |             species_distances <- fread(species_path) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |             # Compute the median distance across all genes of this species and | 
					
						
							|  |  |  |             # add it to the species table along other static data. | 
					
						
							|  |  |  |             species <- rbindlist(list(species, data.table( | 
					
						
							|  |  |  |                 id = species_id, | 
					
						
							|  |  |  |                 label = original_species[id == species_id, label], | 
					
						
							|  |  |  |                 median_distance = median(species_distances[, dist]) | 
					
						
							|  |  |  |             ))) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |             # Column names have to be unique for each species. | 
					
						
							|  |  |  |             setnames(species_distances, "dist", species_id) | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |             distances <- merge(distances, species_distances, all = TRUE) | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     list( | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  |         genes = genes, | 
					
						
							| 
									
										
										
										
											2021-06-24 20:20:46 +02:00
										 |  |  |         species = species, | 
					
						
							|  |  |  |         distances = distances | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  | } |