| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  | library(data.table) | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  | library(rlog) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-29 14:02:37 +02:00
										 |  |  | #' Gene names of genes for verified TPE-OLD genes. | 
					
						
							|  |  |  | genes_verified_tpe_old <- c( | 
					
						
							|  |  |  |     "C1S", | 
					
						
							|  |  |  |     "DSP", | 
					
						
							|  |  |  |     "ISG15", | 
					
						
							|  |  |  |     "SORBS2", | 
					
						
							|  |  |  |     "TERT" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' Gene names of genes with a suggested TPE-OLD. | 
					
						
							|  |  |  | genes_suggested_tpe_old <- c( | 
					
						
							|  |  |  |     "AKAP3", | 
					
						
							|  |  |  |     "ANO2", | 
					
						
							|  |  |  |     "CCND2", | 
					
						
							|  |  |  |     "CD163L1", | 
					
						
							|  |  |  |     "CD9", | 
					
						
							|  |  |  |     "FOXM1", | 
					
						
							|  |  |  |     "GALNT8", | 
					
						
							|  |  |  |     "NDUFA9", | 
					
						
							|  |  |  |     "TEAD4", | 
					
						
							|  |  |  |     "TIGAR", | 
					
						
							|  |  |  |     "TSPAN9" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  | #' Merge genome data from files in `path` into `data.table`s. | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-06-24 20:20:46 +02:00
										 |  |  | #' The result will be a list with named elements: | 
					
						
							|  |  |  | #' - `genes` will be a table with metadata on human genes. | 
					
						
							|  |  |  | #' - `species` will contain metadata on each species. | 
					
						
							|  |  |  | #' - `distances` will contain each species' genes' distances to the telomere. | 
					
						
							| 
									
										
										
										
											2021-08-25 12:03:11 +02:00
										 |  |  | load_input <- function(path) { | 
					
						
							| 
									
										
										
										
											2021-08-29 14:02:37 +02:00
										 |  |  |     # Include data on TPE-OLD status for genes. | 
					
						
							|  |  |  |      | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |     genes <- fread(paste(path, "genes.tsv", sep = "/")) | 
					
						
							| 
									
										
										
										
											2021-08-29 14:02:37 +02:00
										 |  |  |     genes[name %chin% genes_verified_tpe_old, verified := TRUE] | 
					
						
							|  |  |  |     genes[name %chin% genes_suggested_tpe_old, suggested := TRUE] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Load and combine data on species and gene distances. | 
					
						
							|  |  |  |      | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |     original_species <- fread(paste(path, "species.csv", sep = "/")) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |     species <- data.table( | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  |         id = character(), | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |         group = character(), | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  |         label = character(), | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  |         median_distance = numeric() | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-16 17:21:01 +02:00
										 |  |  |     distances <- data.table( | 
					
						
							|  |  |  |         species = character(), | 
					
						
							|  |  |  |         gene = integer(), | 
					
						
							|  |  |  |         distance = integer() | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Each file will contain data on one species. | 
					
						
							|  |  |  |     file_names <- list.files(paste(path, "genomes", sep = "/")) | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |     n_species <- length(file_names) | 
					
						
							| 
									
										
										
										
											2021-06-24 22:36:02 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |     for (i in seq_along(file_names)) { | 
					
						
							|  |  |  |         file_name <- file_names[i] | 
					
						
							| 
									
										
										
										
											2021-06-24 20:20:46 +02:00
										 |  |  |         species_id <- strsplit(file_name, split = ".", fixed = TRUE)[[1]][1] | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |         species_path <- paste(path, "genomes", file_name, sep = "/") | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |         log_info(sprintf( | 
					
						
							|  |  |  |             "Reading species %i/%i (%s)", i, n_species, species_id | 
					
						
							|  |  |  |         )) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |         species_distances <- fread(species_path) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |         # Compute the median distance across all genes of this species and | 
					
						
							|  |  |  |         # add it to the species table along other static data. | 
					
						
							|  |  |  |         species <- rbindlist(list(species, data.table( | 
					
						
							|  |  |  |             id = species_id, | 
					
						
							| 
									
										
										
										
											2021-08-25 15:01:18 +02:00
										 |  |  |             group = original_species[id == species_id, group], | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |             label = original_species[id == species_id, label], | 
					
						
							|  |  |  |             median_distance = median(species_distances[, dist]) | 
					
						
							|  |  |  |         ))) | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |         species_distances <- data.table( | 
					
						
							|  |  |  |             species = species_id, | 
					
						
							|  |  |  |             gene = species_distances[, geneid], | 
					
						
							|  |  |  |             distance = species_distances[, dist] | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         distances <- rbindlist(list(distances, species_distances)) | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-08-25 12:00:22 +02:00
										 |  |  |     # Order species by their median distance. | 
					
						
							| 
									
										
										
										
											2021-08-16 17:21:01 +02:00
										 |  |  |     setorder(species, median_distance) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  |     list( | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  |         genes = genes, | 
					
						
							| 
									
										
										
										
											2021-06-24 20:20:46 +02:00
										 |  |  |         species = species, | 
					
						
							|  |  |  |         distances = distances | 
					
						
							| 
									
										
										
										
											2021-06-16 22:01:09 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-06-21 13:03:26 +02:00
										 |  |  | } |