mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	Initial commit
This commit is contained in:
		
						commit
						c52d42c2b6
					
				
					 24 changed files with 1350 additions and 0 deletions
				
			
		
							
								
								
									
										154
									
								
								scripts/ensembl.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								scripts/ensembl.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,154 @@ | |||
| library(data.table) | ||||
| 
 | ||||
| rlog::log_info("Connecting to Ensembl API") | ||||
| 
 | ||||
| #' Object to access the Ensembl API. | ||||
| ensembl <- biomaRt::useEnsembl("ensembl") | ||||
| 
 | ||||
| # Retrieve species information. | ||||
| 
 | ||||
| rlog::log_info("Retrieving species information") | ||||
| ensembl_datasets <- data.table(biomaRt::listDatasets(ensembl)) | ||||
| 
 | ||||
| # Filter out species ID and name from the result. | ||||
| species <- ensembl_datasets[, .( | ||||
|     id = stringr::str_match(dataset, "(.*)_gene_ensembl")[, 2], | ||||
|     name = stringr::str_match(description, "(.*) genes \\(.*\\)")[, 2] | ||||
| )] | ||||
| 
 | ||||
| #' Get all chromosome names for an Ensembl dataset. | ||||
| #' | ||||
| #' Valid chromosome names include decimal numbers as well as typical sex | ||||
| #' chromosome names (X, Y, W and Z). | ||||
| get_chromosome_names <- function(dataset) { | ||||
|     chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") | ||||
|     chromosome_names[stringr::str_which(chromosome_names, "^[0-9]+|[XYWZ]$")] | ||||
| } | ||||
| 
 | ||||
| # Retrieve information on human genes. This will only include genes on | ||||
| # assembled chromosomes. Chromosomes are filtered using get_chromosome_names(). | ||||
| 
 | ||||
| rlog::log_info("Retrieving information on human genes") | ||||
| dataset <- biomaRt::useDataset("hsapiens_gene_ensembl", mart = ensembl) | ||||
| 
 | ||||
| human_data <- data.table(biomaRt::getBM( | ||||
|     attributes = c( | ||||
|         "ensembl_gene_id", | ||||
|         "hgnc_symbol", | ||||
|         "chromosome_name", | ||||
|         "start_position", | ||||
|         "end_position" | ||||
|     ), | ||||
|     filters = "chromosome_name", | ||||
|     values = get_chromosome_names(dataset), | ||||
|     mart = dataset | ||||
| )) | ||||
| 
 | ||||
| # Remove duplicated gene IDs (at the time of writing, there are a handful). | ||||
| human_data <- unique(human_data, by = "ensembl_gene_id") | ||||
| 
 | ||||
| # Only keep relevant information on genes. | ||||
| genes <- human_data[, .( | ||||
|     id = ensembl_gene_id, | ||||
|     name = hgnc_symbol, | ||||
|     chromosome = chromosome_name | ||||
| )] | ||||
| 
 | ||||
| # Retrieve gene distance data across species. | ||||
| 
 | ||||
| rlog::log_info("Retrieving distance data") | ||||
| 
 | ||||
| # Handle the human first, as we already retrieved the data and don't need to | ||||
| # filter based on orthologies. | ||||
| 
 | ||||
| human_data[, chromosome_length := max(end_position), by = chromosome_name] | ||||
| 
 | ||||
| distances <- human_data[, .( | ||||
|     species = "hsapiens", | ||||
|     gene = ensembl_gene_id, | ||||
|     distance = pmin( | ||||
|         start_position, | ||||
|         chromosome_length - end_position | ||||
|     ) | ||||
| )] | ||||
| 
 | ||||
| # Iterate through all other species and retrieve their distance data. | ||||
| for (species_id in species[!id == "hsapiens", id]) { | ||||
|     rlog::log_info(sprintf("Loading species \"%s\"", species_id)) | ||||
| 
 | ||||
|     dataset <- biomaRt::useDataset( | ||||
|         sprintf("%s_gene_ensembl", species_id), | ||||
|         mart = ensembl | ||||
|     ) | ||||
| 
 | ||||
|     # Besides the attributes that are always present, we need to check for | ||||
|     # human orthologs. Some species don't have that information and will be | ||||
|     # skipped. | ||||
|     if (!"hsapiens_homolog_ensembl_gene" %chin% | ||||
|         biomaRt::listAttributes(dataset, what = "name")) { | ||||
| 
 | ||||
|         rlog::log_info("No data on human orthologs") | ||||
|         species <- species[id != species_id] | ||||
| 
 | ||||
|         next | ||||
|     } | ||||
| 
 | ||||
|     chromosome_names <- get_chromosome_names(dataset) | ||||
| 
 | ||||
|     # Skip the species, if there are no assembled chromosomes. | ||||
|     if (length(chromosome_names) <= 0) { | ||||
|         rlog::log_info("No matching chromosome assemblies") | ||||
|         species <- species[id != species_id] | ||||
| 
 | ||||
|         next | ||||
|     } | ||||
| 
 | ||||
|     # Retrieve information on all genes of the current species, that have | ||||
|     # human orthologs. This is called "homolog" in the Ensembl schema. | ||||
|     species_distances <- data.table(biomaRt::getBM( | ||||
|         attributes = c( | ||||
|             "hsapiens_homolog_ensembl_gene", | ||||
|             "chromosome_name", | ||||
|             "start_position", | ||||
|             "end_position" | ||||
|         ), | ||||
|         filters = c("with_hsapiens_homolog", "chromosome_name"), | ||||
|         values = list(TRUE, chromosome_names), | ||||
|         mart = dataset | ||||
|     )) | ||||
| 
 | ||||
|     # Only include one ortholog per human gene. | ||||
|     species_distances <- unique( | ||||
|         species_distances, | ||||
|         by = "hsapiens_homolog_ensembl_gene" | ||||
|     ) | ||||
| 
 | ||||
|     # Precompute the genes' distance to the nearest telomere. | ||||
| 
 | ||||
|     species_distances[, | ||||
|         chromosome_length := max(end_position), | ||||
|         by = chromosome_name | ||||
|     ] | ||||
| 
 | ||||
|     species_distances <- species_distances[, .( | ||||
|         species = species_id, | ||||
|         gene = hsapiens_homolog_ensembl_gene, | ||||
|         distance = pmin( | ||||
|             start_position, | ||||
|             chromosome_length - end_position | ||||
|         ) | ||||
|     )] | ||||
| 
 | ||||
|     distances <- rbindlist(list(distances, species_distances)) | ||||
| } | ||||
| 
 | ||||
| # Add information on number of species per gene. | ||||
| 
 | ||||
| genes_n_species <- distances[, .(n_species = .N), by = "gene"] | ||||
| genes <- merge(genes, genes_n_species, by.x = "id", by.y = "gene") | ||||
| 
 | ||||
| # Save data in the appropriate place. | ||||
| 
 | ||||
| usethis::use_data(species, overwrite = TRUE) | ||||
| usethis::use_data(genes, overwrite = TRUE) | ||||
| usethis::use_data(distances, overwrite = TRUE) | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue