mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 18:57:25 +01:00 
			
		
		
		
	data: Include more chromosomes
This commit is contained in:
		
							parent
							
								
									df6e23d219
								
							
						
					
					
						commit
						6494ae8200
					
				
					 5 changed files with 27 additions and 8 deletions
				
			
		
							
								
								
									
										5
									
								
								R/data.R
									
										
									
									
									
								
							
							
						
						
									
										5
									
								
								R/data.R
									
										
									
									
									
								
							|  | @ -1,6 +1,6 @@ | ||||||
| #' Information on included species from the Ensembl database. | #' Information on included species from the Ensembl database. | ||||||
| #' | #' | ||||||
| #' @format A [data.table] with 91 rows and 2 variables: | #' @format A [data.table] with 99 rows and 2 variables: | ||||||
| #' \describe{ | #' \describe{ | ||||||
| #'   \item{id}{Unique species ID} | #'   \item{id}{Unique species ID} | ||||||
| #'   \item{name}{Human readable species name} | #'   \item{name}{Human readable species name} | ||||||
|  | @ -25,10 +25,11 @@ | ||||||
| #' This dataset contains each known value for a gene's distance to the telomeres | #' This dataset contains each known value for a gene's distance to the telomeres | ||||||
| #' per species. The data is sourced from Ensembl. | #' per species. The data is sourced from Ensembl. | ||||||
| #' | #' | ||||||
| #' @format A [data.table] with 1390730 rows and 3 variables: | #' @format A [data.table] with 1506182 rows and 4 variables: | ||||||
| #' \describe{ | #' \describe{ | ||||||
| #'   \item{species}{Species ID} | #'   \item{species}{Species ID} | ||||||
| #'   \item{gene}{Gene ID} | #'   \item{gene}{Gene ID} | ||||||
|  | #'   \item{position}{Gene start position} | ||||||
| #'   \item{distance}{Distance to nearest telomere} | #'   \item{distance}{Distance to nearest telomere} | ||||||
| #' } | #' } | ||||||
| "distances" | "distances" | ||||||
|  |  | ||||||
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								data/genes.rda
									
										
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								data/genes.rda
									
										
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								data/species.rda
									
										
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								data/species.rda
									
										
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							|  | @ -2,8 +2,9 @@ library(data.table) | ||||||
| 
 | 
 | ||||||
| rlog::log_info("Connecting to Ensembl API") | rlog::log_info("Connecting to Ensembl API") | ||||||
| 
 | 
 | ||||||
| #' Object to access the Ensembl API. | # Object to access the Ensembl API. We use the US east mirror to circumvent | ||||||
| ensembl <- biomaRt::useEnsembl("ensembl") | # current issues with the main server being temporarily unreliable. | ||||||
|  | ensembl <- biomaRt::useEnsembl("ensembl", host = "useast.ensembl.org") | ||||||
| 
 | 
 | ||||||
| # Retrieve species information. | # Retrieve species information. | ||||||
| 
 | 
 | ||||||
|  | @ -18,11 +19,22 @@ species <- ensembl_datasets[, .( | ||||||
| 
 | 
 | ||||||
| #' Get all chromosome names for an Ensembl dataset. | #' Get all chromosome names for an Ensembl dataset. | ||||||
| #' | #' | ||||||
| #' Valid chromosome names include decimal numbers as well as typical sex | #' The following chromosome naming schemes will be recognized and have been | ||||||
| #' chromosome names (X, Y, W and Z). | #' sourced from Ensembl by manually screening chromosome-level assemblies. | ||||||
|  | #' | ||||||
|  | #'  - a decimal number (most species' autosomes) | ||||||
|  | #'  - X, Y, W or Z (gonosomes) | ||||||
|  | #'  - LG followed by a decimal number (some fishes) | ||||||
|  | #'  - ssa/sgr followed by a number (Atlantic salmon/Turquoise killifish) | ||||||
|  | #' | ||||||
|  | #' The function tries to filter out those chromosome names from the available | ||||||
|  | #' assemblies in the dataset. | ||||||
| get_chromosome_names <- function(dataset) { | get_chromosome_names <- function(dataset) { | ||||||
|     chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") |     chromosome_names <- biomaRt::listFilterOptions(dataset, "chromosome_name") | ||||||
|     chromosome_names[stringr::str_which(chromosome_names, "^[0-9]+|[XYWZ]$")] |     chromosome_names[stringr::str_which( | ||||||
|  |         chromosome_names, | ||||||
|  |         "^(LG|sgr|ssa)?[0-9]+|[XYWZ]$" | ||||||
|  |     )] | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # Retrieve information on human genes. This will only include genes on | # Retrieve information on human genes. This will only include genes on | ||||||
|  | @ -66,6 +78,7 @@ human_data[, chromosome_length := max(end_position), by = chromosome_name] | ||||||
| distances <- human_data[, .( | distances <- human_data[, .( | ||||||
|     species = "hsapiens", |     species = "hsapiens", | ||||||
|     gene = ensembl_gene_id, |     gene = ensembl_gene_id, | ||||||
|  |     position = start_position, | ||||||
|     distance = pmin( |     distance = pmin( | ||||||
|         start_position, |         start_position, | ||||||
|         chromosome_length - end_position |         chromosome_length - end_position | ||||||
|  | @ -86,7 +99,6 @@ for (species_id in species[!id == "hsapiens", id]) { | ||||||
|     # skipped. |     # skipped. | ||||||
|     if (!"hsapiens_homolog_ensembl_gene" %chin% |     if (!"hsapiens_homolog_ensembl_gene" %chin% | ||||||
|         biomaRt::listAttributes(dataset, what = "name")) { |         biomaRt::listAttributes(dataset, what = "name")) { | ||||||
| 
 |  | ||||||
|         rlog::log_info("No data on human orthologs") |         rlog::log_info("No data on human orthologs") | ||||||
|         species <- species[id != species_id] |         species <- species[id != species_id] | ||||||
| 
 | 
 | ||||||
|  | @ -117,6 +129,11 @@ for (species_id in species[!id == "hsapiens", id]) { | ||||||
|         mart = dataset |         mart = dataset | ||||||
|     )) |     )) | ||||||
| 
 | 
 | ||||||
|  |     # Only include human genes that we have information on. | ||||||
|  |     species_distances <- species_distances[ | ||||||
|  |         hsapiens_homolog_ensembl_gene %chin% genes$id | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|     # Only include one ortholog per human gene. |     # Only include one ortholog per human gene. | ||||||
|     species_distances <- unique( |     species_distances <- unique( | ||||||
|         species_distances, |         species_distances, | ||||||
|  | @ -133,6 +150,7 @@ for (species_id in species[!id == "hsapiens", id]) { | ||||||
|     species_distances <- species_distances[, .( |     species_distances <- species_distances[, .( | ||||||
|         species = species_id, |         species = species_id, | ||||||
|         gene = hsapiens_homolog_ensembl_gene, |         gene = hsapiens_homolog_ensembl_gene, | ||||||
|  |         position = start_position, | ||||||
|         distance = pmin( |         distance = pmin( | ||||||
|             start_position, |             start_position, | ||||||
|             chromosome_length - end_position |             chromosome_length - end_position | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue