mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	ensembl: Save information on species chromosomes
This commit is contained in:
		
							parent
							
								
									910bf85719
								
							
						
					
					
						commit
						283f629696
					
				
					 3 changed files with 44 additions and 28 deletions
				
			
		
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								data/species.rda
									
										
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								data/species.rda
									
										
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							|  | @ -323,23 +323,48 @@ genes <- human_data[, .( | ||||||
| # Retrieve gene distance data across species. | # Retrieve gene distance data across species. | ||||||
| 
 | 
 | ||||||
| rlog::log_info("Retrieving distance data") | rlog::log_info("Retrieving distance data") | ||||||
|  | distances <- data.table() | ||||||
|  | 
 | ||||||
|  | #' Handle data for one species. | ||||||
|  | handle_species <- function(species_id, species_data) { | ||||||
|  |     chromosomes <- species_data[, | ||||||
|  |         .(chromosome_length = max(end_position)), | ||||||
|  |         by = chromosome_name | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     # Store the number of chromosomes in the species table. | ||||||
|  |     species[id == species_id, n_chromosomes := nrow(chromosomes)] | ||||||
|  | 
 | ||||||
|  |     # Store the median chromosome length in the species table. | ||||||
|  |     species[ | ||||||
|  |         id == species_id, | ||||||
|  |         median_chromosome_length := chromosomes[, median(chromosome_length)] | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     # Precompute the genes' distance to the nearest telomere. | ||||||
|  |     species_distances <- species_data[ | ||||||
|  |         chromosomes, | ||||||
|  |         .( | ||||||
|  |             species = species_id, | ||||||
|  |             gene = ensembl_gene_id, | ||||||
|  |             distance = pmin( | ||||||
|  |                 start_position, | ||||||
|  |                 chromosome_length - end_position | ||||||
|  |             ) | ||||||
|  |         ), | ||||||
|  |         on = "chromosome_name" | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     # Add species distances to the distances table. | ||||||
|  |     distances <<- rbindlist(list(distances, species_distances)) | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| # Handle the human first, as we already retrieved the data and don't need to | # Handle the human first, as we already retrieved the data and don't need to | ||||||
| # filter based on orthologies. | # filter based on orthologies. | ||||||
| 
 | handle_species("hsapiens", human_data) | ||||||
| human_data[, chromosome_length := max(end_position), by = chromosome_name] |  | ||||||
| 
 |  | ||||||
| distances <- human_data[, .( |  | ||||||
|     species = "hsapiens", |  | ||||||
|     gene = ensembl_gene_id, |  | ||||||
|     distance = pmin( |  | ||||||
|         start_position, |  | ||||||
|         chromosome_length - end_position |  | ||||||
|     ) |  | ||||||
| )] |  | ||||||
| 
 | 
 | ||||||
| # Iterate through all other species and retrieve their distance data. | # Iterate through all other species and retrieve their distance data. | ||||||
| for (species_id in species[!id == "hsapiens", id]) { | for (species_id in species[86:nrow(species), id]) { | ||||||
|     rlog::log_info(sprintf("Loading species \"%s\"", species_id)) |     rlog::log_info(sprintf("Loading species \"%s\"", species_id)) | ||||||
| 
 | 
 | ||||||
|     dataset <- biomaRt::useDataset( |     dataset <- biomaRt::useDataset( | ||||||
|  | @ -393,23 +418,14 @@ for (species_id in species[!id == "hsapiens", id]) { | ||||||
|         by = "hsapiens_homolog_ensembl_gene" |         by = "hsapiens_homolog_ensembl_gene" | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     # Precompute the genes' distance to the nearest telomere. |     # Rename gene ID column to match the human data. | ||||||
|  |     setnames( | ||||||
|  |         species_distances, | ||||||
|  |         "hsapiens_homolog_ensembl_gene", | ||||||
|  |         "ensembl_gene_id" | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
|     species_distances[, |     handle_species(species_id, species_distances) | ||||||
|         chromosome_length := max(end_position), |  | ||||||
|         by = chromosome_name |  | ||||||
|     ] |  | ||||||
| 
 |  | ||||||
|     species_distances <- species_distances[, .( |  | ||||||
|         species = species_id, |  | ||||||
|         gene = hsapiens_homolog_ensembl_gene, |  | ||||||
|         distance = pmin( |  | ||||||
|             start_position, |  | ||||||
|             chromosome_length - end_position |  | ||||||
|         ) |  | ||||||
|     )] |  | ||||||
| 
 |  | ||||||
|     distances <- rbindlist(list(distances, species_distances)) |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # Save data in the appropriate place. | # Save data in the appropriate place. | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue