mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 02:37:25 +01:00 
			
		
		
		
	
		
			
	
	
		
			119 lines
		
	
	
	
		
			3.4 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
		
		
			
		
	
	
			119 lines
		
	
	
	
		
			3.4 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
|  | # This is an *interactive* script for retrieving information on species from the | ||
|  | # Ensembl database. There are taxons with more than one entry in the database. | ||
|  | # For each species that has already been seen, the script asks whether to keep | ||
|  | # it or replace it. We recommend to choose the most generic entry in most | ||
|  | # cases. | ||
|  | 
 | ||
|  | library(data.table) | ||
|  | library(DBI) | ||
|  | library(glue) | ||
|  | 
 | ||
|  | # These are the output tables of this script: | ||
|  | 
 | ||
|  | species <- data.table( | ||
|  |   id = integer(), | ||
|  |   name = character(), | ||
|  |   scientific_name = character(), | ||
|  |   table_name = character() | ||
|  | ) | ||
|  | 
 | ||
|  | chromosomes <- data.table( | ||
|  |   species = integer(), | ||
|  |   id = integer(), | ||
|  |   name = character(), | ||
|  |   length = integer() | ||
|  | ) | ||
|  | 
 | ||
|  | rlog::log_info("Connecting to Ensembl database server") | ||
|  | db <- dbConnect( | ||
|  |   RMariaDB::MariaDB(), | ||
|  |   host = "ensembldb.ensembl.org", | ||
|  |   port = 5306, | ||
|  |   user = "anonymous" | ||
|  | ) | ||
|  | 
 | ||
|  | rlog::log_info("Retrieving list of databases") | ||
|  | tables <- dbGetQuery(db, "SHOW DATABASES LIKE '%_core_110_%'")[, 1] | ||
|  | 
 | ||
|  | # Populates the species and chromosomes tables using data from each species' | ||
|  | # table within the Ensembl database. Species without a karyotype will be skipped | ||
|  | # without adding any information to the tables. | ||
|  | for (table in tables) { | ||
|  |   rlog::log_info(glue("Reading species information from {table}")) | ||
|  |   dbExecute(db, glue_sql("USE {`table`}", .con = db)) | ||
|  | 
 | ||
|  |   species_id <- db |> | ||
|  |     dbGetQuery(" | ||
|  |       SELECT meta_value FROM meta | ||
|  |         WHERE meta_key = 'species.taxonomy_id'") |> | ||
|  |     as.integer() | ||
|  | 
 | ||
|  |   species_name <- db |> | ||
|  |     dbGetQuery(" | ||
|  |       SELECT meta_value FROM meta | ||
|  |         WHERE meta_key = 'species.display_name'") |> | ||
|  |     as.character() | ||
|  | 
 | ||
|  |   species_scientific_name <- db |> | ||
|  |     dbGetQuery(" | ||
|  |       SELECT meta_value FROM meta | ||
|  |         WHERE meta_key = 'species.scientific_name'") |> | ||
|  |     as.character() | ||
|  | 
 | ||
|  |   rlog::log_info(glue( | ||
|  |     "Found species {species_name} ({species_scientific_name})" | ||
|  |   )) | ||
|  | 
 | ||
|  |   if (species[id == species_id, .N] > 0) { | ||
|  |     old_name <- species[id == species_id, name] | ||
|  |     old_scientific_name <- species[id == species_id, scientific_name] | ||
|  |     input <- readline(glue("\\ | ||
|  |       Taxon already present ({old_name}, {old_scientific_name}). \\ | ||
|  |       Replace with {species_name} ({species_scientific_name})? [y/N] ")) | ||
|  | 
 | ||
|  |     if (input == "y") { | ||
|  |       species <- species[id != species_id] | ||
|  |       chromosomes <- chromosomes[species != species_id] | ||
|  |     } else { | ||
|  |       next | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   species_chromosomes <- db |> | ||
|  |     dbGetQuery(glue(" | ||
|  |       SELECT seq_region_id, seq_region.name, length | ||
|  |       FROM seq_region | ||
|  |         JOIN seq_region_attrib USING (seq_region_id) | ||
|  |         JOIN attrib_type USING (attrib_type_id) | ||
|  |       WHERE code = 'karyotype_rank' | ||
|  |         AND NOT EXISTS | ||
|  |           (SELECT * FROM seq_region_attrib AS chromosome_attrib | ||
|  |             JOIN attrib_type USING (attrib_type_id) | ||
|  |             WHERE chromosome_attrib.seq_region_id = seq_region.seq_region_id | ||
|  |               AND code = 'sequence_location' | ||
|  |               AND chromosome_attrib.value != 'nuclear_chromosome'); | ||
|  |     ")) |> | ||
|  |     as.data.table() |> | ||
|  |     setnames("seq_region_id", "id") | ||
|  | 
 | ||
|  |   species_chromosomes[, species := species_id] | ||
|  | 
 | ||
|  |   if (nrow(species_chromosomes) == 0) { | ||
|  |     rlog::log_info("Skipping (no karyotype)") | ||
|  |     next | ||
|  |   } | ||
|  | 
 | ||
|  |   species <- rbind(species, data.table( | ||
|  |     id = species_id, | ||
|  |     name = species_name, | ||
|  |     scientific_name = species_scientific_name, | ||
|  |     table_name = table | ||
|  |   )) | ||
|  | 
 | ||
|  |   chromosomes <- rbind(chromosomes, species_chromosomes) | ||
|  | } | ||
|  | 
 | ||
|  | dbDisconnect(db) | ||
|  | 
 | ||
|  | fwrite(species, "species.csv") | ||
|  | fwrite(chromosomes, "chromosomes.csv") |