ensembl: Save information on species chromosomes

This commit is contained in:
Elias Projahn 2022-01-08 17:54:54 +01:00
parent 910bf85719
commit 283f629696
3 changed files with 44 additions and 28 deletions

Binary file not shown.

Binary file not shown.

View file

@ -323,23 +323,48 @@ genes <- human_data[, .(
# Retrieve gene distance data across species. # Retrieve gene distance data across species.
rlog::log_info("Retrieving distance data") rlog::log_info("Retrieving distance data")
distances <- data.table()
# Handle the human first, as we already retrieved the data and don't need to #' Handle data for one species.
# filter based on orthologies. handle_species <- function(species_id, species_data) {
chromosomes <- species_data[,
.(chromosome_length = max(end_position)),
by = chromosome_name
]
human_data[, chromosome_length := max(end_position), by = chromosome_name] # Store the number of chromosomes in the species table.
species[id == species_id, n_chromosomes := nrow(chromosomes)]
distances <- human_data[, .( # Store the median chromosome length in the species table.
species = "hsapiens", species[
id == species_id,
median_chromosome_length := chromosomes[, median(chromosome_length)]
]
# Precompute the genes' distance to the nearest telomere.
species_distances <- species_data[
chromosomes,
.(
species = species_id,
gene = ensembl_gene_id, gene = ensembl_gene_id,
distance = pmin( distance = pmin(
start_position, start_position,
chromosome_length - end_position chromosome_length - end_position
) )
)] ),
on = "chromosome_name"
]
# Add species distances to the distances table.
distances <<- rbindlist(list(distances, species_distances))
}
# Handle the human first, as we already retrieved the data and don't need to
# filter based on orthologies.
handle_species("hsapiens", human_data)
# Iterate through all other species and retrieve their distance data. # Iterate through all other species and retrieve their distance data.
for (species_id in species[!id == "hsapiens", id]) { for (species_id in species[86:nrow(species), id]) {
rlog::log_info(sprintf("Loading species \"%s\"", species_id)) rlog::log_info(sprintf("Loading species \"%s\"", species_id))
dataset <- biomaRt::useDataset( dataset <- biomaRt::useDataset(
@ -393,23 +418,14 @@ for (species_id in species[!id == "hsapiens", id]) {
by = "hsapiens_homolog_ensembl_gene" by = "hsapiens_homolog_ensembl_gene"
) )
# Precompute the genes' distance to the nearest telomere. # Rename gene ID column to match the human data.
setnames(
species_distances[, species_distances,
chromosome_length := max(end_position), "hsapiens_homolog_ensembl_gene",
by = chromosome_name "ensembl_gene_id"
]
species_distances <- species_distances[, .(
species = species_id,
gene = hsapiens_homolog_ensembl_gene,
distance = pmin(
start_position,
chromosome_length - end_position
) )
)]
distances <- rbindlist(list(distances, species_distances)) handle_species(species_id, species_distances)
} }
# Save data in the appropriate place. # Save data in the appropriate place.