mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 10:47:25 +01:00
ensembl: Save information on species chromosomes
This commit is contained in:
parent
910bf85719
commit
283f629696
3 changed files with 44 additions and 28 deletions
Binary file not shown.
BIN
data/species.rda
BIN
data/species.rda
Binary file not shown.
|
|
@ -323,23 +323,48 @@ genes <- human_data[, .(
|
||||||
# Retrieve gene distance data across species.
|
# Retrieve gene distance data across species.
|
||||||
|
|
||||||
rlog::log_info("Retrieving distance data")
|
rlog::log_info("Retrieving distance data")
|
||||||
|
distances <- data.table()
|
||||||
|
|
||||||
# Handle the human first, as we already retrieved the data and don't need to
|
#' Handle data for one species.
|
||||||
# filter based on orthologies.
|
handle_species <- function(species_id, species_data) {
|
||||||
|
chromosomes <- species_data[,
|
||||||
|
.(chromosome_length = max(end_position)),
|
||||||
|
by = chromosome_name
|
||||||
|
]
|
||||||
|
|
||||||
human_data[, chromosome_length := max(end_position), by = chromosome_name]
|
# Store the number of chromosomes in the species table.
|
||||||
|
species[id == species_id, n_chromosomes := nrow(chromosomes)]
|
||||||
|
|
||||||
distances <- human_data[, .(
|
# Store the median chromosome length in the species table.
|
||||||
species = "hsapiens",
|
species[
|
||||||
|
id == species_id,
|
||||||
|
median_chromosome_length := chromosomes[, median(chromosome_length)]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Precompute the genes' distance to the nearest telomere.
|
||||||
|
species_distances <- species_data[
|
||||||
|
chromosomes,
|
||||||
|
.(
|
||||||
|
species = species_id,
|
||||||
gene = ensembl_gene_id,
|
gene = ensembl_gene_id,
|
||||||
distance = pmin(
|
distance = pmin(
|
||||||
start_position,
|
start_position,
|
||||||
chromosome_length - end_position
|
chromosome_length - end_position
|
||||||
)
|
)
|
||||||
)]
|
),
|
||||||
|
on = "chromosome_name"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add species distances to the distances table.
|
||||||
|
distances <<- rbindlist(list(distances, species_distances))
|
||||||
|
}
|
||||||
|
|
||||||
|
# Handle the human first, as we already retrieved the data and don't need to
|
||||||
|
# filter based on orthologies.
|
||||||
|
handle_species("hsapiens", human_data)
|
||||||
|
|
||||||
# Iterate through all other species and retrieve their distance data.
|
# Iterate through all other species and retrieve their distance data.
|
||||||
for (species_id in species[!id == "hsapiens", id]) {
|
for (species_id in species[86:nrow(species), id]) {
|
||||||
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
|
rlog::log_info(sprintf("Loading species \"%s\"", species_id))
|
||||||
|
|
||||||
dataset <- biomaRt::useDataset(
|
dataset <- biomaRt::useDataset(
|
||||||
|
|
@ -393,23 +418,14 @@ for (species_id in species[!id == "hsapiens", id]) {
|
||||||
by = "hsapiens_homolog_ensembl_gene"
|
by = "hsapiens_homolog_ensembl_gene"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Precompute the genes' distance to the nearest telomere.
|
# Rename gene ID column to match the human data.
|
||||||
|
setnames(
|
||||||
species_distances[,
|
species_distances,
|
||||||
chromosome_length := max(end_position),
|
"hsapiens_homolog_ensembl_gene",
|
||||||
by = chromosome_name
|
"ensembl_gene_id"
|
||||||
]
|
|
||||||
|
|
||||||
species_distances <- species_distances[, .(
|
|
||||||
species = species_id,
|
|
||||||
gene = hsapiens_homolog_ensembl_gene,
|
|
||||||
distance = pmin(
|
|
||||||
start_position,
|
|
||||||
chromosome_length - end_position
|
|
||||||
)
|
)
|
||||||
)]
|
|
||||||
|
|
||||||
distances <- rbindlist(list(distances, species_distances))
|
handle_species(species_id, species_distances)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Save data in the appropriate place.
|
# Save data in the appropriate place.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue