mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 18:57:25 +01:00
78 lines
1.6 KiB
R
78 lines
1.6 KiB
R
|
|
# This script does post processing on the data from Ensembl and imports it into
|
||
|
|
# the R package. Run this script after `ensembl_species.R` and
|
||
|
|
# `ensembl_species.R`.
|
||
|
|
|
||
|
|
library(data.table)
|
||
|
|
|
||
|
|
species <- fread("species.csv")
|
||
|
|
chromosomes <- fread("chromosomes.csv")
|
||
|
|
genes <- fread("genes.csv")
|
||
|
|
|
||
|
|
species_metadata <- chromosomes[,
|
||
|
|
.(
|
||
|
|
n_chromosomes = .N,
|
||
|
|
median_chromosome_length = as.double(stats::median(length))
|
||
|
|
),
|
||
|
|
by = species
|
||
|
|
]
|
||
|
|
|
||
|
|
species <- merge(
|
||
|
|
species,
|
||
|
|
species_metadata,
|
||
|
|
by.x = "id",
|
||
|
|
by.y = "species",
|
||
|
|
sort = FALSE
|
||
|
|
)
|
||
|
|
|
||
|
|
# Remove duplicated genes within species.
|
||
|
|
genes <- genes[!duplicated(genes, by = c("species", "gene"))]
|
||
|
|
|
||
|
|
genes_chromosomes <- merge(
|
||
|
|
genes,
|
||
|
|
chromosomes,
|
||
|
|
by.x = c("species", "chromosome"),
|
||
|
|
by.y = c("species", "id"),
|
||
|
|
sort = FALSE
|
||
|
|
)
|
||
|
|
|
||
|
|
genes_chromosomes[, distance := ifelse(
|
||
|
|
start_position < length - end_position,
|
||
|
|
start_position,
|
||
|
|
length - end_position
|
||
|
|
)]
|
||
|
|
|
||
|
|
distances <- genes_chromosomes[, .(
|
||
|
|
species,
|
||
|
|
gene,
|
||
|
|
chromosome,
|
||
|
|
start_position,
|
||
|
|
end_position,
|
||
|
|
distance
|
||
|
|
)]
|
||
|
|
|
||
|
|
# This table will hold information on human genes.
|
||
|
|
genes <- genes_chromosomes[
|
||
|
|
species == 9606,
|
||
|
|
.(
|
||
|
|
id = gene,
|
||
|
|
chromosome = name
|
||
|
|
)
|
||
|
|
]
|
||
|
|
|
||
|
|
genes[, name := gprofiler2::gconvert(
|
||
|
|
id,
|
||
|
|
target = "HGNC",
|
||
|
|
mthreshold = 1,
|
||
|
|
filter_na = FALSE
|
||
|
|
)$target]
|
||
|
|
|
||
|
|
# Previous versions of geposan used different species IDs. For backwards
|
||
|
|
# compatibility, convert integer IDs to character.
|
||
|
|
|
||
|
|
species[, id := as.character(id)]
|
||
|
|
distances[, species := as.character(species)]
|
||
|
|
|
||
|
|
usethis::use_data(species, overwrite = TRUE)
|
||
|
|
usethis::use_data(genes, overwrite = TRUE)
|
||
|
|
usethis::use_data(distances, overwrite = TRUE)
|