2022-06-22 19:10:28 +02:00
|
|
|
# This script uses the results (See results.csv) and computes a score for each
|
|
|
|
|
# gene. This is the data that will be used in the package.
|
|
|
|
|
|
|
|
|
|
library(data.table)
|
|
|
|
|
library(here)
|
|
|
|
|
|
|
|
|
|
i_am("scripts/input.R")
|
|
|
|
|
|
|
|
|
|
data <- fread(here("scripts", "output", "results.csv"))
|
|
|
|
|
|
2022-07-02 17:52:05 +02:00
|
|
|
# Keep only the actual Ensembl ID for each gene.
|
|
|
|
|
data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]
|
2022-06-22 19:10:28 +02:00
|
|
|
|
|
|
|
|
data[, score := 0.5 * above_95 +
|
|
|
|
|
0.25 * mean_expression_normalized +
|
|
|
|
|
-0.25 * sd_expression_normalized]
|
|
|
|
|
|
|
|
|
|
# Normalize scores to be between 0.0 and 1.0.
|
|
|
|
|
data[, score := (score - min(score, na.rm = TRUE)) /
|
|
|
|
|
(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
|
|
|
|
|
|
2022-07-02 17:52:05 +02:00
|
|
|
# These are genes that are not expressed at all or expressed just once, in case
|
|
|
|
|
# the standard deviation is used in the score.
|
2022-06-22 19:10:28 +02:00
|
|
|
data[is.na(score), score := 0.0]
|
|
|
|
|
|
|
|
|
|
setorder(data, -score)
|
|
|
|
|
|
|
|
|
|
# Remove duplicates. This will keep the best row for each duplicated gene.
|
|
|
|
|
data <- unique(data, by = "gene")
|
|
|
|
|
|
|
|
|
|
data[, `:=`(
|
|
|
|
|
hgnc_name = gprofiler2::gconvert(
|
|
|
|
|
gene,
|
|
|
|
|
target = "HGNC",
|
|
|
|
|
mthreshold = 1,
|
|
|
|
|
filter_na = FALSE
|
|
|
|
|
)$target,
|
|
|
|
|
rank = .I
|
|
|
|
|
)]
|
|
|
|
|
|
|
|
|
|
fwrite(data, file = here("scripts", "output", "genes.csv"))
|