Use logarithmic normalization

This commit also changes the way the standard deviation and mean
expression are computed in general. Now, samples where the gene is not
expressed at all are excluded before the computation.
This commit is contained in:
Elias Projahn 2022-07-02 17:52:05 +02:00
parent af6b5561e4
commit cc17c13888
5 changed files with 43 additions and 12 deletions

View file

@ -1,43 +0,0 @@
# This script uses the results (See results.csv) and computes a score for each
# gene. This is the data that will be used in the package.
library(data.table)
library(here)
i_am("scripts/input.R")
data <- fread(here("scripts", "output", "results.csv"))
data[, `:=`(
gene = stringr::str_split(gene, "\\.") |> purrr::map_chr(1),
mean_expression_normalized = mean_expression / max(mean_expression),
sd_expression_normalized = sd_expression / max(sd_expression)
)]
data[, score := 0.5 * above_95 +
0.25 * mean_expression_normalized +
-0.25 * sd_expression_normalized]
# Normalize scores to be between 0.0 and 1.0.
data[, score := (score - min(score, na.rm = TRUE)) /
(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
# These are genes that are not expressed at all.
data[is.na(score), score := 0.0]
setorder(data, -score)
# Remove duplicates. This will keep the best row for each duplicated gene.
data <- unique(data, by = "gene")
data[, `:=`(
hgnc_name = gprofiler2::gconvert(
gene,
target = "HGNC",
mthreshold = 1,
filter_na = FALSE
)$target,
rank = .I
)]
fwrite(data, file = here("scripts", "output", "genes.csv"))