Use logarithmic normalization

This commit also changes the way the standard deviation and mean
expression are computed in general. Now, samples where the gene is not
expressed at all are excluded before the computation.
This commit is contained in:
Elias Projahn 2022-07-02 17:52:05 +02:00
parent af6b5561e4
commit cc17c13888
5 changed files with 43 additions and 12 deletions

View file

@ -7,21 +7,54 @@ library(here)
i_am("scripts/input.R")
data <- fread(here("scripts", "input", "data_long.csv"))
data <- fread(here("scripts", "input", "data_long.csv.gz"))
data[, `:=`(
expression_median = median(expression),
expression_95 = quantile(expression, probs = 0.95)
), by = sample]
# Transform the expression logarithmically. The samples that don't express a
# gene at all will be left out intentionally.
data[expression > 0, expression_log := log2(expression)]
results <- data[, .(
median_expression = median(expression),
mean_expression = mean(expression),
sd_expression = sd(expression),
median_expression = median(expression[expression > 0]),
iqr_expression = IQR(expression[expression > 0]),
mean_expression = mean(expression[expression > 0]),
sd_expression = sd(expression[expression > 0]),
median_expression_normalized = median(expression_log, na.rm = TRUE),
iqr_expression_normalized = IQR(expression_log, na.rm = TRUE),
mean_expression_normalized = mean(expression_log, na.rm = TRUE),
sd_expression_normalized = sd(expression_log, na.rm = TRUE),
above_zero = mean(expression > 0.0),
above_threshold = mean(expression > 50.0),
above_median = mean(expression > expression_median),
above_95 = mean(expression > expression_95)
), by = "gene"]
# Normalize values to the range of 0.0 to 1.0.
results[, `:=`(
median_expression_normalized =
(median_expression_normalized -
min(median_expression_normalized, na.rm = TRUE)) /
(max(median_expression_normalized, na.rm = TRUE) -
min(median_expression_normalized, na.rm = TRUE)),
iqr_expression_normalized =
(iqr_expression_normalized -
min(iqr_expression_normalized, na.rm = TRUE)) /
(max(iqr_expression_normalized, na.rm = TRUE) -
min(iqr_expression_normalized, na.rm = TRUE)),
mean_expression_normalized =
(mean_expression_normalized -
min(mean_expression_normalized, na.rm = TRUE)) /
(max(mean_expression_normalized, na.rm = TRUE) -
min(mean_expression_normalized, na.rm = TRUE)),
sd_expression_normalized =
(sd_expression_normalized -
min(sd_expression_normalized, na.rm = TRUE)) /
(max(sd_expression_normalized, na.rm = TRUE) -
min(sd_expression_normalized, na.rm = TRUE))
)]
fwrite(results, file = here("scripts", "output", "results.csv"))

View file

@ -31,8 +31,8 @@ fwrite(
file = here(
"scripts",
"input",
"data_wide_samples.csv"
"data_wide_samples.csv.gz"
)
)
fwrite(data_long, file = here("scripts", "input", "data_long.csv"))
fwrite(data_long, file = here("scripts", "input", "data_long.csv.gz"))

View file

@ -8,11 +8,8 @@ i_am("scripts/input.R")
data <- fread(here("scripts", "output", "results.csv"))
data[, `:=`(
gene = stringr::str_split(gene, "\\.") |> purrr::map_chr(1),
mean_expression_normalized = mean_expression / max(mean_expression),
sd_expression_normalized = sd_expression / max(sd_expression)
)]
# Keep only the actual Ensembl ID for each gene.
data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]
data[, score := 0.5 * above_95 +
0.25 * mean_expression_normalized +
@ -22,7 +19,8 @@ data[, score := 0.5 * above_95 +
data[, score := (score - min(score, na.rm = TRUE)) /
(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
# These are genes that are not expressed at all.
# These are genes that are not expressed at all or expressed just once, in case
# the standard deviation is used in the score.
data[is.na(score), score := 0.0]
setorder(data, -score)