ubigen/scripts/analyze.R
Elias Projahn cc17c13888 Use logarithmic normalization
This commit also changes the way the standard deviation and mean
expression are computed in general. Now, samples where the gene is not
expressed at all are excluded before the computation.
2022-07-02 17:52:05 +02:00

60 lines
2.4 KiB
R

# This scripts reads the input data (See input.R) and performs various
# computations on it in order to later use the results for computating scores
# for ubuiquitously expressed genes.
library(data.table)
library(here)
i_am("scripts/input.R")
data <- fread(here("scripts", "input", "data_long.csv.gz"))
data[, `:=`(
expression_median = median(expression),
expression_95 = quantile(expression, probs = 0.95)
), by = sample]
# Transform the expression logarithmically. The samples that don't express a
# gene at all will be left out intentionally.
data[expression > 0, expression_log := log2(expression)]
results <- data[, .(
median_expression = median(expression[expression > 0]),
iqr_expression = IQR(expression[expression > 0]),
mean_expression = mean(expression[expression > 0]),
sd_expression = sd(expression[expression > 0]),
median_expression_normalized = median(expression_log, na.rm = TRUE),
iqr_expression_normalized = IQR(expression_log, na.rm = TRUE),
mean_expression_normalized = mean(expression_log, na.rm = TRUE),
sd_expression_normalized = sd(expression_log, na.rm = TRUE),
above_zero = mean(expression > 0.0),
above_threshold = mean(expression > 50.0),
above_median = mean(expression > expression_median),
above_95 = mean(expression > expression_95)
), by = "gene"]
# Normalize values to the range of 0.0 to 1.0.
results[, `:=`(
median_expression_normalized =
(median_expression_normalized -
min(median_expression_normalized, na.rm = TRUE)) /
(max(median_expression_normalized, na.rm = TRUE) -
min(median_expression_normalized, na.rm = TRUE)),
iqr_expression_normalized =
(iqr_expression_normalized -
min(iqr_expression_normalized, na.rm = TRUE)) /
(max(iqr_expression_normalized, na.rm = TRUE) -
min(iqr_expression_normalized, na.rm = TRUE)),
mean_expression_normalized =
(mean_expression_normalized -
min(mean_expression_normalized, na.rm = TRUE)) /
(max(mean_expression_normalized, na.rm = TRUE) -
min(mean_expression_normalized, na.rm = TRUE)),
sd_expression_normalized =
(sd_expression_normalized -
min(sd_expression_normalized, na.rm = TRUE)) /
(max(sd_expression_normalized, na.rm = TRUE) -
min(sd_expression_normalized, na.rm = TRUE))
)]
fwrite(results, file = here("scripts", "output", "results.csv"))