mirror of
https://github.com/johrpan/ubigen.git
synced 2025-10-26 19:57:24 +01:00
Use logarithmic normalization
This commit also changes the way the standard deviation and mean expression are computed in general. Now, samples where the gene is not expressed at all are excluded before the computation.
This commit is contained in:
parent
af6b5561e4
commit
cc17c13888
5 changed files with 43 additions and 12 deletions
BIN
R/sysdata.rda
BIN
R/sysdata.rda
Binary file not shown.
BIN
data/genes.rda
BIN
data/genes.rda
Binary file not shown.
|
|
@ -7,21 +7,54 @@ library(here)
|
||||||
|
|
||||||
i_am("scripts/input.R")
|
i_am("scripts/input.R")
|
||||||
|
|
||||||
data <- fread(here("scripts", "input", "data_long.csv"))
|
data <- fread(here("scripts", "input", "data_long.csv.gz"))
|
||||||
|
|
||||||
data[, `:=`(
|
data[, `:=`(
|
||||||
expression_median = median(expression),
|
expression_median = median(expression),
|
||||||
expression_95 = quantile(expression, probs = 0.95)
|
expression_95 = quantile(expression, probs = 0.95)
|
||||||
), by = sample]
|
), by = sample]
|
||||||
|
|
||||||
|
# Transform the expression logarithmically. The samples that don't express a
|
||||||
|
# gene at all will be left out intentionally.
|
||||||
|
data[expression > 0, expression_log := log2(expression)]
|
||||||
|
|
||||||
results <- data[, .(
|
results <- data[, .(
|
||||||
median_expression = median(expression),
|
median_expression = median(expression[expression > 0]),
|
||||||
mean_expression = mean(expression),
|
iqr_expression = IQR(expression[expression > 0]),
|
||||||
sd_expression = sd(expression),
|
mean_expression = mean(expression[expression > 0]),
|
||||||
|
sd_expression = sd(expression[expression > 0]),
|
||||||
|
median_expression_normalized = median(expression_log, na.rm = TRUE),
|
||||||
|
iqr_expression_normalized = IQR(expression_log, na.rm = TRUE),
|
||||||
|
mean_expression_normalized = mean(expression_log, na.rm = TRUE),
|
||||||
|
sd_expression_normalized = sd(expression_log, na.rm = TRUE),
|
||||||
above_zero = mean(expression > 0.0),
|
above_zero = mean(expression > 0.0),
|
||||||
above_threshold = mean(expression > 50.0),
|
above_threshold = mean(expression > 50.0),
|
||||||
above_median = mean(expression > expression_median),
|
above_median = mean(expression > expression_median),
|
||||||
above_95 = mean(expression > expression_95)
|
above_95 = mean(expression > expression_95)
|
||||||
), by = "gene"]
|
), by = "gene"]
|
||||||
|
|
||||||
|
# Normalize values to the range of 0.0 to 1.0.
|
||||||
|
results[, `:=`(
|
||||||
|
median_expression_normalized =
|
||||||
|
(median_expression_normalized -
|
||||||
|
min(median_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(median_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(median_expression_normalized, na.rm = TRUE)),
|
||||||
|
iqr_expression_normalized =
|
||||||
|
(iqr_expression_normalized -
|
||||||
|
min(iqr_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(iqr_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(iqr_expression_normalized, na.rm = TRUE)),
|
||||||
|
mean_expression_normalized =
|
||||||
|
(mean_expression_normalized -
|
||||||
|
min(mean_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(mean_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(mean_expression_normalized, na.rm = TRUE)),
|
||||||
|
sd_expression_normalized =
|
||||||
|
(sd_expression_normalized -
|
||||||
|
min(sd_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(sd_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(sd_expression_normalized, na.rm = TRUE))
|
||||||
|
)]
|
||||||
|
|
||||||
fwrite(results, file = here("scripts", "output", "results.csv"))
|
fwrite(results, file = here("scripts", "output", "results.csv"))
|
||||||
|
|
|
||||||
|
|
@ -31,8 +31,8 @@ fwrite(
|
||||||
file = here(
|
file = here(
|
||||||
"scripts",
|
"scripts",
|
||||||
"input",
|
"input",
|
||||||
"data_wide_samples.csv"
|
"data_wide_samples.csv.gz"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
fwrite(data_long, file = here("scripts", "input", "data_long.csv"))
|
fwrite(data_long, file = here("scripts", "input", "data_long.csv.gz"))
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,8 @@ i_am("scripts/input.R")
|
||||||
|
|
||||||
data <- fread(here("scripts", "output", "results.csv"))
|
data <- fread(here("scripts", "output", "results.csv"))
|
||||||
|
|
||||||
data[, `:=`(
|
# Keep only the actual Ensembl ID for each gene.
|
||||||
gene = stringr::str_split(gene, "\\.") |> purrr::map_chr(1),
|
data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)]
|
||||||
mean_expression_normalized = mean_expression / max(mean_expression),
|
|
||||||
sd_expression_normalized = sd_expression / max(sd_expression)
|
|
||||||
)]
|
|
||||||
|
|
||||||
data[, score := 0.5 * above_95 +
|
data[, score := 0.5 * above_95 +
|
||||||
0.25 * mean_expression_normalized +
|
0.25 * mean_expression_normalized +
|
||||||
|
|
@ -22,7 +19,8 @@ data[, score := 0.5 * above_95 +
|
||||||
data[, score := (score - min(score, na.rm = TRUE)) /
|
data[, score := (score - min(score, na.rm = TRUE)) /
|
||||||
(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
|
(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
|
||||||
|
|
||||||
# These are genes that are not expressed at all.
|
# These are genes that are not expressed at all or expressed just once, in case
|
||||||
|
# the standard deviation is used in the score.
|
||||||
data[is.na(score), score := 0.0]
|
data[is.na(score), score := 0.0]
|
||||||
|
|
||||||
setorder(data, -score)
|
setorder(data, -score)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue