diff --git a/R/sysdata.rda b/R/sysdata.rda index c785d0c..fcc3232 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/data/genes.rda b/data/genes.rda index 5e2ad3a..147c955 100644 Binary files a/data/genes.rda and b/data/genes.rda differ diff --git a/scripts/analyze.R b/scripts/analyze.R index 800d49b..ab99ebf 100644 --- a/scripts/analyze.R +++ b/scripts/analyze.R @@ -7,21 +7,54 @@ library(here) i_am("scripts/input.R") -data <- fread(here("scripts", "input", "data_long.csv")) +data <- fread(here("scripts", "input", "data_long.csv.gz")) data[, `:=`( expression_median = median(expression), expression_95 = quantile(expression, probs = 0.95) ), by = sample] +# Transform the expression logarithmically. The samples that don't express a +# gene at all will be left out intentionally. +data[expression > 0, expression_log := log2(expression)] + results <- data[, .( - median_expression = median(expression), - mean_expression = mean(expression), - sd_expression = sd(expression), + median_expression = median(expression[expression > 0]), + iqr_expression = IQR(expression[expression > 0]), + mean_expression = mean(expression[expression > 0]), + sd_expression = sd(expression[expression > 0]), + median_expression_normalized = median(expression_log, na.rm = TRUE), + iqr_expression_normalized = IQR(expression_log, na.rm = TRUE), + mean_expression_normalized = mean(expression_log, na.rm = TRUE), + sd_expression_normalized = sd(expression_log, na.rm = TRUE), above_zero = mean(expression > 0.0), above_threshold = mean(expression > 50.0), above_median = mean(expression > expression_median), above_95 = mean(expression > expression_95) ), by = "gene"] +# Normalize values to the range of 0.0 to 1.0. +results[, `:=`( + median_expression_normalized = + (median_expression_normalized - + min(median_expression_normalized, na.rm = TRUE)) / + (max(median_expression_normalized, na.rm = TRUE) - + min(median_expression_normalized, na.rm = TRUE)), + iqr_expression_normalized = + (iqr_expression_normalized - + min(iqr_expression_normalized, na.rm = TRUE)) / + (max(iqr_expression_normalized, na.rm = TRUE) - + min(iqr_expression_normalized, na.rm = TRUE)), + mean_expression_normalized = + (mean_expression_normalized - + min(mean_expression_normalized, na.rm = TRUE)) / + (max(mean_expression_normalized, na.rm = TRUE) - + min(mean_expression_normalized, na.rm = TRUE)), + sd_expression_normalized = + (sd_expression_normalized - + min(sd_expression_normalized, na.rm = TRUE)) / + (max(sd_expression_normalized, na.rm = TRUE) - + min(sd_expression_normalized, na.rm = TRUE)) +)] + fwrite(results, file = here("scripts", "output", "results.csv")) diff --git a/scripts/input.R b/scripts/input.R index 1d686ba..d1ff95d 100644 --- a/scripts/input.R +++ b/scripts/input.R @@ -31,8 +31,8 @@ fwrite( file = here( "scripts", "input", - "data_wide_samples.csv" + "data_wide_samples.csv.gz" ) ) -fwrite(data_long, file = here("scripts", "input", "data_long.csv")) +fwrite(data_long, file = here("scripts", "input", "data_long.csv.gz")) diff --git a/scripts/genes.R b/scripts/ranking.R similarity index 77% rename from scripts/genes.R rename to scripts/ranking.R index ec41d7f..5312786 100644 --- a/scripts/genes.R +++ b/scripts/ranking.R @@ -8,11 +8,8 @@ i_am("scripts/input.R") data <- fread(here("scripts", "output", "results.csv")) -data[, `:=`( - gene = stringr::str_split(gene, "\\.") |> purrr::map_chr(1), - mean_expression_normalized = mean_expression / max(mean_expression), - sd_expression_normalized = sd_expression / max(sd_expression) -)] +# Keep only the actual Ensembl ID for each gene. +data[, gene := stringr::str_split(gene, "\\.") |> purrr::map_chr(1)] data[, score := 0.5 * above_95 + 0.25 * mean_expression_normalized + @@ -22,7 +19,8 @@ data[, score := 0.5 * above_95 + data[, score := (score - min(score, na.rm = TRUE)) / (max(score, na.rm = TRUE) - min(score, na.rm = TRUE))] -# These are genes that are not expressed at all. +# These are genes that are not expressed at all or expressed just once, in case +# the standard deviation is used in the score. data[is.na(score), score := 0.0] setorder(data, -score)