scripts: Add input and analysis script

2025-10-26 19:57:24 +01:00 · 2022-06-22 19:06:56 +02:00 · 2022-06-22 19:06:56 +02:00 · 4cb08de325
commit 4cb08de325
parent 51c4847949
3 changed files with 66 additions and 0 deletions
--- a/scripts/.gitignore
+++ b/scripts/.gitignore
@ -1 +1,2 @@
+/input/
 /output/
--- a/scripts/analyze.R
+++ b/scripts/analyze.R
@ -0,0 +1,27 @@
+# This scripts reads the input data (See input.R) and performs various
+# computations on it in order to later use the results for computating scores
+# for ubuiquitously expressed genes.
+
+library(data.table)
+library(here)
+
+i_am("scripts/input.R")
+
+data <- fread(here("scripts", "input", "data_long.csv"))
+
+data[, `:=`(
+    expression_median = median(expression),
+    expression_95 = quantile(expression, probs = 0.95)
+), by = sample]
+
+results <- data[, .(
+    median_expression = median(expression),
+    mean_expression = mean(expression),
+    sd_expression = sd(expression),
+    above_zero = mean(expression > 0.0),
+    above_threshold = mean(expression > 50.0),
+    above_median = mean(expression > expression_median),
+    above_95 = mean(expression > expression_95)
+), by = "gene"]
+
+fwrite(results, file = here("scripts", "output", "results.csv"))
--- a/scripts/input.R
+++ b/scripts/input.R
@ -0,0 +1,38 @@
+# This script reads data from GTEx and transforms it into various formats for
+# further analysis. Note that this requires very good computational resources
+# and especially a lot of RAM because of the size of the data.
+
+library(data.table)
+library(here)
+
+i_am("scripts/input.R")
+
+# Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/
+# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
+# The file has been edited removing the lines above the column headers.
+data_wide_samples <- fread(here("scripts", "input", "gtex.tsv.gz"))
+
+setnames(
+  data_wide_samples,
+  c("Name", "Description"),
+  c("gene", "hgnc_symbol")
+)
+
+data_long <- melt(
+  data_wide_samples,
+  id.vars = c("gene", "hgnc_symbol"),
+  variable.name = "sample",
+  value.name = "expression",
+  variable.factor = FALSE
+)
+
+fwrite(
+  data_wide_samples,
+  file = here(
+    "scripts",
+    "input",
+    "data_wide_samples.csv"
+  )
+)
+
+fwrite(data_long, file = here("scripts", "input", "data_long.csv"))