From 4cb08de325223129f8ce9843502e928f90a60969 Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Wed, 22 Jun 2022 19:06:56 +0200 Subject: [PATCH] scripts: Add input and analysis script --- scripts/.gitignore | 1 + scripts/analyze.R | 27 +++++++++++++++++++++++++++ scripts/input.R | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) create mode 100644 scripts/analyze.R create mode 100644 scripts/input.R diff --git a/scripts/.gitignore b/scripts/.gitignore index 16be8f2..4bf02be 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1 +1,2 @@ +/input/ /output/ diff --git a/scripts/analyze.R b/scripts/analyze.R new file mode 100644 index 0000000..800d49b --- /dev/null +++ b/scripts/analyze.R @@ -0,0 +1,27 @@ +# This scripts reads the input data (See input.R) and performs various +# computations on it in order to later use the results for computating scores +# for ubuiquitously expressed genes. + +library(data.table) +library(here) + +i_am("scripts/input.R") + +data <- fread(here("scripts", "input", "data_long.csv")) + +data[, `:=`( + expression_median = median(expression), + expression_95 = quantile(expression, probs = 0.95) +), by = sample] + +results <- data[, .( + median_expression = median(expression), + mean_expression = mean(expression), + sd_expression = sd(expression), + above_zero = mean(expression > 0.0), + above_threshold = mean(expression > 50.0), + above_median = mean(expression > expression_median), + above_95 = mean(expression > expression_95) +), by = "gene"] + +fwrite(results, file = here("scripts", "output", "results.csv")) diff --git a/scripts/input.R b/scripts/input.R new file mode 100644 index 0000000..1d686ba --- /dev/null +++ b/scripts/input.R @@ -0,0 +1,38 @@ +# This script reads data from GTEx and transforms it into various formats for +# further analysis. Note that this requires very good computational resources +# and especially a lot of RAM because of the size of the data. + +library(data.table) +library(here) + +i_am("scripts/input.R") + +# Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/ +# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz +# The file has been edited removing the lines above the column headers. +data_wide_samples <- fread(here("scripts", "input", "gtex.tsv.gz")) + +setnames( + data_wide_samples, + c("Name", "Description"), + c("gene", "hgnc_symbol") +) + +data_long <- melt( + data_wide_samples, + id.vars = c("gene", "hgnc_symbol"), + variable.name = "sample", + value.name = "expression", + variable.factor = FALSE +) + +fwrite( + data_wide_samples, + file = here( + "scripts", + "input", + "data_wide_samples.csv" + ) +) + +fwrite(data_long, file = here("scripts", "input", "data_long.csv"))