mirror of
https://github.com/johrpan/ubigen.git
synced 2025-10-26 19:57:24 +01:00
scripts: Add input and analysis script
This commit is contained in:
parent
51c4847949
commit
4cb08de325
3 changed files with 66 additions and 0 deletions
1
scripts/.gitignore
vendored
1
scripts/.gitignore
vendored
|
|
@ -1 +1,2 @@
|
|||
/input/
|
||||
/output/
|
||||
|
|
|
|||
27
scripts/analyze.R
Normal file
27
scripts/analyze.R
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# This scripts reads the input data (See input.R) and performs various
|
||||
# computations on it in order to later use the results for computating scores
|
||||
# for ubuiquitously expressed genes.
|
||||
|
||||
library(data.table)
|
||||
library(here)
|
||||
|
||||
i_am("scripts/input.R")
|
||||
|
||||
data <- fread(here("scripts", "input", "data_long.csv"))
|
||||
|
||||
data[, `:=`(
|
||||
expression_median = median(expression),
|
||||
expression_95 = quantile(expression, probs = 0.95)
|
||||
), by = sample]
|
||||
|
||||
results <- data[, .(
|
||||
median_expression = median(expression),
|
||||
mean_expression = mean(expression),
|
||||
sd_expression = sd(expression),
|
||||
above_zero = mean(expression > 0.0),
|
||||
above_threshold = mean(expression > 50.0),
|
||||
above_median = mean(expression > expression_median),
|
||||
above_95 = mean(expression > expression_95)
|
||||
), by = "gene"]
|
||||
|
||||
fwrite(results, file = here("scripts", "output", "results.csv"))
|
||||
38
scripts/input.R
Normal file
38
scripts/input.R
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
# This script reads data from GTEx and transforms it into various formats for
|
||||
# further analysis. Note that this requires very good computational resources
|
||||
# and especially a lot of RAM because of the size of the data.
|
||||
|
||||
library(data.table)
|
||||
library(here)
|
||||
|
||||
i_am("scripts/input.R")
|
||||
|
||||
# Source: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/
|
||||
# GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
|
||||
# The file has been edited removing the lines above the column headers.
|
||||
data_wide_samples <- fread(here("scripts", "input", "gtex.tsv.gz"))
|
||||
|
||||
setnames(
|
||||
data_wide_samples,
|
||||
c("Name", "Description"),
|
||||
c("gene", "hgnc_symbol")
|
||||
)
|
||||
|
||||
data_long <- melt(
|
||||
data_wide_samples,
|
||||
id.vars = c("gene", "hgnc_symbol"),
|
||||
variable.name = "sample",
|
||||
value.name = "expression",
|
||||
variable.factor = FALSE
|
||||
)
|
||||
|
||||
fwrite(
|
||||
data_wide_samples,
|
||||
file = here(
|
||||
"scripts",
|
||||
"input",
|
||||
"data_wide_samples.csv"
|
||||
)
|
||||
)
|
||||
|
||||
fwrite(data_long, file = here("scripts", "input", "data_long.csv"))
|
||||
Loading…
Add table
Add a link
Reference in a new issue