From 698ea5086a126a89cfd1210b5d0b6f998aa935a4 Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Wed, 30 Nov 2022 14:32:40 +0100 Subject: [PATCH] Move analysis into the package --- NAMESPACE | 1 + R/analyze.R | 81 +++++++++++++++++++++++++++++++++++++++++++++++ man/analyze.Rd | 21 ++++++++++++ man/genes.Rd | 2 +- scripts/analyze.R | 69 +--------------------------------------- 5 files changed, 105 insertions(+), 69 deletions(-) create mode 100644 R/analyze.R create mode 100644 man/analyze.Rd diff --git a/NAMESPACE b/NAMESPACE index 7482eaf..41009d9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(analyze) export(rank_genes) export(run_app) import(data.table) diff --git a/R/analyze.R b/R/analyze.R new file mode 100644 index 0000000..dec0570 --- /dev/null +++ b/R/analyze.R @@ -0,0 +1,81 @@ +#' Analyze the provided expression data for ubiquitously expressed genes. +#' +#' @param data A `data.table` in normalized, long format. There should be a +#' `gene` column containing Ensembl gene IDs, a `sample` column containing +#' abitrary sample identifiers that are unique per sample and an `expression` +#' column containing the actual expression value for each given combination +#' of gene and sample. +#' +#' @return A `data.table` containing all computed values per gene. +#' +#' @export +analyze <- function(data) { + data[, `:=`( + expression_median = median(expression), + expression_95 = quantile(expression, probs = 0.95) + ), by = sample] + + # Transform the expression logarithmically. The samples that don't express a + # gene at all will be left out intentionally. + data[expression > 0, expression_log := log2(expression)] + + results <- data[, .( + median_expression = median(expression[expression > 0]), + iqr_expression = IQR(expression[expression > 0]), + mean_expression = mean(expression[expression > 0]), + sd_expression = sd(expression[expression > 0]), + median_expression_normalized = median(expression_log, na.rm = TRUE), + iqr_expression_normalized = IQR(expression_log, na.rm = TRUE), + mean_expression_normalized = mean(expression_log, na.rm = TRUE), + sd_expression_normalized = sd(expression_log, na.rm = TRUE), + above_zero = mean(expression > 0.0), + above_threshold = mean(expression > 50.0), + above_median = mean(expression > expression_median), + above_95 = mean(expression > expression_95) + ), by = "gene"] + + results[, `:=`( + qcv_expression = iqr_expression / median_expression, + qcv_expression_normalized = + iqr_expression_normalized / median_expression_normalized, + cv_expression = sd_expression / mean_expression, + cv_expression_normalized = + sd_expression_normalized / mean_expression_normalized + )] + + # Normalize values to the range of 0.0 to 1.0. + results[, `:=`( + median_expression_normalized = + (median_expression_normalized - + min(median_expression_normalized, na.rm = TRUE)) / + (max(median_expression_normalized, na.rm = TRUE) - + min(median_expression_normalized, na.rm = TRUE)), + iqr_expression_normalized = + (iqr_expression_normalized - + min(iqr_expression_normalized, na.rm = TRUE)) / + (max(iqr_expression_normalized, na.rm = TRUE) - + min(iqr_expression_normalized, na.rm = TRUE)), + qcv_expression_normalized = + (qcv_expression_normalized - + min(qcv_expression_normalized, na.rm = TRUE)) / + (max(qcv_expression_normalized, na.rm = TRUE) - + min(qcv_expression_normalized, na.rm = TRUE)), + mean_expression_normalized = + (mean_expression_normalized - + min(mean_expression_normalized, na.rm = TRUE)) / + (max(mean_expression_normalized, na.rm = TRUE) - + min(mean_expression_normalized, na.rm = TRUE)), + sd_expression_normalized = + (sd_expression_normalized - + min(sd_expression_normalized, na.rm = TRUE)) / + (max(sd_expression_normalized, na.rm = TRUE) - + min(sd_expression_normalized, na.rm = TRUE)), + cv_expression_normalized = + (cv_expression_normalized - + min(cv_expression_normalized, na.rm = TRUE)) / + (max(cv_expression_normalized, na.rm = TRUE) - + min(cv_expression_normalized, na.rm = TRUE)) + )] + + results +} diff --git a/man/analyze.Rd b/man/analyze.Rd new file mode 100644 index 0000000..702b0ac --- /dev/null +++ b/man/analyze.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/analyze.R +\name{analyze} +\alias{analyze} +\title{Analyze the provided expression data for ubiquitously expressed genes.} +\usage{ +analyze(data) +} +\arguments{ +\item{data}{A \code{data.table} in normalized, long format. There should be a +\code{gene} column containing Ensembl gene IDs, a \code{sample} column containing +abitrary sample identifiers that are unique per sample and an \code{expression} +column containing the actual expression value for each given combination +of gene and sample.} +} +\value{ +A \code{data.table} containing all computed values per gene. +} +\description{ +Analyze the provided expression data for ubiquitously expressed genes. +} diff --git a/man/genes.Rd b/man/genes.Rd index 6a09d2c..0c06c32 100644 --- a/man/genes.Rd +++ b/man/genes.Rd @@ -5,7 +5,7 @@ \alias{genes} \title{A \code{data.table} containig data on genes and their expression behavior.} \format{ -An object of class \code{data.table} (inherits from \code{data.frame}) with 56156 rows and 20 columns. +An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 20 columns. } \usage{ genes diff --git a/scripts/analyze.R b/scripts/analyze.R index 34f1845..1429225 100644 --- a/scripts/analyze.R +++ b/scripts/analyze.R @@ -8,72 +8,5 @@ library(here) i_am("scripts/input.R") data <- fread(here("scripts", "input", "data_long.csv")) - -data[, `:=`( - expression_median = median(expression), - expression_95 = quantile(expression, probs = 0.95) -), by = sample] - -# Transform the expression logarithmically. The samples that don't express a -# gene at all will be left out intentionally. -data[expression > 0, expression_log := log2(expression)] - -results <- data[, .( - median_expression = median(expression[expression > 0]), - iqr_expression = IQR(expression[expression > 0]), - mean_expression = mean(expression[expression > 0]), - sd_expression = sd(expression[expression > 0]), - median_expression_normalized = median(expression_log, na.rm = TRUE), - iqr_expression_normalized = IQR(expression_log, na.rm = TRUE), - mean_expression_normalized = mean(expression_log, na.rm = TRUE), - sd_expression_normalized = sd(expression_log, na.rm = TRUE), - above_zero = mean(expression > 0.0), - above_threshold = mean(expression > 50.0), - above_median = mean(expression > expression_median), - above_95 = mean(expression > expression_95) -), by = "gene"] - -results[, `:=`( - qcv_expression = iqr_expression / median_expression, - qcv_expression_normalized = - iqr_expression_normalized / median_expression_normalized, - cv_expression = sd_expression / mean_expression, - cv_expression_normalized = - sd_expression_normalized / mean_expression_normalized -)] - -# Normalize values to the range of 0.0 to 1.0. -results[, `:=`( - median_expression_normalized = - (median_expression_normalized - - min(median_expression_normalized, na.rm = TRUE)) / - (max(median_expression_normalized, na.rm = TRUE) - - min(median_expression_normalized, na.rm = TRUE)), - iqr_expression_normalized = - (iqr_expression_normalized - - min(iqr_expression_normalized, na.rm = TRUE)) / - (max(iqr_expression_normalized, na.rm = TRUE) - - min(iqr_expression_normalized, na.rm = TRUE)), - qcv_expression_normalized = - (qcv_expression_normalized - - min(qcv_expression_normalized, na.rm = TRUE)) / - (max(qcv_expression_normalized, na.rm = TRUE) - - min(qcv_expression_normalized, na.rm = TRUE)), - mean_expression_normalized = - (mean_expression_normalized - - min(mean_expression_normalized, na.rm = TRUE)) / - (max(mean_expression_normalized, na.rm = TRUE) - - min(mean_expression_normalized, na.rm = TRUE)), - sd_expression_normalized = - (sd_expression_normalized - - min(sd_expression_normalized, na.rm = TRUE)) / - (max(sd_expression_normalized, na.rm = TRUE) - - min(sd_expression_normalized, na.rm = TRUE)), - cv_expression_normalized = - (cv_expression_normalized - - min(cv_expression_normalized, na.rm = TRUE)) / - (max(cv_expression_normalized, na.rm = TRUE) - - min(cv_expression_normalized, na.rm = TRUE)) -)] - +results <- ubigen::analyze(data) fwrite(results, file = here("scripts", "output", "results.csv"))