Move analysis into the package

This commit is contained in:
Elias Projahn 2022-11-30 14:32:40 +01:00
parent d25bb424b1
commit 698ea5086a
5 changed files with 105 additions and 69 deletions

View file

@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(analyze)
export(rank_genes) export(rank_genes)
export(run_app) export(run_app)
import(data.table) import(data.table)

81
R/analyze.R Normal file
View file

@ -0,0 +1,81 @@
#' Analyze the provided expression data for ubiquitously expressed genes.
#'
#' @param data A `data.table` in normalized, long format. There should be a
#' `gene` column containing Ensembl gene IDs, a `sample` column containing
#' abitrary sample identifiers that are unique per sample and an `expression`
#' column containing the actual expression value for each given combination
#' of gene and sample.
#'
#' @return A `data.table` containing all computed values per gene.
#'
#' @export
analyze <- function(data) {
data[, `:=`(
expression_median = median(expression),
expression_95 = quantile(expression, probs = 0.95)
), by = sample]
# Transform the expression logarithmically. The samples that don't express a
# gene at all will be left out intentionally.
data[expression > 0, expression_log := log2(expression)]
results <- data[, .(
median_expression = median(expression[expression > 0]),
iqr_expression = IQR(expression[expression > 0]),
mean_expression = mean(expression[expression > 0]),
sd_expression = sd(expression[expression > 0]),
median_expression_normalized = median(expression_log, na.rm = TRUE),
iqr_expression_normalized = IQR(expression_log, na.rm = TRUE),
mean_expression_normalized = mean(expression_log, na.rm = TRUE),
sd_expression_normalized = sd(expression_log, na.rm = TRUE),
above_zero = mean(expression > 0.0),
above_threshold = mean(expression > 50.0),
above_median = mean(expression > expression_median),
above_95 = mean(expression > expression_95)
), by = "gene"]
results[, `:=`(
qcv_expression = iqr_expression / median_expression,
qcv_expression_normalized =
iqr_expression_normalized / median_expression_normalized,
cv_expression = sd_expression / mean_expression,
cv_expression_normalized =
sd_expression_normalized / mean_expression_normalized
)]
# Normalize values to the range of 0.0 to 1.0.
results[, `:=`(
median_expression_normalized =
(median_expression_normalized -
min(median_expression_normalized, na.rm = TRUE)) /
(max(median_expression_normalized, na.rm = TRUE) -
min(median_expression_normalized, na.rm = TRUE)),
iqr_expression_normalized =
(iqr_expression_normalized -
min(iqr_expression_normalized, na.rm = TRUE)) /
(max(iqr_expression_normalized, na.rm = TRUE) -
min(iqr_expression_normalized, na.rm = TRUE)),
qcv_expression_normalized =
(qcv_expression_normalized -
min(qcv_expression_normalized, na.rm = TRUE)) /
(max(qcv_expression_normalized, na.rm = TRUE) -
min(qcv_expression_normalized, na.rm = TRUE)),
mean_expression_normalized =
(mean_expression_normalized -
min(mean_expression_normalized, na.rm = TRUE)) /
(max(mean_expression_normalized, na.rm = TRUE) -
min(mean_expression_normalized, na.rm = TRUE)),
sd_expression_normalized =
(sd_expression_normalized -
min(sd_expression_normalized, na.rm = TRUE)) /
(max(sd_expression_normalized, na.rm = TRUE) -
min(sd_expression_normalized, na.rm = TRUE)),
cv_expression_normalized =
(cv_expression_normalized -
min(cv_expression_normalized, na.rm = TRUE)) /
(max(cv_expression_normalized, na.rm = TRUE) -
min(cv_expression_normalized, na.rm = TRUE))
)]
results
}

21
man/analyze.Rd Normal file
View file

@ -0,0 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/analyze.R
\name{analyze}
\alias{analyze}
\title{Analyze the provided expression data for ubiquitously expressed genes.}
\usage{
analyze(data)
}
\arguments{
\item{data}{A \code{data.table} in normalized, long format. There should be a
\code{gene} column containing Ensembl gene IDs, a \code{sample} column containing
abitrary sample identifiers that are unique per sample and an \code{expression}
column containing the actual expression value for each given combination
of gene and sample.}
}
\value{
A \code{data.table} containing all computed values per gene.
}
\description{
Analyze the provided expression data for ubiquitously expressed genes.
}

View file

@ -5,7 +5,7 @@
\alias{genes} \alias{genes}
\title{A \code{data.table} containig data on genes and their expression behavior.} \title{A \code{data.table} containig data on genes and their expression behavior.}
\format{ \format{
An object of class \code{data.table} (inherits from \code{data.frame}) with 56156 rows and 20 columns. An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 20 columns.
} }
\usage{ \usage{
genes genes

View file

@ -8,72 +8,5 @@ library(here)
i_am("scripts/input.R") i_am("scripts/input.R")
data <- fread(here("scripts", "input", "data_long.csv")) data <- fread(here("scripts", "input", "data_long.csv"))
results <- ubigen::analyze(data)
data[, `:=`(
expression_median = median(expression),
expression_95 = quantile(expression, probs = 0.95)
), by = sample]
# Transform the expression logarithmically. The samples that don't express a
# gene at all will be left out intentionally.
data[expression > 0, expression_log := log2(expression)]
results <- data[, .(
median_expression = median(expression[expression > 0]),
iqr_expression = IQR(expression[expression > 0]),
mean_expression = mean(expression[expression > 0]),
sd_expression = sd(expression[expression > 0]),
median_expression_normalized = median(expression_log, na.rm = TRUE),
iqr_expression_normalized = IQR(expression_log, na.rm = TRUE),
mean_expression_normalized = mean(expression_log, na.rm = TRUE),
sd_expression_normalized = sd(expression_log, na.rm = TRUE),
above_zero = mean(expression > 0.0),
above_threshold = mean(expression > 50.0),
above_median = mean(expression > expression_median),
above_95 = mean(expression > expression_95)
), by = "gene"]
results[, `:=`(
qcv_expression = iqr_expression / median_expression,
qcv_expression_normalized =
iqr_expression_normalized / median_expression_normalized,
cv_expression = sd_expression / mean_expression,
cv_expression_normalized =
sd_expression_normalized / mean_expression_normalized
)]
# Normalize values to the range of 0.0 to 1.0.
results[, `:=`(
median_expression_normalized =
(median_expression_normalized -
min(median_expression_normalized, na.rm = TRUE)) /
(max(median_expression_normalized, na.rm = TRUE) -
min(median_expression_normalized, na.rm = TRUE)),
iqr_expression_normalized =
(iqr_expression_normalized -
min(iqr_expression_normalized, na.rm = TRUE)) /
(max(iqr_expression_normalized, na.rm = TRUE) -
min(iqr_expression_normalized, na.rm = TRUE)),
qcv_expression_normalized =
(qcv_expression_normalized -
min(qcv_expression_normalized, na.rm = TRUE)) /
(max(qcv_expression_normalized, na.rm = TRUE) -
min(qcv_expression_normalized, na.rm = TRUE)),
mean_expression_normalized =
(mean_expression_normalized -
min(mean_expression_normalized, na.rm = TRUE)) /
(max(mean_expression_normalized, na.rm = TRUE) -
min(mean_expression_normalized, na.rm = TRUE)),
sd_expression_normalized =
(sd_expression_normalized -
min(sd_expression_normalized, na.rm = TRUE)) /
(max(sd_expression_normalized, na.rm = TRUE) -
min(sd_expression_normalized, na.rm = TRUE)),
cv_expression_normalized =
(cv_expression_normalized -
min(cv_expression_normalized, na.rm = TRUE)) /
(max(cv_expression_normalized, na.rm = TRUE) -
min(cv_expression_normalized, na.rm = TRUE))
)]
fwrite(results, file = here("scripts", "output", "results.csv")) fwrite(results, file = here("scripts", "output", "results.csv"))