mirror of
https://github.com/johrpan/ubigen.git
synced 2025-10-26 19:57:24 +01:00
Move analysis into the package
This commit is contained in:
parent
d25bb424b1
commit
698ea5086a
5 changed files with 105 additions and 69 deletions
|
|
@ -1,5 +1,6 @@
|
||||||
# Generated by roxygen2: do not edit by hand
|
# Generated by roxygen2: do not edit by hand
|
||||||
|
|
||||||
|
export(analyze)
|
||||||
export(rank_genes)
|
export(rank_genes)
|
||||||
export(run_app)
|
export(run_app)
|
||||||
import(data.table)
|
import(data.table)
|
||||||
|
|
|
||||||
81
R/analyze.R
Normal file
81
R/analyze.R
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
#' Analyze the provided expression data for ubiquitously expressed genes.
|
||||||
|
#'
|
||||||
|
#' @param data A `data.table` in normalized, long format. There should be a
|
||||||
|
#' `gene` column containing Ensembl gene IDs, a `sample` column containing
|
||||||
|
#' abitrary sample identifiers that are unique per sample and an `expression`
|
||||||
|
#' column containing the actual expression value for each given combination
|
||||||
|
#' of gene and sample.
|
||||||
|
#'
|
||||||
|
#' @return A `data.table` containing all computed values per gene.
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
analyze <- function(data) {
|
||||||
|
data[, `:=`(
|
||||||
|
expression_median = median(expression),
|
||||||
|
expression_95 = quantile(expression, probs = 0.95)
|
||||||
|
), by = sample]
|
||||||
|
|
||||||
|
# Transform the expression logarithmically. The samples that don't express a
|
||||||
|
# gene at all will be left out intentionally.
|
||||||
|
data[expression > 0, expression_log := log2(expression)]
|
||||||
|
|
||||||
|
results <- data[, .(
|
||||||
|
median_expression = median(expression[expression > 0]),
|
||||||
|
iqr_expression = IQR(expression[expression > 0]),
|
||||||
|
mean_expression = mean(expression[expression > 0]),
|
||||||
|
sd_expression = sd(expression[expression > 0]),
|
||||||
|
median_expression_normalized = median(expression_log, na.rm = TRUE),
|
||||||
|
iqr_expression_normalized = IQR(expression_log, na.rm = TRUE),
|
||||||
|
mean_expression_normalized = mean(expression_log, na.rm = TRUE),
|
||||||
|
sd_expression_normalized = sd(expression_log, na.rm = TRUE),
|
||||||
|
above_zero = mean(expression > 0.0),
|
||||||
|
above_threshold = mean(expression > 50.0),
|
||||||
|
above_median = mean(expression > expression_median),
|
||||||
|
above_95 = mean(expression > expression_95)
|
||||||
|
), by = "gene"]
|
||||||
|
|
||||||
|
results[, `:=`(
|
||||||
|
qcv_expression = iqr_expression / median_expression,
|
||||||
|
qcv_expression_normalized =
|
||||||
|
iqr_expression_normalized / median_expression_normalized,
|
||||||
|
cv_expression = sd_expression / mean_expression,
|
||||||
|
cv_expression_normalized =
|
||||||
|
sd_expression_normalized / mean_expression_normalized
|
||||||
|
)]
|
||||||
|
|
||||||
|
# Normalize values to the range of 0.0 to 1.0.
|
||||||
|
results[, `:=`(
|
||||||
|
median_expression_normalized =
|
||||||
|
(median_expression_normalized -
|
||||||
|
min(median_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(median_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(median_expression_normalized, na.rm = TRUE)),
|
||||||
|
iqr_expression_normalized =
|
||||||
|
(iqr_expression_normalized -
|
||||||
|
min(iqr_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(iqr_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(iqr_expression_normalized, na.rm = TRUE)),
|
||||||
|
qcv_expression_normalized =
|
||||||
|
(qcv_expression_normalized -
|
||||||
|
min(qcv_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(qcv_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(qcv_expression_normalized, na.rm = TRUE)),
|
||||||
|
mean_expression_normalized =
|
||||||
|
(mean_expression_normalized -
|
||||||
|
min(mean_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(mean_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(mean_expression_normalized, na.rm = TRUE)),
|
||||||
|
sd_expression_normalized =
|
||||||
|
(sd_expression_normalized -
|
||||||
|
min(sd_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(sd_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(sd_expression_normalized, na.rm = TRUE)),
|
||||||
|
cv_expression_normalized =
|
||||||
|
(cv_expression_normalized -
|
||||||
|
min(cv_expression_normalized, na.rm = TRUE)) /
|
||||||
|
(max(cv_expression_normalized, na.rm = TRUE) -
|
||||||
|
min(cv_expression_normalized, na.rm = TRUE))
|
||||||
|
)]
|
||||||
|
|
||||||
|
results
|
||||||
|
}
|
||||||
21
man/analyze.Rd
Normal file
21
man/analyze.Rd
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/analyze.R
|
||||||
|
\name{analyze}
|
||||||
|
\alias{analyze}
|
||||||
|
\title{Analyze the provided expression data for ubiquitously expressed genes.}
|
||||||
|
\usage{
|
||||||
|
analyze(data)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{A \code{data.table} in normalized, long format. There should be a
|
||||||
|
\code{gene} column containing Ensembl gene IDs, a \code{sample} column containing
|
||||||
|
abitrary sample identifiers that are unique per sample and an \code{expression}
|
||||||
|
column containing the actual expression value for each given combination
|
||||||
|
of gene and sample.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A \code{data.table} containing all computed values per gene.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Analyze the provided expression data for ubiquitously expressed genes.
|
||||||
|
}
|
||||||
|
|
@ -5,7 +5,7 @@
|
||||||
\alias{genes}
|
\alias{genes}
|
||||||
\title{A \code{data.table} containig data on genes and their expression behavior.}
|
\title{A \code{data.table} containig data on genes and their expression behavior.}
|
||||||
\format{
|
\format{
|
||||||
An object of class \code{data.table} (inherits from \code{data.frame}) with 56156 rows and 20 columns.
|
An object of class \code{data.table} (inherits from \code{data.frame}) with 55507 rows and 20 columns.
|
||||||
}
|
}
|
||||||
\usage{
|
\usage{
|
||||||
genes
|
genes
|
||||||
|
|
|
||||||
|
|
@ -8,72 +8,5 @@ library(here)
|
||||||
i_am("scripts/input.R")
|
i_am("scripts/input.R")
|
||||||
|
|
||||||
data <- fread(here("scripts", "input", "data_long.csv"))
|
data <- fread(here("scripts", "input", "data_long.csv"))
|
||||||
|
results <- ubigen::analyze(data)
|
||||||
data[, `:=`(
|
|
||||||
expression_median = median(expression),
|
|
||||||
expression_95 = quantile(expression, probs = 0.95)
|
|
||||||
), by = sample]
|
|
||||||
|
|
||||||
# Transform the expression logarithmically. The samples that don't express a
|
|
||||||
# gene at all will be left out intentionally.
|
|
||||||
data[expression > 0, expression_log := log2(expression)]
|
|
||||||
|
|
||||||
results <- data[, .(
|
|
||||||
median_expression = median(expression[expression > 0]),
|
|
||||||
iqr_expression = IQR(expression[expression > 0]),
|
|
||||||
mean_expression = mean(expression[expression > 0]),
|
|
||||||
sd_expression = sd(expression[expression > 0]),
|
|
||||||
median_expression_normalized = median(expression_log, na.rm = TRUE),
|
|
||||||
iqr_expression_normalized = IQR(expression_log, na.rm = TRUE),
|
|
||||||
mean_expression_normalized = mean(expression_log, na.rm = TRUE),
|
|
||||||
sd_expression_normalized = sd(expression_log, na.rm = TRUE),
|
|
||||||
above_zero = mean(expression > 0.0),
|
|
||||||
above_threshold = mean(expression > 50.0),
|
|
||||||
above_median = mean(expression > expression_median),
|
|
||||||
above_95 = mean(expression > expression_95)
|
|
||||||
), by = "gene"]
|
|
||||||
|
|
||||||
results[, `:=`(
|
|
||||||
qcv_expression = iqr_expression / median_expression,
|
|
||||||
qcv_expression_normalized =
|
|
||||||
iqr_expression_normalized / median_expression_normalized,
|
|
||||||
cv_expression = sd_expression / mean_expression,
|
|
||||||
cv_expression_normalized =
|
|
||||||
sd_expression_normalized / mean_expression_normalized
|
|
||||||
)]
|
|
||||||
|
|
||||||
# Normalize values to the range of 0.0 to 1.0.
|
|
||||||
results[, `:=`(
|
|
||||||
median_expression_normalized =
|
|
||||||
(median_expression_normalized -
|
|
||||||
min(median_expression_normalized, na.rm = TRUE)) /
|
|
||||||
(max(median_expression_normalized, na.rm = TRUE) -
|
|
||||||
min(median_expression_normalized, na.rm = TRUE)),
|
|
||||||
iqr_expression_normalized =
|
|
||||||
(iqr_expression_normalized -
|
|
||||||
min(iqr_expression_normalized, na.rm = TRUE)) /
|
|
||||||
(max(iqr_expression_normalized, na.rm = TRUE) -
|
|
||||||
min(iqr_expression_normalized, na.rm = TRUE)),
|
|
||||||
qcv_expression_normalized =
|
|
||||||
(qcv_expression_normalized -
|
|
||||||
min(qcv_expression_normalized, na.rm = TRUE)) /
|
|
||||||
(max(qcv_expression_normalized, na.rm = TRUE) -
|
|
||||||
min(qcv_expression_normalized, na.rm = TRUE)),
|
|
||||||
mean_expression_normalized =
|
|
||||||
(mean_expression_normalized -
|
|
||||||
min(mean_expression_normalized, na.rm = TRUE)) /
|
|
||||||
(max(mean_expression_normalized, na.rm = TRUE) -
|
|
||||||
min(mean_expression_normalized, na.rm = TRUE)),
|
|
||||||
sd_expression_normalized =
|
|
||||||
(sd_expression_normalized -
|
|
||||||
min(sd_expression_normalized, na.rm = TRUE)) /
|
|
||||||
(max(sd_expression_normalized, na.rm = TRUE) -
|
|
||||||
min(sd_expression_normalized, na.rm = TRUE)),
|
|
||||||
cv_expression_normalized =
|
|
||||||
(cv_expression_normalized -
|
|
||||||
min(cv_expression_normalized, na.rm = TRUE)) /
|
|
||||||
(max(cv_expression_normalized, na.rm = TRUE) -
|
|
||||||
min(cv_expression_normalized, na.rm = TRUE))
|
|
||||||
)]
|
|
||||||
|
|
||||||
fwrite(results, file = here("scripts", "output", "results.csv"))
|
fwrite(results, file = here("scripts", "output", "results.csv"))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue