Use package's ranking function for included data

This commit is contained in:
Elias Projahn 2022-11-30 14:49:42 +01:00
parent 698ea5086a
commit a6f0a64c2c
3 changed files with 18 additions and 16 deletions

View file

@ -3,22 +3,26 @@
#' This function will compute a weighted average across multiple metrics that
#' define how ubiquitous a gene is based on its expression across samples.
#'
#' @param data The input data to use. This should either be the result of a
#' previous call to this function or the return value of [analyze()].
#'
#' @return A `data.table` with gene data as well as the scores, ranks and
#' percentiles for each gene.
#'
#' @export
rank_genes <- function(cross_sample_metric = "above_95",
rank_genes <- function(data = ubigen::genes,
cross_sample_metric = "above_95",
cross_sample_weight = 0.5,
level_metric = "median_expression_normalized",
level_weight = 0.25,
variation_metric = "qcv_expression_normalized",
variation_weight = -0.25) {
data <- copy(data)
total_weight <- abs(cross_sample_weight) +
abs(level_weight) +
abs(variation_weight)
data <- copy(ubigen::genes)
data[, score :=
(cross_sample_weight * get(cross_sample_metric) +
level_weight * get(level_metric) +

View file

@ -5,6 +5,7 @@
\title{Rank genes based on how ubiquitous they are.}
\usage{
rank_genes(
data = ubigen::genes,
cross_sample_metric = "above_95",
cross_sample_weight = 0.5,
level_metric = "median_expression_normalized",
@ -13,6 +14,10 @@ rank_genes(
variation_weight = -0.25
)
}
\arguments{
\item{data}{The input data to use. This should either be the result of a
previous call to this function or the return value of \code{\link[=analyze]{analyze()}}.}
}
\value{
A \code{data.table} with gene data as well as the scores, ranks and
percentiles for each gene.

View file

@ -6,22 +6,14 @@ library(here)
i_am("scripts/input.R")
# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs
# are part of the separate genes table.
genes <- fread(here("scripts", "input", "genes.csv"))
data <- fread(here("scripts", "output", "results.csv"))
data[, score := 0.5 * above_95 +
0.25 * median_expression_normalized +
-0.25 * qcv_expression_normalized]
# Normalize scores to be between 0.0 and 1.0.
data[, score := (score - min(score, na.rm = TRUE)) /
(max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
# These are genes that are not expressed at all or expressed just once, in case
# the standard deviation is used in the score.
data[is.na(score), score := 0.0]
setorder(data, -score)
# Rank the data using default parameters.
data <- ubigen::rank_genes(data = data)
# Reintroduce gene IDs and HGNC symbols.
@ -41,6 +33,7 @@ data[, id := NULL]
# Remove duplicates. This will keep the best row for each duplicated gene.
data <- unique(data, by = "gene")
# Reassign ranks, because duplicates may have been removed.
data[, rank := .I]
fwrite(data, file = here("scripts", "output", "genes.csv"))