From a6f0a64c2c32cce623732777cd93188d548db5f1 Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Wed, 30 Nov 2022 14:49:42 +0100 Subject: [PATCH] Use package's ranking function for included data --- R/ranking.R | 10 +++++++--- man/rank_genes.Rd | 5 +++++ scripts/ranking.R | 19 ++++++------------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/R/ranking.R b/R/ranking.R index dba5618..2f71468 100644 --- a/R/ranking.R +++ b/R/ranking.R @@ -3,22 +3,26 @@ #' This function will compute a weighted average across multiple metrics that #' define how ubiquitous a gene is based on its expression across samples. #' +#' @param data The input data to use. This should either be the result of a +#' previous call to this function or the return value of [analyze()]. +#' #' @return A `data.table` with gene data as well as the scores, ranks and #' percentiles for each gene. #' #' @export -rank_genes <- function(cross_sample_metric = "above_95", +rank_genes <- function(data = ubigen::genes, + cross_sample_metric = "above_95", cross_sample_weight = 0.5, level_metric = "median_expression_normalized", level_weight = 0.25, variation_metric = "qcv_expression_normalized", variation_weight = -0.25) { + data <- copy(data) + total_weight <- abs(cross_sample_weight) + abs(level_weight) + abs(variation_weight) - data <- copy(ubigen::genes) - data[, score := (cross_sample_weight * get(cross_sample_metric) + level_weight * get(level_metric) + diff --git a/man/rank_genes.Rd b/man/rank_genes.Rd index 68de9a6..e7f078f 100644 --- a/man/rank_genes.Rd +++ b/man/rank_genes.Rd @@ -5,6 +5,7 @@ \title{Rank genes based on how ubiquitous they are.} \usage{ rank_genes( + data = ubigen::genes, cross_sample_metric = "above_95", cross_sample_weight = 0.5, level_metric = "median_expression_normalized", @@ -13,6 +14,10 @@ rank_genes( variation_weight = -0.25 ) } +\arguments{ +\item{data}{The input data to use. This should either be the result of a +previous call to this function or the return value of \code{\link[=analyze]{analyze()}}.} +} \value{ A \code{data.table} with gene data as well as the scores, ranks and percentiles for each gene. diff --git a/scripts/ranking.R b/scripts/ranking.R index ea1ea2c..a96c1cb 100644 --- a/scripts/ranking.R +++ b/scripts/ranking.R @@ -6,22 +6,14 @@ library(here) i_am("scripts/input.R") +# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs +# are part of the separate genes table. + genes <- fread(here("scripts", "input", "genes.csv")) data <- fread(here("scripts", "output", "results.csv")) -data[, score := 0.5 * above_95 + - 0.25 * median_expression_normalized + - -0.25 * qcv_expression_normalized] - -# Normalize scores to be between 0.0 and 1.0. -data[, score := (score - min(score, na.rm = TRUE)) / - (max(score, na.rm = TRUE) - min(score, na.rm = TRUE))] - -# These are genes that are not expressed at all or expressed just once, in case -# the standard deviation is used in the score. -data[is.na(score), score := 0.0] - -setorder(data, -score) +# Rank the data using default parameters. +data <- ubigen::rank_genes(data = data) # Reintroduce gene IDs and HGNC symbols. @@ -41,6 +33,7 @@ data[, id := NULL] # Remove duplicates. This will keep the best row for each duplicated gene. data <- unique(data, by = "gene") +# Reassign ranks, because duplicates may have been removed. data[, rank := .I] fwrite(data, file = here("scripts", "output", "genes.csv"))