From a6f0a64c2c32cce623732777cd93188d548db5f1 Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Wed, 30 Nov 2022 14:49:42 +0100
Subject: [PATCH] Use package's ranking function for included data

---
 R/ranking.R       | 10 +++++++---
 man/rank_genes.Rd |  5 +++++
 scripts/ranking.R | 19 ++++++-------------
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/R/ranking.R b/R/ranking.R
index dba5618..2f71468 100644
--- a/R/ranking.R
+++ b/R/ranking.R
@@ -3,22 +3,26 @@
 #' This function will compute a weighted average across multiple metrics that
 #' define how ubiquitous a gene is based on its expression across samples.
 #'
+#' @param data The input data to use. This should either be the result of a
+#'   previous call to this function or the return value of [analyze()].
+#'
 #' @return A `data.table` with gene data as well as the scores, ranks and
 #'   percentiles for each gene.
 #'
 #' @export
-rank_genes <- function(cross_sample_metric = "above_95",
+rank_genes <- function(data = ubigen::genes,
+                       cross_sample_metric = "above_95",
                        cross_sample_weight = 0.5,
                        level_metric = "median_expression_normalized",
                        level_weight = 0.25,
                        variation_metric = "qcv_expression_normalized",
                        variation_weight = -0.25) {
+  data <- copy(data)
+
   total_weight <- abs(cross_sample_weight) +
     abs(level_weight) +
     abs(variation_weight)
 
-  data <- copy(ubigen::genes)
-
   data[, score :=
     (cross_sample_weight * get(cross_sample_metric) +
       level_weight * get(level_metric) +
diff --git a/man/rank_genes.Rd b/man/rank_genes.Rd
index 68de9a6..e7f078f 100644
--- a/man/rank_genes.Rd
+++ b/man/rank_genes.Rd
@@ -5,6 +5,7 @@
 \title{Rank genes based on how ubiquitous they are.}
 \usage{
 rank_genes(
+  data = ubigen::genes,
   cross_sample_metric = "above_95",
   cross_sample_weight = 0.5,
   level_metric = "median_expression_normalized",
@@ -13,6 +14,10 @@ rank_genes(
   variation_weight = -0.25
 )
 }
+\arguments{
+\item{data}{The input data to use. This should either be the result of a
+previous call to this function or the return value of \code{\link[=analyze]{analyze()}}.}
+}
 \value{
 A \code{data.table} with gene data as well as the scores, ranks and
 percentiles for each gene.
diff --git a/scripts/ranking.R b/scripts/ranking.R
index ea1ea2c..a96c1cb 100644
--- a/scripts/ranking.R
+++ b/scripts/ranking.R
@@ -6,22 +6,14 @@ library(here)
 
 i_am("scripts/input.R")
 
+# To save memory, the data includes fake IDs for genes. The actual Ensembl IDs
+# are part of the separate genes table.
+
 genes <- fread(here("scripts", "input", "genes.csv"))
 data <- fread(here("scripts", "output", "results.csv"))
 
-data[, score := 0.5 * above_95 +
-  0.25 * median_expression_normalized +
-  -0.25 * qcv_expression_normalized]
-
-# Normalize scores to be between 0.0 and 1.0.
-data[, score := (score - min(score, na.rm = TRUE)) /
-  (max(score, na.rm = TRUE) - min(score, na.rm = TRUE))]
-
-# These are genes that are not expressed at all or expressed just once, in case
-# the standard deviation is used in the score.
-data[is.na(score), score := 0.0]
-
-setorder(data, -score)
+# Rank the data using default parameters.
+data <- ubigen::rank_genes(data = data)
 
 # Reintroduce gene IDs and HGNC symbols.
 
@@ -41,6 +33,7 @@ data[, id := NULL]
 # Remove duplicates. This will keep the best row for each duplicated gene.
 data <- unique(data, by = "gene")
 
+# Reassign ranks, because duplicates may have been removed.
 data[, rank := .I]
 
 fwrite(data, file = here("scripts", "output", "genes.csv"))