validate: Update and extend

2025-10-26 10:47:25 +01:00 · 2022-01-26 11:38:39 +01:00 · 2022-01-26 11:38:39 +01:00 · 3cedc4fea4
commit 3cedc4fea4
parent 016a9ada9d
2 changed files with 106 additions and 80 deletions
--- a/R/validate.R
+++ b/R/validate.R
@ -1,80 +1,94 @@
-#' Perform cross-validation for the analysis.
+#' Perform cross-validation for the ranking.
 #'
-#' This function reoptimizes the analysis leaving out one of the original
+#' This function reoptimizes the ranking leaving out one of the original
 #' reference genes at a time.
 #'
-#' @param analysis The analysis to validate.
+#' @param ranking The ranking to validate.
 #' @param reference_gene_ids The reference gene IDs whose ranking should be
 #'   validated.
 #' @param method_ids IDs of the methods that were used.
 #' @param progress An optional progress function that should accept a single
 #'   value between 0.0 and 1.0 for progress information.
 #'
-#' @returns An object containing the mean absolute error and the mean percent
+#' @returns A validation object with the following items:
-#'   rank for the original analysis as well as the validation.
+#'   \describe{
 #'     \item{`validation`}{A `data.table` containing percentiles of the
 #'       comparison genes from the original ranking as well as their validation.
 #'     }
 #'     \item{`mean_score`}{The mean score of the genes.}
 #'     \item{`mean_percentile_original`}{The mean percentile of the genes in
 #'       the original ranking.
 #'     }
 #'     \item{`mean_percentile_validation`}{The mean percentile of the genes
 #'       when optimizing without themselves.
 #'     }
 #'     \item{`mean_error`}{The mean absolute error.}
 #'   }
 #'
 #' @export
-validate <- function(analysis, progress = NULL) {
+validate <- function(ranking, reference_gene_ids, method_ids, progress = NULL) {
-    if (inherits(analysis, "geposan_analysis")) {
+    if (!inherits(ranking, "geposan_ranking")) {
-        stop("Analysis is invalid. Use geposan::analyze().")
+        stop("Ranking is invalid. Use geposan::ranking().")
    }
-    cached("validation", analysis$preset, {
+    if (is.null(progress)) {
-        reference_gene_ids <- analysis$preset$reference_gene_ids
+        progress_bar <- progress::progress_bar$new()
-        n_references <- length(reference_gene_ids)
+        progress_bar$update(0.0)
        methods <- analysis$preset$methods
        ranking_reference <- analysis$ranking
        n_ranks <- nrow(ranking_reference)
-        mean_error_reference <- mean(
+        progress <- function(progress_value) {
-            1.0 - ranking_reference[gene %chin% reference_gene_ids, score]
+            if (!progress_bar$finished) {
-        )
+                progress_bar$update(progress_value)
-
+                if (progress_value >= 1.0) {
-        mean_rank_reference <- mean(
+                    progress_bar$terminate()
-            1.0 - ranking_reference[gene %chin% reference_gene_ids, rank] /
+                }
                n_ranks
        )
        mean_error_validation <- 0.0
        mean_rank_validation <- 0.0
        progress_state <- 0.0
        progress_step <- 1.0 / n_references
        for (validation_gene_id in reference_gene_ids) {
            included_gene_ids <- reference_gene_ids[
                reference_gene_ids != validation_gene_id
            ]
            weights <- optimal_weights(
                analysis,
                methods,
                included_gene_ids
            )
            ranking_validation <- ranking(analysis, weights)
            mean_error_validation <- mean_error_validation +
                (1.0 - ranking_validation[gene == validation_gene_id, score]) /
                    n_references
            mean_rank_validation <- mean_rank_validation +
                (1.0 - ranking_validation[gene == validation_gene_id, rank] /
                    n_ranks) / n_references
            if (!is.null(progress)) {
                progress_state <- progress_state + progress_step
                progress(progress_state)
            }
        }
    }
-        structure(
+    progress_state <- 0.0
-            list(
+    progress_step <- 1.0 / length(reference_gene_ids)
-                mean_error_reference = mean_error_reference,
+
-                mean_error_validation = mean_error_validation,
+    results <- ranking[gene %chin% reference_gene_ids, .(gene, percentile)]
-                mean_rank_reference = mean_rank_reference,
+
-                mean_rank_validation = mean_rank_validation
+    for (gene_id in reference_gene_ids) {
-            ),
+        included_gene_ids <- reference_gene_ids[
-            class = "geposan_validation"
+            reference_gene_ids != gene_id
        ]
        weights <- optimal_weights(
            ranking,
            method_ids,
            included_gene_ids
        )
-    })
+
        ranking_validation <- ranking(ranking, weights)
        results[
            gene == gene_id,
            percentile_validation := ranking_validation[
                gene == gene_id,
                percentile
            ]
        ]
        if (!is.null(progress)) {
            progress_state <- progress_state + progress_step
            progress(progress_state)
        }
    }
    results[, error := percentile - percentile_validation]
    setorder(results, error)
    structure(
        list(
            validation = results,
            mean_percentile_original = results[, mean(percentile)],
            mean_percentile_validation = results[, mean(percentile_validation)],
            mean_error = results[, mean(error)]
        ),
        class = "geposan_validation"
    )
 }
 #' S3 method to print a validation object.
@ -86,22 +100,17 @@ validate <- function(analysis, progress = NULL) {
 #'
 #' @export
 print.geposan_validation <- function(x, ...) {
    cat("geposan validation:\n")
    cat(sprintf(
        paste0(
-            "\n  Absolute scores:",
+            "geposan validation:",
-            "\n  Mean error reference:  %.3f",
+            "\n  Mean percentile original: %.1f%%",
-            "\n  Mean error validation: %.3f",
+            "\n  Mean percentile validation: %.1f%%",
-            "\n",
+            "\n  Mean error: %.1f percent points",
            "\n  Ranks:",
            "\n  Mean rank reference:   %.1f%%",
            "\n  Mean rank validation:  %.1f%%",
            "\n"
        ),
-        x$mean_error_reference,
+        x$mean_percentile_original * 100,
-        x$mean_error_validation,
+        x$mean_percentile_validation * 100,
-        x$mean_rank_reference * 100,
+        x$mean_error * 100
        x$mean_rank_validation * 100
    ))
    invisible(x)
--- a/man/validate.Rd
+++ b/man/validate.Rd
@ -2,21 +2,38 @@
 % Please edit documentation in R/validate.R
 \name{validate}
 \alias{validate}
-\title{Perform cross-validation for the analysis.}
+\title{Perform cross-validation for the ranking.}
 \usage{
-validate(analysis, progress = NULL)
+validate(ranking, reference_gene_ids, method_ids, progress = NULL)
 }
 \arguments{
-\item{analysis}{The analysis to validate.}
+\item{ranking}{The ranking to validate.}
 \item{reference_gene_ids}{The reference gene IDs whose ranking should be
 validated.}
 \item{method_ids}{IDs of the methods that were used.}
 \item{progress}{An optional progress function that should accept a single
 value between 0.0 and 1.0 for progress information.}
 }
 \value{
-An object containing the mean absolute error and the mean percent
+A validation object with the following items:
-rank for the original analysis as well as the validation.
+\describe{
 \item{\code{validation}}{A \code{data.table} containing percentiles of the
 comparison genes from the original ranking as well as their validation.
 }
 \item{\code{mean_score}}{The mean score of the genes.}
 \item{\code{mean_percentile_original}}{The mean percentile of the genes in
 the original ranking.
 }
 \item{\code{mean_percentile_validation}}{The mean percentile of the genes
 when optimizing without themselves.
 }
 \item{\code{mean_error}}{The mean absolute error.}
 }
 }
 \description{
-This function reoptimizes the analysis leaving out one of the original
+This function reoptimizes the ranking leaving out one of the original
 reference genes at a time.
 }