From 33056bfa4073ec3cbd05b46f5b9cdaa88f356e8f Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Wed, 17 Nov 2021 22:57:31 +0100 Subject: [PATCH] Move species count to analysis --- R/analyze.R | 18 ++++++++++++++---- R/preset.R | 6 ++++++ R/ranking.R | 30 ++++-------------------------- man/optimal_weights.Rd | 12 +----------- man/preset.Rd | 4 ++++ man/ranking.Rd | 5 +---- 6 files changed, 30 insertions(+), 45 deletions(-) diff --git a/R/analyze.R b/R/analyze.R index 2fb865a..2278ded 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -75,13 +75,23 @@ analyze <- function(preset, progress = NULL) { total_progress <- total_progress + 1 / method_count } - if (!is.null(progress)) { - progress(1.0) - } + # Count included species from the preset per gene. + genes_n_species <- geposan::distances[ + species %chin% preset$species_ids, + .(n_species = .N), + by = "gene" + ] - results + setkey(genes_n_species, "gene") + + # Return the results for genes with enough species. + results[genes_n_species[gene, n_species] >= preset$min_n_species] }) + if (!is.null(progress)) { + progress(1.0) + } + structure( list( preset = preset, diff --git a/R/preset.R b/R/preset.R index a08a80f..6fe8e7f 100644 --- a/R/preset.R +++ b/R/preset.R @@ -14,6 +14,8 @@ #' @param methods Methods to apply. #' @param species_ids IDs of species to include. #' @param gene_ids IDs of genes to screen. +#' @param min_n_species Minimum number of orthologs that a gene should have to +#' be included in the analysis. #' @param reference_gene_ids IDs of reference genes to compare to. #' #' @return The preset to use with [analyze()]. @@ -27,6 +29,7 @@ preset <- function(methods = c( ), species_ids = NULL, gene_ids = NULL, + min_n_species = 10, reference_gene_ids = NULL) { # The included data gets sorted to be able to produce predictable hashes # for the object later. @@ -35,6 +38,7 @@ preset <- function(methods = c( methods = sort(methods), species_ids = sort(species_ids), gene_ids = sort(gene_ids), + min_n_species = as.numeric(min_n_species), reference_gene_ids = sort(reference_gene_ids) ), class = "geposan_preset" @@ -60,6 +64,8 @@ print.geposan_preset <- function(x, ...) { length(x$gene_ids) )) + cat(sprintf("\n Species per gene: \u2265 %i", x$min_n_species)) + cat(sprintf( "\n Comparison data: %i reference genes\n", length(x$reference_gene_ids) diff --git a/R/ranking.R b/R/ranking.R index de412a0..3930ca3 100644 --- a/R/ranking.R +++ b/R/ranking.R @@ -6,33 +6,18 @@ #' @param analysis Analysis object resulting from [analyze()]. #' @param weights Named list pairing method names with weighting factors. Only #' methods that are contained within this list will be included. -#' @param min_n_species Minimum number of required species per gene. Genes that -#' have fewer species will not be included in the ranking. #' #' @returns A ranking object. The object extends the analysis result with #' additional columns containing the `score` and the `rank` of each gene. It #' will be ordered by rank. #' #' @export -ranking <- function(analysis, weights, min_n_species = 10) { +ranking <- function(analysis, weights) { if (!"geposan_analysis" %chin% class(analysis)) { stop("Invalid analyis. Use geposan::analyze().") } - # Count included species from the preset per gene. - genes_n_species <- geposan::distances[ - species %chin% analysis$preset$species_ids, - .(n_species = .N), - by = "gene" - ] - - setkey(genes_n_species, gene) - - # Exclude genes with too few species. - ranking <- analysis$results[ - genes_n_species[gene, n_species] >= min_n_species - ] - + ranking <- copy(analysis$results) ranking[, score := 0.0] for (method in names(weights)) { @@ -65,16 +50,13 @@ ranking <- function(analysis, weights, min_n_species = 10) { #' @param reference_gene_ids IDs of the reference genes. #' @param target The optimization target. It may be one of "mean", "min" or #' "max" and results in the respective rank being optimized. -#' @param min_n_species Minimum number of required species per gene. Genes that -#' have fewer species will not be included in the rankings used to find the -#' optimal weights. #' #' @returns Named list pairing method names with their optimal weights. This #' can be used as an argument to [ranking()]. #' #' @export optimal_weights <- function(analysis, methods, reference_gene_ids, - target = "mean", min_n_species = 10) { + target = "mean") { if (!"geposan_analysis" %chin% class(analysis)) { stop("Invalid analyis. Use geposan::analyze().") } @@ -92,11 +74,7 @@ optimal_weights <- function(analysis, methods, reference_gene_ids, # Compute the target rank of the reference genes when applying the weights. target_rank <- function(factors) { - data <- ranking( - analysis, - weights(factors), - min_n_species = min_n_species - ) + data <- ranking(analysis, weights(factors)) result <- data[gene %chin% reference_gene_ids, if (target == "min") { min(rank) diff --git a/man/optimal_weights.Rd b/man/optimal_weights.Rd index 1d82970..0f0fe81 100644 --- a/man/optimal_weights.Rd +++ b/man/optimal_weights.Rd @@ -4,13 +4,7 @@ \alias{optimal_weights} \title{Find the best weights to rank the results.} \usage{ -optimal_weights( - analysis, - methods, - reference_gene_ids, - target = "mean", - min_n_species = 10 -) +optimal_weights(analysis, methods, reference_gene_ids, target = "mean") } \arguments{ \item{analysis}{Results from \code{\link[=analyze]{analyze()}} or \code{\link[=ranking]{ranking()}}.} @@ -21,10 +15,6 @@ optimal_weights( \item{target}{The optimization target. It may be one of "mean", "min" or "max" and results in the respective rank being optimized.} - -\item{min_n_species}{Minimum number of required species per gene. Genes that -have fewer species will not be included in the rankings used to find the -optimal weights.} } \value{ Named list pairing method names with their optimal weights. This diff --git a/man/preset.Rd b/man/preset.Rd index e30b1cd..015900e 100644 --- a/man/preset.Rd +++ b/man/preset.Rd @@ -8,6 +8,7 @@ preset( methods = c("clusteriness", "correlation", "neural", "proximity"), species_ids = NULL, gene_ids = NULL, + min_n_species = 10, reference_gene_ids = NULL ) } @@ -18,6 +19,9 @@ preset( \item{gene_ids}{IDs of genes to screen.} +\item{min_n_species}{Minimum number of orthologs that a gene should have to +be included in the analysis.} + \item{reference_gene_ids}{IDs of reference genes to compare to.} } \value{ diff --git a/man/ranking.Rd b/man/ranking.Rd index 21154c1..93c7f39 100644 --- a/man/ranking.Rd +++ b/man/ranking.Rd @@ -4,16 +4,13 @@ \alias{ranking} \title{Rank the results by computing a score.} \usage{ -ranking(analysis, weights, min_n_species = 10) +ranking(analysis, weights) } \arguments{ \item{analysis}{Analysis object resulting from \code{\link[=analyze]{analyze()}}.} \item{weights}{Named list pairing method names with weighting factors. Only methods that are contained within this list will be included.} - -\item{min_n_species}{Minimum number of required species per gene. Genes that -have fewer species will not be included in the ranking.} } \value{ A ranking object. The object extends the analysis result with