From 4dda7fa49e5f53f44007578045d68de76ba9374f Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Fri, 12 Nov 2021 10:25:07 +0100 Subject: [PATCH] ranking: Filter out species with too few species --- R/ranking.R | 31 ++++++++++++++++++++++++++----- man/optimal_weights.Rd | 12 +++++++++++- man/ranking.Rd | 5 ++++- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/R/ranking.R b/R/ranking.R index b87cb42..711b7c5 100644 --- a/R/ranking.R +++ b/R/ranking.R @@ -6,19 +6,33 @@ #' @param analysis Analysis object resulting from [analyze()]. #' @param weights Named list pairing method names with weighting factors. Only #' methods that are contained within this list will be included. +#' @param min_n_species Minimum number of required species per gene. Genes that +#' have fewer species will not be included in the ranking. #' #' @returns A ranking object. The object extends the analysis result with #' additional columns containing the `score` and the `rank` of each gene. It #' will be ordered by rank. #' #' @export -ranking <- function(analysis, weights) { +ranking <- function(analysis, weights, min_n_species = 10) { if (!"geposan_analysis" %chin% class(analysis)) { stop("Invalid analyis. Use geposan::analyze().") } - ranking <- copy(analysis$results) - ranking[, score := 0.0] + # Count included species from the preset per gene. + genes_n_species <- geposan::distances[ + species %chin% analysis$preset$species_ids, + .(n_species = .N), + by = "gene" + ] + + setkey(genes_n_species, gene) + + # Exclude genes with too few species. + ranking <- analysis$results[ + genes_n_species[gene, n_species] >= min_n_species, + .(score = 0.0) + ] for (method in names(weights)) { weighted <- weights[[method]] * ranking[, ..method] @@ -47,13 +61,16 @@ ranking <- function(analysis, weights) { #' @param reference_gene_ids IDs of the reference genes. #' @param target The optimization target. It may be one of "mean", "min" or #' "max" and results in the respective rank being optimized. +#' @param min_n_species Minimum number of required species per gene. Genes that +#' have fewer species will not be included in the rankings used to find the +#' optimal weights. #' #' @returns Named list pairing method names with their optimal weights. This #' can be used as an argument to [ranking()]. #' #' @export optimal_weights <- function(analysis, methods, reference_gene_ids, - target = "mean") { + target = "mean", min_n_species = 10) { if (!"geposan_analysis" %chin% class(analysis)) { stop("Invalid analyis. Use geposan::analyze().") } @@ -71,7 +88,11 @@ optimal_weights <- function(analysis, methods, reference_gene_ids, # Compute the target rank of the reference genes when applying the weights. target_rank <- function(factors) { - data <- ranking(analysis, weights(factors)) + data <- ranking( + analysis, + weights(factors), + min_n_species = min_n_species + ) data[gene %chin% reference_gene_ids, if (target == "min") { min(rank) diff --git a/man/optimal_weights.Rd b/man/optimal_weights.Rd index 0f0fe81..1d82970 100644 --- a/man/optimal_weights.Rd +++ b/man/optimal_weights.Rd @@ -4,7 +4,13 @@ \alias{optimal_weights} \title{Find the best weights to rank the results.} \usage{ -optimal_weights(analysis, methods, reference_gene_ids, target = "mean") +optimal_weights( + analysis, + methods, + reference_gene_ids, + target = "mean", + min_n_species = 10 +) } \arguments{ \item{analysis}{Results from \code{\link[=analyze]{analyze()}} or \code{\link[=ranking]{ranking()}}.} @@ -15,6 +21,10 @@ optimal_weights(analysis, methods, reference_gene_ids, target = "mean") \item{target}{The optimization target. It may be one of "mean", "min" or "max" and results in the respective rank being optimized.} + +\item{min_n_species}{Minimum number of required species per gene. Genes that +have fewer species will not be included in the rankings used to find the +optimal weights.} } \value{ Named list pairing method names with their optimal weights. This diff --git a/man/ranking.Rd b/man/ranking.Rd index 93c7f39..21154c1 100644 --- a/man/ranking.Rd +++ b/man/ranking.Rd @@ -4,13 +4,16 @@ \alias{ranking} \title{Rank the results by computing a score.} \usage{ -ranking(analysis, weights) +ranking(analysis, weights, min_n_species = 10) } \arguments{ \item{analysis}{Analysis object resulting from \code{\link[=analyze]{analyze()}}.} \item{weights}{Named list pairing method names with weighting factors. Only methods that are contained within this list will be included.} + +\item{min_n_species}{Minimum number of required species per gene. Genes that +have fewer species will not be included in the ranking.} } \value{ A ranking object. The object extends the analysis result with