preset: Remove min_n_species customization

2025-10-26 10:47:25 +01:00 · 2021-11-18 12:30:19 +01:00 · 2021-11-18 12:30:19 +01:00 · de1c1ed40e
commit de1c1ed40e
parent 33056bfa40
3 changed files with 48 additions and 31 deletions
--- a/R/analyze.R
+++ b/R/analyze.R
@ -75,17 +75,7 @@ analyze <- function(preset, progress = NULL) {
            total_progress <- total_progress + 1 / method_count
        }
-        # Count included species from the preset per gene.
+        results
        genes_n_species <- geposan::distances[
            species %chin% preset$species_ids,
            .(n_species = .N),
            by = "gene"
        ]
        setkey(genes_n_species, "gene")
        # Return the results for genes with enough species.
        results[genes_n_species[gene, n_species] >= preset$min_n_species]
    })
    if (!is.null(progress)) {
--- a/R/preset.R
+++ b/R/preset.R
@ -2,20 +2,28 @@
 #'
 #' A preset is used to specify which methods and inputs should be used for an
 #' analysis. Note that the genes to process should normally include the
-#' reference genes to be able to assess the results later.
+#' reference genes to be able to assess the results later. The genes will be
 #' filtered based on how many species have data for them. Genes which only have
 #' orthologs for less than 25% of the input species will be excluded from the
 #' preset and the analyis.
 #'
 #' Available methods are:
 #'
-#'  - `clusteriness` How much the gene distances cluster across species.
+#'  - `clusteriness` How much the gene distances to the nearest telomere
-#'  - `correlation` The mean correlation with the reference genes.
+#'    cluster across species.
 #'  - `clusteriness_positions` The same as `clusteriness` but using absolute
 #'    gene positions instead of distances.
 #'  - `correlation` The mean correlation of gene distances to the nearest
 #'    telomere across species.
 #'  - `correlation_positions` Correlation using position data.
 #'  - `neural` Assessment by neural network trained using distances.
 #'  - `neural_positions` Assessment by neural network trained using absolute
 #'    position data.
 #'  - `proximity` Mean proximity to telomeres.
 #'  - `neural` Assessment by neural network.
 #'
 #' @param methods Methods to apply.
 #' @param species_ids IDs of species to include.
 #' @param gene_ids IDs of genes to screen.
 #' @param min_n_species Minimum number of orthologs that a gene should have to
 #'   be included in the analysis.
 #' @param reference_gene_ids IDs of reference genes to compare to.
 #'
 #' @return The preset to use with [analyze()].
@ -23,22 +31,36 @@
 #' @export
 preset <- function(methods = c(
                       "clusteriness",
                       "clusteriness_positions",
                       "correlation",
                       "correlation_positions",
                       "neural",
                       "neural_positions",
                       "proximity"
                   ),
                   species_ids = NULL,
                   gene_ids = NULL,
                   min_n_species = 10,
                   reference_gene_ids = NULL) {
    # Count included species per gene.
    genes_n_species <- geposan::distances[
        species %chin% species_ids,
        .(n_species = .N),
        by = "gene"
    ]
    # Filter out genes with less than 25% existing orthologs.
    gene_ids_filtered <- genes_n_species[
        n_species >= 0.25 * length(species_ids),
        gene
    ]
    # The included data gets sorted to be able to produce predictable hashes
    # for the object later.
    structure(
        list(
            methods = sort(methods),
            species_ids = sort(species_ids),
-            gene_ids = sort(gene_ids),
+            gene_ids = sort(gene_ids_filtered),
            min_n_species = as.numeric(min_n_species),
            reference_gene_ids = sort(reference_gene_ids)
        ),
        class = "geposan_preset"
@ -64,8 +86,6 @@ print.geposan_preset <- function(x, ...) {
        length(x$gene_ids)
    ))
    cat(sprintf("\n  Species per gene: \u2265 %i", x$min_n_species))
    cat(sprintf(
        "\n  Comparison data: %i reference genes\n",
        length(x$reference_gene_ids)
--- a/man/preset.Rd
+++ b/man/preset.Rd
@ -5,10 +5,10 @@
 \title{Create a new preset.}
 \usage{
 preset(
-  methods = c("clusteriness", "correlation", "neural", "proximity"),
+  methods = c("clusteriness", "clusteriness_positions", "correlation",
    "correlation_positions", "neural", "neural_positions", "proximity"),
  species_ids = NULL,
  gene_ids = NULL,
  min_n_species = 10,
  reference_gene_ids = NULL
 )
 }
@ -19,9 +19,6 @@ preset(
 \item{gene_ids}{IDs of genes to screen.}
 \item{min_n_species}{Minimum number of orthologs that a gene should have to
 be included in the analysis.}
 \item{reference_gene_ids}{IDs of reference genes to compare to.}
 }
 \value{
@ -30,14 +27,24 @@ The preset to use with \code{\link[=analyze]{analyze()}}.
 \description{
 A preset is used to specify which methods and inputs should be used for an
 analysis. Note that the genes to process should normally include the
-reference genes to be able to assess the results later.
+reference genes to be able to assess the results later. The genes will be
 filtered based on how many species have data for them. Genes which only have
 orthologs for less than 25\% of the input species will be excluded from the
 preset and the analyis.
 }
 \details{
 Available methods are:
 \itemize{
-\item \code{clusteriness} How much the gene distances cluster across species.
+\item \code{clusteriness} How much the gene distances to the nearest telomere
-\item \code{correlation} The mean correlation with the reference genes.
+cluster across species.
 \item \code{clusteriness_positions} The same as \code{clusteriness} but using absolute
 gene positions instead of distances.
 \item \code{correlation} The mean correlation of gene distances to the nearest
 telomere across species.
 \item \code{correlation_positions} Correlation using position data.
 \item \code{neural} Assessment by neural network trained using distances.
 \item \code{neural_positions} Assessment by neural network trained using absolute
 position data.
 \item \code{proximity} Mean proximity to telomeres.
 \item \code{neural} Assessment by neural network.
 }
 }