From de1c1ed40e53c7ad336e62dc537621b76b04e0f9 Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Thu, 18 Nov 2021 12:30:19 +0100
Subject: [PATCH] preset: Remove min_n_species customization

---
 R/analyze.R   | 12 +-----------
 R/preset.R    | 42 +++++++++++++++++++++++++++++++-----------
 man/preset.Rd | 25 ++++++++++++++++---------
 3 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/R/analyze.R b/R/analyze.R
index 2278ded..8d9b2de 100644
--- a/R/analyze.R
+++ b/R/analyze.R
@@ -75,17 +75,7 @@ analyze <- function(preset, progress = NULL) {
             total_progress <- total_progress + 1 / method_count
         }
 
-        # Count included species from the preset per gene.
-        genes_n_species <- geposan::distances[
-            species %chin% preset$species_ids,
-            .(n_species = .N),
-            by = "gene"
-        ]
-
-        setkey(genes_n_species, "gene")
-
-        # Return the results for genes with enough species.
-        results[genes_n_species[gene, n_species] >= preset$min_n_species]
+        results
     })
 
     if (!is.null(progress)) {
diff --git a/R/preset.R b/R/preset.R
index 6fe8e7f..50ee098 100644
--- a/R/preset.R
+++ b/R/preset.R
@@ -2,20 +2,28 @@
 #'
 #' A preset is used to specify which methods and inputs should be used for an
 #' analysis. Note that the genes to process should normally include the
-#' reference genes to be able to assess the results later.
+#' reference genes to be able to assess the results later. The genes will be
+#' filtered based on how many species have data for them. Genes which only have
+#' orthologs for less than 25% of the input species will be excluded from the
+#' preset and the analyis.
 #'
 #' Available methods are:
 #'
-#'  - `clusteriness` How much the gene distances cluster across species.
-#'  - `correlation` The mean correlation with the reference genes.
+#'  - `clusteriness` How much the gene distances to the nearest telomere
+#'    cluster across species.
+#'  - `clusteriness_positions` The same as `clusteriness` but using absolute
+#'    gene positions instead of distances.
+#'  - `correlation` The mean correlation of gene distances to the nearest
+#'    telomere across species.
+#'  - `correlation_positions` Correlation using position data.
+#'  - `neural` Assessment by neural network trained using distances.
+#'  - `neural_positions` Assessment by neural network trained using absolute
+#'    position data.
 #'  - `proximity` Mean proximity to telomeres.
-#'  - `neural` Assessment by neural network.
 #'
 #' @param methods Methods to apply.
 #' @param species_ids IDs of species to include.
 #' @param gene_ids IDs of genes to screen.
-#' @param min_n_species Minimum number of orthologs that a gene should have to
-#'   be included in the analysis.
 #' @param reference_gene_ids IDs of reference genes to compare to.
 #'
 #' @return The preset to use with [analyze()].
@@ -23,22 +31,36 @@
 #' @export
 preset <- function(methods = c(
                        "clusteriness",
+                       "clusteriness_positions",
                        "correlation",
+                       "correlation_positions",
                        "neural",
+                       "neural_positions",
                        "proximity"
                    ),
                    species_ids = NULL,
                    gene_ids = NULL,
-                   min_n_species = 10,
                    reference_gene_ids = NULL) {
+    # Count included species per gene.
+    genes_n_species <- geposan::distances[
+        species %chin% species_ids,
+        .(n_species = .N),
+        by = "gene"
+    ]
+
+    # Filter out genes with less than 25% existing orthologs.
+    gene_ids_filtered <- genes_n_species[
+        n_species >= 0.25 * length(species_ids),
+        gene
+    ]
+
     # The included data gets sorted to be able to produce predictable hashes
     # for the object later.
     structure(
         list(
             methods = sort(methods),
             species_ids = sort(species_ids),
-            gene_ids = sort(gene_ids),
-            min_n_species = as.numeric(min_n_species),
+            gene_ids = sort(gene_ids_filtered),
             reference_gene_ids = sort(reference_gene_ids)
         ),
         class = "geposan_preset"
@@ -64,8 +86,6 @@ print.geposan_preset <- function(x, ...) {
         length(x$gene_ids)
     ))
 
-    cat(sprintf("\n  Species per gene: \u2265 %i", x$min_n_species))
-
     cat(sprintf(
         "\n  Comparison data: %i reference genes\n",
         length(x$reference_gene_ids)
diff --git a/man/preset.Rd b/man/preset.Rd
index 015900e..e6414c5 100644
--- a/man/preset.Rd
+++ b/man/preset.Rd
@@ -5,10 +5,10 @@
 \title{Create a new preset.}
 \usage{
 preset(
-  methods = c("clusteriness", "correlation", "neural", "proximity"),
+  methods = c("clusteriness", "clusteriness_positions", "correlation",
+    "correlation_positions", "neural", "neural_positions", "proximity"),
   species_ids = NULL,
   gene_ids = NULL,
-  min_n_species = 10,
   reference_gene_ids = NULL
 )
 }
@@ -19,9 +19,6 @@ preset(
 
 \item{gene_ids}{IDs of genes to screen.}
 
-\item{min_n_species}{Minimum number of orthologs that a gene should have to
-be included in the analysis.}
-
 \item{reference_gene_ids}{IDs of reference genes to compare to.}
 }
 \value{
@@ -30,14 +27,24 @@ The preset to use with \code{\link[=analyze]{analyze()}}.
 \description{
 A preset is used to specify which methods and inputs should be used for an
 analysis. Note that the genes to process should normally include the
-reference genes to be able to assess the results later.
+reference genes to be able to assess the results later. The genes will be
+filtered based on how many species have data for them. Genes which only have
+orthologs for less than 25\% of the input species will be excluded from the
+preset and the analyis.
 }
 \details{
 Available methods are:
 \itemize{
-\item \code{clusteriness} How much the gene distances cluster across species.
-\item \code{correlation} The mean correlation with the reference genes.
+\item \code{clusteriness} How much the gene distances to the nearest telomere
+cluster across species.
+\item \code{clusteriness_positions} The same as \code{clusteriness} but using absolute
+gene positions instead of distances.
+\item \code{correlation} The mean correlation of gene distances to the nearest
+telomere across species.
+\item \code{correlation_positions} Correlation using position data.
+\item \code{neural} Assessment by neural network trained using distances.
+\item \code{neural_positions} Assessment by neural network trained using absolute
+position data.
 \item \code{proximity} Mean proximity to telomeres.
-\item \code{neural} Assessment by neural network.
 }
 }