From 3217c9bd298444531fcef5f6067ce373f048c89b Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Mon, 30 May 2022 13:49:52 +0200
Subject: [PATCH] preset: Filter species in addition to genes

---
 R/method_neural.R | 24 ++++++------------------
 R/preset.R        | 44 +++++++++++++++++++++++++++++---------------
 man/neural.Rd     | 10 +---------
 man/preset.Rd     | 19 +++++++++++++------
 4 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/R/method_neural.R b/R/method_neural.R
index 9f9c2b1..dafd0cf 100644
--- a/R/method_neural.R
+++ b/R/method_neural.R
@@ -7,8 +7,6 @@
 #'   final score will be the mean of the result of applying the different
 #'   models. There should be at least two training sets. The analysis will only
 #'   work, if there is at least one reference gene per training set.
-#' @param gene_requirement Minimum proportion of genes from the preset that a
-#'   species has to have in order to be included in the models.
 #' @param control_ratio The proportion of random control genes that is included
 #'   in the training data sets in addition to the reference genes. This should
 #'   be a numeric value between 0.0 and 1.0.
@@ -16,10 +14,7 @@
 #' @return An object of class `geposan_method`.
 #'
 #' @export
-neural <- function(seed = 180199,
-                       n_models = 5,
-                       gene_requirement = 0.5,
-                       control_ratio = 0.5) {
+neural <- function(seed = 180199, n_models = 5, control_ratio = 0.5) {
   method(
     id = "neural",
     name = "Neural",
@@ -37,7 +32,6 @@ neural <- function(seed = 180199,
           reference_gene_ids,
           seed,
           n_models,
-          gene_requirement,
           control_ratio
         ),
         { # nolint
@@ -57,12 +51,6 @@ neural <- function(seed = 180199,
           distances <- geposan::distances[species %chin% species_ids &
             gene %chin% gene_ids]
 
-          # Only include species that have at least 25% of the included genes.
-          distances[, species_n_genes := .N, by = species]
-          distances <- distances[species_n_genes >=
-            gene_requirement * length(gene_ids)]
-          included_species <- distances[, unique(species)]
-
           # Reshape data to put species into columns.
           data <- dcast(
             distances,
@@ -72,7 +60,7 @@ neural <- function(seed = 180199,
 
           # Replace values that are still missing with mean values for the
           # species in question.
-          data[, (included_species) := lapply(included_species, \(species) {
+          data[, (species_ids) := lapply(species_ids, \(species) {
             species <- get(species)
             species[is.na(species)] <- mean(species, na.rm = TRUE)
             species
@@ -129,7 +117,7 @@ neural <- function(seed = 180199,
           # Step 3: Create, train and apply neural network.
           # -----------------------------------------------
 
-          data_matrix <- prepare_data(data, included_species)
+          data_matrix <- prepare_data(data, species_ids)
           output_vars <- NULL
 
           for (i in seq_along(networks)) {
@@ -138,14 +126,14 @@ neural <- function(seed = 180199,
             # Create a new model for each training session, because
             # the model would keep its state across training
             # sessions otherwise.
-            model <- create_model(length(included_species))
+            model <- create_model(length(species_ids))
 
             # Train the model.
             fit <- train_model(
               model,
               network$training_data,
               network$validation_data,
-              included_species
+              species_ids
             )
 
             # Apply the model.
@@ -180,7 +168,7 @@ neural <- function(seed = 180199,
             details = list(
               seed = seed,
               n_models = n_models,
-              all_results = data[, !..included_species],
+              all_results = data[, !..species_ids],
               networks = networks
             )
           )
diff --git a/R/preset.R b/R/preset.R
index e798bbb..39e0d46 100644
--- a/R/preset.R
+++ b/R/preset.R
@@ -3,16 +3,19 @@
 #' A preset is used to specify which methods and inputs should be used for an
 #' analysis. Note that the genes to process should normally include the
 #' reference genes to be able to assess the results later. The genes will be
-#' filtered based on how many species have data for them. Genes which only have
-#' orthologs for less than 25% of the input species will be excluded from the
-#' preset and the analyis. See the different method functions for the available
-#' methods: [clustering()], [correlation()], [neural()], [adjacency()] and
-#' [species_adjacency()].
+#' filtered based on how many species have data for them. Afterwards, species
+#' that still have many missing genes will also be excluded. See the different
+#' method functions for the available methods: [clustering()], [correlation()],
+#' [neural()], [adjacency()] and [species_adjacency()].
 #'
 #' @param reference_gene_ids IDs of reference genes to compare to.
 #' @param methods List of methods to apply.
 #' @param species_ids IDs of species to include.
 #' @param gene_ids IDs of genes to screen.
+#' @param species_requirement The proportion of species a gene has to have
+#'  orthologs in in order for the gene to qualify.
+#' @param gene_requirement The proportion of genes that a species has to have
+#'  in order for the species to be included in the analysis.
 #'
 #' @return The preset to use with [analyze()].
 #'
@@ -20,21 +23,32 @@
 preset <- function(reference_gene_ids,
                    methods = all_methods(),
                    species_ids = geposan::species$id,
-                   gene_ids = geposan::genes$id) {
-  # Count included species per gene.
-  genes_n_species <- geposan::distances[
-    species %chin% species_ids,
-    .(n_species = .N),
-    by = "gene"
+                   gene_ids = geposan::genes$id,
+                   species_requirement = 0.25,
+                   gene_requirement = 0.5) {
+  # Prefilter distances.
+  distances <- geposan::distances[
+    species %chin% species_ids & gene %chin% gene_ids
   ]
 
-  # Filter out genes with less than 25% existing orthologs.
+  # Count included species per gene.
+  genes_n_species <- distances[, .(n_species = .N), by = "gene"]
+
+  # Filter out genes with less too few existing orthologs.
   gene_ids_filtered <- genes_n_species[
-    gene %chin% gene_ids &
-      n_species >= 0.25 * length(species_ids),
+    n_species >= species_requirement * length(species_ids),
     gene
   ]
 
+  # Count included genes per species.
+  species_n_genes <- geposan::distances[, .(n_genes = .N), by = "species"]
+
+  # Filter out species that have too few of the genes.
+  species_ids_filtered <- species_n_genes[
+    n_genes >= gene_requirement * length(gene_ids_filtered),
+    species
+  ]
+
   reference_gene_ids_excluded <- reference_gene_ids[
     !reference_gene_ids %chin% gene_ids_filtered
   ]
@@ -65,7 +79,7 @@ preset <- function(reference_gene_ids,
     list(
       reference_gene_ids = sort(reference_gene_ids_included),
       methods = methods,
-      species_ids = sort(species_ids),
+      species_ids = sort(species_ids_filtered),
       gene_ids = sort(gene_ids_filtered)
     ),
     class = "geposan_preset"
diff --git a/man/neural.Rd b/man/neural.Rd
index 9c79be6..671494c 100644
--- a/man/neural.Rd
+++ b/man/neural.Rd
@@ -4,12 +4,7 @@
 \alias{neural}
 \title{Find genes by training and applying a neural network.}
 \usage{
-neural(
-  seed = 180199,
-  n_models = 5,
-  gene_requirement = 0.5,
-  control_ratio = 0.5
-)
+neural(seed = 180199, n_models = 5, control_ratio = 0.5)
 }
 \arguments{
 \item{seed}{The seed will be used to make the results reproducible.}
@@ -21,9 +16,6 @@ final score will be the mean of the result of applying the different
 models. There should be at least two training sets. The analysis will only
 work, if there is at least one reference gene per training set.}
 
-\item{gene_requirement}{Minimum proportion of genes from the preset that a
-species has to have in order to be included in the models.}
-
 \item{control_ratio}{The proportion of random control genes that is included
 in the training data sets in addition to the reference genes. This should
 be a numeric value between 0.0 and 1.0.}
diff --git a/man/preset.Rd b/man/preset.Rd
index 3f3d44d..41b1085 100644
--- a/man/preset.Rd
+++ b/man/preset.Rd
@@ -8,7 +8,9 @@ preset(
   reference_gene_ids,
   methods = all_methods(),
   species_ids = geposan::species$id,
-  gene_ids = geposan::genes$id
+  gene_ids = geposan::genes$id,
+  species_requirement = 0.25,
+  gene_requirement = 0.5
 )
 }
 \arguments{
@@ -19,6 +21,12 @@ preset(
 \item{species_ids}{IDs of species to include.}
 
 \item{gene_ids}{IDs of genes to screen.}
+
+\item{species_requirement}{The proportion of species a gene has to have
+orthologs in in order for the gene to qualify.}
+
+\item{gene_requirement}{The proportion of genes that a species has to have
+in order for the species to be included in the analysis.}
 }
 \value{
 The preset to use with \code{\link[=analyze]{analyze()}}.
@@ -27,9 +35,8 @@ The preset to use with \code{\link[=analyze]{analyze()}}.
 A preset is used to specify which methods and inputs should be used for an
 analysis. Note that the genes to process should normally include the
 reference genes to be able to assess the results later. The genes will be
-filtered based on how many species have data for them. Genes which only have
-orthologs for less than 25\% of the input species will be excluded from the
-preset and the analyis. See the different method functions for the available
-methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and
-\code{\link[=species_adjacency]{species_adjacency()}}.
+filtered based on how many species have data for them. Afterwards, species
+that still have many missing genes will also be excluded. See the different
+method functions for the available methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}},
+\code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and \code{\link[=species_adjacency]{species_adjacency()}}.
 }