neural: Refactor and increase gene requirement

2025-10-25 19:37:23 +02:00 · 2022-05-26 19:50:23 +02:00 · 2022-05-26 19:50:23 +02:00 · 49981300fb
commit 49981300fb
parent c04b6337e9
2 changed files with 168 additions and 140 deletions
--- a/R/method_neural.R
+++ b/R/method_neural.R
@ -7,11 +7,19 @@
 #'   final score will be the mean of the result of applying the different
 #'   models. There should be at least two training sets. The analysis will only
 #'   work, if there is at least one reference gene per training set.
+#' @param gene_requirement Minimum proportion of genes from the preset that a
+#'   species has to have in order to be included in the models.
+#' @param control_ratio The proportion of random control genes that is included
+#'   in the training data sets in addition to the reference genes. This should
+#'   be a numeric value between 0.0 and 1.0.
 #'
 #' @return An object of class `geposan_method`.
 #'
 #' @export
-neural <- function(seed = 180199, n_models = 5) {
+neural <- function(seed = 180199,
+                       n_models = 5,
+                       gene_requirement = 0.5,
+                       control_ratio = 0.5) {
  method(
    id = "neural",
    name = "Neural",
@ -23,65 +31,52 @@ neural <- function(seed = 180199, n_models = 5) {

      cached(
        "neural",
-        c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
+        c(
+          species_ids,
+          gene_ids,
+          reference_gene_ids,
+          seed,
+          n_models,
+          gene_requirement,
+          control_ratio
+        ),
        { # nolint
          reference_count <- length(reference_gene_ids)
          stopifnot(n_models %in% 2:reference_count)

+          control_count <- ceiling(reference_count * control_ratio /
+            (1 - control_ratio))
+
          # Make results reproducible.
          tensorflow::set_random_seed(seed)

          # Step 1: Prepare input data.
          # ---------------------------

-          # Prefilter distances by species.
-          distances <- geposan::distances[species %chin% species_ids]
+          # Prefilter distances by species and gene.
+          distances <- geposan::distances[species %chin% species_ids &
+            gene %chin% gene_ids]

-          # Input data for the network. This contains the gene ID as
-          # an identifier as well as the per-species gene distances as
-          # input variables.
-          data <- data.table(gene = gene_ids)
+          # Only include species that have at least 25% of the included genes.
+          distances[, species_n_genes := .N, by = species]
+          distances <- distances[species_n_genes >=
+            gene_requirement * length(gene_ids)]
+          included_species <- distances[, unique(species)]

-          # Buffer to keep track of the names of the input variables.
-          input_vars <- NULL
+          # Reshape data to put species into columns.
+          data <- dcast(
+            distances,
+            gene ~ species,
+            value.var = "distance"
+          )

-          # Make a columns containing positions and distances for each
-          # species.
-          for (species_id in species_ids) {
-            species_data <- distances[
-              species == species_id,
-              .(gene, distance)
-            ]
-
-            # Only include species with at least 25% known values.
-            # As positions and distances always coexist, we don't
-            # loose any data here.
-
-            species_data <- stats::na.omit(species_data)
-
-            if (nrow(species_data) >= 0.25 * length(gene_ids)) {
-              data <- merge(data, species_data, all.x = TRUE)
-
-              # Replace missing data with mean values. The neural
-              # network can't handle NAs in a meaningful way.
-              # Choosing extreme values here would result in
-              # heavily biased results. Therefore, the mean value
-              # is chosen as a compromise. However, this will of
-              # course lessen the significance of the results.
-
-              mean_distance <- round(
-                species_data[, mean(distance)]
-              )
-
-              data[is.na(distance), distance := mean_distance]
-
-              # Name the new column after the species.
-              setnames(data, "distance", species_id)
-
-              # Add the input variable to the buffer.
-              input_vars <- c(input_vars, species_id)
-            }
-          }
+          # Replace values that are still missing with mean values for the
+          # species in question.
+          data[, (included_species) := lapply(included_species, \(species) {
+            species <- get(species)
+            species[is.na(species)] <- mean(species, na.rm = TRUE)
+            species
+          })]

          progress(0.1)

@ -89,140 +84,81 @@ neural <- function(seed = 180199, n_models = 5) {
          # ------------------------------

          # Take out the reference data.
-
          reference_data <- data[gene %chin% reference_gene_ids]
          reference_data[, score := 1.0]

-          # Take out random samples from the remaining genes. This is
-          # another compromise with a negative impact on
-          # significance. We assume that a random gene is not likely
-          # to match the reference genes.
-
-          without_reference_data <- data[
-            !gene %chin% reference_gene_ids
+          # Draw control data from the remaining genes.
+          control_data <- data[!gene %chin% reference_gene_ids][
+            sample(.N, control_count)
          ]
-
-          control_data <- without_reference_data[
-            sample(
-              nrow(without_reference_data),
-              reference_count
-            )
-          ]
-
          control_data[, score := 0.0]

-          # Split the training data into random sets to have
-          # validation data for each model.
+          # Randomly distribute the indices of the reference and control genes
+          # across one bucket per model.

-          # Scramble the source tables.
-          reference_data <- reference_data[sample(reference_count)]
-          control_data <- control_data[sample(reference_count)]
+          reference_sets <- split(
+            sample(reference_count),
+            seq_len(reference_count) %% n_models
+          )

-          networks <- list()
+          control_sets <- split(
+            sample(control_count),
+            seq_len(control_count) %% n_models
+          )

-          indices <- seq_len(reference_count)
-          indices_split <- split(indices, indices %% n_models)
-
-          for (i in seq_len(n_models)) {
+          # Prepare the data for each model. Each model will have one pair of
+          # reference and control gene sets left out for validation. The
+          # training data consists of all the remaining sets.
+          networks <- lapply(seq_len(n_models), \(index) {
            training_data <- rbindlist(list(
-              reference_data[!indices_split[[i]]],
-              control_data[!indices_split[[i]]]
+              reference_data[!reference_sets[[index]]],
+              control_data[!control_sets[[index]]]
            ))

            validation_data <- rbindlist(list(
-              reference_data[indices_split[[i]]],
-              control_data[indices_split[[i]]]
+              reference_data[reference_sets[[index]]],
+              control_data[control_sets[[index]]]
            ))

-            networks[[i]] <- list(
+            list(
              training_data = training_data,
              validation_data = validation_data
            )
-          }
+          })

          # Step 3: Create, train and apply neural network.
          # -----------------------------------------------

-          # Layers for the neural network.
-          input_layer <- length(input_vars)
-          layer1 <- input_layer
-          layer2 <- 0.5 * input_layer
-          layer3 <- 0.5 * layer2
-
-          # Convert data to matrix and normalize it.
-          to_matrix <- function(data) {
-            data_matrix <- as.matrix(data[, ..input_vars])
-            colnames(data_matrix) <- NULL
-            keras::normalize(data_matrix)
-          }
-
-          data_matrix <- to_matrix(data)
+          data_matrix <- prepare_data(data, included_species)
          output_vars <- NULL

          for (i in seq_along(networks)) {
+            network <- networks[[i]]
+
            # Create a new model for each training session, because
            # the model would keep its state across training
            # sessions otherwise.
-            model <- keras::keras_model_sequential() |>
-              keras::layer_dense(
-                units = layer1,
-                activation = "relu",
-                input_shape = input_layer,
-              ) |>
-              keras::layer_dense(
-                units = layer2,
-                activation = "relu",
-                kernel_regularizer = keras::regularizer_l2()
-              ) |>
-              keras::layer_dense(
-                units = layer3,
-                activation = "relu",
-                kernel_regularizer = keras::regularizer_l2()
-              ) |>
-              keras::layer_dense(
-                units = 1,
-                activation = "sigmoid"
-              ) |>
-              keras::compile(
-                loss = keras::loss_mean_absolute_error(),
-                optimizer = keras::optimizer_adam()
-              )
+            model <- create_model(length(included_species))

            # Train the model.
-
-            network <- networks[[i]]
-
-            training_data <- network$training_data
-            training_matrix <- to_matrix(training_data)
-            validation_data <- network$validation_data
-            validation_matrix <- to_matrix(validation_data)
-
-            fit <- keras::fit(
+            fit <- train_model(
              model,
-              x = training_matrix,
-              y = training_data$score,
-              validation_data = list(
-                x_val = validation_matrix,
-                y_val = validation_data$score
-              ),
-              epochs = 500,
-              verbose = FALSE
+              network$training_data,
+              network$validation_data,
+              included_species
            )

            # Apply the model.
-
            data[, new_score := stats::predict(model, data_matrix)]

            # Remove the values of the training data itself.
-            data[gene %chin% training_data$gene, new_score := NA]
+            data[gene %chin% network$training_data$gene, new_score := NA]

            output_var <- sprintf("score%i", i)
            setnames(data, "new_score", output_var)
            output_vars <- c(output_vars, output_var)

-
            # Store the details.
-
            networks[[i]]$model <- keras::serialize_model(model)
            networks[[i]]$fit <- fit

@ -244,7 +180,7 @@ neural <- function(seed = 180199, n_models = 5) {
            details = list(
              seed = seed,
              n_models = n_models,
-              all_results = data[, !..input_vars],
+              all_results = data[, !..included_species],
              networks = networks
            )
          )
@ -253,3 +189,83 @@ neural <- function(seed = 180199, n_models = 5) {
    }
  )
 }
+
+#' Create a `keras` model based on the number of input variables.
+#'
+#' @param n_input_vars Number of input variables (i.e. species).
+#' @return A `keras` model.
+#'
+#' @noRd
+create_model <- function(n_input_vars) {
+  # Layers for the neural network.
+  layer1 <- n_input_vars
+  layer2 <- 0.5 * layer1
+  layer3 <- 0.5 * layer2
+
+  keras::keras_model_sequential() |>
+    keras::layer_dense(
+      units = layer1,
+      activation = "relu",
+      input_shape = n_input_vars,
+    ) |>
+    keras::layer_dense(
+      units = layer2,
+      activation = "relu",
+      kernel_regularizer = keras::regularizer_l2()
+    ) |>
+    keras::layer_dense(
+      units = layer3,
+      activation = "relu",
+      kernel_regularizer = keras::regularizer_l2()
+    ) |>
+    keras::layer_dense(
+      units = 1,
+      activation = "sigmoid"
+    ) |>
+    keras::compile(
+      loss = keras::loss_mean_absolute_error(),
+      optimizer = keras::optimizer_adam()
+    )
+}
+
+#' Train a model on a specific training dataset.
+#'
+#' @param model The model created using [create_model()]. The model will be
+#'   changed reflecting the state after training.
+#' @param training_data Data to fit the model to.
+#' @param validation_data Additional data to assess the model performance.
+#' @param input_vars Character vector of input variables that should be
+#'   included.
+#'
+#' @return The `keras` fit object describing the training process.
+#' @noRd
+train_model <- function(model, training_data, validation_data, input_vars) {
+  training_matrix <- prepare_data(training_data, input_vars)
+  validation_matrix <- prepare_data(validation_data, input_vars)
+
+  keras::fit(
+    model,
+    x = training_matrix,
+    y = training_data$score,
+    validation_data = list(
+      x_val = validation_matrix,
+      y_val = validation_data$score
+    ),
+    epochs = 500,
+    verbose = FALSE
+  )
+}
+
+#' Convert data to a matrix and normalize it.
+#'
+#' @param data Input data.
+#' @param input_vars Character vector of input variables that should be
+#'   included.
+#'
+#' @return A data matrix that can be used within the models.
+#' @noRd
+prepare_data <- function(data, input_vars) {
+  data_matrix <- as.matrix(data[, ..input_vars])
+  colnames(data_matrix) <- NULL
+  keras::normalize(data_matrix)
+}
--- a/man/neural.Rd
+++ b/man/neural.Rd
@ -4,7 +4,12 @@
 \alias{neural}
 \title{Find genes by training and applying a neural network.}
 \usage{
-neural(seed = 180199, n_models = 5)
+neural(
+  seed = 180199,
+  n_models = 5,
+  gene_requirement = 0.5,
+  control_ratio = 0.5
+)
 }
 \arguments{
 \item{seed}{The seed will be used to make the results reproducible.}
@ -15,6 +20,13 @@ training data and validated using this set. For non-training genes, the
 final score will be the mean of the result of applying the different
 models. There should be at least two training sets. The analysis will only
 work, if there is at least one reference gene per training set.}
+
+\item{gene_requirement}{Minimum proportion of genes from the preset that a
+species has to have in order to be included in the models.}
+
+\item{control_ratio}{The proportion of random control genes that is included
+in the training data sets in addition to the reference genes. This should
+be a numeric value between 0.0 and 1.0.}
 }
 \value{
 An object of class \code{geposan_method}.