neural: Refactor and increase gene requirement

2025-10-26 02:37:25 +01:00 · 2022-05-26 19:50:23 +02:00 · 2022-05-26 19:50:23 +02:00 · 49981300fb
commit 49981300fb
parent c04b6337e9
2 changed files with 168 additions and 140 deletions
--- a/R/method_neural.R
+++ b/R/method_neural.R
@ -7,11 +7,19 @@
 #'   final score will be the mean of the result of applying the different
 #'   models. There should be at least two training sets. The analysis will only
 #'   work, if there is at least one reference gene per training set.
 #' @param gene_requirement Minimum proportion of genes from the preset that a
 #'   species has to have in order to be included in the models.
 #' @param control_ratio The proportion of random control genes that is included
 #'   in the training data sets in addition to the reference genes. This should
 #'   be a numeric value between 0.0 and 1.0.
 #'
 #' @return An object of class `geposan_method`.
 #'
 #' @export
-neural <- function(seed = 180199, n_models = 5) {
+neural <- function(seed = 180199,
                       n_models = 5,
                       gene_requirement = 0.5,
                       control_ratio = 0.5) {
  method(
    id = "neural",
    name = "Neural",
@ -23,65 +31,52 @@ neural <- function(seed = 180199, n_models = 5) {
      cached(
        "neural",
-        c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
+        c(
          species_ids,
          gene_ids,
          reference_gene_ids,
          seed,
          n_models,
          gene_requirement,
          control_ratio
        ),
        { # nolint
          reference_count <- length(reference_gene_ids)
          stopifnot(n_models %in% 2:reference_count)
          control_count <- ceiling(reference_count * control_ratio /
            (1 - control_ratio))
          # Make results reproducible.
          tensorflow::set_random_seed(seed)
          # Step 1: Prepare input data.
          # ---------------------------
-          # Prefilter distances by species.
+          # Prefilter distances by species and gene.
-          distances <- geposan::distances[species %chin% species_ids]
+          distances <- geposan::distances[species %chin% species_ids &
            gene %chin% gene_ids]
-          # Input data for the network. This contains the gene ID as
+          # Only include species that have at least 25% of the included genes.
-          # an identifier as well as the per-species gene distances as
+          distances[, species_n_genes := .N, by = species]
-          # input variables.
+          distances <- distances[species_n_genes >=
-          data <- data.table(gene = gene_ids)
+            gene_requirement * length(gene_ids)]
          included_species <- distances[, unique(species)]
-          # Buffer to keep track of the names of the input variables.
+          # Reshape data to put species into columns.
-          input_vars <- NULL
+          data <- dcast(
            distances,
            gene ~ species,
            value.var = "distance"
          )
-          # Make a columns containing positions and distances for each
+          # Replace values that are still missing with mean values for the
-          # species.
+          # species in question.
-          for (species_id in species_ids) {
+          data[, (included_species) := lapply(included_species, \(species) {
-            species_data <- distances[
+            species <- get(species)
-              species == species_id,
+            species[is.na(species)] <- mean(species, na.rm = TRUE)
-              .(gene, distance)
+            species
-            ]
+          })]
            # Only include species with at least 25% known values.
            # As positions and distances always coexist, we don't
            # loose any data here.
            species_data <- stats::na.omit(species_data)
            if (nrow(species_data) >= 0.25 * length(gene_ids)) {
              data <- merge(data, species_data, all.x = TRUE)
              # Replace missing data with mean values. The neural
              # network can't handle NAs in a meaningful way.
              # Choosing extreme values here would result in
              # heavily biased results. Therefore, the mean value
              # is chosen as a compromise. However, this will of
              # course lessen the significance of the results.
              mean_distance <- round(
                species_data[, mean(distance)]
              )
              data[is.na(distance), distance := mean_distance]
              # Name the new column after the species.
              setnames(data, "distance", species_id)
              # Add the input variable to the buffer.
              input_vars <- c(input_vars, species_id)
            }
          }
          progress(0.1)
@ -89,140 +84,81 @@ neural <- function(seed = 180199, n_models = 5) {
          # ------------------------------
          # Take out the reference data.
          reference_data <- data[gene %chin% reference_gene_ids]
          reference_data[, score := 1.0]
-          # Take out random samples from the remaining genes. This is
+          # Draw control data from the remaining genes.
-          # another compromise with a negative impact on
+          control_data <- data[!gene %chin% reference_gene_ids][
-          # significance. We assume that a random gene is not likely
+            sample(.N, control_count)
          # to match the reference genes.
          without_reference_data <- data[
            !gene %chin% reference_gene_ids
          ]
          control_data <- without_reference_data[
            sample(
              nrow(without_reference_data),
              reference_count
            )
          ]
          control_data[, score := 0.0]
-          # Split the training data into random sets to have
+          # Randomly distribute the indices of the reference and control genes
-          # validation data for each model.
+          # across one bucket per model.
-          # Scramble the source tables.
+          reference_sets <- split(
-          reference_data <- reference_data[sample(reference_count)]
+            sample(reference_count),
-          control_data <- control_data[sample(reference_count)]
+            seq_len(reference_count) %% n_models
          )
-          networks <- list()
+          control_sets <- split(
            sample(control_count),
            seq_len(control_count) %% n_models
          )
-          indices <- seq_len(reference_count)
+          # Prepare the data for each model. Each model will have one pair of
-          indices_split <- split(indices, indices %% n_models)
+          # reference and control gene sets left out for validation. The
-
+          # training data consists of all the remaining sets.
-          for (i in seq_len(n_models)) {
+          networks <- lapply(seq_len(n_models), \(index) {
            training_data <- rbindlist(list(
-              reference_data[!indices_split[[i]]],
+              reference_data[!reference_sets[[index]]],
-              control_data[!indices_split[[i]]]
+              control_data[!control_sets[[index]]]
            ))
            validation_data <- rbindlist(list(
-              reference_data[indices_split[[i]]],
+              reference_data[reference_sets[[index]]],
-              control_data[indices_split[[i]]]
+              control_data[control_sets[[index]]]
            ))
-            networks[[i]] <- list(
+            list(
              training_data = training_data,
              validation_data = validation_data
            )
-          }
+          })
          # Step 3: Create, train and apply neural network.
          # -----------------------------------------------
-          # Layers for the neural network.
+          data_matrix <- prepare_data(data, included_species)
          input_layer <- length(input_vars)
          layer1 <- input_layer
          layer2 <- 0.5 * input_layer
          layer3 <- 0.5 * layer2
          # Convert data to matrix and normalize it.
          to_matrix <- function(data) {
            data_matrix <- as.matrix(data[, ..input_vars])
            colnames(data_matrix) <- NULL
            keras::normalize(data_matrix)
          }
          data_matrix <- to_matrix(data)
          output_vars <- NULL
          for (i in seq_along(networks)) {
            network <- networks[[i]]
            # Create a new model for each training session, because
            # the model would keep its state across training
            # sessions otherwise.
-            model <- keras::keras_model_sequential() |>
+            model <- create_model(length(included_species))
              keras::layer_dense(
                units = layer1,
                activation = "relu",
                input_shape = input_layer,
              ) |>
              keras::layer_dense(
                units = layer2,
                activation = "relu",
                kernel_regularizer = keras::regularizer_l2()
              ) |>
              keras::layer_dense(
                units = layer3,
                activation = "relu",
                kernel_regularizer = keras::regularizer_l2()
              ) |>
              keras::layer_dense(
                units = 1,
                activation = "sigmoid"
              ) |>
              keras::compile(
                loss = keras::loss_mean_absolute_error(),
                optimizer = keras::optimizer_adam()
              )
            # Train the model.
-
+            fit <- train_model(
            network <- networks[[i]]
            training_data <- network$training_data
            training_matrix <- to_matrix(training_data)
            validation_data <- network$validation_data
            validation_matrix <- to_matrix(validation_data)
            fit <- keras::fit(
              model,
-              x = training_matrix,
+              network$training_data,
-              y = training_data$score,
+              network$validation_data,
-              validation_data = list(
+              included_species
                x_val = validation_matrix,
                y_val = validation_data$score
              ),
              epochs = 500,
              verbose = FALSE
            )
            # Apply the model.
            data[, new_score := stats::predict(model, data_matrix)]
            # Remove the values of the training data itself.
-            data[gene %chin% training_data$gene, new_score := NA]
+            data[gene %chin% network$training_data$gene, new_score := NA]
            output_var <- sprintf("score%i", i)
            setnames(data, "new_score", output_var)
            output_vars <- c(output_vars, output_var)
            # Store the details.
            networks[[i]]$model <- keras::serialize_model(model)
            networks[[i]]$fit <- fit
@ -244,7 +180,7 @@ neural <- function(seed = 180199, n_models = 5) {
            details = list(
              seed = seed,
              n_models = n_models,
-              all_results = data[, !..input_vars],
+              all_results = data[, !..included_species],
              networks = networks
            )
          )
@ -253,3 +189,83 @@ neural <- function(seed = 180199, n_models = 5) {
    }
  )
 }
 #' Create a `keras` model based on the number of input variables.
 #'
 #' @param n_input_vars Number of input variables (i.e. species).
 #' @return A `keras` model.
 #'
 #' @noRd
 create_model <- function(n_input_vars) {
  # Layers for the neural network.
  layer1 <- n_input_vars
  layer2 <- 0.5 * layer1
  layer3 <- 0.5 * layer2
  keras::keras_model_sequential() |>
    keras::layer_dense(
      units = layer1,
      activation = "relu",
      input_shape = n_input_vars,
    ) |>
    keras::layer_dense(
      units = layer2,
      activation = "relu",
      kernel_regularizer = keras::regularizer_l2()
    ) |>
    keras::layer_dense(
      units = layer3,
      activation = "relu",
      kernel_regularizer = keras::regularizer_l2()
    ) |>
    keras::layer_dense(
      units = 1,
      activation = "sigmoid"
    ) |>
    keras::compile(
      loss = keras::loss_mean_absolute_error(),
      optimizer = keras::optimizer_adam()
    )
 }
 #' Train a model on a specific training dataset.
 #'
 #' @param model The model created using [create_model()]. The model will be
 #'   changed reflecting the state after training.
 #' @param training_data Data to fit the model to.
 #' @param validation_data Additional data to assess the model performance.
 #' @param input_vars Character vector of input variables that should be
 #'   included.
 #'
 #' @return The `keras` fit object describing the training process.
 #' @noRd
 train_model <- function(model, training_data, validation_data, input_vars) {
  training_matrix <- prepare_data(training_data, input_vars)
  validation_matrix <- prepare_data(validation_data, input_vars)
  keras::fit(
    model,
    x = training_matrix,
    y = training_data$score,
    validation_data = list(
      x_val = validation_matrix,
      y_val = validation_data$score
    ),
    epochs = 500,
    verbose = FALSE
  )
 }
 #' Convert data to a matrix and normalize it.
 #'
 #' @param data Input data.
 #' @param input_vars Character vector of input variables that should be
 #'   included.
 #'
 #' @return A data matrix that can be used within the models.
 #' @noRd
 prepare_data <- function(data, input_vars) {
  data_matrix <- as.matrix(data[, ..input_vars])
  colnames(data_matrix) <- NULL
  keras::normalize(data_matrix)
 }
--- a/man/neural.Rd
+++ b/man/neural.Rd
@ -4,7 +4,12 @@
 \alias{neural}
 \title{Find genes by training and applying a neural network.}
 \usage{
-neural(seed = 180199, n_models = 5)
+neural(
  seed = 180199,
  n_models = 5,
  gene_requirement = 0.5,
  control_ratio = 0.5
 )
 }
 \arguments{
 \item{seed}{The seed will be used to make the results reproducible.}
@ -15,6 +20,13 @@ training data and validated using this set. For non-training genes, the
 final score will be the mean of the result of applying the different
 models. There should be at least two training sets. The analysis will only
 work, if there is at least one reference gene per training set.}
 \item{gene_requirement}{Minimum proportion of genes from the preset that a
 species has to have in order to be included in the models.}
 \item{control_ratio}{The proportion of random control genes that is included
 in the training data sets in addition to the reference genes. This should
 be a numeric value between 0.0 and 1.0.}
 }
 \value{
 An object of class \code{geposan_method}.