From c60f6a8aff6e397a3ed0a7d4e376751f37c5e5fb Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Sat, 14 Oct 2023 11:33:19 +0200 Subject: [PATCH] Remove neural network --- DESCRIPTION | 4 +- NAMESPACE | 1 - R/method.R | 1 - R/method_neural.R | 267 ---------------------------------------------- R/preset.R | 4 +- man/neural.Rd | 41 ------- man/preset.Rd | 4 +- 7 files changed, 5 insertions(+), 317 deletions(-) delete mode 100644 R/method_neural.R delete mode 100644 man/neural.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 21f60f5..d13c223 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,11 +24,9 @@ Depends: Imports: data.table, glue, - keras, ranger, rlang, - progress, - tensorflow + progress Suggests: biomaRt, httr, diff --git a/NAMESPACE b/NAMESPACE index 60d16ad..cb6ef98 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,7 +16,6 @@ export(correlation) export(densest) export(distance) export(method) -export(neural) export(optimal_weights) export(plot_boxplot) export(plot_chromosomes) diff --git a/R/method.R b/R/method.R index 6f8b3f2..a407f57 100644 --- a/R/method.R +++ b/R/method.R @@ -37,7 +37,6 @@ all_methods <- function() { adjacency(), clustering(), correlation(), - neural(), random_forest() ) } diff --git a/R/method_neural.R b/R/method_neural.R deleted file mode 100644 index 438aff1..0000000 --- a/R/method_neural.R +++ /dev/null @@ -1,267 +0,0 @@ -#' Find genes by training and applying a neural network. -#' -#' @param id Unique ID for the method and its results. -#' @param name Human readable name for the method. -#' @param description Method description. -#' @param seed The seed will be used to make the results reproducible. -#' @param n_models This number specifies how many sets of training data should -#' be created. For each set, there will be a model trained on the remaining -#' training data and validated using this set. For non-training genes, the -#' final score will be the mean of the result of applying the different -#' models. There should be at least two training sets. The analysis will only -#' work, if there is at least one reference gene per training set. -#' @param control_ratio The proportion of random control genes that is included -#' in the training data sets in addition to the reference genes. This should -#' be a numeric value between 0.0 and 1.0. -#' -#' @return An object of class `geposan_method`. -#' -#' @export -neural <- function(id = "neural", - name = "Neural", - description = "Assessment by neural network", - seed = 180199, - n_models = 5, - control_ratio = 0.5) { - method( - id = id, - name = name, - description = description, - function(preset, progress) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - reference_gene_ids <- preset$reference_gene_ids - - cached( - id, - c( - species_ids, - gene_ids, - reference_gene_ids, - seed, - n_models, - control_ratio - ), - { # nolint - reference_count <- length(reference_gene_ids) - stopifnot(n_models %in% 2:reference_count) - - control_count <- ceiling(reference_count * control_ratio / - (1 - control_ratio)) - - # Make results reproducible. - tensorflow::set_random_seed(seed) - - # Step 1: Prepare input data. - # --------------------------- - - # Prefilter distances by species and gene. - distances <- geposan::distances[species %chin% species_ids & - gene %chin% gene_ids] - - # Reshape data to put species into columns. - data <- dcast( - distances, - gene ~ species, - value.var = "distance" - ) - - # Replace values that are still missing with mean values for the - # species in question. - data[, (species_ids) := lapply(species_ids, \(species) { - species <- get(species) - species[is.na(species)] <- mean(species, na.rm = TRUE) - species - })] - - progress(0.1) - - # Step 2: Prepare training data. - # ------------------------------ - - # Take out the reference data. - reference_data <- data[gene %chin% reference_gene_ids] - reference_data[, score := 1.0] - - # Draw control data from the remaining genes. - control_data <- data[!gene %chin% reference_gene_ids][ - sample(.N, control_count) - ] - control_data[, score := 0.0] - - # Randomly distribute the indices of the reference and control genes - # across one bucket per model. - - reference_sets <- split( - sample(reference_count), - seq_len(reference_count) %% n_models - ) - - control_sets <- split( - sample(control_count), - seq_len(control_count) %% n_models - ) - - # Prepare the data for each model. Each model will have one pair of - # reference and control gene sets left out for validation. The - # training data consists of all the remaining sets. - networks <- lapply(seq_len(n_models), \(index) { - training_data <- rbindlist(list( - reference_data[!reference_sets[[index]]], - control_data[!control_sets[[index]]] - )) - - validation_data <- rbindlist(list( - reference_data[reference_sets[[index]]], - control_data[control_sets[[index]]] - )) - - list( - training_data = training_data, - validation_data = validation_data - ) - }) - - # Step 3: Create, train and apply neural network. - # ----------------------------------------------- - - data_matrix <- prepare_data(data, species_ids) - output_vars <- NULL - - for (i in seq_along(networks)) { - network <- networks[[i]] - - # Create a new model for each training session, because - # the model would keep its state across training - # sessions otherwise. - model <- create_model(length(species_ids)) - - # Train the model. - fit <- train_model( - model, - network$training_data, - network$validation_data, - species_ids - ) - - # Apply the model. - data[, new_score := stats::predict(model, data_matrix)] - - # Remove the values of the training data itself. - data[gene %chin% network$training_data$gene, new_score := NA] - - output_var <- sprintf("score%i", i) - setnames(data, "new_score", output_var) - output_vars <- c(output_vars, output_var) - - # Store the details. - networks[[i]]$model <- keras::serialize_model(model) - networks[[i]]$fit <- fit - - progress(0.1 + i * (0.9 / n_models)) - } - - # Compute the final score as the mean score. - data[, - score := mean(as.numeric(.SD), na.rm = TRUE), - .SDcols = output_vars, - by = gene - ] - - progress(1.0) - - result( - method = "neural", - scores = data[, .(gene, score)], - details = list( - seed = seed, - n_models = n_models, - all_results = data[, !..species_ids], - networks = networks - ) - ) - } - ) - } - ) -} - -#' Create a `keras` model based on the number of input variables. -#' -#' @param n_input_vars Number of input variables (i.e. species). -#' @return A `keras` model. -#' -#' @noRd -create_model <- function(n_input_vars) { - # Layers for the neural network. - layer1 <- n_input_vars - layer2 <- 0.5 * layer1 - layer3 <- 0.5 * layer2 - - keras::keras_model_sequential() |> - keras::layer_dense( - units = layer1, - activation = "relu", - input_shape = n_input_vars, - ) |> - keras::layer_dense( - units = layer2, - activation = "relu", - kernel_regularizer = keras::regularizer_l2() - ) |> - keras::layer_dense( - units = layer3, - activation = "relu", - kernel_regularizer = keras::regularizer_l2() - ) |> - keras::layer_dense( - units = 1, - activation = "sigmoid" - ) |> - keras::compile( - loss = keras::loss_mean_absolute_error(), - optimizer = keras::optimizer_adam() - ) -} - -#' Train a model on a specific training dataset. -#' -#' @param model The model created using [create_model()]. The model will be -#' changed reflecting the state after training. -#' @param training_data Data to fit the model to. -#' @param validation_data Additional data to assess the model performance. -#' @param input_vars Character vector of input variables that should be -#' included. -#' -#' @return The `keras` fit object describing the training process. -#' @noRd -train_model <- function(model, training_data, validation_data, input_vars) { - training_matrix <- prepare_data(training_data, input_vars) - validation_matrix <- prepare_data(validation_data, input_vars) - - keras::fit( - model, - x = training_matrix, - y = training_data$score, - validation_data = list( - x_val = validation_matrix, - y_val = validation_data$score - ), - epochs = 500, - verbose = FALSE - ) -} - -#' Convert data to a matrix and normalize it. -#' -#' @param data Input data. -#' @param input_vars Character vector of input variables that should be -#' included. -#' -#' @return A data matrix that can be used within the models. -#' @noRd -prepare_data <- function(data, input_vars) { - data_matrix <- as.matrix(data[, ..input_vars]) - colnames(data_matrix) <- NULL - keras::normalize(data_matrix) -} diff --git a/R/preset.R b/R/preset.R index d44d8d2..4178f65 100644 --- a/R/preset.R +++ b/R/preset.R @@ -5,8 +5,8 @@ #' reference genes to be able to assess the results later. The genes will be #' filtered based on how many species have data for them. Afterwards, species #' that still have many missing genes will also be excluded. See the different -#' method functions for the available methods: [clustering()], [correlation()], -#' [distance()], [neural()] and [random_forest()]. +#' method functions for the available methods: [distance()], [variation()], +#' [clustering()], [adjacency()], [correlation()] and [random_forest()]. #' #' @param reference_gene_ids IDs of reference genes to compare to. #' @param methods List of methods to apply. diff --git a/man/neural.Rd b/man/neural.Rd deleted file mode 100644 index c7962ea..0000000 --- a/man/neural.Rd +++ /dev/null @@ -1,41 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/method_neural.R -\name{neural} -\alias{neural} -\title{Find genes by training and applying a neural network.} -\usage{ -neural( - id = "neural", - name = "Neural", - description = "Assessment by neural network", - seed = 180199, - n_models = 5, - control_ratio = 0.5 -) -} -\arguments{ -\item{id}{Unique ID for the method and its results.} - -\item{name}{Human readable name for the method.} - -\item{description}{Method description.} - -\item{seed}{The seed will be used to make the results reproducible.} - -\item{n_models}{This number specifies how many sets of training data should -be created. For each set, there will be a model trained on the remaining -training data and validated using this set. For non-training genes, the -final score will be the mean of the result of applying the different -models. There should be at least two training sets. The analysis will only -work, if there is at least one reference gene per training set.} - -\item{control_ratio}{The proportion of random control genes that is included -in the training data sets in addition to the reference genes. This should -be a numeric value between 0.0 and 1.0.} -} -\value{ -An object of class \code{geposan_method}. -} -\description{ -Find genes by training and applying a neural network. -} diff --git a/man/preset.Rd b/man/preset.Rd index c53fc9e..d5a8bcb 100644 --- a/man/preset.Rd +++ b/man/preset.Rd @@ -37,6 +37,6 @@ analysis. Note that the genes to process should normally include the reference genes to be able to assess the results later. The genes will be filtered based on how many species have data for them. Afterwards, species that still have many missing genes will also be excluded. See the different -method functions for the available methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, -\code{\link[=distance]{distance()}}, \code{\link[=neural]{neural()}} and \code{\link[=random_forest]{random_forest()}}. +method functions for the available methods: \code{\link[=distance]{distance()}}, \code{\link[=variation]{variation()}}, +\code{\link[=clustering]{clustering()}}, \code{\link[=adjacency]{adjacency()}}, \code{\link[=correlation]{correlation()}} and \code{\link[=random_forest]{random_forest()}}. }