From 61422a6a0633c42131fb1e1f233109cd4e404b58 Mon Sep 17 00:00:00 2001 From: Elias Projahn Date: Sun, 21 Nov 2021 23:55:56 +0100 Subject: [PATCH] neural: Reimplement merging positions and distances --- DESCRIPTION | 2 +- R/analyze.R | 3 - R/neural.R | 269 ++++++++++++++++++++++++++++------------------------ R/preset.R | 1 - 4 files changed, 146 insertions(+), 129 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a74b1fe..2b0f092 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,7 @@ Depends: R (>= 2.10) Imports: data.table, - neuralnet, + keras, rlang Suggests: biomaRt, diff --git a/R/analyze.R b/R/analyze.R index 54e4f46..6db623b 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -40,9 +40,6 @@ analyze <- function(preset, progress = NULL) { correlation(..., use_positions = TRUE) }, "neural" = neural, - "neural_positions" = function(...) { - neural(..., use_positions = TRUE) - }, "proximity" = proximity ) diff --git a/R/neural.R b/R/neural.R index 2987ed1..5c9e8d2 100644 --- a/R/neural.R +++ b/R/neural.R @@ -1,146 +1,167 @@ -# Find genes by training a neural network on reference position data. -# -# @param seed A seed to get reproducible results. -neural <- function(preset, - use_positions = FALSE, - progress = NULL, - seed = 448077) { +# Find genes by training and applying a neural network. +neural <- function(preset, progress = NULL, seed = 49641) { species_ids <- preset$species_ids gene_ids <- preset$gene_ids reference_gene_ids <- preset$reference_gene_ids - cached( - "neural", - c(species_ids, gene_ids, reference_gene_ids, use_positions), - { # nolint - set.seed(seed) - gene_count <- length(gene_ids) + cached("neural", c(species_ids, gene_ids, reference_gene_ids), { + set.seed(seed) + gene_count <- length(gene_ids) - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Input data for the network. This contains the gene ID as an - # identifier as well as the per-species gene distances as input - # variables. - data <- data.table(gene = gene_ids) + # Input data for the network. This contains the gene ID as an + # identifier as well as the per-species gene distances as input + # variables. + data <- data.table(gene = gene_ids) - # Buffer to keep track of species included in the computation. - # Species from `species_ids` may be excluded if they don't have - # enough data. - species_ids_included <- NULL + # Buffer to keep track of the names of the input variables. + input_vars <- NULL - # Make a column containing positions for each species. - for (species_id in species_ids) { - species_data <- if (use_positions) { - setnames(distances[ - species == species_id, - .(gene, position) - ], "position", "distance") - } else { - distances[ - species == species_id, - .(gene, distance) - ] - } - - # Only include species with at least 25% known values. - - species_data <- stats::na.omit(species_data) - - if (nrow(species_data) >= 0.25 * gene_count) { - species_ids_included <- c(species_ids_included, species_id) - data <- merge(data, species_data, all.x = TRUE) - - # Replace missing data with mean values. The neural network - # can't handle NAs in a meaningful way. Choosing extreme - # values here would result in heavily biased results. - # Therefore, the mean value is chosen as a compromise. - # However, this will of course lessen the significance of - # the results. - - mean_distance <- round(species_data[, mean(distance)]) - data[is.na(distance), distance := mean_distance] - - # Name the new column after the species. - setnames(data, "distance", species_id) - } - } - - # Extract the reference genes. - - reference_data <- data[gene %chin% reference_gene_ids] - reference_data[, neural := 1.0] - - # Take out random samples from the remaining genes. This is another - # compromise with a negative impact on significance. Because there - # is no information on genes with are explicitely *not* TPE-OLD - # genes, we have to assume that a random sample of genes has a low - # probability of including TPE-OLD genes. - - without_reference_data <- data[!gene %chin% reference_gene_ids] - - reference_samples <- without_reference_data[ - sample( - nrow(without_reference_data), - nrow(reference_data) - ) + # Make a columns containing positions and distances for each + # species. + for (species_id in species_ids) { + species_data <- distances[ + species == species_id, + .(gene, position, distance) ] - reference_samples[, neural := 0.0] + # Only include species with at least 25% known values. As + # positions and distances always coexist, we don't loose any + # data here. - # Merge training data. The training data includes all reference - # genes as well as an equal number of random sample genes. - training_data <- rbindlist(list(reference_data, reference_samples)) + species_data <- stats::na.omit(species_data) - # Construct and train the neural network. + if (nrow(species_data) >= 0.25 * gene_count) { + data <- merge(data, species_data, all.x = TRUE) - nn_formula <- stats::as.formula(sprintf( - "neural~%s", - paste(species_ids_included, collapse = "+") - )) + # Replace missing data with mean values. The neural network + # can't handle NAs in a meaningful way. Choosing extreme + # values here would result in heavily biased results. + # Therefore, the mean value is chosen as a compromise. + # However, this will of course lessen the significance of + # the results. - layer1 <- length(species_ids) * 0.66 - layer2 <- layer1 * 0.66 - layer3 <- layer2 * 0.66 + mean_position <- round(species_data[, mean(position)]) + mean_distance <- round(species_data[, mean(distance)]) - nn <- neuralnet::neuralnet( - nn_formula, - training_data, - hidden = c(layer1, layer2, layer3), - linear.output = FALSE - ) + data[is.na(distance), `:=`( + position = mean_position, + distance = mean_distance + )] - if (!is.null(progress)) { - progress(0.33) - } + input_position <- sprintf("%s_position", species_id) + input_distance <- sprintf("%s_distance", species_id) - # Apply the neural network. - data[, score := neuralnet::compute(nn, data)$net.result] - - # Reconstruct and run the neural network for each training gene - # by training it without the respective gene. - for (gene_id in training_data$gene) { - nn <- neuralnet::neuralnet( - nn_formula, - training_data[gene != gene_id], - hidden = c(layer1, layer2, layer3), - linear.output = FALSE + # Name the new columns after the species. + setnames( + data, + c("position", "distance"), + c(input_position, input_distance) ) - data[ - gene == gene_id, - score := neuralnet::compute( - nn, - training_data[gene == gene_id] - )$net.result - ] + # Add the input variables to the buffer. + input_vars <- c(input_vars, input_position, input_distance) } - - if (!is.null(progress)) { - progress(1.0) - } - - data[, .(gene, score)] } - ) + + # Extract the reference genes. + + reference_data <- data[gene %chin% reference_gene_ids] + reference_data[, score := 1.0] + + # Take out random samples from the remaining genes. This is another + # compromise with a negative impact on significance. Because there + # is no information on genes with are explicitely *not* TPE-OLD + # genes, we have to assume that a random sample of genes has a low + # probability of including TPE-OLD genes. + + without_reference_data <- data[!gene %chin% reference_gene_ids] + + reference_samples <- without_reference_data[ + sample( + nrow(without_reference_data), + nrow(reference_data) + ) + ] + + reference_samples[, score := 0.0] + + # Merge training data. The training data includes all reference + # genes as well as an equal number of random sample genes. + training_data <- rbindlist(list(reference_data, reference_samples)) + + # Layers for the neural network. + input_layer <- length(input_vars) + layer1 <- input_layer + layer2 <- 0.5 * input_layer + layer3 <- 0.5 * layer2 + + # Train the model using the specified subset of the training data and + # apply it for predicting the genes. + apply_network <- function(training_gene_ids, gene_ids) { + # Create a new model for each training session, because the model + # would keep its state across training sessions otherwise. + model <- keras::keras_model_sequential() |> + keras::layer_dense( + units = layer1, + activation = "relu", + input_shape = input_layer, + ) |> + keras::layer_dense( + units = layer2, + activation = "relu", + kernel_regularizer = keras::regularizer_l2() + ) |> + keras::layer_dense( + units = layer3, + activation = "relu", + kernel_regularizer = keras::regularizer_l2() + ) |> + keras::layer_dense( + units = 1, + activation = "sigmoid" + ) |> + keras::compile(loss = "binary_crossentropy") + + # Prepare training data by filtering it to the given genes and + # converting it to a matrix. + training_data <- training_data[gene %chin% training_gene_ids] + training_matrix <- as.matrix(training_data[, ..input_vars]) + colnames(training_matrix) <- NULL + training_matrix <- keras::normalize(training_matrix) + + keras::fit( + model, + x = training_matrix, + y = training_data$score, + epochs = 300, + verbose = FALSE + ) + + # Convert the input data to a matrix. + data_matrix <- as.matrix(data[gene %chin% gene_ids, ..input_vars]) + colnames(data_matrix) <- NULL + data_matrix <- keras::normalize(data_matrix) + + data[gene %chin% gene_ids, score := predict(model, data_matrix)] + } + + # Apply the network to all non-training genes first. + apply_network( + training_data$gene, + gene_ids[!gene_ids %chin% training_data$gene] + ) + + # Apply the network to the training genes leaving out the gene itself. + for (training_gene_id in training_data$gene) { + apply_network( + training_data[gene != training_gene_id, gene], + training_gene_id + ) + } + + data[, .(gene, score)] + }) } diff --git a/R/preset.R b/R/preset.R index 9d3bb84..441082c 100644 --- a/R/preset.R +++ b/R/preset.R @@ -43,7 +43,6 @@ preset <- function(methods = c( "correlation", "correlation_positions", "neural", - "neural_positions", "proximity" ), species_ids = NULL,