| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | #' Find genes by training and applying a neural network. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @param seed The seed will be used to make the results reproducible. | 
					
						
							|  |  |  | #' @param n_models This number specifies how many sets of training data should | 
					
						
							|  |  |  | #'   be created. For each set, there will be a model trained on the remaining | 
					
						
							|  |  |  | #'   training data and validated using this set. For non-training genes, the | 
					
						
							|  |  |  | #'   final score will be the mean of the result of applying the different | 
					
						
							|  |  |  | #'   models. There should be at least two training sets. The analysis will only | 
					
						
							|  |  |  | #'   work, if there is at least one reference gene per training set. | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  | #' @param control_ratio The proportion of random control genes that is included | 
					
						
							|  |  |  | #'   in the training data sets in addition to the reference genes. This should | 
					
						
							|  |  |  | #'   be a numeric value between 0.0 and 1.0. | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | #' | 
					
						
							|  |  |  | #' @return An object of class `geposan_method`. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @export | 
					
						
							| 
									
										
										
										
											2022-05-30 13:49:52 +02:00
										 |  |  | neural <- function(seed = 180199, n_models = 5, control_ratio = 0.5) { | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |   method( | 
					
						
							|  |  |  |     id = "neural", | 
					
						
							|  |  |  |     name = "Neural", | 
					
						
							|  |  |  |     description = "Assessment by neural network", | 
					
						
							|  |  |  |     function(preset, progress) { | 
					
						
							|  |  |  |       species_ids <- preset$species_ids | 
					
						
							|  |  |  |       gene_ids <- preset$gene_ids | 
					
						
							|  |  |  |       reference_gene_ids <- preset$reference_gene_ids | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       cached( | 
					
						
							|  |  |  |         "neural", | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |         c( | 
					
						
							|  |  |  |           species_ids, | 
					
						
							|  |  |  |           gene_ids, | 
					
						
							|  |  |  |           reference_gene_ids, | 
					
						
							|  |  |  |           seed, | 
					
						
							|  |  |  |           n_models, | 
					
						
							|  |  |  |           control_ratio | 
					
						
							|  |  |  |         ), | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |         { # nolint | 
					
						
							|  |  |  |           reference_count <- length(reference_gene_ids) | 
					
						
							|  |  |  |           stopifnot(n_models %in% 2:reference_count) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           control_count <- ceiling(reference_count * control_ratio / | 
					
						
							|  |  |  |             (1 - control_ratio)) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           # Make results reproducible. | 
					
						
							|  |  |  |           tensorflow::set_random_seed(seed) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Step 1: Prepare input data. | 
					
						
							|  |  |  |           # --------------------------- | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           # Prefilter distances by species and gene. | 
					
						
							|  |  |  |           distances <- geposan::distances[species %chin% species_ids & | 
					
						
							|  |  |  |             gene %chin% gene_ids] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Reshape data to put species into columns. | 
					
						
							|  |  |  |           data <- dcast( | 
					
						
							|  |  |  |             distances, | 
					
						
							|  |  |  |             gene ~ species, | 
					
						
							|  |  |  |             value.var = "distance" | 
					
						
							|  |  |  |           ) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           # Replace values that are still missing with mean values for the | 
					
						
							|  |  |  |           # species in question. | 
					
						
							| 
									
										
										
										
											2022-05-30 13:49:52 +02:00
										 |  |  |           data[, (species_ids) := lapply(species_ids, \(species) { | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |             species <- get(species) | 
					
						
							|  |  |  |             species[is.na(species)] <- mean(species, na.rm = TRUE) | 
					
						
							|  |  |  |             species | 
					
						
							|  |  |  |           })] | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |           progress(0.1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Step 2: Prepare training data. | 
					
						
							|  |  |  |           # ------------------------------ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           # Take out the reference data. | 
					
						
							|  |  |  |           reference_data <- data[gene %chin% reference_gene_ids] | 
					
						
							|  |  |  |           reference_data[, score := 1.0] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           # Draw control data from the remaining genes. | 
					
						
							|  |  |  |           control_data <- data[!gene %chin% reference_gene_ids][ | 
					
						
							|  |  |  |             sample(.N, control_count) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           ] | 
					
						
							|  |  |  |           control_data[, score := 0.0] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           # Randomly distribute the indices of the reference and control genes | 
					
						
							|  |  |  |           # across one bucket per model. | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           reference_sets <- split( | 
					
						
							|  |  |  |             sample(reference_count), | 
					
						
							|  |  |  |             seq_len(reference_count) %% n_models | 
					
						
							|  |  |  |           ) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           control_sets <- split( | 
					
						
							|  |  |  |             sample(control_count), | 
					
						
							|  |  |  |             seq_len(control_count) %% n_models | 
					
						
							|  |  |  |           ) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           # Prepare the data for each model. Each model will have one pair of | 
					
						
							|  |  |  |           # reference and control gene sets left out for validation. The | 
					
						
							|  |  |  |           # training data consists of all the remaining sets. | 
					
						
							|  |  |  |           networks <- lapply(seq_len(n_models), \(index) { | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |             training_data <- rbindlist(list( | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |               reference_data[!reference_sets[[index]]], | 
					
						
							|  |  |  |               control_data[!control_sets[[index]]] | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |             )) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             validation_data <- rbindlist(list( | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |               reference_data[reference_sets[[index]]], | 
					
						
							|  |  |  |               control_data[control_sets[[index]]] | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |             )) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |             list( | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |               training_data = training_data, | 
					
						
							|  |  |  |               validation_data = validation_data | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |           }) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |           # Step 3: Create, train and apply neural network. | 
					
						
							|  |  |  |           # ----------------------------------------------- | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-30 13:49:52 +02:00
										 |  |  |           data_matrix <- prepare_data(data, species_ids) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           output_vars <- NULL | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |           for (i in seq_along(networks)) { | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |             network <- networks[[i]] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |             # Create a new model for each training session, because | 
					
						
							|  |  |  |             # the model would keep its state across training | 
					
						
							|  |  |  |             # sessions otherwise. | 
					
						
							| 
									
										
										
										
											2022-05-30 13:49:52 +02:00
										 |  |  |             model <- create_model(length(species_ids)) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             # Train the model. | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |             fit <- train_model( | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |               model, | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |               network$training_data, | 
					
						
							|  |  |  |               network$validation_data, | 
					
						
							| 
									
										
										
										
											2022-05-30 13:49:52 +02:00
										 |  |  |               species_ids | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # Apply the model. | 
					
						
							|  |  |  |             data[, new_score := stats::predict(model, data_matrix)] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # Remove the values of the training data itself. | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  |             data[gene %chin% network$training_data$gene, new_score := NA] | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             output_var <- sprintf("score%i", i) | 
					
						
							|  |  |  |             setnames(data, "new_score", output_var) | 
					
						
							|  |  |  |             output_vars <- c(output_vars, output_var) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # Store the details. | 
					
						
							|  |  |  |             networks[[i]]$model <- keras::serialize_model(model) | 
					
						
							|  |  |  |             networks[[i]]$fit <- fit | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             progress(0.1 + i * (0.9 / n_models)) | 
					
						
							|  |  |  |           } | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           # Compute the final score as the mean score. | 
					
						
							|  |  |  |           data[, | 
					
						
							|  |  |  |             score := mean(as.numeric(.SD), na.rm = TRUE), | 
					
						
							|  |  |  |             .SDcols = output_vars, | 
					
						
							|  |  |  |             by = gene | 
					
						
							|  |  |  |           ] | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           progress(1.0) | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           result( | 
					
						
							|  |  |  |             method = "neural", | 
					
						
							|  |  |  |             scores = data[, .(gene, score)], | 
					
						
							|  |  |  |             details = list( | 
					
						
							|  |  |  |               seed = seed, | 
					
						
							|  |  |  |               n_models = n_models, | 
					
						
							| 
									
										
										
										
											2022-05-30 13:49:52 +02:00
										 |  |  |               all_results = data[, !..species_ids], | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |               networks = networks | 
					
						
							| 
									
										
										
										
											2021-11-23 16:26:04 +01:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |           ) | 
					
						
							| 
									
										
										
										
											2021-11-23 16:26:04 +01:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2022-05-26 12:42:19 +02:00
										 |  |  |       ) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   ) | 
					
						
							| 
									
										
										
										
											2021-10-19 13:39:55 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2022-05-26 19:50:23 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | #' Create a `keras` model based on the number of input variables. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @param n_input_vars Number of input variables (i.e. species). | 
					
						
							|  |  |  | #' @return A `keras` model. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @noRd | 
					
						
							|  |  |  | create_model <- function(n_input_vars) { | 
					
						
							|  |  |  |   # Layers for the neural network. | 
					
						
							|  |  |  |   layer1 <- n_input_vars | 
					
						
							|  |  |  |   layer2 <- 0.5 * layer1 | 
					
						
							|  |  |  |   layer3 <- 0.5 * layer2 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   keras::keras_model_sequential() |> | 
					
						
							|  |  |  |     keras::layer_dense( | 
					
						
							|  |  |  |       units = layer1, | 
					
						
							|  |  |  |       activation = "relu", | 
					
						
							|  |  |  |       input_shape = n_input_vars, | 
					
						
							|  |  |  |     ) |> | 
					
						
							|  |  |  |     keras::layer_dense( | 
					
						
							|  |  |  |       units = layer2, | 
					
						
							|  |  |  |       activation = "relu", | 
					
						
							|  |  |  |       kernel_regularizer = keras::regularizer_l2() | 
					
						
							|  |  |  |     ) |> | 
					
						
							|  |  |  |     keras::layer_dense( | 
					
						
							|  |  |  |       units = layer3, | 
					
						
							|  |  |  |       activation = "relu", | 
					
						
							|  |  |  |       kernel_regularizer = keras::regularizer_l2() | 
					
						
							|  |  |  |     ) |> | 
					
						
							|  |  |  |     keras::layer_dense( | 
					
						
							|  |  |  |       units = 1, | 
					
						
							|  |  |  |       activation = "sigmoid" | 
					
						
							|  |  |  |     ) |> | 
					
						
							|  |  |  |     keras::compile( | 
					
						
							|  |  |  |       loss = keras::loss_mean_absolute_error(), | 
					
						
							|  |  |  |       optimizer = keras::optimizer_adam() | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' Train a model on a specific training dataset. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @param model The model created using [create_model()]. The model will be | 
					
						
							|  |  |  | #'   changed reflecting the state after training. | 
					
						
							|  |  |  | #' @param training_data Data to fit the model to. | 
					
						
							|  |  |  | #' @param validation_data Additional data to assess the model performance. | 
					
						
							|  |  |  | #' @param input_vars Character vector of input variables that should be | 
					
						
							|  |  |  | #'   included. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @return The `keras` fit object describing the training process. | 
					
						
							|  |  |  | #' @noRd | 
					
						
							|  |  |  | train_model <- function(model, training_data, validation_data, input_vars) { | 
					
						
							|  |  |  |   training_matrix <- prepare_data(training_data, input_vars) | 
					
						
							|  |  |  |   validation_matrix <- prepare_data(validation_data, input_vars) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   keras::fit( | 
					
						
							|  |  |  |     model, | 
					
						
							|  |  |  |     x = training_matrix, | 
					
						
							|  |  |  |     y = training_data$score, | 
					
						
							|  |  |  |     validation_data = list( | 
					
						
							|  |  |  |       x_val = validation_matrix, | 
					
						
							|  |  |  |       y_val = validation_data$score | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  |     epochs = 500, | 
					
						
							|  |  |  |     verbose = FALSE | 
					
						
							|  |  |  |   ) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' Convert data to a matrix and normalize it. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @param data Input data. | 
					
						
							|  |  |  | #' @param input_vars Character vector of input variables that should be | 
					
						
							|  |  |  | #'   included. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @return A data matrix that can be used within the models. | 
					
						
							|  |  |  | #' @noRd | 
					
						
							|  |  |  | prepare_data <- function(data, input_vars) { | 
					
						
							|  |  |  |   data_matrix <- as.matrix(data[, ..input_vars]) | 
					
						
							|  |  |  |   colnames(data_matrix) <- NULL | 
					
						
							|  |  |  |   keras::normalize(data_matrix) | 
					
						
							|  |  |  | } |