neural: Refactor and increase gene requirement

This commit is contained in:
Elias Projahn 2022-05-26 19:50:23 +02:00
parent c04b6337e9
commit 49981300fb
2 changed files with 168 additions and 140 deletions

View file

@ -7,11 +7,19 @@
#' final score will be the mean of the result of applying the different #' final score will be the mean of the result of applying the different
#' models. There should be at least two training sets. The analysis will only #' models. There should be at least two training sets. The analysis will only
#' work, if there is at least one reference gene per training set. #' work, if there is at least one reference gene per training set.
#' @param gene_requirement Minimum proportion of genes from the preset that a
#' species has to have in order to be included in the models.
#' @param control_ratio The proportion of random control genes that is included
#' in the training data sets in addition to the reference genes. This should
#' be a numeric value between 0.0 and 1.0.
#' #'
#' @return An object of class `geposan_method`. #' @return An object of class `geposan_method`.
#' #'
#' @export #' @export
neural <- function(seed = 180199, n_models = 5) { neural <- function(seed = 180199,
n_models = 5,
gene_requirement = 0.5,
control_ratio = 0.5) {
method( method(
id = "neural", id = "neural",
name = "Neural", name = "Neural",
@ -23,65 +31,52 @@ neural <- function(seed = 180199, n_models = 5) {
cached( cached(
"neural", "neural",
c(species_ids, gene_ids, reference_gene_ids, seed, n_models), c(
species_ids,
gene_ids,
reference_gene_ids,
seed,
n_models,
gene_requirement,
control_ratio
),
{ # nolint { # nolint
reference_count <- length(reference_gene_ids) reference_count <- length(reference_gene_ids)
stopifnot(n_models %in% 2:reference_count) stopifnot(n_models %in% 2:reference_count)
control_count <- ceiling(reference_count * control_ratio /
(1 - control_ratio))
# Make results reproducible. # Make results reproducible.
tensorflow::set_random_seed(seed) tensorflow::set_random_seed(seed)
# Step 1: Prepare input data. # Step 1: Prepare input data.
# --------------------------- # ---------------------------
# Prefilter distances by species. # Prefilter distances by species and gene.
distances <- geposan::distances[species %chin% species_ids] distances <- geposan::distances[species %chin% species_ids &
gene %chin% gene_ids]
# Input data for the network. This contains the gene ID as # Only include species that have at least 25% of the included genes.
# an identifier as well as the per-species gene distances as distances[, species_n_genes := .N, by = species]
# input variables. distances <- distances[species_n_genes >=
data <- data.table(gene = gene_ids) gene_requirement * length(gene_ids)]
included_species <- distances[, unique(species)]
# Buffer to keep track of the names of the input variables. # Reshape data to put species into columns.
input_vars <- NULL data <- dcast(
distances,
# Make a columns containing positions and distances for each gene ~ species,
# species. value.var = "distance"
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
# Only include species with at least 25% known values.
# As positions and distances always coexist, we don't
# loose any data here.
species_data <- stats::na.omit(species_data)
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
data <- merge(data, species_data, all.x = TRUE)
# Replace missing data with mean values. The neural
# network can't handle NAs in a meaningful way.
# Choosing extreme values here would result in
# heavily biased results. Therefore, the mean value
# is chosen as a compromise. However, this will of
# course lessen the significance of the results.
mean_distance <- round(
species_data[, mean(distance)]
) )
data[is.na(distance), distance := mean_distance] # Replace values that are still missing with mean values for the
# species in question.
# Name the new column after the species. data[, (included_species) := lapply(included_species, \(species) {
setnames(data, "distance", species_id) species <- get(species)
species[is.na(species)] <- mean(species, na.rm = TRUE)
# Add the input variable to the buffer. species
input_vars <- c(input_vars, species_id) })]
}
}
progress(0.1) progress(0.1)
@ -89,140 +84,81 @@ neural <- function(seed = 180199, n_models = 5) {
# ------------------------------ # ------------------------------
# Take out the reference data. # Take out the reference data.
reference_data <- data[gene %chin% reference_gene_ids] reference_data <- data[gene %chin% reference_gene_ids]
reference_data[, score := 1.0] reference_data[, score := 1.0]
# Take out random samples from the remaining genes. This is # Draw control data from the remaining genes.
# another compromise with a negative impact on control_data <- data[!gene %chin% reference_gene_ids][
# significance. We assume that a random gene is not likely sample(.N, control_count)
# to match the reference genes.
without_reference_data <- data[
!gene %chin% reference_gene_ids
] ]
control_data <- without_reference_data[
sample(
nrow(without_reference_data),
reference_count
)
]
control_data[, score := 0.0] control_data[, score := 0.0]
# Split the training data into random sets to have # Randomly distribute the indices of the reference and control genes
# validation data for each model. # across one bucket per model.
# Scramble the source tables. reference_sets <- split(
reference_data <- reference_data[sample(reference_count)] sample(reference_count),
control_data <- control_data[sample(reference_count)] seq_len(reference_count) %% n_models
)
networks <- list() control_sets <- split(
sample(control_count),
seq_len(control_count) %% n_models
)
indices <- seq_len(reference_count) # Prepare the data for each model. Each model will have one pair of
indices_split <- split(indices, indices %% n_models) # reference and control gene sets left out for validation. The
# training data consists of all the remaining sets.
for (i in seq_len(n_models)) { networks <- lapply(seq_len(n_models), \(index) {
training_data <- rbindlist(list( training_data <- rbindlist(list(
reference_data[!indices_split[[i]]], reference_data[!reference_sets[[index]]],
control_data[!indices_split[[i]]] control_data[!control_sets[[index]]]
)) ))
validation_data <- rbindlist(list( validation_data <- rbindlist(list(
reference_data[indices_split[[i]]], reference_data[reference_sets[[index]]],
control_data[indices_split[[i]]] control_data[control_sets[[index]]]
)) ))
networks[[i]] <- list( list(
training_data = training_data, training_data = training_data,
validation_data = validation_data validation_data = validation_data
) )
} })
# Step 3: Create, train and apply neural network. # Step 3: Create, train and apply neural network.
# ----------------------------------------------- # -----------------------------------------------
# Layers for the neural network. data_matrix <- prepare_data(data, included_species)
input_layer <- length(input_vars)
layer1 <- input_layer
layer2 <- 0.5 * input_layer
layer3 <- 0.5 * layer2
# Convert data to matrix and normalize it.
to_matrix <- function(data) {
data_matrix <- as.matrix(data[, ..input_vars])
colnames(data_matrix) <- NULL
keras::normalize(data_matrix)
}
data_matrix <- to_matrix(data)
output_vars <- NULL output_vars <- NULL
for (i in seq_along(networks)) { for (i in seq_along(networks)) {
network <- networks[[i]]
# Create a new model for each training session, because # Create a new model for each training session, because
# the model would keep its state across training # the model would keep its state across training
# sessions otherwise. # sessions otherwise.
model <- keras::keras_model_sequential() |> model <- create_model(length(included_species))
keras::layer_dense(
units = layer1,
activation = "relu",
input_shape = input_layer,
) |>
keras::layer_dense(
units = layer2,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = layer3,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = 1,
activation = "sigmoid"
) |>
keras::compile(
loss = keras::loss_mean_absolute_error(),
optimizer = keras::optimizer_adam()
)
# Train the model. # Train the model.
fit <- train_model(
network <- networks[[i]]
training_data <- network$training_data
training_matrix <- to_matrix(training_data)
validation_data <- network$validation_data
validation_matrix <- to_matrix(validation_data)
fit <- keras::fit(
model, model,
x = training_matrix, network$training_data,
y = training_data$score, network$validation_data,
validation_data = list( included_species
x_val = validation_matrix,
y_val = validation_data$score
),
epochs = 500,
verbose = FALSE
) )
# Apply the model. # Apply the model.
data[, new_score := stats::predict(model, data_matrix)] data[, new_score := stats::predict(model, data_matrix)]
# Remove the values of the training data itself. # Remove the values of the training data itself.
data[gene %chin% training_data$gene, new_score := NA] data[gene %chin% network$training_data$gene, new_score := NA]
output_var <- sprintf("score%i", i) output_var <- sprintf("score%i", i)
setnames(data, "new_score", output_var) setnames(data, "new_score", output_var)
output_vars <- c(output_vars, output_var) output_vars <- c(output_vars, output_var)
# Store the details. # Store the details.
networks[[i]]$model <- keras::serialize_model(model) networks[[i]]$model <- keras::serialize_model(model)
networks[[i]]$fit <- fit networks[[i]]$fit <- fit
@ -244,7 +180,7 @@ neural <- function(seed = 180199, n_models = 5) {
details = list( details = list(
seed = seed, seed = seed,
n_models = n_models, n_models = n_models,
all_results = data[, !..input_vars], all_results = data[, !..included_species],
networks = networks networks = networks
) )
) )
@ -253,3 +189,83 @@ neural <- function(seed = 180199, n_models = 5) {
} }
) )
} }
#' Create a `keras` model based on the number of input variables.
#'
#' @param n_input_vars Number of input variables (i.e. species).
#' @return A `keras` model.
#'
#' @noRd
create_model <- function(n_input_vars) {
# Layers for the neural network.
layer1 <- n_input_vars
layer2 <- 0.5 * layer1
layer3 <- 0.5 * layer2
keras::keras_model_sequential() |>
keras::layer_dense(
units = layer1,
activation = "relu",
input_shape = n_input_vars,
) |>
keras::layer_dense(
units = layer2,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = layer3,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = 1,
activation = "sigmoid"
) |>
keras::compile(
loss = keras::loss_mean_absolute_error(),
optimizer = keras::optimizer_adam()
)
}
#' Train a model on a specific training dataset.
#'
#' @param model The model created using [create_model()]. The model will be
#' changed reflecting the state after training.
#' @param training_data Data to fit the model to.
#' @param validation_data Additional data to assess the model performance.
#' @param input_vars Character vector of input variables that should be
#' included.
#'
#' @return The `keras` fit object describing the training process.
#' @noRd
train_model <- function(model, training_data, validation_data, input_vars) {
training_matrix <- prepare_data(training_data, input_vars)
validation_matrix <- prepare_data(validation_data, input_vars)
keras::fit(
model,
x = training_matrix,
y = training_data$score,
validation_data = list(
x_val = validation_matrix,
y_val = validation_data$score
),
epochs = 500,
verbose = FALSE
)
}
#' Convert data to a matrix and normalize it.
#'
#' @param data Input data.
#' @param input_vars Character vector of input variables that should be
#' included.
#'
#' @return A data matrix that can be used within the models.
#' @noRd
prepare_data <- function(data, input_vars) {
data_matrix <- as.matrix(data[, ..input_vars])
colnames(data_matrix) <- NULL
keras::normalize(data_matrix)
}

View file

@ -4,7 +4,12 @@
\alias{neural} \alias{neural}
\title{Find genes by training and applying a neural network.} \title{Find genes by training and applying a neural network.}
\usage{ \usage{
neural(seed = 180199, n_models = 5) neural(
seed = 180199,
n_models = 5,
gene_requirement = 0.5,
control_ratio = 0.5
)
} }
\arguments{ \arguments{
\item{seed}{The seed will be used to make the results reproducible.} \item{seed}{The seed will be used to make the results reproducible.}
@ -15,6 +20,13 @@ training data and validated using this set. For non-training genes, the
final score will be the mean of the result of applying the different final score will be the mean of the result of applying the different
models. There should be at least two training sets. The analysis will only models. There should be at least two training sets. The analysis will only
work, if there is at least one reference gene per training set.} work, if there is at least one reference gene per training set.}
\item{gene_requirement}{Minimum proportion of genes from the preset that a
species has to have in order to be included in the models.}
\item{control_ratio}{The proportion of random control genes that is included
in the training data sets in addition to the reference genes. This should
be a numeric value between 0.0 and 1.0.}
} }
\value{ \value{
An object of class \code{geposan_method}. An object of class \code{geposan_method}.