Implement random forest model method

This commit is contained in:
Elias Projahn 2022-08-11 12:32:57 +02:00
parent 6cc9196a0d
commit 038208bfbd
3 changed files with 227 additions and 1 deletions

View file

@ -37,7 +37,8 @@ all_methods <- function() {
adjacency(),
clustering(),
correlation(),
neural()
neural(),
random_forest()
)
}

183
R/method_random_forest.R Normal file
View file

@ -0,0 +1,183 @@
#' Predict scores using a random forest.
#'
#' @param id Unique ID for the method and its results.
#' @param name Human readable name for the method.
#' @param description Method description.
#' @param seed The seed will be used to make the results reproducible.
#' @param n_models This number specifies how many sets of training data should
#' be created. For each set, there will be a model trained on the remaining
#' training data and validated using this set. For non-training genes, the
#' final score will be the mean of the result of applying the different
#' models. There should be at least two training sets. The analysis will only
#' work, if there is at least one reference gene per training set. By default,
#' one model per reference gene will be used.
#' @param control_ratio The proportion of random control genes that is included
#' in the training data sets in addition to the reference genes. This should
#' be a numeric value between 0.0 and 1.0.
#'
#' @return An object of class `geposan_method`.
#'
#' @export
random_forest <- function(id = "rforest",
name = "Random forest",
description = "Assessment by random forest",
seed = 180199,
n_models = NULL,
control_ratio = 0.75) {
method(
id = id,
name = name,
description = description,
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
reference_count <- length(reference_gene_ids)
if (is.null(n_models)) {
n_models = reference_count
}
cached(
id,
c(
species_ids,
gene_ids,
reference_gene_ids,
seed,
n_models,
control_ratio
),
{ # nolint
stopifnot(n_models %in% 2:reference_count)
control_count <- ceiling(reference_count * control_ratio /
(1 - control_ratio))
# Make results reproducible.
set.seed(seed)
# Step 1: Prepare input data.
# ---------------------------
# Prefilter distances by species and gene.
distances <- geposan::distances[species %chin% species_ids &
gene %chin% gene_ids]
# Reshape data to put species into columns.
data <- dcast(
distances,
gene ~ species,
value.var = "distance"
)
# Replace values that are still missing with mean values for the
# species in question.
data[, (species_ids) := lapply(species_ids, \(species) {
species <- get(species)
species[is.na(species)] <- mean(species, na.rm = TRUE)
species
})]
progress(0.1)
# Step 2: Prepare training data.
# ------------------------------
# Take out the reference data.
reference_data <- data[gene %chin% reference_gene_ids]
reference_data[, score := 1.0]
# Draw control data from the remaining genes.
control_data <- data[!gene %chin% reference_gene_ids][
sample(.N, control_count)
]
control_data[, score := 0.0]
# Randomly distribute the indices of the reference and control genes
# across one bucket per model.
reference_sets <- split(
sample(reference_count),
seq_len(reference_count) %% n_models
)
control_sets <- split(
sample(control_count),
seq_len(control_count) %% n_models
)
# Prepare the data for each model. Each model will have one pair of
# reference and control gene sets left out for validation. The
# training data consists of all the remaining sets.
models <- lapply(seq_len(n_models), \(index) {
training_data <- rbindlist(list(
reference_data[!reference_sets[[index]]],
control_data[!control_sets[[index]]]
))
validation_data <- rbindlist(list(
reference_data[reference_sets[[index]]],
control_data[control_sets[[index]]]
))
list(
training_data = training_data,
validation_data = validation_data
)
})
# Step 3: Create, train and apply the models.
# -------------------------------------------
output_vars <- NULL
for (i in seq_along(models)) {
model <- models[[i]]
forest <- ranger::ranger(
x = model$training_data[, ..species_ids],
y = model$training_data$score
)
# TODO: Make use of validation data.
# Apply the model.
data[, new_score := stats::predict(forest, data)$predictions]
# Remove the values of the training data itself.
data[gene %chin% model$training_data$gene, new_score := NA]
output_var <- sprintf("score%i", i)
setnames(data, "new_score", output_var)
output_vars <- c(output_vars, output_var)
# Store the details.
models[[i]]$forest <- forest
progress(0.1 + i * (0.9 / n_models))
}
# Compute the final score as the mean score.
data[,
score := mean(as.numeric(.SD), na.rm = TRUE),
.SDcols = output_vars,
by = gene
]
progress(1.0)
result(
method = id,
scores = data[, .(gene, score)],
details = list(
seed = seed,
n_models = n_models,
all_results = data[, !..species_ids],
models = models
)
)
}
)
}
)
}

42
man/random_forest.Rd Normal file
View file

@ -0,0 +1,42 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/method_random_forest.R
\name{random_forest}
\alias{random_forest}
\title{Predict scores using a random forest.}
\usage{
random_forest(
id = "rforest",
name = "Random forest",
description = "Assessment by random forest",
seed = 180199,
n_models = NULL,
control_ratio = 0.75
)
}
\arguments{
\item{id}{Unique ID for the method and its results.}
\item{name}{Human readable name for the method.}
\item{description}{Method description.}
\item{seed}{The seed will be used to make the results reproducible.}
\item{n_models}{This number specifies how many sets of training data should
be created. For each set, there will be a model trained on the remaining
training data and validated using this set. For non-training genes, the
final score will be the mean of the result of applying the different
models. There should be at least two training sets. The analysis will only
work, if there is at least one reference gene per training set. By default,
one model per reference gene will be used.}
\item{control_ratio}{The proportion of random control genes that is included
in the training data sets in addition to the reference genes. This should
be a numeric value between 0.0 and 1.0.}
}
\value{
An object of class \code{geposan_method}.
}
\description{
Predict scores using a random forest.
}