Implement random forest model method

2025-10-25 19:37:23 +02:00 · 2022-08-11 12:32:57 +02:00 · 2022-08-11 12:32:57 +02:00 · 038208bfbd
commit 038208bfbd
parent 6cc9196a0d
3 changed files with 227 additions and 1 deletions
--- a/R/method.R
+++ b/R/method.R
@ -37,7 +37,8 @@ all_methods <- function() {
    adjacency(),
    clustering(),
    correlation(),
-    neural()
+    neural(),
+    random_forest()
  )
 }

--- a/R/method_random_forest.R
+++ b/R/method_random_forest.R
@ -0,0 +1,183 @@
+#' Predict scores using a random forest.
+#'
+#' @param id Unique ID for the method and its results.
+#' @param name Human readable name for the method.
+#' @param description Method description.
+#' @param seed The seed will be used to make the results reproducible.
+#' @param n_models This number specifies how many sets of training data should
+#'   be created. For each set, there will be a model trained on the remaining
+#'   training data and validated using this set. For non-training genes, the
+#'   final score will be the mean of the result of applying the different
+#'   models. There should be at least two training sets. The analysis will only
+#'   work, if there is at least one reference gene per training set. By default,
+#'   one model per reference gene will be used.
+#' @param control_ratio The proportion of random control genes that is included
+#'   in the training data sets in addition to the reference genes. This should
+#'   be a numeric value between 0.0 and 1.0.
+#'
+#' @return An object of class `geposan_method`.
+#'
+#' @export
+random_forest <- function(id = "rforest",
+                   name = "Random forest",
+                   description = "Assessment by random forest",
+                   seed = 180199,
+                   n_models = NULL,
+                   control_ratio = 0.75) {
+  method(
+    id = id,
+    name = name,
+    description = description,
+    function(preset, progress) {
+      species_ids <- preset$species_ids
+      gene_ids <- preset$gene_ids
+      reference_gene_ids <- preset$reference_gene_ids
+
+      reference_count <- length(reference_gene_ids)
+      if (is.null(n_models)) {
+        n_models = reference_count
+      }
+
+      cached(
+        id,
+        c(
+          species_ids,
+          gene_ids,
+          reference_gene_ids,
+          seed,
+          n_models,
+          control_ratio
+        ),
+        { # nolint
+          stopifnot(n_models %in% 2:reference_count)
+
+          control_count <- ceiling(reference_count * control_ratio /
+            (1 - control_ratio))
+
+          # Make results reproducible.
+          set.seed(seed)
+
+          # Step 1: Prepare input data.
+          # ---------------------------
+
+          # Prefilter distances by species and gene.
+          distances <- geposan::distances[species %chin% species_ids &
+            gene %chin% gene_ids]
+
+          # Reshape data to put species into columns.
+          data <- dcast(
+            distances,
+            gene ~ species,
+            value.var = "distance"
+          )
+
+          # Replace values that are still missing with mean values for the
+          # species in question.
+          data[, (species_ids) := lapply(species_ids, \(species) {
+            species <- get(species)
+            species[is.na(species)] <- mean(species, na.rm = TRUE)
+            species
+          })]
+
+          progress(0.1)
+
+          # Step 2: Prepare training data.
+          # ------------------------------
+
+          # Take out the reference data.
+          reference_data <- data[gene %chin% reference_gene_ids]
+          reference_data[, score := 1.0]
+
+          # Draw control data from the remaining genes.
+          control_data <- data[!gene %chin% reference_gene_ids][
+            sample(.N, control_count)
+          ]
+          control_data[, score := 0.0]
+
+          # Randomly distribute the indices of the reference and control genes
+          # across one bucket per model.
+
+          reference_sets <- split(
+            sample(reference_count),
+            seq_len(reference_count) %% n_models
+          )
+
+          control_sets <- split(
+            sample(control_count),
+            seq_len(control_count) %% n_models
+          )
+
+          # Prepare the data for each model. Each model will have one pair of
+          # reference and control gene sets left out for validation. The
+          # training data consists of all the remaining sets.
+          models <- lapply(seq_len(n_models), \(index) {
+            training_data <- rbindlist(list(
+              reference_data[!reference_sets[[index]]],
+              control_data[!control_sets[[index]]]
+            ))
+
+            validation_data <- rbindlist(list(
+              reference_data[reference_sets[[index]]],
+              control_data[control_sets[[index]]]
+            ))
+
+            list(
+              training_data = training_data,
+              validation_data = validation_data
+            )
+          })
+
+          # Step 3: Create, train and apply the models.
+          # -------------------------------------------
+
+          output_vars <- NULL
+
+          for (i in seq_along(models)) {
+            model <- models[[i]]
+            forest <- ranger::ranger(
+              x = model$training_data[, ..species_ids],
+              y = model$training_data$score
+            )
+
+            # TODO: Make use of validation data.
+
+            # Apply the model.
+            data[, new_score := stats::predict(forest, data)$predictions]
+
+            # Remove the values of the training data itself.
+            data[gene %chin% model$training_data$gene, new_score := NA]
+
+            output_var <- sprintf("score%i", i)
+            setnames(data, "new_score", output_var)
+            output_vars <- c(output_vars, output_var)
+
+            # Store the details.
+            models[[i]]$forest <- forest
+
+            progress(0.1 + i * (0.9 / n_models))
+          }
+
+          # Compute the final score as the mean score.
+          data[,
+            score := mean(as.numeric(.SD), na.rm = TRUE),
+            .SDcols = output_vars,
+            by = gene
+          ]
+
+          progress(1.0)
+
+          result(
+            method = id,
+            scores = data[, .(gene, score)],
+            details = list(
+              seed = seed,
+              n_models = n_models,
+              all_results = data[, !..species_ids],
+              models = models
+            )
+          )
+        }
+      )
+    }
+  )
+}
--- a/man/random_forest.Rd
+++ b/man/random_forest.Rd
@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/method_random_forest.R
+\name{random_forest}
+\alias{random_forest}
+\title{Predict scores using a random forest.}
+\usage{
+random_forest(
+  id = "rforest",
+  name = "Random forest",
+  description = "Assessment by random forest",
+  seed = 180199,
+  n_models = NULL,
+  control_ratio = 0.75
+)
+}
+\arguments{
+\item{id}{Unique ID for the method and its results.}
+
+\item{name}{Human readable name for the method.}
+
+\item{description}{Method description.}
+
+\item{seed}{The seed will be used to make the results reproducible.}
+
+\item{n_models}{This number specifies how many sets of training data should
+be created. For each set, there will be a model trained on the remaining
+training data and validated using this set. For non-training genes, the
+final score will be the mean of the result of applying the different
+models. There should be at least two training sets. The analysis will only
+work, if there is at least one reference gene per training set. By default,
+one model per reference gene will be used.}
+
+\item{control_ratio}{The proportion of random control genes that is included
+in the training data sets in addition to the reference genes. This should
+be a numeric value between 0.0 and 1.0.}
+}
+\value{
+An object of class \code{geposan_method}.
+}
+\description{
+Predict scores using a random forest.
+}