Restructure classes and their responsibilities

This commit is contained in:
Elias Projahn 2021-12-16 13:01:44 +01:00
parent 01ec301d6d
commit e2b93babe5
27 changed files with 974 additions and 634 deletions

View file

@ -1,81 +1,89 @@
# Score genes based on their proximity to the reference genes.
#
# This method finds the distance value with the maximum density for each gene
# (i.e. the mode of its estimated distribution). Genes are scored by comparing
# those distance values with the values of the reference genes.
adjacency <- function(preset, progress = NULL) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
#' Score genes based on their proximity to the reference genes.
#'
#' This method finds the distance value with the maximum density for each gene
#' (i.e. the mode of its estimated distribution). Genes are scored by comparing
#' those distance values with the values of the reference genes.
#'
#' @return An object of class `geposan_method`.
#'
#' @export
adjacency <- function() {
method(
id = "adjacency",
name = "Adjacency",
description = "Adjacency to reference genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), {
# Get the virtual distance value with the highest density.
compute_densest_distance <- function(distances) {
if (length(distances) <= 2) {
mean(distances)
} else {
d <- stats::density(distances)
d$x[which.max(d$y)]
}
cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), {
# Get the virtual distance value with the highest density.
compute_densest_distance <- function(distances) {
if (length(distances) <= 2) {
mean(distances)
} else {
d <- stats::density(distances)
d$x[which.max(d$y)]
}
}
# Filter distances by species and gene and find the distance
# with the highest density of values for each gene.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids,
.(densest_distance = compute_densest_distance(distance)),
by = gene
]
# Compute the absolute value of the difference between the
# provided densest distance value in comparison to the mean of
# the densest distances of the comparison genes.
compute_difference <- function(densest_distance,
comparison_ids) {
# Get the mean of the densest distances of the reference
# genes.
mean_densest_distance <- data[
gene %chin% comparison_ids,
mean(densest_distance)
]
abs(densest_distance - mean_densest_distance)
}
# Compute the differences to the reference genes.
data[
!gene %chin% reference_gene_ids,
difference := compute_difference(
densest_distance,
reference_gene_ids
)
]
progress(0.5)
# Exclude the reference gene itself when computing its
# difference.
data[
gene %chin% reference_gene_ids,
difference := compute_difference(
densest_distance,
reference_gene_ids[reference_gene_ids != gene]
),
by = gene
]
# Compute the final score by normalizing the difference.
data[, score := 1 - difference / max(difference)]
progress(1.0)
result(
method = "adjacency",
scores = data[, .(gene, score)],
details = list(data = data)
)
})
}
# Filter distances by species and gene and find the distance with the
# highest density of values for each gene.
data <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids,
.(densest_distance = compute_densest_distance(distance)),
by = gene
]
# Compute the absolute value of the difference between the provided
# densest distance value in comparison to the mean of the densest
# distances of the comparison genes.
compute_difference <- function(densest_distance, comparison_ids) {
# Get the mean of the densest distances of the reference genes.
mean_densest_distance <- data[
gene %chin% comparison_ids,
mean(densest_distance)
]
abs(densest_distance - mean_densest_distance)
}
# Compute the differences to the reference genes.
data[
!gene %chin% reference_gene_ids,
difference := compute_difference(
densest_distance,
reference_gene_ids
)
]
if (!is.null(progress)) {
progress(0.5)
}
# Exclude the reference gene itself when computing its difference.
data[
gene %chin% reference_gene_ids,
difference := compute_difference(
densest_distance,
reference_gene_ids[reference_gene_ids != gene]
),
by = gene
]
# Compute the final score by normalizing the difference.
data[, score := 1 - difference / max(difference)]
if (!is.null(progress)) {
progress(1.0)
}
structure(
list(
results = data[, .(gene, score)],
details = data
),
class = "geposan_method_results"
)
})
)
}

View file

@ -1,16 +1,17 @@
#' Analyze by applying the specified preset.
#' Analyze genes based on position data.
#'
#' @param preset The preset to use which should be created using [preset()].
#' @param progress A function to be called for progress information. The
#' function should accept a number between 0.0 and 1.0 for the current
#' progress.
#' progress. If no function is provided, a simple text progress bar will be
#' shown.
#'
#' @returns An object containing the results of the analysis with the following
#' items:
#' \describe{
#' \item{`preset`}{The preset that was used.}
#' \item{`weights`}{The optimal weights for ranking the reference genes.}
#' \item{`ranking`}{The optimal ranking created using the weights.}
#' \item{`scores`}{Table containing all scores for each gene.}
#' \item{`results`}{Results from the different methods including details.}
#' }
#'
#' @export
@ -19,80 +20,69 @@ analyze <- function(preset, progress = NULL) {
stop("Preset is invalid. Use geposan::preset() to create one.")
}
# Available methods by ID.
#
# A method describes a way to perform a computation on gene distance data
# that results in a single score per gene. The function should accept the
# preset to apply (see [preset()]) and an optional progress function (that
# may be called with a number between 0.0 and 1.0) as its parameters.
#
# The function should return a [data.table] with the following columns:
#
# - `gene` Gene ID of the processed gene.
# - `score` Score for the gene between 0.0 and 1.0.
methods <- list(
"clusteriness" = clusteriness,
"correlation" = correlation,
"neural" = neural,
"adjacency" = adjacency,
"proximity" = proximity
)
if (is.null(progress)) {
progress_bar <- progress::progress_bar$new()
progress_bar$update(0.0)
analysis <- cached("analysis", preset, {
total_progress <- 0.0
method_count <- length(preset$methods)
results <- data.table(gene = preset$gene_ids)
for (method_id in preset$methods) {
method_progress <- if (!is.null(progress)) {
function(p) {
progress(total_progress + p / method_count)
progress <- function(progress_value) {
if (!progress_bar$finished) {
progress_bar$update(progress_value)
if (progress_value >= 1.0) {
progress_bar$terminate()
}
}
method_results <- methods[[method_id]](
preset,
progress = method_progress
)$results
setnames(method_results, "score", method_id)
results <- merge(
results,
method_results,
by = "gene"
)
total_progress <- total_progress + 1 / method_count
}
results <- structure(
results,
class = c("geposan_results", class(results))
)
weights <- optimal_weights(
results,
preset$methods,
preset$reference_gene_ids,
target = preset$optimization_target
)
ranking <- ranking(results, weights)
structure(
list(
preset = preset,
weights = weights,
ranking = ranking
),
class = "geposan_analysis"
)
})
if (!is.null(progress)) {
progress(1.0)
}
analysis
progress_buffer <- 0.0
method_count <- length(preset$methods)
method_progress <- function(progress_value) {
progress(progress_buffer + progress_value / method_count)
}
scores <- data.table(gene = preset$gene_id)
results <- list()
for (method in preset$methods) {
method_results <- method$func(preset, method_progress)
scores <- merge(scores, method_results$scores)
setnames(scores, "score", method$id)
results <- c(results, list(method_results))
progress_buffer <- progress_buffer + 1 / method_count
progress(progress_buffer)
}
structure(
list(
preset = preset,
scores = scores,
results = results
),
class = "geposan_analysis"
)
}
#' Print an analysis object.
#'
#' @param x The analysis to print.
#' @param ... Other parameters.
#'
#' @seealso [analyze()]
#'
#' @export
print.geposan_analysis <- function(x, ...) {
cat("geposan analysis:\n\n")
print(x$preset)
cat("\n")
for (result in x$results) {
print(result)
cat("\n")
}
invisible(x)
}

View file

@ -1,84 +0,0 @@
# Perform a cluster analysis.
#
# This function will cluster the data using `hclust` and `cutree` (with the
# specified height). Every cluster with at least two members qualifies for
# further analysis. Clusters are then ranked based on their size in relation
# to the number of values. The return value is a final score between zero and
# one. Lower ranking clusters contribute less to this score.
#
# @param data The values that should be scored.
# @param height The maximum span of values considered to be in one cluster.
# @param weight The weight that will be given to the next largest cluster in
# relation to the previous one. For example, if `weight` is 0.7 (the default),
# the first cluster will weigh 1.0, the second 0.7, the third 0.49 etc.
clusteriness_priv <- function(data, height = 1000000, weight = 0.7) {
n <- length(data)
# Return a score of 0.0 if there is just one or no value at all.
if (n < 2) {
return(0.0)
}
# Cluster the data and compute the cluster sizes.
tree <- stats::hclust(stats::dist(data))
clusters <- stats::cutree(tree, h = height)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "clusteriness" score.
score <- 0.0
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + weight ^ (i - 1) * cluster_score
}
}
score
}
# Process genes clustering their distance to telomeres.
clusteriness <- function(preset, progress = NULL) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
cached("clusteriness", c(species_ids, gene_ids), {
results <- data.table(gene = gene_ids)
# Prefilter the input data by species.
distances <- geposan::distances[species %chin% species_ids]
# Add an index for quickly accessing data per gene.
setkey(distances, gene)
genes_done <- 0
genes_total <- length(gene_ids)
# Perform the cluster analysis for one gene.
compute <- function(gene_id) {
data <- distances[gene_id, distance]
score <- clusteriness_priv(data)
if (!is.null(progress)) {
genes_done <<- genes_done + 1
progress(genes_done / genes_total)
}
score
}
structure(
list(
results = results[,
score := compute(gene),
by = gene
]
),
class = "geposan_method_results"
)
})
}

93
R/clustering.R Normal file
View file

@ -0,0 +1,93 @@
#' Perform a cluster analysis.
#'
#' This function will cluster the data using [stats::hclust()] and
#' [stats::cutree()]. Every cluster with at least two members qualifies for
#' further analysis. Clusters are then ranked based on their size in relation
#' to the total number of values. The return value is a final score between
#' 0.0 and 1.0. Lower ranking clusters contribute less to this score.
#'
#' @param data The values that should be scored.
#' @param span The maximum span of values considered to be in one cluster.
#' @param weight The weight that will be given to the next largest cluster in
#' relation to the previous one. For example, if `weight` is 0.7 (the
#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
#' etc.
clusteriness <- function(data, span = 1000000, weight = 0.7) {
n <- length(data)
# Return a score of 0.0 if there is just one or no value at all.
if (n < 2) {
return(0.0)
}
# Cluster the data and compute the cluster sizes.
tree <- stats::hclust(stats::dist(data))
clusters <- stats::cutree(tree, h = span)
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
# Compute the "clusteriness" score.
score <- 0.0
for (i in seq_along(cluster_sizes)) {
cluster_size <- cluster_sizes[i]
if (cluster_size >= 2) {
cluster_score <- cluster_size / n
score <- score + weight^(i - 1) * cluster_score
}
}
score
}
#' Process genes clustering their distance to telomeres.
#'
#' The result will be cached and can be reused for different presets, because
#' it is independent of the reference genes in use.
#'
#' @return An object of class `geposan_method`.
#'
#' @seealso [clusteriness()]
#'
#' @export
clustering <- function() {
method(
id = "clustering",
name = "Clustering",
description = "Clustering of genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
cached("clustering", c(species_ids, gene_ids), {
scores <- data.table(gene = gene_ids)
# Prefilter the input data by species.
distances <- geposan::distances[species %chin% species_ids]
genes_done <- 0
genes_total <- length(gene_ids)
# Perform the cluster analysis for one gene.
compute <- function(gene_id) {
data <- distances[gene == gene_id, distance]
score <- clusteriness(data)
genes_done <<- genes_done + 1
progress(genes_done / genes_total)
score
}
scores[, score := compute(gene), by = gene]
result(
method = "clustering",
scores = scores
)
})
}
)
}

View file

@ -1,88 +1,101 @@
# Compute the mean correlation coefficient comparing gene distances with a set
# of reference genes.
correlation <- function(preset, progress = NULL) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
#' Compute the mean correlation coefficient comparing gene distances with a set
#' of reference genes.
#'
#' @return An object of class `geposan_method`.
#'
#' @export
correlation <- function() {
method(
id = "correlation",
name = "Correlation",
description = "Correlation with reference genes",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached(
"correlation", c(species_ids, gene_ids, reference_gene_ids), {
# Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids]
cached(
"correlation",
c(species_ids, gene_ids, reference_gene_ids),
{ # nolint
# Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids]
# Tranform data to get species as rows and genes as columns. We
# construct columns per species, because it requires fewer
# iterations, and transpose the table afterwards.
# Tranform data to get species as rows and genes as columns.
# We construct columns per species, because it requires
# fewer iterations, and transpose the table afterwards.
data <- data.table(gene = gene_ids)
data <- data.table(gene = gene_ids)
# Make a column containing distance data for each species.
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
# Make a column containing distance data for each species.
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
data <- merge(data, species_data, all.x = TRUE)
setnames(data, "distance", species_id)
}
data <- merge(data, species_data, all.x = TRUE)
setnames(data, "distance", species_id)
}
# Transpose to the desired format.
data <- transpose(data, make.names = "gene")
# Transpose to the desired format.
data <- transpose(data, make.names = "gene")
if (!is.null(progress)) progress(0.33)
progress(0.33)
# Take the reference data.
reference_data <- data[, ..reference_gene_ids]
# Take the reference data.
reference_data <- data[, ..reference_gene_ids]
# Perform the correlation between all possible pairs.
results <- stats::cor(
data[, ..gene_ids],
reference_data,
use = "pairwise.complete.obs",
method = "spearman"
)
# Perform the correlation between all possible pairs.
results <- stats::cor(
data[, ..gene_ids],
reference_data,
use = "pairwise.complete.obs",
method = "spearman"
)
results <- data.table(results, keep.rownames = TRUE)
setnames(results, "rn", "gene")
results <- data.table(results, keep.rownames = TRUE)
setnames(results, "rn", "gene")
# Remove correlations between the reference genes themselves.
for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id)
results[gene == reference_gene_id, eval(column) := NA]
}
# Remove correlations between the reference genes
# themselves.
for (reference_gene_id in reference_gene_ids) {
column <- quote(reference_gene_id)
results[gene == reference_gene_id, eval(column) := NA]
}
if (!is.null(progress)) progress(0.66)
progress(0.66)
# Compute the final score as the mean of known correlation scores.
# Negative correlations will correctly lessen the score, which will
# be clamped to zero as its lower bound. Genes with no possible
# correlations at all will be assumed to have a score of 0.0.
# Compute the final score as the mean of known correlation
# scores. Negative correlations will correctly lessen the
# score, which will be clamped to zero as its lower bound.
# Genes with no possible correlations at all will be assumed
# to have a score of 0.0.
compute_score <- function(scores) {
score <- mean(scores, na.rm = TRUE)
compute_score <- function(scores) {
score <- mean(scores, na.rm = TRUE)
if (is.na(score) | score < 0.0) {
score <- 0.0
if (is.na(score) | score < 0.0) {
score <- 0.0
}
score
}
results[,
score := compute_score(as.matrix(.SD)),
.SDcols = reference_gene_ids,
by = gene
]
results[, .(gene, score)]
result(
method = "correlation",
scores = results[, .(gene, score)],
details = list(all_correlations = results)
)
}
score
}
results[,
score := compute_score(as.matrix(.SD)),
.SDcols = reference_gene_ids,
by = gene
]
results[, .(gene, score)]
structure(
list(
results = results[, .(gene, score)],
all_correlations = results
),
class = "geposan_method_results"
)
}
)

67
R/method.R Normal file
View file

@ -0,0 +1,67 @@
#' Describe a new method for analyzing gene position data.
#'
#' @param id Unique identifier for the method.
#' @param name Human readable name.
#' @param description Slightly longer description.
#' @param func Function to apply the method. The function should accept two
#' parameters: an object of class `geposan_preset` as input and a function to
#' report progress information to as a numeric value. The return value should
#' be an object of class `geposan_result`.
#'
#' @return An object of class `geposan_method`.
#'
#' @export
method <- function(id, name, description, func) {
stopifnot(is.character(id) & length(id) == 1)
stopifnot(is.character(name) & length(name) == 1)
stopifnot(is.character(description) & length(description) == 1)
stopifnot(is.function(func))
structure(
list(
id = id,
name = name,
description = description,
func = func
),
class = "geposan_method"
)
}
#' Get a list of all available methods.
#'
#' @export
all_methods <- function() {
list(
clustering(),
correlation(),
neural(),
adjacency(),
proximity()
)
}
#' Print a method object.
#'
#' @param x The method to print.
#' @param ... Other parameters.
#'
#' @seealso [method()]
#'
#' @export
print.geposan_method <- function(x, ...) {
cat(sprintf(
paste0(
"geposan method:",
"\n Method ID: %s",
"\n Name: %s",
"\n Description: %s",
"\n"
),
x$id,
x$name,
x$description
))
invisible(x)
}

View file

@ -1,248 +1,254 @@
# Find genes by training and applying a neural network.
#
# @param seed The seed will be used to make the results reproducible.
# @param n_models This number specifies how many sets of training data should
# be created. For each set, there will be a model trained on the remaining
# training data and validated using this set. For non-training genes, the
# final score will be the mean of the result of applying the different
# models.
neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
#' Find genes by training and applying a neural network.
#'
#' @param seed The seed will be used to make the results reproducible.
#' @param n_models This number specifies how many sets of training data should
#' be created. For each set, there will be a model trained on the remaining
#' training data and validated using this set. For non-training genes, the
#' final score will be the mean of the result of applying the different
#' models. There should be at least two training sets. The analysis will only
#' work, if there is at least one reference gene per training set.
#'
#' @return An object of class `geposan_method`.
#'
#' @export
neural <- function(seed = 180199, n_models = 5) {
method(
id = "neural",
name = "Neural",
description = "Assessment by neural network",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
reference_gene_ids <- preset$reference_gene_ids
cached(
"neural",
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
{ # nolint
reference_count <- length(reference_gene_ids)
if (!n_models %in% 2:reference_count) {
stop(paste0(
"n_models has to be between 2 and the number of reference ",
"genes."
))
}
cached(
"neural",
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
{ # nolint
reference_count <- length(reference_gene_ids)
stopifnot(n_models %in% 2:reference_count)
# Make results reproducible.
tensorflow::set_random_seed(seed)
# Make results reproducible.
tensorflow::set_random_seed(seed)
# Step 1: Prepare input data.
# ---------------------------
# Step 1: Prepare input data.
# ---------------------------
# Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids]
# Prefilter distances by species.
distances <- geposan::distances[species %chin% species_ids]
# Input data for the network. This contains the gene ID as an
# identifier as well as the per-species gene distances as input
# variables.
data <- data.table(gene = gene_ids)
# Input data for the network. This contains the gene ID as
# an identifier as well as the per-species gene distances as
# input variables.
data <- data.table(gene = gene_ids)
# Buffer to keep track of the names of the input variables.
input_vars <- NULL
# Buffer to keep track of the names of the input variables.
input_vars <- NULL
# Make a columns containing positions and distances for each
# species.
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
# Make a columns containing positions and distances for each
# species.
for (species_id in species_ids) {
species_data <- distances[
species == species_id,
.(gene, distance)
]
# Only include species with at least 25% known values. As
# positions and distances always coexist, we don't loose any
# data here.
# Only include species with at least 25% known values.
# As positions and distances always coexist, we don't
# loose any data here.
species_data <- stats::na.omit(species_data)
species_data <- stats::na.omit(species_data)
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
data <- merge(data, species_data, all.x = TRUE)
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
data <- merge(data, species_data, all.x = TRUE)
# Replace missing data with mean values. The neural network
# can't handle NAs in a meaningful way. Choosing extreme
# values here would result in heavily biased results.
# Therefore, the mean value is chosen as a compromise.
# However, this will of course lessen the significance of
# the results.
# Replace missing data with mean values. The neural
# network can't handle NAs in a meaningful way.
# Choosing extreme values here would result in
# heavily biased results. Therefore, the mean value
# is chosen as a compromise. However, this will of
# course lessen the significance of the results.
mean_distance <- round(species_data[, mean(distance)])
data[is.na(distance), `:=`(distance = mean_distance)]
mean_distance <- round(
species_data[, mean(distance)]
)
# Name the new column after the species.
setnames(data, "distance", species_id)
data[is.na(distance), distance := mean_distance]
# Add the input variable to the buffer.
input_vars <- c(input_vars, species_id)
}
}
# Name the new column after the species.
setnames(data, "distance", species_id)
if (!is.null(progress)) {
progress(0.1)
}
# Add the input variable to the buffer.
input_vars <- c(input_vars, species_id)
}
}
# Step 2: Prepare training data.
# ------------------------------
progress(0.1)
# Take out the reference data.
# Step 2: Prepare training data.
# ------------------------------
reference_data <- data[gene %chin% reference_gene_ids]
reference_data[, score := 1.0]
# Take out the reference data.
# Take out random samples from the remaining genes. This is another
# compromise with a negative impact on significance. Because there
# is no information on genes with are explicitely *not* TPE-OLD
# genes, we have to assume that a random sample of genes has a low
# probability of including TPE-OLD genes.
reference_data <- data[gene %chin% reference_gene_ids]
reference_data[, score := 1.0]
without_reference_data <- data[!gene %chin% reference_gene_ids]
# Take out random samples from the remaining genes. This is
# another compromise with a negative impact on
# significance. We assume that a random gene is not likely
# to match the reference genes.
control_data <- without_reference_data[
sample(
nrow(without_reference_data),
reference_count
)
]
without_reference_data <- data[
!gene %chin% reference_gene_ids
]
control_data[, score := 0.0]
control_data <- without_reference_data[
sample(
nrow(without_reference_data),
reference_count
)
]
# Split the training data into random sets to have validation data
# for each model.
control_data[, score := 0.0]
# Scramble the source tables.
reference_data <- reference_data[sample(reference_count)]
control_data <- control_data[sample(reference_count)]
# Split the training data into random sets to have
# validation data for each model.
networks <- list()
# Scramble the source tables.
reference_data <- reference_data[sample(reference_count)]
control_data <- control_data[sample(reference_count)]
indices <- seq_len(reference_count)
indices_split <- split(indices, indices %% n_models)
networks <- list()
for (i in seq_len(n_models)) {
training_data <- rbindlist(list(
reference_data[!indices_split[[i]]],
control_data[!indices_split[[i]]]
))
indices <- seq_len(reference_count)
indices_split <- split(indices, indices %% n_models)
validation_data <- rbindlist(list(
reference_data[indices_split[[i]]],
control_data[indices_split[[i]]]
))
for (i in seq_len(n_models)) {
training_data <- rbindlist(list(
reference_data[!indices_split[[i]]],
control_data[!indices_split[[i]]]
))
networks[[i]] <- list(
training_data = training_data,
validation_data = validation_data
)
}
validation_data <- rbindlist(list(
reference_data[indices_split[[i]]],
control_data[indices_split[[i]]]
))
# Step 3: Create, train and apply neural network.
# -----------------------------------------------
networks[[i]] <- list(
training_data = training_data,
validation_data = validation_data
)
}
# Layers for the neural network.
input_layer <- length(input_vars)
layer1 <- input_layer
layer2 <- 0.5 * input_layer
layer3 <- 0.5 * layer2
# Step 3: Create, train and apply neural network.
# -----------------------------------------------
# Convert data to matrix and normalize it.
to_matrix <- function(data) {
data_matrix <- as.matrix(data[, ..input_vars])
colnames(data_matrix) <- NULL
keras::normalize(data_matrix)
}
# Layers for the neural network.
input_layer <- length(input_vars)
layer1 <- input_layer
layer2 <- 0.5 * input_layer
layer3 <- 0.5 * layer2
data_matrix <- to_matrix(data)
output_vars <- NULL
# Convert data to matrix and normalize it.
to_matrix <- function(data) {
data_matrix <- as.matrix(data[, ..input_vars])
colnames(data_matrix) <- NULL
keras::normalize(data_matrix)
}
for (i in seq_along(networks)) {
# Create a new model for each training session, because the
# model would keep its state across training sessions otherwise.
model <- keras::keras_model_sequential() |>
keras::layer_dense(
units = layer1,
activation = "relu",
input_shape = input_layer,
) |>
keras::layer_dense(
units = layer2,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = layer3,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = 1,
activation = "sigmoid"
) |>
keras::compile(
loss = keras::loss_mean_absolute_error(),
optimizer = keras::optimizer_adam()
data_matrix <- to_matrix(data)
output_vars <- NULL
for (i in seq_along(networks)) {
# Create a new model for each training session, because
# the model would keep its state across training
# sessions otherwise.
model <- keras::keras_model_sequential() |>
keras::layer_dense(
units = layer1,
activation = "relu",
input_shape = input_layer,
) |>
keras::layer_dense(
units = layer2,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = layer3,
activation = "relu",
kernel_regularizer = keras::regularizer_l2()
) |>
keras::layer_dense(
units = 1,
activation = "sigmoid"
) |>
keras::compile(
loss = keras::loss_mean_absolute_error(),
optimizer = keras::optimizer_adam()
)
# Train the model.
network <- networks[[i]]
training_data <- network$training_data
training_matrix <- to_matrix(training_data)
validation_data <- network$validation_data
validation_matrix <- to_matrix(validation_data)
fit <- keras::fit(
model,
x = training_matrix,
y = training_data$score,
validation_data = list(
x_val = validation_matrix,
y_val = validation_data$score
),
epochs = 500,
verbose = FALSE
)
# Apply the model.
data[, new_score := stats::predict(model, data_matrix)]
# Remove the values of the training data itself.
data[gene %chin% training_data$gene, new_score := NA]
output_var <- sprintf("score%i", i)
setnames(data, "new_score", output_var)
output_vars <- c(output_vars, output_var)
# Store the details.
networks[[i]]$model <- keras::serialize_model(model)
networks[[i]]$fit <- fit
progress(0.1 + i * (0.9 / n_models))
}
# Compute the final score as the mean score.
data[,
score := mean(as.numeric(.SD), na.rm = TRUE),
.SDcols = output_vars,
by = gene
]
progress(1.0)
result(
method = "neural",
scores = data[, .(gene, score)],
details = list(
seed = seed,
n_models = n_models,
all_results = data[, !..input_vars],
networks = networks
)
)
# Train the model.
network <- networks[[i]]
training_data <- network$training_data
training_matrix <- to_matrix(training_data)
validation_data <- network$validation_data
validation_matrix <- to_matrix(validation_data)
fit <- keras::fit(
model,
x = training_matrix,
y = training_data$score,
validation_data = list(
x_val = validation_matrix,
y_val = validation_data$score
),
epochs = 500,
verbose = FALSE
)
# Apply the model.
data[, new_score := stats::predict(model, data_matrix)]
# Remove the values of the training data itself.
data[gene %chin% training_data$gene, new_score := NA]
output_var <- sprintf("score%i", i)
setnames(data, "new_score", output_var)
output_vars <- c(output_vars, output_var)
# Store the details.
networks[[i]]$model <- keras::serialize_model(model)
networks[[i]]$fit <- fit
if (!is.null(progress)) {
progress(0.1 + i * (0.9 / n_models))
}
}
# Compute the final score as the mean score.
data[,
score := mean(as.numeric(.SD), na.rm = TRUE),
.SDcols = output_vars,
by = gene
]
if (!is.null(progress)) {
progress(1.0)
}
structure(
list(
results = data[, .(gene, score)],
seed = seed,
n_models = n_models,
all_results = data[, !..input_vars],
networks = networks
),
class = "geposan_method_results"
)
}
)

View file

@ -5,46 +5,22 @@
#' reference genes to be able to assess the results later. The genes will be
#' filtered based on how many species have data for them. Genes which only have
#' orthologs for less than 25% of the input species will be excluded from the
#' preset and the analyis.
#' preset and the analyis. See the different method functions for the available
#' methods: [clustering()], [correlation()], [neural()], [adjacency()] and
#' [proximity()].
#'
#' Available methods are:
#'
#' - `clusteriness` How much the gene distances to the nearest telomere
#' cluster across species.
#' - `correlation` The mean correlation of gene distances to the nearest
#' telomere across species.
#' - `neural` Assessment by neural network trained on the reference genes.
#' - `adjacency` Proximity to reference genes.
#' - `proximity` Mean proximity to telomeres.
#'
#' Available optimization targets are:
#'
#' - `mean` Mean rank of the reference genes.
#' - `median` Median rank of the reference genes.
#' - `max` First rank of the reference genes.
#' - `min` Last rank of the reference genes.
#'
#' @param methods Methods to apply.
#' @param methods List of methods to apply.
#' @param species_ids IDs of species to include.
#' @param gene_ids IDs of genes to screen.
#' @param reference_gene_ids IDs of reference genes to compare to.
#' @param optimization_target Parameter of the reference genes that the ranking
#' should be optimized for.
#'
#' @return The preset to use with [analyze()].
#'
#' @export
preset <- function(methods = c(
"clusteriness",
"correlation",
"neural",
"adjacency",
"proximity"
),
species_ids = NULL,
gene_ids = NULL,
reference_gene_ids = NULL,
optimization_target = "mean_rank") {
preset <- function(methods = all_methods(),
species_ids = geposan::species$id,
gene_ids = geposan::genes$id,
reference_gene_ids) {
# Count included species per gene.
genes_n_species <- geposan::distances[
species %chin% species_ids,
@ -63,11 +39,10 @@ preset <- function(methods = c(
# for the object later.
structure(
list(
methods = sort(methods),
methods = methods,
species_ids = sort(species_ids),
gene_ids = sort(gene_ids_filtered),
reference_gene_ids = sort(reference_gene_ids),
optimization_target = optimization_target
reference_gene_ids = sort(reference_gene_ids)
),
class = "geposan_preset"
)
@ -82,25 +57,20 @@ preset <- function(methods = c(
#'
#' @export
print.geposan_preset <- function(x, ...) {
cat("geposan preset:")
cat("\n Included methods: ")
cat(x$methods, sep = ", ")
cat(sprintf(
"\n Input data: %i species, %i genes",
paste0(
"geposan preset:",
"\n Included methods: %s",
"\n Number of species: %i",
"\n Number of genes: %i",
"\n Reference genes: %i",
"\n"
),
paste(sapply(x$methods, function(m) m$id), collapse = ", "),
length(x$species_ids),
length(x$gene_ids)
))
cat(sprintf(
"\n Comparison data: %i reference genes",
length(x$gene_ids),
length(x$reference_gene_ids)
))
cat(sprintf(
"\n Optimization target: %s\n",
x$optimization_target
))
invisible(x)
}

View file

@ -1,34 +1,39 @@
# Score the mean distance of genes to the telomeres across species.
#
# A score will be given to each gene such that 0.0 corresponds to the maximal
# mean distance across all genes and 1.0 corresponds to a distance of 0.
proximity <- function(preset, progress = NULL) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
#' Score the mean distance of genes to the telomeres across species.
#'
#' A score will be given to each gene such that 0.0 corresponds to the maximal
#' mean distance across all genes and 1.0 corresponds to a distance of 0.
#'
#' @return An object of class `geposan_method`.
#'
#' @export
proximity <- function() {
method(
id = "proximity",
name = "Proximity",
description = "Proximity to telomeres",
function(preset, progress) {
species_ids <- preset$species_ids
gene_ids <- preset$gene_ids
cached("proximity", c(species_ids, gene_ids), {
# Prefilter distances by species and gene.
data <- geposan::distances[
species %chin% preset$species_ids & gene %chin% preset$gene_ids
]
cached("proximity", c(species_ids, gene_ids), {
# Prefilter distances by species and gene.
data <- geposan::distances[
species %chin% preset$species_ids &
gene %chin% preset$gene_ids
]
# Compute the score as described above.
data <- data[, .(mean_distance = mean(distance)), by = "gene"]
max_distance <- data[, max(mean_distance)]
data[, score := 1 - mean_distance / max_distance]
# Compute the score as described above.
data <- data[, .(mean_distance = mean(distance)), by = "gene"]
max_distance <- data[, max(mean_distance)]
data[, score := 1 - mean_distance / max_distance]
if (!is.null(progress)) {
# We do everything in one go, so it's not possible to report
# detailed progress information. As the method is relatively quick,
# this should not be a problem.
progress(1.0)
progress(1.0)
result(
method = "proximity",
scores = data[, .(gene, score)]
)
})
}
structure(
list(
results = data[, .(gene, score)]
),
class = "geposan_method_results"
)
})
)
}

View file

@ -13,10 +13,10 @@
#'
#' @export
ranking <- function(analysis, weights) {
if (inherits(analysis, "geposan_analysis")) {
ranking <- copy(analysis$ranking)
} else if (inherits(analysis, "geposan_results")) {
ranking <- copy(analysis)
ranking <- if (inherits(analysis, "geposan_analysis")) {
copy(analysis$scores)
} else if (inherits(analysis, "geposan_ranking")) {
copy(analysis)
} else {
stop("Invalid analyis. Use geposan::analyze().")
}
@ -39,7 +39,7 @@ ranking <- function(analysis, weights) {
structure(
ranking,
class = c("geposan_ranking", "geposan_results", class(ranking))
class = c("geposan_ranking", class(ranking))
)
}
@ -60,7 +60,7 @@ ranking <- function(analysis, weights) {
#' @export
optimal_weights <- function(analysis, methods, reference_gene_ids,
target = "mean") {
if (!inherits(analysis, c("geposan_analysis", "geposan_results"))) {
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
stop("Invalid analyis. Use geposan::analyze().")
}

50
R/result.R Normal file
View file

@ -0,0 +1,50 @@
#' Result of applying a method on gene position data.
#'
#' @param method_id ID of the method that produced this result.
#' @param scores A `data.frame` mapping gene IDs (`gene`) to computed scores
#' between 0.0 and 1.0 (`score`).
#' @param details Optional details that may contain intermediate results as
#' well as other information on the method application.
#'
#' @return An object of class `geposan_result`.
#'
#' @export
result <- function(method_id, scores, details = list()) {
stopifnot(is.data.frame(scores) &
c("gene", "score") %chin% colnames(scores))
stopifnot(is.list(details))
structure(
list(
method_id = method_id,
scores = scores,
details = details
),
class = "geposan_result"
)
}
#' Print a result object.
#'
#' @param x The result to print.
#' @param ... Other parameters.
#'
#' @seealso [result()]
#'
#' @export
print.geposan_result <- function(x, ...) {
cat(sprintf(
paste0(
"geposan result:",
"\n Method: %s",
"\n Number of genes: %i",
"\n Available details: %s",
"\n"
),
x$method_id,
nrow(x$scores),
paste(names(x$details), collapse = ", ")
))
invisible(x)
}