mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 10:47:25 +01:00
Restructure classes and their responsibilities
This commit is contained in:
parent
01ec301d6d
commit
e2b93babe5
27 changed files with 974 additions and 634 deletions
|
|
@ -24,6 +24,7 @@ Imports:
|
|||
data.table,
|
||||
keras,
|
||||
rlang,
|
||||
progress,
|
||||
tensorflow
|
||||
Suggests:
|
||||
biomaRt,
|
||||
|
|
|
|||
11
NAMESPACE
11
NAMESPACE
|
|
@ -1,10 +1,19 @@
|
|||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
S3method(print,geposan_analysis)
|
||||
S3method(print,geposan_comparison)
|
||||
S3method(print,geposan_method)
|
||||
S3method(print,geposan_preset)
|
||||
S3method(print,geposan_result)
|
||||
S3method(print,geposan_validation)
|
||||
export(adjacency)
|
||||
export(all_methods)
|
||||
export(analyze)
|
||||
export(clustering)
|
||||
export(compare)
|
||||
export(correlation)
|
||||
export(method)
|
||||
export(neural)
|
||||
export(optimal_weights)
|
||||
export(plot_boxplot)
|
||||
export(plot_chromosomes)
|
||||
|
|
@ -12,6 +21,8 @@ export(plot_positions)
|
|||
export(plot_rankings)
|
||||
export(plot_scores)
|
||||
export(preset)
|
||||
export(proximity)
|
||||
export(ranking)
|
||||
export(result)
|
||||
export(validate)
|
||||
import(data.table)
|
||||
|
|
|
|||
|
|
@ -1,9 +1,18 @@
|
|||
# Score genes based on their proximity to the reference genes.
|
||||
#
|
||||
# This method finds the distance value with the maximum density for each gene
|
||||
# (i.e. the mode of its estimated distribution). Genes are scored by comparing
|
||||
# those distance values with the values of the reference genes.
|
||||
adjacency <- function(preset, progress = NULL) {
|
||||
#' Score genes based on their proximity to the reference genes.
|
||||
#'
|
||||
#' This method finds the distance value with the maximum density for each gene
|
||||
#' (i.e. the mode of its estimated distribution). Genes are scored by comparing
|
||||
#' those distance values with the values of the reference genes.
|
||||
#'
|
||||
#' @return An object of class `geposan_method`.
|
||||
#'
|
||||
#' @export
|
||||
adjacency <- function() {
|
||||
method(
|
||||
id = "adjacency",
|
||||
name = "Adjacency",
|
||||
description = "Adjacency to reference genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
|
|
@ -19,19 +28,21 @@ adjacency <- function(preset, progress = NULL) {
|
|||
}
|
||||
}
|
||||
|
||||
# Filter distances by species and gene and find the distance with the
|
||||
# highest density of values for each gene.
|
||||
# Filter distances by species and gene and find the distance
|
||||
# with the highest density of values for each gene.
|
||||
data <- geposan::distances[
|
||||
species %chin% species_ids & gene %chin% gene_ids,
|
||||
.(densest_distance = compute_densest_distance(distance)),
|
||||
by = gene
|
||||
]
|
||||
|
||||
# Compute the absolute value of the difference between the provided
|
||||
# densest distance value in comparison to the mean of the densest
|
||||
# distances of the comparison genes.
|
||||
compute_difference <- function(densest_distance, comparison_ids) {
|
||||
# Get the mean of the densest distances of the reference genes.
|
||||
# Compute the absolute value of the difference between the
|
||||
# provided densest distance value in comparison to the mean of
|
||||
# the densest distances of the comparison genes.
|
||||
compute_difference <- function(densest_distance,
|
||||
comparison_ids) {
|
||||
# Get the mean of the densest distances of the reference
|
||||
# genes.
|
||||
mean_densest_distance <- data[
|
||||
gene %chin% comparison_ids,
|
||||
mean(densest_distance)
|
||||
|
|
@ -49,11 +60,10 @@ adjacency <- function(preset, progress = NULL) {
|
|||
)
|
||||
]
|
||||
|
||||
if (!is.null(progress)) {
|
||||
progress(0.5)
|
||||
}
|
||||
|
||||
# Exclude the reference gene itself when computing its difference.
|
||||
# Exclude the reference gene itself when computing its
|
||||
# difference.
|
||||
data[
|
||||
gene %chin% reference_gene_ids,
|
||||
difference := compute_difference(
|
||||
|
|
@ -66,16 +76,14 @@ adjacency <- function(preset, progress = NULL) {
|
|||
# Compute the final score by normalizing the difference.
|
||||
data[, score := 1 - difference / max(difference)]
|
||||
|
||||
if (!is.null(progress)) {
|
||||
progress(1.0)
|
||||
}
|
||||
|
||||
structure(
|
||||
list(
|
||||
results = data[, .(gene, score)],
|
||||
details = data
|
||||
),
|
||||
class = "geposan_method_results"
|
||||
result(
|
||||
method = "adjacency",
|
||||
scores = data[, .(gene, score)],
|
||||
details = list(data = data)
|
||||
)
|
||||
})
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
|
|||
114
R/analyze.R
114
R/analyze.R
|
|
@ -1,16 +1,17 @@
|
|||
#' Analyze by applying the specified preset.
|
||||
#' Analyze genes based on position data.
|
||||
#'
|
||||
#' @param preset The preset to use which should be created using [preset()].
|
||||
#' @param progress A function to be called for progress information. The
|
||||
#' function should accept a number between 0.0 and 1.0 for the current
|
||||
#' progress.
|
||||
#' progress. If no function is provided, a simple text progress bar will be
|
||||
#' shown.
|
||||
#'
|
||||
#' @returns An object containing the results of the analysis with the following
|
||||
#' items:
|
||||
#' \describe{
|
||||
#' \item{`preset`}{The preset that was used.}
|
||||
#' \item{`weights`}{The optimal weights for ranking the reference genes.}
|
||||
#' \item{`ranking`}{The optimal ranking created using the weights.}
|
||||
#' \item{`scores`}{Table containing all scores for each gene.}
|
||||
#' \item{`results`}{Results from the different methods including details.}
|
||||
#' }
|
||||
#'
|
||||
#' @export
|
||||
|
|
@ -19,80 +20,69 @@ analyze <- function(preset, progress = NULL) {
|
|||
stop("Preset is invalid. Use geposan::preset() to create one.")
|
||||
}
|
||||
|
||||
# Available methods by ID.
|
||||
#
|
||||
# A method describes a way to perform a computation on gene distance data
|
||||
# that results in a single score per gene. The function should accept the
|
||||
# preset to apply (see [preset()]) and an optional progress function (that
|
||||
# may be called with a number between 0.0 and 1.0) as its parameters.
|
||||
#
|
||||
# The function should return a [data.table] with the following columns:
|
||||
#
|
||||
# - `gene` Gene ID of the processed gene.
|
||||
# - `score` Score for the gene between 0.0 and 1.0.
|
||||
methods <- list(
|
||||
"clusteriness" = clusteriness,
|
||||
"correlation" = correlation,
|
||||
"neural" = neural,
|
||||
"adjacency" = adjacency,
|
||||
"proximity" = proximity
|
||||
)
|
||||
if (is.null(progress)) {
|
||||
progress_bar <- progress::progress_bar$new()
|
||||
progress_bar$update(0.0)
|
||||
|
||||
analysis <- cached("analysis", preset, {
|
||||
total_progress <- 0.0
|
||||
progress <- function(progress_value) {
|
||||
if (!progress_bar$finished) {
|
||||
progress_bar$update(progress_value)
|
||||
if (progress_value >= 1.0) {
|
||||
progress_bar$terminate()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
progress_buffer <- 0.0
|
||||
method_count <- length(preset$methods)
|
||||
results <- data.table(gene = preset$gene_ids)
|
||||
|
||||
for (method_id in preset$methods) {
|
||||
method_progress <- if (!is.null(progress)) {
|
||||
function(p) {
|
||||
progress(total_progress + p / method_count)
|
||||
}
|
||||
method_progress <- function(progress_value) {
|
||||
progress(progress_buffer + progress_value / method_count)
|
||||
}
|
||||
|
||||
method_results <- methods[[method_id]](
|
||||
preset,
|
||||
progress = method_progress
|
||||
)$results
|
||||
scores <- data.table(gene = preset$gene_id)
|
||||
results <- list()
|
||||
|
||||
setnames(method_results, "score", method_id)
|
||||
for (method in preset$methods) {
|
||||
method_results <- method$func(preset, method_progress)
|
||||
|
||||
results <- merge(
|
||||
results,
|
||||
method_results,
|
||||
by = "gene"
|
||||
)
|
||||
scores <- merge(scores, method_results$scores)
|
||||
setnames(scores, "score", method$id)
|
||||
|
||||
total_progress <- total_progress + 1 / method_count
|
||||
results <- c(results, list(method_results))
|
||||
|
||||
progress_buffer <- progress_buffer + 1 / method_count
|
||||
progress(progress_buffer)
|
||||
}
|
||||
|
||||
results <- structure(
|
||||
results,
|
||||
class = c("geposan_results", class(results))
|
||||
)
|
||||
|
||||
weights <- optimal_weights(
|
||||
results,
|
||||
preset$methods,
|
||||
preset$reference_gene_ids,
|
||||
target = preset$optimization_target
|
||||
)
|
||||
|
||||
ranking <- ranking(results, weights)
|
||||
|
||||
structure(
|
||||
list(
|
||||
preset = preset,
|
||||
weights = weights,
|
||||
ranking = ranking
|
||||
scores = scores,
|
||||
results = results
|
||||
),
|
||||
class = "geposan_analysis"
|
||||
)
|
||||
})
|
||||
|
||||
if (!is.null(progress)) {
|
||||
progress(1.0)
|
||||
}
|
||||
|
||||
analysis
|
||||
#' Print an analysis object.
|
||||
#'
|
||||
#' @param x The analysis to print.
|
||||
#' @param ... Other parameters.
|
||||
#'
|
||||
#' @seealso [analyze()]
|
||||
#'
|
||||
#' @export
|
||||
print.geposan_analysis <- function(x, ...) {
|
||||
cat("geposan analysis:\n\n")
|
||||
print(x$preset)
|
||||
cat("\n")
|
||||
|
||||
for (result in x$results) {
|
||||
print(result)
|
||||
cat("\n")
|
||||
}
|
||||
|
||||
invisible(x)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,84 +0,0 @@
|
|||
# Perform a cluster analysis.
|
||||
#
|
||||
# This function will cluster the data using `hclust` and `cutree` (with the
|
||||
# specified height). Every cluster with at least two members qualifies for
|
||||
# further analysis. Clusters are then ranked based on their size in relation
|
||||
# to the number of values. The return value is a final score between zero and
|
||||
# one. Lower ranking clusters contribute less to this score.
|
||||
#
|
||||
# @param data The values that should be scored.
|
||||
# @param height The maximum span of values considered to be in one cluster.
|
||||
# @param weight The weight that will be given to the next largest cluster in
|
||||
# relation to the previous one. For example, if `weight` is 0.7 (the default),
|
||||
# the first cluster will weigh 1.0, the second 0.7, the third 0.49 etc.
|
||||
clusteriness_priv <- function(data, height = 1000000, weight = 0.7) {
|
||||
n <- length(data)
|
||||
|
||||
# Return a score of 0.0 if there is just one or no value at all.
|
||||
if (n < 2) {
|
||||
return(0.0)
|
||||
}
|
||||
|
||||
# Cluster the data and compute the cluster sizes.
|
||||
|
||||
tree <- stats::hclust(stats::dist(data))
|
||||
clusters <- stats::cutree(tree, h = height)
|
||||
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
|
||||
|
||||
# Compute the "clusteriness" score.
|
||||
|
||||
score <- 0.0
|
||||
|
||||
for (i in seq_along(cluster_sizes)) {
|
||||
cluster_size <- cluster_sizes[i]
|
||||
|
||||
if (cluster_size >= 2) {
|
||||
cluster_score <- cluster_size / n
|
||||
score <- score + weight ^ (i - 1) * cluster_score
|
||||
}
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
# Process genes clustering their distance to telomeres.
|
||||
clusteriness <- function(preset, progress = NULL) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
|
||||
cached("clusteriness", c(species_ids, gene_ids), {
|
||||
results <- data.table(gene = gene_ids)
|
||||
|
||||
# Prefilter the input data by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
|
||||
# Add an index for quickly accessing data per gene.
|
||||
setkey(distances, gene)
|
||||
|
||||
genes_done <- 0
|
||||
genes_total <- length(gene_ids)
|
||||
|
||||
# Perform the cluster analysis for one gene.
|
||||
compute <- function(gene_id) {
|
||||
data <- distances[gene_id, distance]
|
||||
score <- clusteriness_priv(data)
|
||||
|
||||
if (!is.null(progress)) {
|
||||
genes_done <<- genes_done + 1
|
||||
progress(genes_done / genes_total)
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
structure(
|
||||
list(
|
||||
results = results[,
|
||||
score := compute(gene),
|
||||
by = gene
|
||||
]
|
||||
),
|
||||
class = "geposan_method_results"
|
||||
)
|
||||
})
|
||||
}
|
||||
93
R/clustering.R
Normal file
93
R/clustering.R
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
#' Perform a cluster analysis.
|
||||
#'
|
||||
#' This function will cluster the data using [stats::hclust()] and
|
||||
#' [stats::cutree()]. Every cluster with at least two members qualifies for
|
||||
#' further analysis. Clusters are then ranked based on their size in relation
|
||||
#' to the total number of values. The return value is a final score between
|
||||
#' 0.0 and 1.0. Lower ranking clusters contribute less to this score.
|
||||
#'
|
||||
#' @param data The values that should be scored.
|
||||
#' @param span The maximum span of values considered to be in one cluster.
|
||||
#' @param weight The weight that will be given to the next largest cluster in
|
||||
#' relation to the previous one. For example, if `weight` is 0.7 (the
|
||||
#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
|
||||
#' etc.
|
||||
clusteriness <- function(data, span = 1000000, weight = 0.7) {
|
||||
n <- length(data)
|
||||
|
||||
# Return a score of 0.0 if there is just one or no value at all.
|
||||
if (n < 2) {
|
||||
return(0.0)
|
||||
}
|
||||
|
||||
# Cluster the data and compute the cluster sizes.
|
||||
|
||||
tree <- stats::hclust(stats::dist(data))
|
||||
clusters <- stats::cutree(tree, h = span)
|
||||
cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE)
|
||||
|
||||
# Compute the "clusteriness" score.
|
||||
|
||||
score <- 0.0
|
||||
|
||||
for (i in seq_along(cluster_sizes)) {
|
||||
cluster_size <- cluster_sizes[i]
|
||||
|
||||
if (cluster_size >= 2) {
|
||||
cluster_score <- cluster_size / n
|
||||
score <- score + weight^(i - 1) * cluster_score
|
||||
}
|
||||
}
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
#' Process genes clustering their distance to telomeres.
|
||||
#'
|
||||
#' The result will be cached and can be reused for different presets, because
|
||||
#' it is independent of the reference genes in use.
|
||||
#'
|
||||
#' @return An object of class `geposan_method`.
|
||||
#'
|
||||
#' @seealso [clusteriness()]
|
||||
#'
|
||||
#' @export
|
||||
clustering <- function() {
|
||||
method(
|
||||
id = "clustering",
|
||||
name = "Clustering",
|
||||
description = "Clustering of genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
|
||||
cached("clustering", c(species_ids, gene_ids), {
|
||||
scores <- data.table(gene = gene_ids)
|
||||
|
||||
# Prefilter the input data by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
|
||||
genes_done <- 0
|
||||
genes_total <- length(gene_ids)
|
||||
|
||||
# Perform the cluster analysis for one gene.
|
||||
compute <- function(gene_id) {
|
||||
data <- distances[gene == gene_id, distance]
|
||||
score <- clusteriness(data)
|
||||
|
||||
genes_done <<- genes_done + 1
|
||||
progress(genes_done / genes_total)
|
||||
|
||||
score
|
||||
}
|
||||
|
||||
scores[, score := compute(gene), by = gene]
|
||||
|
||||
result(
|
||||
method = "clustering",
|
||||
scores = scores
|
||||
)
|
||||
})
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
@ -1,18 +1,29 @@
|
|||
# Compute the mean correlation coefficient comparing gene distances with a set
|
||||
# of reference genes.
|
||||
correlation <- function(preset, progress = NULL) {
|
||||
#' Compute the mean correlation coefficient comparing gene distances with a set
|
||||
#' of reference genes.
|
||||
#'
|
||||
#' @return An object of class `geposan_method`.
|
||||
#'
|
||||
#' @export
|
||||
correlation <- function() {
|
||||
method(
|
||||
id = "correlation",
|
||||
name = "Correlation",
|
||||
description = "Correlation with reference genes",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
|
||||
cached(
|
||||
"correlation", c(species_ids, gene_ids, reference_gene_ids), {
|
||||
"correlation",
|
||||
c(species_ids, gene_ids, reference_gene_ids),
|
||||
{ # nolint
|
||||
# Prefilter distances by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
|
||||
# Tranform data to get species as rows and genes as columns. We
|
||||
# construct columns per species, because it requires fewer
|
||||
# iterations, and transpose the table afterwards.
|
||||
# Tranform data to get species as rows and genes as columns.
|
||||
# We construct columns per species, because it requires
|
||||
# fewer iterations, and transpose the table afterwards.
|
||||
|
||||
data <- data.table(gene = gene_ids)
|
||||
|
||||
|
|
@ -30,7 +41,7 @@ correlation <- function(preset, progress = NULL) {
|
|||
# Transpose to the desired format.
|
||||
data <- transpose(data, make.names = "gene")
|
||||
|
||||
if (!is.null(progress)) progress(0.33)
|
||||
progress(0.33)
|
||||
|
||||
# Take the reference data.
|
||||
reference_data <- data[, ..reference_gene_ids]
|
||||
|
|
@ -46,18 +57,20 @@ correlation <- function(preset, progress = NULL) {
|
|||
results <- data.table(results, keep.rownames = TRUE)
|
||||
setnames(results, "rn", "gene")
|
||||
|
||||
# Remove correlations between the reference genes themselves.
|
||||
# Remove correlations between the reference genes
|
||||
# themselves.
|
||||
for (reference_gene_id in reference_gene_ids) {
|
||||
column <- quote(reference_gene_id)
|
||||
results[gene == reference_gene_id, eval(column) := NA]
|
||||
}
|
||||
|
||||
if (!is.null(progress)) progress(0.66)
|
||||
progress(0.66)
|
||||
|
||||
# Compute the final score as the mean of known correlation scores.
|
||||
# Negative correlations will correctly lessen the score, which will
|
||||
# be clamped to zero as its lower bound. Genes with no possible
|
||||
# correlations at all will be assumed to have a score of 0.0.
|
||||
# Compute the final score as the mean of known correlation
|
||||
# scores. Negative correlations will correctly lessen the
|
||||
# score, which will be clamped to zero as its lower bound.
|
||||
# Genes with no possible correlations at all will be assumed
|
||||
# to have a score of 0.0.
|
||||
|
||||
compute_score <- function(scores) {
|
||||
score <- mean(scores, na.rm = TRUE)
|
||||
|
|
@ -77,12 +90,12 @@ correlation <- function(preset, progress = NULL) {
|
|||
|
||||
results[, .(gene, score)]
|
||||
|
||||
structure(
|
||||
list(
|
||||
results = results[, .(gene, score)],
|
||||
all_correlations = results
|
||||
),
|
||||
class = "geposan_method_results"
|
||||
result(
|
||||
method = "correlation",
|
||||
scores = results[, .(gene, score)],
|
||||
details = list(all_correlations = results)
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
)
|
||||
|
|
|
|||
67
R/method.R
Normal file
67
R/method.R
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
#' Describe a new method for analyzing gene position data.
|
||||
#'
|
||||
#' @param id Unique identifier for the method.
|
||||
#' @param name Human readable name.
|
||||
#' @param description Slightly longer description.
|
||||
#' @param func Function to apply the method. The function should accept two
|
||||
#' parameters: an object of class `geposan_preset` as input and a function to
|
||||
#' report progress information to as a numeric value. The return value should
|
||||
#' be an object of class `geposan_result`.
|
||||
#'
|
||||
#' @return An object of class `geposan_method`.
|
||||
#'
|
||||
#' @export
|
||||
method <- function(id, name, description, func) {
|
||||
stopifnot(is.character(id) & length(id) == 1)
|
||||
stopifnot(is.character(name) & length(name) == 1)
|
||||
stopifnot(is.character(description) & length(description) == 1)
|
||||
stopifnot(is.function(func))
|
||||
|
||||
structure(
|
||||
list(
|
||||
id = id,
|
||||
name = name,
|
||||
description = description,
|
||||
func = func
|
||||
),
|
||||
class = "geposan_method"
|
||||
)
|
||||
}
|
||||
|
||||
#' Get a list of all available methods.
|
||||
#'
|
||||
#' @export
|
||||
all_methods <- function() {
|
||||
list(
|
||||
clustering(),
|
||||
correlation(),
|
||||
neural(),
|
||||
adjacency(),
|
||||
proximity()
|
||||
)
|
||||
}
|
||||
|
||||
#' Print a method object.
|
||||
#'
|
||||
#' @param x The method to print.
|
||||
#' @param ... Other parameters.
|
||||
#'
|
||||
#' @seealso [method()]
|
||||
#'
|
||||
#' @export
|
||||
print.geposan_method <- function(x, ...) {
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan method:",
|
||||
"\n Method ID: %s",
|
||||
"\n Name: %s",
|
||||
"\n Description: %s",
|
||||
"\n"
|
||||
),
|
||||
x$id,
|
||||
x$name,
|
||||
x$description
|
||||
))
|
||||
|
||||
invisible(x)
|
||||
}
|
||||
106
R/neural.R
106
R/neural.R
|
|
@ -1,12 +1,22 @@
|
|||
# Find genes by training and applying a neural network.
|
||||
#
|
||||
# @param seed The seed will be used to make the results reproducible.
|
||||
# @param n_models This number specifies how many sets of training data should
|
||||
# be created. For each set, there will be a model trained on the remaining
|
||||
# training data and validated using this set. For non-training genes, the
|
||||
# final score will be the mean of the result of applying the different
|
||||
# models.
|
||||
neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
||||
#' Find genes by training and applying a neural network.
|
||||
#'
|
||||
#' @param seed The seed will be used to make the results reproducible.
|
||||
#' @param n_models This number specifies how many sets of training data should
|
||||
#' be created. For each set, there will be a model trained on the remaining
|
||||
#' training data and validated using this set. For non-training genes, the
|
||||
#' final score will be the mean of the result of applying the different
|
||||
#' models. There should be at least two training sets. The analysis will only
|
||||
#' work, if there is at least one reference gene per training set.
|
||||
#'
|
||||
#' @return An object of class `geposan_method`.
|
||||
#'
|
||||
#' @export
|
||||
neural <- function(seed = 180199, n_models = 5) {
|
||||
method(
|
||||
id = "neural",
|
||||
name = "Neural",
|
||||
description = "Assessment by neural network",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
reference_gene_ids <- preset$reference_gene_ids
|
||||
|
|
@ -16,12 +26,7 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
c(species_ids, gene_ids, reference_gene_ids, seed, n_models),
|
||||
{ # nolint
|
||||
reference_count <- length(reference_gene_ids)
|
||||
if (!n_models %in% 2:reference_count) {
|
||||
stop(paste0(
|
||||
"n_models has to be between 2 and the number of reference ",
|
||||
"genes."
|
||||
))
|
||||
}
|
||||
stopifnot(n_models %in% 2:reference_count)
|
||||
|
||||
# Make results reproducible.
|
||||
tensorflow::set_random_seed(seed)
|
||||
|
|
@ -32,9 +37,9 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
# Prefilter distances by species.
|
||||
distances <- geposan::distances[species %chin% species_ids]
|
||||
|
||||
# Input data for the network. This contains the gene ID as an
|
||||
# identifier as well as the per-species gene distances as input
|
||||
# variables.
|
||||
# Input data for the network. This contains the gene ID as
|
||||
# an identifier as well as the per-species gene distances as
|
||||
# input variables.
|
||||
data <- data.table(gene = gene_ids)
|
||||
|
||||
# Buffer to keep track of the names of the input variables.
|
||||
|
|
@ -48,24 +53,27 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
.(gene, distance)
|
||||
]
|
||||
|
||||
# Only include species with at least 25% known values. As
|
||||
# positions and distances always coexist, we don't loose any
|
||||
# data here.
|
||||
# Only include species with at least 25% known values.
|
||||
# As positions and distances always coexist, we don't
|
||||
# loose any data here.
|
||||
|
||||
species_data <- stats::na.omit(species_data)
|
||||
|
||||
if (nrow(species_data) >= 0.25 * length(gene_ids)) {
|
||||
data <- merge(data, species_data, all.x = TRUE)
|
||||
|
||||
# Replace missing data with mean values. The neural network
|
||||
# can't handle NAs in a meaningful way. Choosing extreme
|
||||
# values here would result in heavily biased results.
|
||||
# Therefore, the mean value is chosen as a compromise.
|
||||
# However, this will of course lessen the significance of
|
||||
# the results.
|
||||
# Replace missing data with mean values. The neural
|
||||
# network can't handle NAs in a meaningful way.
|
||||
# Choosing extreme values here would result in
|
||||
# heavily biased results. Therefore, the mean value
|
||||
# is chosen as a compromise. However, this will of
|
||||
# course lessen the significance of the results.
|
||||
|
||||
mean_distance <- round(species_data[, mean(distance)])
|
||||
data[is.na(distance), `:=`(distance = mean_distance)]
|
||||
mean_distance <- round(
|
||||
species_data[, mean(distance)]
|
||||
)
|
||||
|
||||
data[is.na(distance), distance := mean_distance]
|
||||
|
||||
# Name the new column after the species.
|
||||
setnames(data, "distance", species_id)
|
||||
|
|
@ -75,9 +83,7 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!is.null(progress)) {
|
||||
progress(0.1)
|
||||
}
|
||||
|
||||
# Step 2: Prepare training data.
|
||||
# ------------------------------
|
||||
|
|
@ -87,13 +93,14 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
reference_data <- data[gene %chin% reference_gene_ids]
|
||||
reference_data[, score := 1.0]
|
||||
|
||||
# Take out random samples from the remaining genes. This is another
|
||||
# compromise with a negative impact on significance. Because there
|
||||
# is no information on genes with are explicitely *not* TPE-OLD
|
||||
# genes, we have to assume that a random sample of genes has a low
|
||||
# probability of including TPE-OLD genes.
|
||||
# Take out random samples from the remaining genes. This is
|
||||
# another compromise with a negative impact on
|
||||
# significance. We assume that a random gene is not likely
|
||||
# to match the reference genes.
|
||||
|
||||
without_reference_data <- data[!gene %chin% reference_gene_ids]
|
||||
without_reference_data <- data[
|
||||
!gene %chin% reference_gene_ids
|
||||
]
|
||||
|
||||
control_data <- without_reference_data[
|
||||
sample(
|
||||
|
|
@ -104,8 +111,8 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
|
||||
control_data[, score := 0.0]
|
||||
|
||||
# Split the training data into random sets to have validation data
|
||||
# for each model.
|
||||
# Split the training data into random sets to have
|
||||
# validation data for each model.
|
||||
|
||||
# Scramble the source tables.
|
||||
reference_data <- reference_data[sample(reference_count)]
|
||||
|
|
@ -153,8 +160,9 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
output_vars <- NULL
|
||||
|
||||
for (i in seq_along(networks)) {
|
||||
# Create a new model for each training session, because the
|
||||
# model would keep its state across training sessions otherwise.
|
||||
# Create a new model for each training session, because
|
||||
# the model would keep its state across training
|
||||
# sessions otherwise.
|
||||
model <- keras::keras_model_sequential() |>
|
||||
keras::layer_dense(
|
||||
units = layer1,
|
||||
|
|
@ -218,10 +226,8 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
networks[[i]]$model <- keras::serialize_model(model)
|
||||
networks[[i]]$fit <- fit
|
||||
|
||||
if (!is.null(progress)) {
|
||||
progress(0.1 + i * (0.9 / n_models))
|
||||
}
|
||||
}
|
||||
|
||||
# Compute the final score as the mean score.
|
||||
data[,
|
||||
|
|
@ -230,19 +236,19 @@ neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) {
|
|||
by = gene
|
||||
]
|
||||
|
||||
if (!is.null(progress)) {
|
||||
progress(1.0)
|
||||
}
|
||||
|
||||
structure(
|
||||
list(
|
||||
results = data[, .(gene, score)],
|
||||
result(
|
||||
method = "neural",
|
||||
scores = data[, .(gene, score)],
|
||||
details = list(
|
||||
seed = seed,
|
||||
n_models = n_models,
|
||||
all_results = data[, !..input_vars],
|
||||
networks = networks
|
||||
),
|
||||
class = "geposan_method_results"
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
}
|
||||
)
|
||||
|
|
|
|||
70
R/preset.R
70
R/preset.R
|
|
@ -5,46 +5,22 @@
|
|||
#' reference genes to be able to assess the results later. The genes will be
|
||||
#' filtered based on how many species have data for them. Genes which only have
|
||||
#' orthologs for less than 25% of the input species will be excluded from the
|
||||
#' preset and the analyis.
|
||||
#' preset and the analyis. See the different method functions for the available
|
||||
#' methods: [clustering()], [correlation()], [neural()], [adjacency()] and
|
||||
#' [proximity()].
|
||||
#'
|
||||
#' Available methods are:
|
||||
#'
|
||||
#' - `clusteriness` How much the gene distances to the nearest telomere
|
||||
#' cluster across species.
|
||||
#' - `correlation` The mean correlation of gene distances to the nearest
|
||||
#' telomere across species.
|
||||
#' - `neural` Assessment by neural network trained on the reference genes.
|
||||
#' - `adjacency` Proximity to reference genes.
|
||||
#' - `proximity` Mean proximity to telomeres.
|
||||
#'
|
||||
#' Available optimization targets are:
|
||||
#'
|
||||
#' - `mean` Mean rank of the reference genes.
|
||||
#' - `median` Median rank of the reference genes.
|
||||
#' - `max` First rank of the reference genes.
|
||||
#' - `min` Last rank of the reference genes.
|
||||
#'
|
||||
#' @param methods Methods to apply.
|
||||
#' @param methods List of methods to apply.
|
||||
#' @param species_ids IDs of species to include.
|
||||
#' @param gene_ids IDs of genes to screen.
|
||||
#' @param reference_gene_ids IDs of reference genes to compare to.
|
||||
#' @param optimization_target Parameter of the reference genes that the ranking
|
||||
#' should be optimized for.
|
||||
#'
|
||||
#' @return The preset to use with [analyze()].
|
||||
#'
|
||||
#' @export
|
||||
preset <- function(methods = c(
|
||||
"clusteriness",
|
||||
"correlation",
|
||||
"neural",
|
||||
"adjacency",
|
||||
"proximity"
|
||||
),
|
||||
species_ids = NULL,
|
||||
gene_ids = NULL,
|
||||
reference_gene_ids = NULL,
|
||||
optimization_target = "mean_rank") {
|
||||
preset <- function(methods = all_methods(),
|
||||
species_ids = geposan::species$id,
|
||||
gene_ids = geposan::genes$id,
|
||||
reference_gene_ids) {
|
||||
# Count included species per gene.
|
||||
genes_n_species <- geposan::distances[
|
||||
species %chin% species_ids,
|
||||
|
|
@ -63,11 +39,10 @@ preset <- function(methods = c(
|
|||
# for the object later.
|
||||
structure(
|
||||
list(
|
||||
methods = sort(methods),
|
||||
methods = methods,
|
||||
species_ids = sort(species_ids),
|
||||
gene_ids = sort(gene_ids_filtered),
|
||||
reference_gene_ids = sort(reference_gene_ids),
|
||||
optimization_target = optimization_target
|
||||
reference_gene_ids = sort(reference_gene_ids)
|
||||
),
|
||||
class = "geposan_preset"
|
||||
)
|
||||
|
|
@ -82,25 +57,20 @@ preset <- function(methods = c(
|
|||
#'
|
||||
#' @export
|
||||
print.geposan_preset <- function(x, ...) {
|
||||
cat("geposan preset:")
|
||||
cat("\n Included methods: ")
|
||||
cat(x$methods, sep = ", ")
|
||||
|
||||
cat(sprintf(
|
||||
"\n Input data: %i species, %i genes",
|
||||
paste0(
|
||||
"geposan preset:",
|
||||
"\n Included methods: %s",
|
||||
"\n Number of species: %i",
|
||||
"\n Number of genes: %i",
|
||||
"\n Reference genes: %i",
|
||||
"\n"
|
||||
),
|
||||
paste(sapply(x$methods, function(m) m$id), collapse = ", "),
|
||||
length(x$species_ids),
|
||||
length(x$gene_ids)
|
||||
))
|
||||
|
||||
cat(sprintf(
|
||||
"\n Comparison data: %i reference genes",
|
||||
length(x$gene_ids),
|
||||
length(x$reference_gene_ids)
|
||||
))
|
||||
|
||||
cat(sprintf(
|
||||
"\n Optimization target: %s\n",
|
||||
x$optimization_target
|
||||
))
|
||||
|
||||
invisible(x)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,15 +1,25 @@
|
|||
# Score the mean distance of genes to the telomeres across species.
|
||||
#
|
||||
# A score will be given to each gene such that 0.0 corresponds to the maximal
|
||||
# mean distance across all genes and 1.0 corresponds to a distance of 0.
|
||||
proximity <- function(preset, progress = NULL) {
|
||||
#' Score the mean distance of genes to the telomeres across species.
|
||||
#'
|
||||
#' A score will be given to each gene such that 0.0 corresponds to the maximal
|
||||
#' mean distance across all genes and 1.0 corresponds to a distance of 0.
|
||||
#'
|
||||
#' @return An object of class `geposan_method`.
|
||||
#'
|
||||
#' @export
|
||||
proximity <- function() {
|
||||
method(
|
||||
id = "proximity",
|
||||
name = "Proximity",
|
||||
description = "Proximity to telomeres",
|
||||
function(preset, progress) {
|
||||
species_ids <- preset$species_ids
|
||||
gene_ids <- preset$gene_ids
|
||||
|
||||
cached("proximity", c(species_ids, gene_ids), {
|
||||
# Prefilter distances by species and gene.
|
||||
data <- geposan::distances[
|
||||
species %chin% preset$species_ids & gene %chin% preset$gene_ids
|
||||
species %chin% preset$species_ids &
|
||||
gene %chin% preset$gene_ids
|
||||
]
|
||||
|
||||
# Compute the score as described above.
|
||||
|
|
@ -17,18 +27,13 @@ proximity <- function(preset, progress = NULL) {
|
|||
max_distance <- data[, max(mean_distance)]
|
||||
data[, score := 1 - mean_distance / max_distance]
|
||||
|
||||
if (!is.null(progress)) {
|
||||
# We do everything in one go, so it's not possible to report
|
||||
# detailed progress information. As the method is relatively quick,
|
||||
# this should not be a problem.
|
||||
progress(1.0)
|
||||
}
|
||||
|
||||
structure(
|
||||
list(
|
||||
results = data[, .(gene, score)]
|
||||
),
|
||||
class = "geposan_method_results"
|
||||
result(
|
||||
method = "proximity",
|
||||
scores = data[, .(gene, score)]
|
||||
)
|
||||
})
|
||||
}
|
||||
)
|
||||
}
|
||||
|
|
|
|||
12
R/ranking.R
12
R/ranking.R
|
|
@ -13,10 +13,10 @@
|
|||
#'
|
||||
#' @export
|
||||
ranking <- function(analysis, weights) {
|
||||
if (inherits(analysis, "geposan_analysis")) {
|
||||
ranking <- copy(analysis$ranking)
|
||||
} else if (inherits(analysis, "geposan_results")) {
|
||||
ranking <- copy(analysis)
|
||||
ranking <- if (inherits(analysis, "geposan_analysis")) {
|
||||
copy(analysis$scores)
|
||||
} else if (inherits(analysis, "geposan_ranking")) {
|
||||
copy(analysis)
|
||||
} else {
|
||||
stop("Invalid analyis. Use geposan::analyze().")
|
||||
}
|
||||
|
|
@ -39,7 +39,7 @@ ranking <- function(analysis, weights) {
|
|||
|
||||
structure(
|
||||
ranking,
|
||||
class = c("geposan_ranking", "geposan_results", class(ranking))
|
||||
class = c("geposan_ranking", class(ranking))
|
||||
)
|
||||
}
|
||||
|
||||
|
|
@ -60,7 +60,7 @@ ranking <- function(analysis, weights) {
|
|||
#' @export
|
||||
optimal_weights <- function(analysis, methods, reference_gene_ids,
|
||||
target = "mean") {
|
||||
if (!inherits(analysis, c("geposan_analysis", "geposan_results"))) {
|
||||
if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) {
|
||||
stop("Invalid analyis. Use geposan::analyze().")
|
||||
}
|
||||
|
||||
|
|
|
|||
50
R/result.R
Normal file
50
R/result.R
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
#' Result of applying a method on gene position data.
|
||||
#'
|
||||
#' @param method_id ID of the method that produced this result.
|
||||
#' @param scores A `data.frame` mapping gene IDs (`gene`) to computed scores
|
||||
#' between 0.0 and 1.0 (`score`).
|
||||
#' @param details Optional details that may contain intermediate results as
|
||||
#' well as other information on the method application.
|
||||
#'
|
||||
#' @return An object of class `geposan_result`.
|
||||
#'
|
||||
#' @export
|
||||
result <- function(method_id, scores, details = list()) {
|
||||
stopifnot(is.data.frame(scores) &
|
||||
c("gene", "score") %chin% colnames(scores))
|
||||
stopifnot(is.list(details))
|
||||
|
||||
structure(
|
||||
list(
|
||||
method_id = method_id,
|
||||
scores = scores,
|
||||
details = details
|
||||
),
|
||||
class = "geposan_result"
|
||||
)
|
||||
}
|
||||
|
||||
#' Print a result object.
|
||||
#'
|
||||
#' @param x The result to print.
|
||||
#' @param ... Other parameters.
|
||||
#'
|
||||
#' @seealso [result()]
|
||||
#'
|
||||
#' @export
|
||||
print.geposan_result <- function(x, ...) {
|
||||
cat(sprintf(
|
||||
paste0(
|
||||
"geposan result:",
|
||||
"\n Method: %s",
|
||||
"\n Number of genes: %i",
|
||||
"\n Available details: %s",
|
||||
"\n"
|
||||
),
|
||||
x$method_id,
|
||||
nrow(x$scores),
|
||||
paste(names(x$details), collapse = ", ")
|
||||
))
|
||||
|
||||
invisible(x)
|
||||
}
|
||||
16
man/adjacency.Rd
Normal file
16
man/adjacency.Rd
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/adjacency.R
|
||||
\name{adjacency}
|
||||
\alias{adjacency}
|
||||
\title{Score genes based on their proximity to the reference genes.}
|
||||
\usage{
|
||||
adjacency()
|
||||
}
|
||||
\value{
|
||||
An object of class \code{geposan_method}.
|
||||
}
|
||||
\description{
|
||||
This method finds the distance value with the maximum density for each gene
|
||||
(i.e. the mode of its estimated distribution). Genes are scored by comparing
|
||||
those distance values with the values of the reference genes.
|
||||
}
|
||||
11
man/all_methods.Rd
Normal file
11
man/all_methods.Rd
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/method.R
|
||||
\name{all_methods}
|
||||
\alias{all_methods}
|
||||
\title{Get a list of all available methods.}
|
||||
\usage{
|
||||
all_methods()
|
||||
}
|
||||
\description{
|
||||
Get a list of all available methods.
|
||||
}
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
% Please edit documentation in R/analyze.R
|
||||
\name{analyze}
|
||||
\alias{analyze}
|
||||
\title{Analyze by applying the specified preset.}
|
||||
\title{Analyze genes based on position data.}
|
||||
\usage{
|
||||
analyze(preset, progress = NULL)
|
||||
}
|
||||
|
|
@ -11,17 +11,18 @@ analyze(preset, progress = NULL)
|
|||
|
||||
\item{progress}{A function to be called for progress information. The
|
||||
function should accept a number between 0.0 and 1.0 for the current
|
||||
progress.}
|
||||
progress. If no function is provided, a simple text progress bar will be
|
||||
shown.}
|
||||
}
|
||||
\value{
|
||||
An object containing the results of the analysis with the following
|
||||
items:
|
||||
\describe{
|
||||
\item{\code{preset}}{The preset that was used.}
|
||||
\item{\code{weights}}{The optimal weights for ranking the reference genes.}
|
||||
\item{\code{ranking}}{The optimal ranking created using the weights.}
|
||||
\item{\code{scores}}{Table containing all scores for each gene.}
|
||||
\item{\code{results}}{Results from the different methods including details.}
|
||||
}
|
||||
}
|
||||
\description{
|
||||
Analyze by applying the specified preset.
|
||||
Analyze genes based on position data.
|
||||
}
|
||||
|
|
|
|||
25
man/clusteriness.Rd
Normal file
25
man/clusteriness.Rd
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/clustering.R
|
||||
\name{clusteriness}
|
||||
\alias{clusteriness}
|
||||
\title{Perform a cluster analysis.}
|
||||
\usage{
|
||||
clusteriness(data, span = 1e+06, weight = 0.7)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{The values that should be scored.}
|
||||
|
||||
\item{span}{The maximum span of values considered to be in one cluster.}
|
||||
|
||||
\item{weight}{The weight that will be given to the next largest cluster in
|
||||
relation to the previous one. For example, if \code{weight} is 0.7 (the
|
||||
default), the first cluster will weigh 1.0, the second 0.7, the third 0.49
|
||||
etc.}
|
||||
}
|
||||
\description{
|
||||
This function will cluster the data using \code{\link[stats:hclust]{stats::hclust()}} and
|
||||
\code{\link[stats:cutree]{stats::cutree()}}. Every cluster with at least two members qualifies for
|
||||
further analysis. Clusters are then ranked based on their size in relation
|
||||
to the total number of values. The return value is a final score between
|
||||
0.0 and 1.0. Lower ranking clusters contribute less to this score.
|
||||
}
|
||||
18
man/clustering.Rd
Normal file
18
man/clustering.Rd
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/clustering.R
|
||||
\name{clustering}
|
||||
\alias{clustering}
|
||||
\title{Process genes clustering their distance to telomeres.}
|
||||
\usage{
|
||||
clustering()
|
||||
}
|
||||
\value{
|
||||
An object of class \code{geposan_method}.
|
||||
}
|
||||
\description{
|
||||
The result will be cached and can be reused for different presets, because
|
||||
it is independent of the reference genes in use.
|
||||
}
|
||||
\seealso{
|
||||
\code{\link[=clusteriness]{clusteriness()}}
|
||||
}
|
||||
16
man/correlation.Rd
Normal file
16
man/correlation.Rd
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/correlation.R
|
||||
\name{correlation}
|
||||
\alias{correlation}
|
||||
\title{Compute the mean correlation coefficient comparing gene distances with a set
|
||||
of reference genes.}
|
||||
\usage{
|
||||
correlation()
|
||||
}
|
||||
\value{
|
||||
An object of class \code{geposan_method}.
|
||||
}
|
||||
\description{
|
||||
Compute the mean correlation coefficient comparing gene distances with a set
|
||||
of reference genes.
|
||||
}
|
||||
26
man/method.Rd
Normal file
26
man/method.Rd
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/method.R
|
||||
\name{method}
|
||||
\alias{method}
|
||||
\title{Describe a new method for analyzing gene position data.}
|
||||
\usage{
|
||||
method(id, name, description, func)
|
||||
}
|
||||
\arguments{
|
||||
\item{id}{Unique identifier for the method.}
|
||||
|
||||
\item{name}{Human readable name.}
|
||||
|
||||
\item{description}{Slightly longer description.}
|
||||
|
||||
\item{func}{Function to apply the method. The function should accept two
|
||||
parameters: an object of class \code{geposan_preset} as input and a function to
|
||||
report progress information to as a numeric value. The return value should
|
||||
be an object of class \code{geposan_result}.}
|
||||
}
|
||||
\value{
|
||||
An object of class \code{geposan_method}.
|
||||
}
|
||||
\description{
|
||||
Describe a new method for analyzing gene position data.
|
||||
}
|
||||
24
man/neural.Rd
Normal file
24
man/neural.Rd
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/neural.R
|
||||
\name{neural}
|
||||
\alias{neural}
|
||||
\title{Find genes by training and applying a neural network.}
|
||||
\usage{
|
||||
neural(seed = 180199, n_models = 5)
|
||||
}
|
||||
\arguments{
|
||||
\item{seed}{The seed will be used to make the results reproducible.}
|
||||
|
||||
\item{n_models}{This number specifies how many sets of training data should
|
||||
be created. For each set, there will be a model trained on the remaining
|
||||
training data and validated using this set. For non-training genes, the
|
||||
final score will be the mean of the result of applying the different
|
||||
models. There should be at least two training sets. The analysis will only
|
||||
work, if there is at least one reference gene per training set.}
|
||||
}
|
||||
\value{
|
||||
An object of class \code{geposan_method}.
|
||||
}
|
||||
\description{
|
||||
Find genes by training and applying a neural network.
|
||||
}
|
||||
|
|
@ -5,24 +5,20 @@
|
|||
\title{Create a new preset.}
|
||||
\usage{
|
||||
preset(
|
||||
methods = c("clusteriness", "correlation", "neural", "adjacency", "proximity"),
|
||||
species_ids = NULL,
|
||||
gene_ids = NULL,
|
||||
reference_gene_ids = NULL,
|
||||
optimization_target = "mean_rank"
|
||||
methods = all_methods(),
|
||||
species_ids = geposan::species$id,
|
||||
gene_ids = geposan::genes$id,
|
||||
reference_gene_ids
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{methods}{Methods to apply.}
|
||||
\item{methods}{List of methods to apply.}
|
||||
|
||||
\item{species_ids}{IDs of species to include.}
|
||||
|
||||
\item{gene_ids}{IDs of genes to screen.}
|
||||
|
||||
\item{reference_gene_ids}{IDs of reference genes to compare to.}
|
||||
|
||||
\item{optimization_target}{Parameter of the reference genes that the ranking
|
||||
should be optimized for.}
|
||||
}
|
||||
\value{
|
||||
The preset to use with \code{\link[=analyze]{analyze()}}.
|
||||
|
|
@ -33,25 +29,7 @@ analysis. Note that the genes to process should normally include the
|
|||
reference genes to be able to assess the results later. The genes will be
|
||||
filtered based on how many species have data for them. Genes which only have
|
||||
orthologs for less than 25\% of the input species will be excluded from the
|
||||
preset and the analyis.
|
||||
}
|
||||
\details{
|
||||
Available methods are:
|
||||
\itemize{
|
||||
\item \code{clusteriness} How much the gene distances to the nearest telomere
|
||||
cluster across species.
|
||||
\item \code{correlation} The mean correlation of gene distances to the nearest
|
||||
telomere across species.
|
||||
\item \code{neural} Assessment by neural network trained on the reference genes.
|
||||
\item \code{adjacency} Proximity to reference genes.
|
||||
\item \code{proximity} Mean proximity to telomeres.
|
||||
}
|
||||
|
||||
Available optimization targets are:
|
||||
\itemize{
|
||||
\item \code{mean} Mean rank of the reference genes.
|
||||
\item \code{median} Median rank of the reference genes.
|
||||
\item \code{max} First rank of the reference genes.
|
||||
\item \code{min} Last rank of the reference genes.
|
||||
}
|
||||
preset and the analyis. See the different method functions for the available
|
||||
methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and
|
||||
\code{\link[=proximity]{proximity()}}.
|
||||
}
|
||||
|
|
|
|||
19
man/print.geposan_analysis.Rd
Normal file
19
man/print.geposan_analysis.Rd
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/analyze.R
|
||||
\name{print.geposan_analysis}
|
||||
\alias{print.geposan_analysis}
|
||||
\title{Print an analysis object.}
|
||||
\usage{
|
||||
\method{print}{geposan_analysis}(x, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{The analysis to print.}
|
||||
|
||||
\item{...}{Other parameters.}
|
||||
}
|
||||
\description{
|
||||
Print an analysis object.
|
||||
}
|
||||
\seealso{
|
||||
\code{\link[=analyze]{analyze()}}
|
||||
}
|
||||
19
man/print.geposan_method.Rd
Normal file
19
man/print.geposan_method.Rd
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/method.R
|
||||
\name{print.geposan_method}
|
||||
\alias{print.geposan_method}
|
||||
\title{Print a method object.}
|
||||
\usage{
|
||||
\method{print}{geposan_method}(x, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{The method to print.}
|
||||
|
||||
\item{...}{Other parameters.}
|
||||
}
|
||||
\description{
|
||||
Print a method object.
|
||||
}
|
||||
\seealso{
|
||||
\code{\link[=method]{method()}}
|
||||
}
|
||||
19
man/print.geposan_result.Rd
Normal file
19
man/print.geposan_result.Rd
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/result.R
|
||||
\name{print.geposan_result}
|
||||
\alias{print.geposan_result}
|
||||
\title{Print a result object.}
|
||||
\usage{
|
||||
\method{print}{geposan_result}(x, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{The result to print.}
|
||||
|
||||
\item{...}{Other parameters.}
|
||||
}
|
||||
\description{
|
||||
Print a result object.
|
||||
}
|
||||
\seealso{
|
||||
\code{\link[=result]{result()}}
|
||||
}
|
||||
15
man/proximity.Rd
Normal file
15
man/proximity.Rd
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/proximity.R
|
||||
\name{proximity}
|
||||
\alias{proximity}
|
||||
\title{Score the mean distance of genes to the telomeres across species.}
|
||||
\usage{
|
||||
proximity()
|
||||
}
|
||||
\value{
|
||||
An object of class \code{geposan_method}.
|
||||
}
|
||||
\description{
|
||||
A score will be given to each gene such that 0.0 corresponds to the maximal
|
||||
mean distance across all genes and 1.0 corresponds to a distance of 0.
|
||||
}
|
||||
23
man/result.Rd
Normal file
23
man/result.Rd
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/result.R
|
||||
\name{result}
|
||||
\alias{result}
|
||||
\title{Result of applying a method on gene position data.}
|
||||
\usage{
|
||||
result(method_id, scores, details = list())
|
||||
}
|
||||
\arguments{
|
||||
\item{method_id}{ID of the method that produced this result.}
|
||||
|
||||
\item{scores}{A \code{data.frame} mapping gene IDs (\code{gene}) to computed scores
|
||||
between 0.0 and 1.0 (\code{score}).}
|
||||
|
||||
\item{details}{Optional details that may contain intermediate results as
|
||||
well as other information on the method application.}
|
||||
}
|
||||
\value{
|
||||
An object of class \code{geposan_result}.
|
||||
}
|
||||
\description{
|
||||
Result of applying a method on gene position data.
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue