diff --git a/DESCRIPTION b/DESCRIPTION index 00a2214..62a79aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,6 +24,7 @@ Imports: data.table, keras, rlang, + progress, tensorflow Suggests: biomaRt, diff --git a/NAMESPACE b/NAMESPACE index 47e903c..cc7f389 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,19 @@ # Generated by roxygen2: do not edit by hand +S3method(print,geposan_analysis) S3method(print,geposan_comparison) +S3method(print,geposan_method) S3method(print,geposan_preset) +S3method(print,geposan_result) S3method(print,geposan_validation) +export(adjacency) +export(all_methods) export(analyze) +export(clustering) export(compare) +export(correlation) +export(method) +export(neural) export(optimal_weights) export(plot_boxplot) export(plot_chromosomes) @@ -12,6 +21,8 @@ export(plot_positions) export(plot_rankings) export(plot_scores) export(preset) +export(proximity) export(ranking) +export(result) export(validate) import(data.table) diff --git a/R/adjacency.R b/R/adjacency.R index 849133d..180974e 100644 --- a/R/adjacency.R +++ b/R/adjacency.R @@ -1,81 +1,89 @@ -# Score genes based on their proximity to the reference genes. -# -# This method finds the distance value with the maximum density for each gene -# (i.e. the mode of its estimated distribution). Genes are scored by comparing -# those distance values with the values of the reference genes. -adjacency <- function(preset, progress = NULL) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - reference_gene_ids <- preset$reference_gene_ids +#' Score genes based on their proximity to the reference genes. +#' +#' This method finds the distance value with the maximum density for each gene +#' (i.e. the mode of its estimated distribution). Genes are scored by comparing +#' those distance values with the values of the reference genes. +#' +#' @return An object of class `geposan_method`. +#' +#' @export +adjacency <- function() { + method( + id = "adjacency", + name = "Adjacency", + description = "Adjacency to reference genes", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + reference_gene_ids <- preset$reference_gene_ids - cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), { - # Get the virtual distance value with the highest density. - compute_densest_distance <- function(distances) { - if (length(distances) <= 2) { - mean(distances) - } else { - d <- stats::density(distances) - d$x[which.max(d$y)] - } + cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), { + # Get the virtual distance value with the highest density. + compute_densest_distance <- function(distances) { + if (length(distances) <= 2) { + mean(distances) + } else { + d <- stats::density(distances) + d$x[which.max(d$y)] + } + } + + # Filter distances by species and gene and find the distance + # with the highest density of values for each gene. + data <- geposan::distances[ + species %chin% species_ids & gene %chin% gene_ids, + .(densest_distance = compute_densest_distance(distance)), + by = gene + ] + + # Compute the absolute value of the difference between the + # provided densest distance value in comparison to the mean of + # the densest distances of the comparison genes. + compute_difference <- function(densest_distance, + comparison_ids) { + # Get the mean of the densest distances of the reference + # genes. + mean_densest_distance <- data[ + gene %chin% comparison_ids, + mean(densest_distance) + ] + + abs(densest_distance - mean_densest_distance) + } + + # Compute the differences to the reference genes. + data[ + !gene %chin% reference_gene_ids, + difference := compute_difference( + densest_distance, + reference_gene_ids + ) + ] + + progress(0.5) + + # Exclude the reference gene itself when computing its + # difference. + data[ + gene %chin% reference_gene_ids, + difference := compute_difference( + densest_distance, + reference_gene_ids[reference_gene_ids != gene] + ), + by = gene + ] + + # Compute the final score by normalizing the difference. + data[, score := 1 - difference / max(difference)] + + progress(1.0) + + result( + method = "adjacency", + scores = data[, .(gene, score)], + details = list(data = data) + ) + }) } - - # Filter distances by species and gene and find the distance with the - # highest density of values for each gene. - data <- geposan::distances[ - species %chin% species_ids & gene %chin% gene_ids, - .(densest_distance = compute_densest_distance(distance)), - by = gene - ] - - # Compute the absolute value of the difference between the provided - # densest distance value in comparison to the mean of the densest - # distances of the comparison genes. - compute_difference <- function(densest_distance, comparison_ids) { - # Get the mean of the densest distances of the reference genes. - mean_densest_distance <- data[ - gene %chin% comparison_ids, - mean(densest_distance) - ] - - abs(densest_distance - mean_densest_distance) - } - - # Compute the differences to the reference genes. - data[ - !gene %chin% reference_gene_ids, - difference := compute_difference( - densest_distance, - reference_gene_ids - ) - ] - - if (!is.null(progress)) { - progress(0.5) - } - - # Exclude the reference gene itself when computing its difference. - data[ - gene %chin% reference_gene_ids, - difference := compute_difference( - densest_distance, - reference_gene_ids[reference_gene_ids != gene] - ), - by = gene - ] - - # Compute the final score by normalizing the difference. - data[, score := 1 - difference / max(difference)] - - if (!is.null(progress)) { - progress(1.0) - } - - structure( - list( - results = data[, .(gene, score)], - details = data - ), - class = "geposan_method_results" - ) - }) + ) } diff --git a/R/analyze.R b/R/analyze.R index 3b637c7..f046909 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -1,16 +1,17 @@ -#' Analyze by applying the specified preset. +#' Analyze genes based on position data. #' #' @param preset The preset to use which should be created using [preset()]. #' @param progress A function to be called for progress information. The #' function should accept a number between 0.0 and 1.0 for the current -#' progress. +#' progress. If no function is provided, a simple text progress bar will be +#' shown. #' #' @returns An object containing the results of the analysis with the following #' items: #' \describe{ #' \item{`preset`}{The preset that was used.} -#' \item{`weights`}{The optimal weights for ranking the reference genes.} -#' \item{`ranking`}{The optimal ranking created using the weights.} +#' \item{`scores`}{Table containing all scores for each gene.} +#' \item{`results`}{Results from the different methods including details.} #' } #' #' @export @@ -19,80 +20,69 @@ analyze <- function(preset, progress = NULL) { stop("Preset is invalid. Use geposan::preset() to create one.") } - # Available methods by ID. - # - # A method describes a way to perform a computation on gene distance data - # that results in a single score per gene. The function should accept the - # preset to apply (see [preset()]) and an optional progress function (that - # may be called with a number between 0.0 and 1.0) as its parameters. - # - # The function should return a [data.table] with the following columns: - # - # - `gene` Gene ID of the processed gene. - # - `score` Score for the gene between 0.0 and 1.0. - methods <- list( - "clusteriness" = clusteriness, - "correlation" = correlation, - "neural" = neural, - "adjacency" = adjacency, - "proximity" = proximity - ) + if (is.null(progress)) { + progress_bar <- progress::progress_bar$new() + progress_bar$update(0.0) - analysis <- cached("analysis", preset, { - total_progress <- 0.0 - method_count <- length(preset$methods) - results <- data.table(gene = preset$gene_ids) - - for (method_id in preset$methods) { - method_progress <- if (!is.null(progress)) { - function(p) { - progress(total_progress + p / method_count) + progress <- function(progress_value) { + if (!progress_bar$finished) { + progress_bar$update(progress_value) + if (progress_value >= 1.0) { + progress_bar$terminate() } } - - method_results <- methods[[method_id]]( - preset, - progress = method_progress - )$results - - setnames(method_results, "score", method_id) - - results <- merge( - results, - method_results, - by = "gene" - ) - - total_progress <- total_progress + 1 / method_count } - - results <- structure( - results, - class = c("geposan_results", class(results)) - ) - - weights <- optimal_weights( - results, - preset$methods, - preset$reference_gene_ids, - target = preset$optimization_target - ) - - ranking <- ranking(results, weights) - - structure( - list( - preset = preset, - weights = weights, - ranking = ranking - ), - class = "geposan_analysis" - ) - }) - - if (!is.null(progress)) { - progress(1.0) } - analysis + progress_buffer <- 0.0 + method_count <- length(preset$methods) + + method_progress <- function(progress_value) { + progress(progress_buffer + progress_value / method_count) + } + + scores <- data.table(gene = preset$gene_id) + results <- list() + + for (method in preset$methods) { + method_results <- method$func(preset, method_progress) + + scores <- merge(scores, method_results$scores) + setnames(scores, "score", method$id) + + results <- c(results, list(method_results)) + + progress_buffer <- progress_buffer + 1 / method_count + progress(progress_buffer) + } + + structure( + list( + preset = preset, + scores = scores, + results = results + ), + class = "geposan_analysis" + ) +} + +#' Print an analysis object. +#' +#' @param x The analysis to print. +#' @param ... Other parameters. +#' +#' @seealso [analyze()] +#' +#' @export +print.geposan_analysis <- function(x, ...) { + cat("geposan analysis:\n\n") + print(x$preset) + cat("\n") + + for (result in x$results) { + print(result) + cat("\n") + } + + invisible(x) } diff --git a/R/clusteriness.R b/R/clusteriness.R deleted file mode 100644 index 9396d19..0000000 --- a/R/clusteriness.R +++ /dev/null @@ -1,84 +0,0 @@ -# Perform a cluster analysis. -# -# This function will cluster the data using `hclust` and `cutree` (with the -# specified height). Every cluster with at least two members qualifies for -# further analysis. Clusters are then ranked based on their size in relation -# to the number of values. The return value is a final score between zero and -# one. Lower ranking clusters contribute less to this score. -# -# @param data The values that should be scored. -# @param height The maximum span of values considered to be in one cluster. -# @param weight The weight that will be given to the next largest cluster in -# relation to the previous one. For example, if `weight` is 0.7 (the default), -# the first cluster will weigh 1.0, the second 0.7, the third 0.49 etc. -clusteriness_priv <- function(data, height = 1000000, weight = 0.7) { - n <- length(data) - - # Return a score of 0.0 if there is just one or no value at all. - if (n < 2) { - return(0.0) - } - - # Cluster the data and compute the cluster sizes. - - tree <- stats::hclust(stats::dist(data)) - clusters <- stats::cutree(tree, h = height) - cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) - - # Compute the "clusteriness" score. - - score <- 0.0 - - for (i in seq_along(cluster_sizes)) { - cluster_size <- cluster_sizes[i] - - if (cluster_size >= 2) { - cluster_score <- cluster_size / n - score <- score + weight ^ (i - 1) * cluster_score - } - } - - score -} - -# Process genes clustering their distance to telomeres. -clusteriness <- function(preset, progress = NULL) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - - cached("clusteriness", c(species_ids, gene_ids), { - results <- data.table(gene = gene_ids) - - # Prefilter the input data by species. - distances <- geposan::distances[species %chin% species_ids] - - # Add an index for quickly accessing data per gene. - setkey(distances, gene) - - genes_done <- 0 - genes_total <- length(gene_ids) - - # Perform the cluster analysis for one gene. - compute <- function(gene_id) { - data <- distances[gene_id, distance] - score <- clusteriness_priv(data) - - if (!is.null(progress)) { - genes_done <<- genes_done + 1 - progress(genes_done / genes_total) - } - - score - } - - structure( - list( - results = results[, - score := compute(gene), - by = gene - ] - ), - class = "geposan_method_results" - ) - }) -} diff --git a/R/clustering.R b/R/clustering.R new file mode 100644 index 0000000..50e828b --- /dev/null +++ b/R/clustering.R @@ -0,0 +1,93 @@ +#' Perform a cluster analysis. +#' +#' This function will cluster the data using [stats::hclust()] and +#' [stats::cutree()]. Every cluster with at least two members qualifies for +#' further analysis. Clusters are then ranked based on their size in relation +#' to the total number of values. The return value is a final score between +#' 0.0 and 1.0. Lower ranking clusters contribute less to this score. +#' +#' @param data The values that should be scored. +#' @param span The maximum span of values considered to be in one cluster. +#' @param weight The weight that will be given to the next largest cluster in +#' relation to the previous one. For example, if `weight` is 0.7 (the +#' default), the first cluster will weigh 1.0, the second 0.7, the third 0.49 +#' etc. +clusteriness <- function(data, span = 1000000, weight = 0.7) { + n <- length(data) + + # Return a score of 0.0 if there is just one or no value at all. + if (n < 2) { + return(0.0) + } + + # Cluster the data and compute the cluster sizes. + + tree <- stats::hclust(stats::dist(data)) + clusters <- stats::cutree(tree, h = span) + cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) + + # Compute the "clusteriness" score. + + score <- 0.0 + + for (i in seq_along(cluster_sizes)) { + cluster_size <- cluster_sizes[i] + + if (cluster_size >= 2) { + cluster_score <- cluster_size / n + score <- score + weight^(i - 1) * cluster_score + } + } + + score +} + +#' Process genes clustering their distance to telomeres. +#' +#' The result will be cached and can be reused for different presets, because +#' it is independent of the reference genes in use. +#' +#' @return An object of class `geposan_method`. +#' +#' @seealso [clusteriness()] +#' +#' @export +clustering <- function() { + method( + id = "clustering", + name = "Clustering", + description = "Clustering of genes", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + + cached("clustering", c(species_ids, gene_ids), { + scores <- data.table(gene = gene_ids) + + # Prefilter the input data by species. + distances <- geposan::distances[species %chin% species_ids] + + genes_done <- 0 + genes_total <- length(gene_ids) + + # Perform the cluster analysis for one gene. + compute <- function(gene_id) { + data <- distances[gene == gene_id, distance] + score <- clusteriness(data) + + genes_done <<- genes_done + 1 + progress(genes_done / genes_total) + + score + } + + scores[, score := compute(gene), by = gene] + + result( + method = "clustering", + scores = scores + ) + }) + } + ) +} diff --git a/R/correlation.R b/R/correlation.R index 45677cf..36a1503 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -1,88 +1,101 @@ -# Compute the mean correlation coefficient comparing gene distances with a set -# of reference genes. -correlation <- function(preset, progress = NULL) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - reference_gene_ids <- preset$reference_gene_ids +#' Compute the mean correlation coefficient comparing gene distances with a set +#' of reference genes. +#' +#' @return An object of class `geposan_method`. +#' +#' @export +correlation <- function() { + method( + id = "correlation", + name = "Correlation", + description = "Correlation with reference genes", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + reference_gene_ids <- preset$reference_gene_ids - cached( - "correlation", c(species_ids, gene_ids, reference_gene_ids), { - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + cached( + "correlation", + c(species_ids, gene_ids, reference_gene_ids), + { # nolint + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Tranform data to get species as rows and genes as columns. We - # construct columns per species, because it requires fewer - # iterations, and transpose the table afterwards. + # Tranform data to get species as rows and genes as columns. + # We construct columns per species, because it requires + # fewer iterations, and transpose the table afterwards. - data <- data.table(gene = gene_ids) + data <- data.table(gene = gene_ids) - # Make a column containing distance data for each species. - for (species_id in species_ids) { - species_data <- distances[ - species == species_id, - .(gene, distance) - ] + # Make a column containing distance data for each species. + for (species_id in species_ids) { + species_data <- distances[ + species == species_id, + .(gene, distance) + ] - data <- merge(data, species_data, all.x = TRUE) - setnames(data, "distance", species_id) - } + data <- merge(data, species_data, all.x = TRUE) + setnames(data, "distance", species_id) + } - # Transpose to the desired format. - data <- transpose(data, make.names = "gene") + # Transpose to the desired format. + data <- transpose(data, make.names = "gene") - if (!is.null(progress)) progress(0.33) + progress(0.33) - # Take the reference data. - reference_data <- data[, ..reference_gene_ids] + # Take the reference data. + reference_data <- data[, ..reference_gene_ids] - # Perform the correlation between all possible pairs. - results <- stats::cor( - data[, ..gene_ids], - reference_data, - use = "pairwise.complete.obs", - method = "spearman" - ) + # Perform the correlation between all possible pairs. + results <- stats::cor( + data[, ..gene_ids], + reference_data, + use = "pairwise.complete.obs", + method = "spearman" + ) - results <- data.table(results, keep.rownames = TRUE) - setnames(results, "rn", "gene") + results <- data.table(results, keep.rownames = TRUE) + setnames(results, "rn", "gene") - # Remove correlations between the reference genes themselves. - for (reference_gene_id in reference_gene_ids) { - column <- quote(reference_gene_id) - results[gene == reference_gene_id, eval(column) := NA] - } + # Remove correlations between the reference genes + # themselves. + for (reference_gene_id in reference_gene_ids) { + column <- quote(reference_gene_id) + results[gene == reference_gene_id, eval(column) := NA] + } - if (!is.null(progress)) progress(0.66) + progress(0.66) - # Compute the final score as the mean of known correlation scores. - # Negative correlations will correctly lessen the score, which will - # be clamped to zero as its lower bound. Genes with no possible - # correlations at all will be assumed to have a score of 0.0. + # Compute the final score as the mean of known correlation + # scores. Negative correlations will correctly lessen the + # score, which will be clamped to zero as its lower bound. + # Genes with no possible correlations at all will be assumed + # to have a score of 0.0. - compute_score <- function(scores) { - score <- mean(scores, na.rm = TRUE) + compute_score <- function(scores) { + score <- mean(scores, na.rm = TRUE) - if (is.na(score) | score < 0.0) { - score <- 0.0 + if (is.na(score) | score < 0.0) { + score <- 0.0 + } + + score + } + + results[, + score := compute_score(as.matrix(.SD)), + .SDcols = reference_gene_ids, + by = gene + ] + + results[, .(gene, score)] + + result( + method = "correlation", + scores = results[, .(gene, score)], + details = list(all_correlations = results) + ) } - - score - } - - results[, - score := compute_score(as.matrix(.SD)), - .SDcols = reference_gene_ids, - by = gene - ] - - results[, .(gene, score)] - - structure( - list( - results = results[, .(gene, score)], - all_correlations = results - ), - class = "geposan_method_results" ) } ) diff --git a/R/method.R b/R/method.R new file mode 100644 index 0000000..a1257f4 --- /dev/null +++ b/R/method.R @@ -0,0 +1,67 @@ +#' Describe a new method for analyzing gene position data. +#' +#' @param id Unique identifier for the method. +#' @param name Human readable name. +#' @param description Slightly longer description. +#' @param func Function to apply the method. The function should accept two +#' parameters: an object of class `geposan_preset` as input and a function to +#' report progress information to as a numeric value. The return value should +#' be an object of class `geposan_result`. +#' +#' @return An object of class `geposan_method`. +#' +#' @export +method <- function(id, name, description, func) { + stopifnot(is.character(id) & length(id) == 1) + stopifnot(is.character(name) & length(name) == 1) + stopifnot(is.character(description) & length(description) == 1) + stopifnot(is.function(func)) + + structure( + list( + id = id, + name = name, + description = description, + func = func + ), + class = "geposan_method" + ) +} + +#' Get a list of all available methods. +#' +#' @export +all_methods <- function() { + list( + clustering(), + correlation(), + neural(), + adjacency(), + proximity() + ) +} + +#' Print a method object. +#' +#' @param x The method to print. +#' @param ... Other parameters. +#' +#' @seealso [method()] +#' +#' @export +print.geposan_method <- function(x, ...) { + cat(sprintf( + paste0( + "geposan method:", + "\n Method ID: %s", + "\n Name: %s", + "\n Description: %s", + "\n" + ), + x$id, + x$name, + x$description + )) + + invisible(x) +} diff --git a/R/neural.R b/R/neural.R index 061102f..2b3323c 100644 --- a/R/neural.R +++ b/R/neural.R @@ -1,248 +1,254 @@ -# Find genes by training and applying a neural network. -# -# @param seed The seed will be used to make the results reproducible. -# @param n_models This number specifies how many sets of training data should -# be created. For each set, there will be a model trained on the remaining -# training data and validated using this set. For non-training genes, the -# final score will be the mean of the result of applying the different -# models. -neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids - reference_gene_ids <- preset$reference_gene_ids +#' Find genes by training and applying a neural network. +#' +#' @param seed The seed will be used to make the results reproducible. +#' @param n_models This number specifies how many sets of training data should +#' be created. For each set, there will be a model trained on the remaining +#' training data and validated using this set. For non-training genes, the +#' final score will be the mean of the result of applying the different +#' models. There should be at least two training sets. The analysis will only +#' work, if there is at least one reference gene per training set. +#' +#' @return An object of class `geposan_method`. +#' +#' @export +neural <- function(seed = 180199, n_models = 5) { + method( + id = "neural", + name = "Neural", + description = "Assessment by neural network", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids + reference_gene_ids <- preset$reference_gene_ids - cached( - "neural", - c(species_ids, gene_ids, reference_gene_ids, seed, n_models), - { # nolint - reference_count <- length(reference_gene_ids) - if (!n_models %in% 2:reference_count) { - stop(paste0( - "n_models has to be between 2 and the number of reference ", - "genes." - )) - } + cached( + "neural", + c(species_ids, gene_ids, reference_gene_ids, seed, n_models), + { # nolint + reference_count <- length(reference_gene_ids) + stopifnot(n_models %in% 2:reference_count) - # Make results reproducible. - tensorflow::set_random_seed(seed) + # Make results reproducible. + tensorflow::set_random_seed(seed) - # Step 1: Prepare input data. - # --------------------------- + # Step 1: Prepare input data. + # --------------------------- - # Prefilter distances by species. - distances <- geposan::distances[species %chin% species_ids] + # Prefilter distances by species. + distances <- geposan::distances[species %chin% species_ids] - # Input data for the network. This contains the gene ID as an - # identifier as well as the per-species gene distances as input - # variables. - data <- data.table(gene = gene_ids) + # Input data for the network. This contains the gene ID as + # an identifier as well as the per-species gene distances as + # input variables. + data <- data.table(gene = gene_ids) - # Buffer to keep track of the names of the input variables. - input_vars <- NULL + # Buffer to keep track of the names of the input variables. + input_vars <- NULL - # Make a columns containing positions and distances for each - # species. - for (species_id in species_ids) { - species_data <- distances[ - species == species_id, - .(gene, distance) - ] + # Make a columns containing positions and distances for each + # species. + for (species_id in species_ids) { + species_data <- distances[ + species == species_id, + .(gene, distance) + ] - # Only include species with at least 25% known values. As - # positions and distances always coexist, we don't loose any - # data here. + # Only include species with at least 25% known values. + # As positions and distances always coexist, we don't + # loose any data here. - species_data <- stats::na.omit(species_data) + species_data <- stats::na.omit(species_data) - if (nrow(species_data) >= 0.25 * length(gene_ids)) { - data <- merge(data, species_data, all.x = TRUE) + if (nrow(species_data) >= 0.25 * length(gene_ids)) { + data <- merge(data, species_data, all.x = TRUE) - # Replace missing data with mean values. The neural network - # can't handle NAs in a meaningful way. Choosing extreme - # values here would result in heavily biased results. - # Therefore, the mean value is chosen as a compromise. - # However, this will of course lessen the significance of - # the results. + # Replace missing data with mean values. The neural + # network can't handle NAs in a meaningful way. + # Choosing extreme values here would result in + # heavily biased results. Therefore, the mean value + # is chosen as a compromise. However, this will of + # course lessen the significance of the results. - mean_distance <- round(species_data[, mean(distance)]) - data[is.na(distance), `:=`(distance = mean_distance)] + mean_distance <- round( + species_data[, mean(distance)] + ) - # Name the new column after the species. - setnames(data, "distance", species_id) + data[is.na(distance), distance := mean_distance] - # Add the input variable to the buffer. - input_vars <- c(input_vars, species_id) - } - } + # Name the new column after the species. + setnames(data, "distance", species_id) - if (!is.null(progress)) { - progress(0.1) - } + # Add the input variable to the buffer. + input_vars <- c(input_vars, species_id) + } + } - # Step 2: Prepare training data. - # ------------------------------ + progress(0.1) - # Take out the reference data. + # Step 2: Prepare training data. + # ------------------------------ - reference_data <- data[gene %chin% reference_gene_ids] - reference_data[, score := 1.0] + # Take out the reference data. - # Take out random samples from the remaining genes. This is another - # compromise with a negative impact on significance. Because there - # is no information on genes with are explicitely *not* TPE-OLD - # genes, we have to assume that a random sample of genes has a low - # probability of including TPE-OLD genes. + reference_data <- data[gene %chin% reference_gene_ids] + reference_data[, score := 1.0] - without_reference_data <- data[!gene %chin% reference_gene_ids] + # Take out random samples from the remaining genes. This is + # another compromise with a negative impact on + # significance. We assume that a random gene is not likely + # to match the reference genes. - control_data <- without_reference_data[ - sample( - nrow(without_reference_data), - reference_count - ) - ] + without_reference_data <- data[ + !gene %chin% reference_gene_ids + ] - control_data[, score := 0.0] + control_data <- without_reference_data[ + sample( + nrow(without_reference_data), + reference_count + ) + ] - # Split the training data into random sets to have validation data - # for each model. + control_data[, score := 0.0] - # Scramble the source tables. - reference_data <- reference_data[sample(reference_count)] - control_data <- control_data[sample(reference_count)] + # Split the training data into random sets to have + # validation data for each model. - networks <- list() + # Scramble the source tables. + reference_data <- reference_data[sample(reference_count)] + control_data <- control_data[sample(reference_count)] - indices <- seq_len(reference_count) - indices_split <- split(indices, indices %% n_models) + networks <- list() - for (i in seq_len(n_models)) { - training_data <- rbindlist(list( - reference_data[!indices_split[[i]]], - control_data[!indices_split[[i]]] - )) + indices <- seq_len(reference_count) + indices_split <- split(indices, indices %% n_models) - validation_data <- rbindlist(list( - reference_data[indices_split[[i]]], - control_data[indices_split[[i]]] - )) + for (i in seq_len(n_models)) { + training_data <- rbindlist(list( + reference_data[!indices_split[[i]]], + control_data[!indices_split[[i]]] + )) - networks[[i]] <- list( - training_data = training_data, - validation_data = validation_data - ) - } + validation_data <- rbindlist(list( + reference_data[indices_split[[i]]], + control_data[indices_split[[i]]] + )) - # Step 3: Create, train and apply neural network. - # ----------------------------------------------- + networks[[i]] <- list( + training_data = training_data, + validation_data = validation_data + ) + } - # Layers for the neural network. - input_layer <- length(input_vars) - layer1 <- input_layer - layer2 <- 0.5 * input_layer - layer3 <- 0.5 * layer2 + # Step 3: Create, train and apply neural network. + # ----------------------------------------------- - # Convert data to matrix and normalize it. - to_matrix <- function(data) { - data_matrix <- as.matrix(data[, ..input_vars]) - colnames(data_matrix) <- NULL - keras::normalize(data_matrix) - } + # Layers for the neural network. + input_layer <- length(input_vars) + layer1 <- input_layer + layer2 <- 0.5 * input_layer + layer3 <- 0.5 * layer2 - data_matrix <- to_matrix(data) - output_vars <- NULL + # Convert data to matrix and normalize it. + to_matrix <- function(data) { + data_matrix <- as.matrix(data[, ..input_vars]) + colnames(data_matrix) <- NULL + keras::normalize(data_matrix) + } - for (i in seq_along(networks)) { - # Create a new model for each training session, because the - # model would keep its state across training sessions otherwise. - model <- keras::keras_model_sequential() |> - keras::layer_dense( - units = layer1, - activation = "relu", - input_shape = input_layer, - ) |> - keras::layer_dense( - units = layer2, - activation = "relu", - kernel_regularizer = keras::regularizer_l2() - ) |> - keras::layer_dense( - units = layer3, - activation = "relu", - kernel_regularizer = keras::regularizer_l2() - ) |> - keras::layer_dense( - units = 1, - activation = "sigmoid" - ) |> - keras::compile( - loss = keras::loss_mean_absolute_error(), - optimizer = keras::optimizer_adam() + data_matrix <- to_matrix(data) + output_vars <- NULL + + for (i in seq_along(networks)) { + # Create a new model for each training session, because + # the model would keep its state across training + # sessions otherwise. + model <- keras::keras_model_sequential() |> + keras::layer_dense( + units = layer1, + activation = "relu", + input_shape = input_layer, + ) |> + keras::layer_dense( + units = layer2, + activation = "relu", + kernel_regularizer = keras::regularizer_l2() + ) |> + keras::layer_dense( + units = layer3, + activation = "relu", + kernel_regularizer = keras::regularizer_l2() + ) |> + keras::layer_dense( + units = 1, + activation = "sigmoid" + ) |> + keras::compile( + loss = keras::loss_mean_absolute_error(), + optimizer = keras::optimizer_adam() + ) + + # Train the model. + + network <- networks[[i]] + + training_data <- network$training_data + training_matrix <- to_matrix(training_data) + validation_data <- network$validation_data + validation_matrix <- to_matrix(validation_data) + + fit <- keras::fit( + model, + x = training_matrix, + y = training_data$score, + validation_data = list( + x_val = validation_matrix, + y_val = validation_data$score + ), + epochs = 500, + verbose = FALSE + ) + + # Apply the model. + + data[, new_score := stats::predict(model, data_matrix)] + + # Remove the values of the training data itself. + data[gene %chin% training_data$gene, new_score := NA] + + output_var <- sprintf("score%i", i) + setnames(data, "new_score", output_var) + output_vars <- c(output_vars, output_var) + + + # Store the details. + + networks[[i]]$model <- keras::serialize_model(model) + networks[[i]]$fit <- fit + + progress(0.1 + i * (0.9 / n_models)) + } + + # Compute the final score as the mean score. + data[, + score := mean(as.numeric(.SD), na.rm = TRUE), + .SDcols = output_vars, + by = gene + ] + + progress(1.0) + + result( + method = "neural", + scores = data[, .(gene, score)], + details = list( + seed = seed, + n_models = n_models, + all_results = data[, !..input_vars], + networks = networks + ) ) - - # Train the model. - - network <- networks[[i]] - - training_data <- network$training_data - training_matrix <- to_matrix(training_data) - validation_data <- network$validation_data - validation_matrix <- to_matrix(validation_data) - - fit <- keras::fit( - model, - x = training_matrix, - y = training_data$score, - validation_data = list( - x_val = validation_matrix, - y_val = validation_data$score - ), - epochs = 500, - verbose = FALSE - ) - - # Apply the model. - - data[, new_score := stats::predict(model, data_matrix)] - - # Remove the values of the training data itself. - data[gene %chin% training_data$gene, new_score := NA] - - output_var <- sprintf("score%i", i) - setnames(data, "new_score", output_var) - output_vars <- c(output_vars, output_var) - - - # Store the details. - - networks[[i]]$model <- keras::serialize_model(model) - networks[[i]]$fit <- fit - - if (!is.null(progress)) { - progress(0.1 + i * (0.9 / n_models)) } - } - - # Compute the final score as the mean score. - data[, - score := mean(as.numeric(.SD), na.rm = TRUE), - .SDcols = output_vars, - by = gene - ] - - if (!is.null(progress)) { - progress(1.0) - } - - structure( - list( - results = data[, .(gene, score)], - seed = seed, - n_models = n_models, - all_results = data[, !..input_vars], - networks = networks - ), - class = "geposan_method_results" ) } ) diff --git a/R/preset.R b/R/preset.R index 46f4f8f..c391b3b 100644 --- a/R/preset.R +++ b/R/preset.R @@ -5,46 +5,22 @@ #' reference genes to be able to assess the results later. The genes will be #' filtered based on how many species have data for them. Genes which only have #' orthologs for less than 25% of the input species will be excluded from the -#' preset and the analyis. +#' preset and the analyis. See the different method functions for the available +#' methods: [clustering()], [correlation()], [neural()], [adjacency()] and +#' [proximity()]. #' -#' Available methods are: -#' -#' - `clusteriness` How much the gene distances to the nearest telomere -#' cluster across species. -#' - `correlation` The mean correlation of gene distances to the nearest -#' telomere across species. -#' - `neural` Assessment by neural network trained on the reference genes. -#' - `adjacency` Proximity to reference genes. -#' - `proximity` Mean proximity to telomeres. -#' -#' Available optimization targets are: -#' -#' - `mean` Mean rank of the reference genes. -#' - `median` Median rank of the reference genes. -#' - `max` First rank of the reference genes. -#' - `min` Last rank of the reference genes. -#' -#' @param methods Methods to apply. +#' @param methods List of methods to apply. #' @param species_ids IDs of species to include. #' @param gene_ids IDs of genes to screen. #' @param reference_gene_ids IDs of reference genes to compare to. -#' @param optimization_target Parameter of the reference genes that the ranking -#' should be optimized for. #' #' @return The preset to use with [analyze()]. #' #' @export -preset <- function(methods = c( - "clusteriness", - "correlation", - "neural", - "adjacency", - "proximity" - ), - species_ids = NULL, - gene_ids = NULL, - reference_gene_ids = NULL, - optimization_target = "mean_rank") { +preset <- function(methods = all_methods(), + species_ids = geposan::species$id, + gene_ids = geposan::genes$id, + reference_gene_ids) { # Count included species per gene. genes_n_species <- geposan::distances[ species %chin% species_ids, @@ -63,11 +39,10 @@ preset <- function(methods = c( # for the object later. structure( list( - methods = sort(methods), + methods = methods, species_ids = sort(species_ids), gene_ids = sort(gene_ids_filtered), - reference_gene_ids = sort(reference_gene_ids), - optimization_target = optimization_target + reference_gene_ids = sort(reference_gene_ids) ), class = "geposan_preset" ) @@ -82,25 +57,20 @@ preset <- function(methods = c( #' #' @export print.geposan_preset <- function(x, ...) { - cat("geposan preset:") - cat("\n Included methods: ") - cat(x$methods, sep = ", ") - cat(sprintf( - "\n Input data: %i species, %i genes", + paste0( + "geposan preset:", + "\n Included methods: %s", + "\n Number of species: %i", + "\n Number of genes: %i", + "\n Reference genes: %i", + "\n" + ), + paste(sapply(x$methods, function(m) m$id), collapse = ", "), length(x$species_ids), - length(x$gene_ids) - )) - - cat(sprintf( - "\n Comparison data: %i reference genes", + length(x$gene_ids), length(x$reference_gene_ids) )) - cat(sprintf( - "\n Optimization target: %s\n", - x$optimization_target - )) - invisible(x) } diff --git a/R/proximity.R b/R/proximity.R index 78b24a2..70c6d92 100644 --- a/R/proximity.R +++ b/R/proximity.R @@ -1,34 +1,39 @@ -# Score the mean distance of genes to the telomeres across species. -# -# A score will be given to each gene such that 0.0 corresponds to the maximal -# mean distance across all genes and 1.0 corresponds to a distance of 0. -proximity <- function(preset, progress = NULL) { - species_ids <- preset$species_ids - gene_ids <- preset$gene_ids +#' Score the mean distance of genes to the telomeres across species. +#' +#' A score will be given to each gene such that 0.0 corresponds to the maximal +#' mean distance across all genes and 1.0 corresponds to a distance of 0. +#' +#' @return An object of class `geposan_method`. +#' +#' @export +proximity <- function() { + method( + id = "proximity", + name = "Proximity", + description = "Proximity to telomeres", + function(preset, progress) { + species_ids <- preset$species_ids + gene_ids <- preset$gene_ids - cached("proximity", c(species_ids, gene_ids), { - # Prefilter distances by species and gene. - data <- geposan::distances[ - species %chin% preset$species_ids & gene %chin% preset$gene_ids - ] + cached("proximity", c(species_ids, gene_ids), { + # Prefilter distances by species and gene. + data <- geposan::distances[ + species %chin% preset$species_ids & + gene %chin% preset$gene_ids + ] - # Compute the score as described above. - data <- data[, .(mean_distance = mean(distance)), by = "gene"] - max_distance <- data[, max(mean_distance)] - data[, score := 1 - mean_distance / max_distance] + # Compute the score as described above. + data <- data[, .(mean_distance = mean(distance)), by = "gene"] + max_distance <- data[, max(mean_distance)] + data[, score := 1 - mean_distance / max_distance] - if (!is.null(progress)) { - # We do everything in one go, so it's not possible to report - # detailed progress information. As the method is relatively quick, - # this should not be a problem. - progress(1.0) + progress(1.0) + + result( + method = "proximity", + scores = data[, .(gene, score)] + ) + }) } - - structure( - list( - results = data[, .(gene, score)] - ), - class = "geposan_method_results" - ) - }) + ) } diff --git a/R/ranking.R b/R/ranking.R index a4d9816..bd2d1a3 100644 --- a/R/ranking.R +++ b/R/ranking.R @@ -13,10 +13,10 @@ #' #' @export ranking <- function(analysis, weights) { - if (inherits(analysis, "geposan_analysis")) { - ranking <- copy(analysis$ranking) - } else if (inherits(analysis, "geposan_results")) { - ranking <- copy(analysis) + ranking <- if (inherits(analysis, "geposan_analysis")) { + copy(analysis$scores) + } else if (inherits(analysis, "geposan_ranking")) { + copy(analysis) } else { stop("Invalid analyis. Use geposan::analyze().") } @@ -39,7 +39,7 @@ ranking <- function(analysis, weights) { structure( ranking, - class = c("geposan_ranking", "geposan_results", class(ranking)) + class = c("geposan_ranking", class(ranking)) ) } @@ -60,7 +60,7 @@ ranking <- function(analysis, weights) { #' @export optimal_weights <- function(analysis, methods, reference_gene_ids, target = "mean") { - if (!inherits(analysis, c("geposan_analysis", "geposan_results"))) { + if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) { stop("Invalid analyis. Use geposan::analyze().") } diff --git a/R/result.R b/R/result.R new file mode 100644 index 0000000..c16ffc0 --- /dev/null +++ b/R/result.R @@ -0,0 +1,50 @@ +#' Result of applying a method on gene position data. +#' +#' @param method_id ID of the method that produced this result. +#' @param scores A `data.frame` mapping gene IDs (`gene`) to computed scores +#' between 0.0 and 1.0 (`score`). +#' @param details Optional details that may contain intermediate results as +#' well as other information on the method application. +#' +#' @return An object of class `geposan_result`. +#' +#' @export +result <- function(method_id, scores, details = list()) { + stopifnot(is.data.frame(scores) & + c("gene", "score") %chin% colnames(scores)) + stopifnot(is.list(details)) + + structure( + list( + method_id = method_id, + scores = scores, + details = details + ), + class = "geposan_result" + ) +} + +#' Print a result object. +#' +#' @param x The result to print. +#' @param ... Other parameters. +#' +#' @seealso [result()] +#' +#' @export +print.geposan_result <- function(x, ...) { + cat(sprintf( + paste0( + "geposan result:", + "\n Method: %s", + "\n Number of genes: %i", + "\n Available details: %s", + "\n" + ), + x$method_id, + nrow(x$scores), + paste(names(x$details), collapse = ", ") + )) + + invisible(x) +} diff --git a/man/adjacency.Rd b/man/adjacency.Rd new file mode 100644 index 0000000..45cc6e0 --- /dev/null +++ b/man/adjacency.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/adjacency.R +\name{adjacency} +\alias{adjacency} +\title{Score genes based on their proximity to the reference genes.} +\usage{ +adjacency() +} +\value{ +An object of class \code{geposan_method}. +} +\description{ +This method finds the distance value with the maximum density for each gene +(i.e. the mode of its estimated distribution). Genes are scored by comparing +those distance values with the values of the reference genes. +} diff --git a/man/all_methods.Rd b/man/all_methods.Rd new file mode 100644 index 0000000..ce871d2 --- /dev/null +++ b/man/all_methods.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method.R +\name{all_methods} +\alias{all_methods} +\title{Get a list of all available methods.} +\usage{ +all_methods() +} +\description{ +Get a list of all available methods. +} diff --git a/man/analyze.Rd b/man/analyze.Rd index 699e063..600eb04 100644 --- a/man/analyze.Rd +++ b/man/analyze.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/analyze.R \name{analyze} \alias{analyze} -\title{Analyze by applying the specified preset.} +\title{Analyze genes based on position data.} \usage{ analyze(preset, progress = NULL) } @@ -11,17 +11,18 @@ analyze(preset, progress = NULL) \item{progress}{A function to be called for progress information. The function should accept a number between 0.0 and 1.0 for the current -progress.} +progress. If no function is provided, a simple text progress bar will be +shown.} } \value{ An object containing the results of the analysis with the following items: \describe{ \item{\code{preset}}{The preset that was used.} -\item{\code{weights}}{The optimal weights for ranking the reference genes.} -\item{\code{ranking}}{The optimal ranking created using the weights.} +\item{\code{scores}}{Table containing all scores for each gene.} +\item{\code{results}}{Results from the different methods including details.} } } \description{ -Analyze by applying the specified preset. +Analyze genes based on position data. } diff --git a/man/clusteriness.Rd b/man/clusteriness.Rd new file mode 100644 index 0000000..a92f4ae --- /dev/null +++ b/man/clusteriness.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{clusteriness} +\alias{clusteriness} +\title{Perform a cluster analysis.} +\usage{ +clusteriness(data, span = 1e+06, weight = 0.7) +} +\arguments{ +\item{data}{The values that should be scored.} + +\item{span}{The maximum span of values considered to be in one cluster.} + +\item{weight}{The weight that will be given to the next largest cluster in +relation to the previous one. For example, if \code{weight} is 0.7 (the +default), the first cluster will weigh 1.0, the second 0.7, the third 0.49 +etc.} +} +\description{ +This function will cluster the data using \code{\link[stats:hclust]{stats::hclust()}} and +\code{\link[stats:cutree]{stats::cutree()}}. Every cluster with at least two members qualifies for +further analysis. Clusters are then ranked based on their size in relation +to the total number of values. The return value is a final score between +0.0 and 1.0. Lower ranking clusters contribute less to this score. +} diff --git a/man/clustering.Rd b/man/clustering.Rd new file mode 100644 index 0000000..a0c04a6 --- /dev/null +++ b/man/clustering.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{clustering} +\alias{clustering} +\title{Process genes clustering their distance to telomeres.} +\usage{ +clustering() +} +\value{ +An object of class \code{geposan_method}. +} +\description{ +The result will be cached and can be reused for different presets, because +it is independent of the reference genes in use. +} +\seealso{ +\code{\link[=clusteriness]{clusteriness()}} +} diff --git a/man/correlation.Rd b/man/correlation.Rd new file mode 100644 index 0000000..d207915 --- /dev/null +++ b/man/correlation.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlation.R +\name{correlation} +\alias{correlation} +\title{Compute the mean correlation coefficient comparing gene distances with a set +of reference genes.} +\usage{ +correlation() +} +\value{ +An object of class \code{geposan_method}. +} +\description{ +Compute the mean correlation coefficient comparing gene distances with a set +of reference genes. +} diff --git a/man/method.Rd b/man/method.Rd new file mode 100644 index 0000000..39e0867 --- /dev/null +++ b/man/method.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method.R +\name{method} +\alias{method} +\title{Describe a new method for analyzing gene position data.} +\usage{ +method(id, name, description, func) +} +\arguments{ +\item{id}{Unique identifier for the method.} + +\item{name}{Human readable name.} + +\item{description}{Slightly longer description.} + +\item{func}{Function to apply the method. The function should accept two +parameters: an object of class \code{geposan_preset} as input and a function to +report progress information to as a numeric value. The return value should +be an object of class \code{geposan_result}.} +} +\value{ +An object of class \code{geposan_method}. +} +\description{ +Describe a new method for analyzing gene position data. +} diff --git a/man/neural.Rd b/man/neural.Rd new file mode 100644 index 0000000..8b7e040 --- /dev/null +++ b/man/neural.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/neural.R +\name{neural} +\alias{neural} +\title{Find genes by training and applying a neural network.} +\usage{ +neural(seed = 180199, n_models = 5) +} +\arguments{ +\item{seed}{The seed will be used to make the results reproducible.} + +\item{n_models}{This number specifies how many sets of training data should +be created. For each set, there will be a model trained on the remaining +training data and validated using this set. For non-training genes, the +final score will be the mean of the result of applying the different +models. There should be at least two training sets. The analysis will only +work, if there is at least one reference gene per training set.} +} +\value{ +An object of class \code{geposan_method}. +} +\description{ +Find genes by training and applying a neural network. +} diff --git a/man/preset.Rd b/man/preset.Rd index 4d2c420..3daf4ae 100644 --- a/man/preset.Rd +++ b/man/preset.Rd @@ -5,24 +5,20 @@ \title{Create a new preset.} \usage{ preset( - methods = c("clusteriness", "correlation", "neural", "adjacency", "proximity"), - species_ids = NULL, - gene_ids = NULL, - reference_gene_ids = NULL, - optimization_target = "mean_rank" + methods = all_methods(), + species_ids = geposan::species$id, + gene_ids = geposan::genes$id, + reference_gene_ids ) } \arguments{ -\item{methods}{Methods to apply.} +\item{methods}{List of methods to apply.} \item{species_ids}{IDs of species to include.} \item{gene_ids}{IDs of genes to screen.} \item{reference_gene_ids}{IDs of reference genes to compare to.} - -\item{optimization_target}{Parameter of the reference genes that the ranking -should be optimized for.} } \value{ The preset to use with \code{\link[=analyze]{analyze()}}. @@ -33,25 +29,7 @@ analysis. Note that the genes to process should normally include the reference genes to be able to assess the results later. The genes will be filtered based on how many species have data for them. Genes which only have orthologs for less than 25\% of the input species will be excluded from the -preset and the analyis. -} -\details{ -Available methods are: -\itemize{ -\item \code{clusteriness} How much the gene distances to the nearest telomere -cluster across species. -\item \code{correlation} The mean correlation of gene distances to the nearest -telomere across species. -\item \code{neural} Assessment by neural network trained on the reference genes. -\item \code{adjacency} Proximity to reference genes. -\item \code{proximity} Mean proximity to telomeres. -} - -Available optimization targets are: -\itemize{ -\item \code{mean} Mean rank of the reference genes. -\item \code{median} Median rank of the reference genes. -\item \code{max} First rank of the reference genes. -\item \code{min} Last rank of the reference genes. -} +preset and the analyis. See the different method functions for the available +methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and +\code{\link[=proximity]{proximity()}}. } diff --git a/man/print.geposan_analysis.Rd b/man/print.geposan_analysis.Rd new file mode 100644 index 0000000..68dc8e3 --- /dev/null +++ b/man/print.geposan_analysis.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/analyze.R +\name{print.geposan_analysis} +\alias{print.geposan_analysis} +\title{Print an analysis object.} +\usage{ +\method{print}{geposan_analysis}(x, ...) +} +\arguments{ +\item{x}{The analysis to print.} + +\item{...}{Other parameters.} +} +\description{ +Print an analysis object. +} +\seealso{ +\code{\link[=analyze]{analyze()}} +} diff --git a/man/print.geposan_method.Rd b/man/print.geposan_method.Rd new file mode 100644 index 0000000..8165118 --- /dev/null +++ b/man/print.geposan_method.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method.R +\name{print.geposan_method} +\alias{print.geposan_method} +\title{Print a method object.} +\usage{ +\method{print}{geposan_method}(x, ...) +} +\arguments{ +\item{x}{The method to print.} + +\item{...}{Other parameters.} +} +\description{ +Print a method object. +} +\seealso{ +\code{\link[=method]{method()}} +} diff --git a/man/print.geposan_result.Rd b/man/print.geposan_result.Rd new file mode 100644 index 0000000..5d78554 --- /dev/null +++ b/man/print.geposan_result.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/result.R +\name{print.geposan_result} +\alias{print.geposan_result} +\title{Print a result object.} +\usage{ +\method{print}{geposan_result}(x, ...) +} +\arguments{ +\item{x}{The result to print.} + +\item{...}{Other parameters.} +} +\description{ +Print a result object. +} +\seealso{ +\code{\link[=result]{result()}} +} diff --git a/man/proximity.Rd b/man/proximity.Rd new file mode 100644 index 0000000..fa8d822 --- /dev/null +++ b/man/proximity.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/proximity.R +\name{proximity} +\alias{proximity} +\title{Score the mean distance of genes to the telomeres across species.} +\usage{ +proximity() +} +\value{ +An object of class \code{geposan_method}. +} +\description{ +A score will be given to each gene such that 0.0 corresponds to the maximal +mean distance across all genes and 1.0 corresponds to a distance of 0. +} diff --git a/man/result.Rd b/man/result.Rd new file mode 100644 index 0000000..ad8a7ad --- /dev/null +++ b/man/result.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/result.R +\name{result} +\alias{result} +\title{Result of applying a method on gene position data.} +\usage{ +result(method_id, scores, details = list()) +} +\arguments{ +\item{method_id}{ID of the method that produced this result.} + +\item{scores}{A \code{data.frame} mapping gene IDs (\code{gene}) to computed scores +between 0.0 and 1.0 (\code{score}).} + +\item{details}{Optional details that may contain intermediate results as +well as other information on the method application.} +} +\value{ +An object of class \code{geposan_result}. +} +\description{ +Result of applying a method on gene position data. +}