mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	Restructure classes and their responsibilities
This commit is contained in:
		
							parent
							
								
									01ec301d6d
								
							
						
					
					
						commit
						e2b93babe5
					
				
					 27 changed files with 974 additions and 634 deletions
				
			
		|  | @ -24,6 +24,7 @@ Imports: | |||
|     data.table, | ||||
|     keras, | ||||
|     rlang, | ||||
|     progress, | ||||
|     tensorflow | ||||
| Suggests: | ||||
|     biomaRt, | ||||
|  |  | |||
							
								
								
									
										11
									
								
								NAMESPACE
									
										
									
									
									
								
							
							
						
						
									
										11
									
								
								NAMESPACE
									
										
									
									
									
								
							|  | @ -1,10 +1,19 @@ | |||
| # Generated by roxygen2: do not edit by hand | ||||
| 
 | ||||
| S3method(print,geposan_analysis) | ||||
| S3method(print,geposan_comparison) | ||||
| S3method(print,geposan_method) | ||||
| S3method(print,geposan_preset) | ||||
| S3method(print,geposan_result) | ||||
| S3method(print,geposan_validation) | ||||
| export(adjacency) | ||||
| export(all_methods) | ||||
| export(analyze) | ||||
| export(clustering) | ||||
| export(compare) | ||||
| export(correlation) | ||||
| export(method) | ||||
| export(neural) | ||||
| export(optimal_weights) | ||||
| export(plot_boxplot) | ||||
| export(plot_chromosomes) | ||||
|  | @ -12,6 +21,8 @@ export(plot_positions) | |||
| export(plot_rankings) | ||||
| export(plot_scores) | ||||
| export(preset) | ||||
| export(proximity) | ||||
| export(ranking) | ||||
| export(result) | ||||
| export(validate) | ||||
| import(data.table) | ||||
|  |  | |||
							
								
								
									
										164
									
								
								R/adjacency.R
									
										
									
									
									
								
							
							
						
						
									
										164
									
								
								R/adjacency.R
									
										
									
									
									
								
							|  | @ -1,81 +1,89 @@ | |||
| # Score genes based on their proximity to the reference genes. | ||||
| # | ||||
| # This method finds the distance value with the maximum density for each gene | ||||
| # (i.e. the mode of its estimated distribution). Genes are scored by comparing | ||||
| # those distance values with the values of the reference genes. | ||||
| adjacency <- function(preset, progress = NULL) { | ||||
|     species_ids <- preset$species_ids | ||||
|     gene_ids <- preset$gene_ids | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
| #' Score genes based on their proximity to the reference genes. | ||||
| #' | ||||
| #' This method finds the distance value with the maximum density for each gene | ||||
| #' (i.e. the mode of its estimated distribution). Genes are scored by comparing | ||||
| #' those distance values with the values of the reference genes. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @export | ||||
| adjacency <- function() { | ||||
|     method( | ||||
|         id = "adjacency", | ||||
|         name = "Adjacency", | ||||
|         description = "Adjacency to reference genes", | ||||
|         function(preset, progress) { | ||||
|             species_ids <- preset$species_ids | ||||
|             gene_ids <- preset$gene_ids | ||||
|             reference_gene_ids <- preset$reference_gene_ids | ||||
| 
 | ||||
|     cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), { | ||||
|         # Get the virtual distance value with the highest density. | ||||
|         compute_densest_distance <- function(distances) { | ||||
|             if (length(distances) <= 2) { | ||||
|                 mean(distances) | ||||
|             } else { | ||||
|                 d <- stats::density(distances) | ||||
|                 d$x[which.max(d$y)] | ||||
|             } | ||||
|             cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), { | ||||
|                 # Get the virtual distance value with the highest density. | ||||
|                 compute_densest_distance <- function(distances) { | ||||
|                     if (length(distances) <= 2) { | ||||
|                         mean(distances) | ||||
|                     } else { | ||||
|                         d <- stats::density(distances) | ||||
|                         d$x[which.max(d$y)] | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 # Filter distances by species and gene and find the distance | ||||
|                 # with the highest density of values for each gene. | ||||
|                 data <- geposan::distances[ | ||||
|                     species %chin% species_ids & gene %chin% gene_ids, | ||||
|                     .(densest_distance = compute_densest_distance(distance)), | ||||
|                     by = gene | ||||
|                 ] | ||||
| 
 | ||||
|                 # Compute the absolute value of the difference between the | ||||
|                 # provided densest distance value in comparison to the mean of | ||||
|                 # the densest distances of the comparison genes. | ||||
|                 compute_difference <- function(densest_distance, | ||||
|                                                comparison_ids) { | ||||
|                     # Get the mean of the densest distances of the reference | ||||
|                     # genes. | ||||
|                     mean_densest_distance <- data[ | ||||
|                         gene %chin% comparison_ids, | ||||
|                         mean(densest_distance) | ||||
|                     ] | ||||
| 
 | ||||
|                     abs(densest_distance - mean_densest_distance) | ||||
|                 } | ||||
| 
 | ||||
|                 # Compute the differences to the reference genes. | ||||
|                 data[ | ||||
|                     !gene %chin% reference_gene_ids, | ||||
|                     difference := compute_difference( | ||||
|                         densest_distance, | ||||
|                         reference_gene_ids | ||||
|                     ) | ||||
|                 ] | ||||
| 
 | ||||
|                 progress(0.5) | ||||
| 
 | ||||
|                 # Exclude the reference gene itself when computing its | ||||
|                 # difference. | ||||
|                 data[ | ||||
|                     gene %chin% reference_gene_ids, | ||||
|                     difference := compute_difference( | ||||
|                         densest_distance, | ||||
|                         reference_gene_ids[reference_gene_ids != gene] | ||||
|                     ), | ||||
|                     by = gene | ||||
|                 ] | ||||
| 
 | ||||
|                 # Compute the final score by normalizing the difference. | ||||
|                 data[, score := 1 - difference / max(difference)] | ||||
| 
 | ||||
|                 progress(1.0) | ||||
| 
 | ||||
|                 result( | ||||
|                     method = "adjacency", | ||||
|                     scores = data[, .(gene, score)], | ||||
|                     details = list(data = data) | ||||
|                 ) | ||||
|             }) | ||||
|         } | ||||
| 
 | ||||
|         # Filter distances by species and gene and find the distance with the | ||||
|         # highest density of values for each gene. | ||||
|         data <- geposan::distances[ | ||||
|             species %chin% species_ids & gene %chin% gene_ids, | ||||
|             .(densest_distance = compute_densest_distance(distance)), | ||||
|             by = gene | ||||
|         ] | ||||
| 
 | ||||
|         # Compute the absolute value of the difference between the provided | ||||
|         # densest distance value in comparison to the mean of the densest | ||||
|         # distances of the comparison genes. | ||||
|         compute_difference <- function(densest_distance, comparison_ids) { | ||||
|             # Get the mean of the densest distances of the reference genes. | ||||
|             mean_densest_distance <- data[ | ||||
|                 gene %chin% comparison_ids, | ||||
|                 mean(densest_distance) | ||||
|             ] | ||||
| 
 | ||||
|             abs(densest_distance - mean_densest_distance) | ||||
|         } | ||||
| 
 | ||||
|         # Compute the differences to the reference genes. | ||||
|         data[ | ||||
|             !gene %chin% reference_gene_ids, | ||||
|             difference := compute_difference( | ||||
|                 densest_distance, | ||||
|                 reference_gene_ids | ||||
|             ) | ||||
|         ] | ||||
| 
 | ||||
|         if (!is.null(progress)) { | ||||
|             progress(0.5) | ||||
|         } | ||||
| 
 | ||||
|         # Exclude the reference gene itself when computing its difference. | ||||
|         data[ | ||||
|             gene %chin% reference_gene_ids, | ||||
|             difference := compute_difference( | ||||
|                 densest_distance, | ||||
|                 reference_gene_ids[reference_gene_ids != gene] | ||||
|             ), | ||||
|             by = gene | ||||
|         ] | ||||
| 
 | ||||
|         # Compute the final score by normalizing the difference. | ||||
|         data[, score := 1 - difference / max(difference)] | ||||
| 
 | ||||
|         if (!is.null(progress)) { | ||||
|             progress(1.0) | ||||
|         } | ||||
| 
 | ||||
|         structure( | ||||
|             list( | ||||
|                 results = data[, .(gene, score)], | ||||
|                 details = data | ||||
|             ), | ||||
|             class = "geposan_method_results" | ||||
|         ) | ||||
|     }) | ||||
|     ) | ||||
| } | ||||
|  |  | |||
							
								
								
									
										138
									
								
								R/analyze.R
									
										
									
									
									
								
							
							
						
						
									
										138
									
								
								R/analyze.R
									
										
									
									
									
								
							|  | @ -1,16 +1,17 @@ | |||
| #' Analyze by applying the specified preset. | ||||
| #' Analyze genes based on position data. | ||||
| #' | ||||
| #' @param preset The preset to use which should be created using [preset()]. | ||||
| #' @param progress A function to be called for progress information. The | ||||
| #'   function should accept a number between 0.0 and 1.0 for the current | ||||
| #'   progress. | ||||
| #'   progress. If no function is provided, a simple text progress bar will be | ||||
| #'   shown. | ||||
| #' | ||||
| #' @returns An object containing the results of the analysis with the following | ||||
| #'   items: | ||||
| #'   \describe{ | ||||
| #'     \item{`preset`}{The preset that was used.} | ||||
| #'     \item{`weights`}{The optimal weights for ranking the reference genes.} | ||||
| #'     \item{`ranking`}{The optimal ranking created using the weights.} | ||||
| #'     \item{`scores`}{Table containing all scores for each gene.} | ||||
| #'     \item{`results`}{Results from the different methods including details.} | ||||
| #'   } | ||||
| #' | ||||
| #' @export | ||||
|  | @ -19,80 +20,69 @@ analyze <- function(preset, progress = NULL) { | |||
|         stop("Preset is invalid. Use geposan::preset() to create one.") | ||||
|     } | ||||
| 
 | ||||
|     # Available methods by ID. | ||||
|     # | ||||
|     # A method describes a way to perform a computation on gene distance data | ||||
|     # that results in a single score per gene. The function should accept the | ||||
|     # preset to apply (see [preset()]) and an optional progress function (that | ||||
|     # may be called with a number between 0.0 and 1.0) as its parameters. | ||||
|     # | ||||
|     # The function should return a [data.table] with the following columns: | ||||
|     # | ||||
|     #  - `gene` Gene ID of the processed gene. | ||||
|     #  - `score` Score for the gene between 0.0 and 1.0. | ||||
|     methods <- list( | ||||
|         "clusteriness" = clusteriness, | ||||
|         "correlation" = correlation, | ||||
|         "neural" = neural, | ||||
|         "adjacency" = adjacency, | ||||
|         "proximity" = proximity | ||||
|     ) | ||||
|     if (is.null(progress)) { | ||||
|         progress_bar <- progress::progress_bar$new() | ||||
|         progress_bar$update(0.0) | ||||
| 
 | ||||
|     analysis <- cached("analysis", preset, { | ||||
|         total_progress <- 0.0 | ||||
|         method_count <- length(preset$methods) | ||||
|         results <- data.table(gene = preset$gene_ids) | ||||
| 
 | ||||
|         for (method_id in preset$methods) { | ||||
|             method_progress <- if (!is.null(progress)) { | ||||
|                 function(p) { | ||||
|                     progress(total_progress + p / method_count) | ||||
|         progress <- function(progress_value) { | ||||
|             if (!progress_bar$finished) { | ||||
|                 progress_bar$update(progress_value) | ||||
|                 if (progress_value >= 1.0) { | ||||
|                     progress_bar$terminate() | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             method_results <- methods[[method_id]]( | ||||
|                 preset, | ||||
|                 progress = method_progress | ||||
|             )$results | ||||
| 
 | ||||
|             setnames(method_results, "score", method_id) | ||||
| 
 | ||||
|             results <- merge( | ||||
|                 results, | ||||
|                 method_results, | ||||
|                 by = "gene" | ||||
|             ) | ||||
| 
 | ||||
|             total_progress <- total_progress + 1 / method_count | ||||
|         } | ||||
| 
 | ||||
|         results <- structure( | ||||
|             results, | ||||
|             class = c("geposan_results", class(results)) | ||||
|         ) | ||||
| 
 | ||||
|         weights <- optimal_weights( | ||||
|             results, | ||||
|             preset$methods, | ||||
|             preset$reference_gene_ids, | ||||
|             target = preset$optimization_target | ||||
|         ) | ||||
| 
 | ||||
|         ranking <- ranking(results, weights) | ||||
| 
 | ||||
|         structure( | ||||
|             list( | ||||
|                 preset = preset, | ||||
|                 weights = weights, | ||||
|                 ranking = ranking | ||||
|             ), | ||||
|             class = "geposan_analysis" | ||||
|         ) | ||||
|     }) | ||||
| 
 | ||||
|     if (!is.null(progress)) { | ||||
|         progress(1.0) | ||||
|     } | ||||
| 
 | ||||
|     analysis | ||||
|     progress_buffer <- 0.0 | ||||
|     method_count <- length(preset$methods) | ||||
| 
 | ||||
|     method_progress <- function(progress_value) { | ||||
|         progress(progress_buffer + progress_value / method_count) | ||||
|     } | ||||
| 
 | ||||
|     scores <- data.table(gene = preset$gene_id) | ||||
|     results <- list() | ||||
| 
 | ||||
|     for (method in preset$methods) { | ||||
|         method_results <- method$func(preset, method_progress) | ||||
| 
 | ||||
|         scores <- merge(scores, method_results$scores) | ||||
|         setnames(scores, "score", method$id) | ||||
| 
 | ||||
|         results <- c(results, list(method_results)) | ||||
| 
 | ||||
|         progress_buffer <- progress_buffer + 1 / method_count | ||||
|         progress(progress_buffer) | ||||
|     } | ||||
| 
 | ||||
|     structure( | ||||
|         list( | ||||
|             preset = preset, | ||||
|             scores = scores, | ||||
|             results = results | ||||
|         ), | ||||
|         class = "geposan_analysis" | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| #' Print an analysis object. | ||||
| #' | ||||
| #' @param x The analysis to print. | ||||
| #' @param ... Other parameters. | ||||
| #' | ||||
| #' @seealso [analyze()] | ||||
| #' | ||||
| #' @export | ||||
| print.geposan_analysis <- function(x, ...) { | ||||
|     cat("geposan analysis:\n\n") | ||||
|     print(x$preset) | ||||
|     cat("\n") | ||||
| 
 | ||||
|     for (result in x$results) { | ||||
|         print(result) | ||||
|         cat("\n") | ||||
|     } | ||||
| 
 | ||||
|     invisible(x) | ||||
| } | ||||
|  |  | |||
|  | @ -1,84 +0,0 @@ | |||
| # Perform a cluster analysis. | ||||
| # | ||||
| # This function will cluster the data using `hclust` and `cutree` (with the | ||||
| # specified height). Every cluster with at least two members qualifies for | ||||
| # further analysis. Clusters are then ranked based on their size in relation | ||||
| # to the number of values. The return value is a final score between zero and | ||||
| # one. Lower ranking clusters contribute less to this score. | ||||
| # | ||||
| # @param data The values that should be scored. | ||||
| # @param height The maximum span of values considered to be in one cluster. | ||||
| # @param weight The weight that will be given to the next largest cluster in | ||||
| #   relation to the previous one. For example, if `weight` is 0.7 (the default), | ||||
| #   the first cluster will weigh 1.0, the second 0.7, the third 0.49 etc. | ||||
| clusteriness_priv <- function(data, height = 1000000, weight = 0.7) { | ||||
|     n <- length(data) | ||||
| 
 | ||||
|     # Return a score of 0.0 if there is just one or no value at all. | ||||
|     if (n < 2) { | ||||
|         return(0.0) | ||||
|     } | ||||
| 
 | ||||
|     # Cluster the data and compute the cluster sizes. | ||||
| 
 | ||||
|     tree <- stats::hclust(stats::dist(data)) | ||||
|     clusters <- stats::cutree(tree, h = height) | ||||
|     cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) | ||||
| 
 | ||||
|     # Compute the "clusteriness" score. | ||||
| 
 | ||||
|     score <- 0.0 | ||||
| 
 | ||||
|     for (i in seq_along(cluster_sizes)) { | ||||
|         cluster_size <- cluster_sizes[i] | ||||
| 
 | ||||
|         if (cluster_size >= 2) { | ||||
|             cluster_score <- cluster_size / n | ||||
|             score <- score + weight ^ (i - 1) * cluster_score | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     score | ||||
| } | ||||
| 
 | ||||
| # Process genes clustering their distance to telomeres. | ||||
| clusteriness <- function(preset, progress = NULL) { | ||||
|     species_ids <- preset$species_ids | ||||
|     gene_ids <- preset$gene_ids | ||||
| 
 | ||||
|     cached("clusteriness", c(species_ids, gene_ids), { | ||||
|         results <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|         # Prefilter the input data by species. | ||||
|         distances <- geposan::distances[species %chin% species_ids] | ||||
| 
 | ||||
|         # Add an index for quickly accessing data per gene. | ||||
|         setkey(distances, gene) | ||||
| 
 | ||||
|         genes_done <- 0 | ||||
|         genes_total <- length(gene_ids) | ||||
| 
 | ||||
|         # Perform the cluster analysis for one gene. | ||||
|         compute <- function(gene_id) { | ||||
|             data <- distances[gene_id, distance] | ||||
|             score <- clusteriness_priv(data) | ||||
| 
 | ||||
|             if (!is.null(progress)) { | ||||
|                 genes_done <<- genes_done + 1 | ||||
|                 progress(genes_done / genes_total) | ||||
|             } | ||||
| 
 | ||||
|             score | ||||
|         } | ||||
| 
 | ||||
|         structure( | ||||
|             list( | ||||
|                 results = results[, | ||||
|                     score := compute(gene), | ||||
|                     by = gene | ||||
|                 ] | ||||
|             ), | ||||
|             class = "geposan_method_results" | ||||
|         ) | ||||
|     }) | ||||
| } | ||||
							
								
								
									
										93
									
								
								R/clustering.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								R/clustering.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,93 @@ | |||
| #' Perform a cluster analysis. | ||||
| #' | ||||
| #' This function will cluster the data using [stats::hclust()] and | ||||
| #' [stats::cutree()]. Every cluster with at least two members qualifies for | ||||
| #' further analysis. Clusters are then ranked based on their size in relation | ||||
| #' to the total number of values. The return value is a final score between | ||||
| #' 0.0 and 1.0. Lower ranking clusters contribute less to this score. | ||||
| #' | ||||
| #' @param data The values that should be scored. | ||||
| #' @param span The maximum span of values considered to be in one cluster. | ||||
| #' @param weight The weight that will be given to the next largest cluster in | ||||
| #'   relation to the previous one. For example, if `weight` is 0.7 (the | ||||
| #'   default), the first cluster will weigh 1.0, the second 0.7, the third 0.49 | ||||
| #'   etc. | ||||
| clusteriness <- function(data, span = 1000000, weight = 0.7) { | ||||
|     n <- length(data) | ||||
| 
 | ||||
|     # Return a score of 0.0 if there is just one or no value at all. | ||||
|     if (n < 2) { | ||||
|         return(0.0) | ||||
|     } | ||||
| 
 | ||||
|     # Cluster the data and compute the cluster sizes. | ||||
| 
 | ||||
|     tree <- stats::hclust(stats::dist(data)) | ||||
|     clusters <- stats::cutree(tree, h = span) | ||||
|     cluster_sizes <- sort(tabulate(clusters), decreasing = TRUE) | ||||
| 
 | ||||
|     # Compute the "clusteriness" score. | ||||
| 
 | ||||
|     score <- 0.0 | ||||
| 
 | ||||
|     for (i in seq_along(cluster_sizes)) { | ||||
|         cluster_size <- cluster_sizes[i] | ||||
| 
 | ||||
|         if (cluster_size >= 2) { | ||||
|             cluster_score <- cluster_size / n | ||||
|             score <- score + weight^(i - 1) * cluster_score | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     score | ||||
| } | ||||
| 
 | ||||
| #' Process genes clustering their distance to telomeres. | ||||
| #' | ||||
| #' The result will be cached and can be reused for different presets, because | ||||
| #' it is independent of the reference genes in use. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @seealso [clusteriness()] | ||||
| #' | ||||
| #' @export | ||||
| clustering <- function() { | ||||
|     method( | ||||
|         id = "clustering", | ||||
|         name = "Clustering", | ||||
|         description = "Clustering of genes", | ||||
|         function(preset, progress) { | ||||
|             species_ids <- preset$species_ids | ||||
|             gene_ids <- preset$gene_ids | ||||
| 
 | ||||
|             cached("clustering", c(species_ids, gene_ids), { | ||||
|                 scores <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|                 # Prefilter the input data by species. | ||||
|                 distances <- geposan::distances[species %chin% species_ids] | ||||
| 
 | ||||
|                 genes_done <- 0 | ||||
|                 genes_total <- length(gene_ids) | ||||
| 
 | ||||
|                 # Perform the cluster analysis for one gene. | ||||
|                 compute <- function(gene_id) { | ||||
|                     data <- distances[gene == gene_id, distance] | ||||
|                     score <- clusteriness(data) | ||||
| 
 | ||||
|                     genes_done <<- genes_done + 1 | ||||
|                     progress(genes_done / genes_total) | ||||
| 
 | ||||
|                     score | ||||
|                 } | ||||
| 
 | ||||
|                 scores[, score := compute(gene), by = gene] | ||||
| 
 | ||||
|                 result( | ||||
|                     method = "clustering", | ||||
|                     scores = scores | ||||
|                 ) | ||||
|             }) | ||||
|         } | ||||
|     ) | ||||
| } | ||||
							
								
								
									
										151
									
								
								R/correlation.R
									
										
									
									
									
								
							
							
						
						
									
										151
									
								
								R/correlation.R
									
										
									
									
									
								
							|  | @ -1,88 +1,101 @@ | |||
| # Compute the mean correlation coefficient comparing gene distances with a set | ||||
| # of reference genes. | ||||
| correlation <- function(preset, progress = NULL) { | ||||
|     species_ids <- preset$species_ids | ||||
|     gene_ids <- preset$gene_ids | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
| #' Compute the mean correlation coefficient comparing gene distances with a set | ||||
| #' of reference genes. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @export | ||||
| correlation <- function() { | ||||
|     method( | ||||
|         id = "correlation", | ||||
|         name = "Correlation", | ||||
|         description = "Correlation with reference genes", | ||||
|         function(preset, progress) { | ||||
|             species_ids <- preset$species_ids | ||||
|             gene_ids <- preset$gene_ids | ||||
|             reference_gene_ids <- preset$reference_gene_ids | ||||
| 
 | ||||
|     cached( | ||||
|         "correlation", c(species_ids, gene_ids, reference_gene_ids), { | ||||
|             # Prefilter distances by species. | ||||
|             distances <- geposan::distances[species %chin% species_ids] | ||||
|             cached( | ||||
|                 "correlation", | ||||
|                 c(species_ids, gene_ids, reference_gene_ids), | ||||
|                 { # nolint | ||||
|                     # Prefilter distances by species. | ||||
|                     distances <- geposan::distances[species %chin% species_ids] | ||||
| 
 | ||||
|             # Tranform data to get species as rows and genes as columns. We | ||||
|             # construct columns per species, because it requires fewer | ||||
|             # iterations, and transpose the table afterwards. | ||||
|                     # Tranform data to get species as rows and genes as columns. | ||||
|                     # We construct columns per species, because it requires | ||||
|                     # fewer iterations, and transpose the table afterwards. | ||||
| 
 | ||||
|             data <- data.table(gene = gene_ids) | ||||
|                     data <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|             # Make a column containing distance data for each species. | ||||
|             for (species_id in species_ids) { | ||||
|                 species_data <- distances[ | ||||
|                     species == species_id, | ||||
|                     .(gene, distance) | ||||
|                 ] | ||||
|                     # Make a column containing distance data for each species. | ||||
|                     for (species_id in species_ids) { | ||||
|                         species_data <- distances[ | ||||
|                             species == species_id, | ||||
|                             .(gene, distance) | ||||
|                         ] | ||||
| 
 | ||||
|                 data <- merge(data, species_data, all.x = TRUE) | ||||
|                 setnames(data, "distance", species_id) | ||||
|             } | ||||
|                         data <- merge(data, species_data, all.x = TRUE) | ||||
|                         setnames(data, "distance", species_id) | ||||
|                     } | ||||
| 
 | ||||
|             # Transpose to the desired format. | ||||
|             data <- transpose(data, make.names = "gene") | ||||
|                     # Transpose to the desired format. | ||||
|                     data <- transpose(data, make.names = "gene") | ||||
| 
 | ||||
|             if (!is.null(progress)) progress(0.33) | ||||
|                     progress(0.33) | ||||
| 
 | ||||
|             # Take the reference data. | ||||
|             reference_data <- data[, ..reference_gene_ids] | ||||
|                     # Take the reference data. | ||||
|                     reference_data <- data[, ..reference_gene_ids] | ||||
| 
 | ||||
|             # Perform the correlation between all possible pairs. | ||||
|             results <- stats::cor( | ||||
|                 data[, ..gene_ids], | ||||
|                 reference_data, | ||||
|                 use = "pairwise.complete.obs", | ||||
|                 method = "spearman" | ||||
|             ) | ||||
|                     # Perform the correlation between all possible pairs. | ||||
|                     results <- stats::cor( | ||||
|                         data[, ..gene_ids], | ||||
|                         reference_data, | ||||
|                         use = "pairwise.complete.obs", | ||||
|                         method = "spearman" | ||||
|                     ) | ||||
| 
 | ||||
|             results <- data.table(results, keep.rownames = TRUE) | ||||
|             setnames(results, "rn", "gene") | ||||
|                     results <- data.table(results, keep.rownames = TRUE) | ||||
|                     setnames(results, "rn", "gene") | ||||
| 
 | ||||
|             # Remove correlations between the reference genes themselves. | ||||
|             for (reference_gene_id in reference_gene_ids) { | ||||
|                 column <- quote(reference_gene_id) | ||||
|                 results[gene == reference_gene_id, eval(column) := NA] | ||||
|             } | ||||
|                     # Remove correlations between the reference genes | ||||
|                     # themselves. | ||||
|                     for (reference_gene_id in reference_gene_ids) { | ||||
|                         column <- quote(reference_gene_id) | ||||
|                         results[gene == reference_gene_id, eval(column) := NA] | ||||
|                     } | ||||
| 
 | ||||
|             if (!is.null(progress)) progress(0.66) | ||||
|                     progress(0.66) | ||||
| 
 | ||||
|             # Compute the final score as the mean of known correlation scores. | ||||
|             # Negative correlations will correctly lessen the score, which will | ||||
|             # be clamped to zero as its lower bound. Genes with no possible | ||||
|             # correlations at all will be assumed to have a score of 0.0. | ||||
|                     # Compute the final score as the mean of known correlation | ||||
|                     # scores. Negative correlations will correctly lessen the | ||||
|                     # score, which will be clamped to zero as its lower bound. | ||||
|                     # Genes with no possible correlations at all will be assumed | ||||
|                     # to have a score of 0.0. | ||||
| 
 | ||||
|             compute_score <- function(scores) { | ||||
|                 score <- mean(scores, na.rm = TRUE) | ||||
|                     compute_score <- function(scores) { | ||||
|                         score <- mean(scores, na.rm = TRUE) | ||||
| 
 | ||||
|                 if (is.na(score) | score < 0.0) { | ||||
|                     score <- 0.0 | ||||
|                         if (is.na(score) | score < 0.0) { | ||||
|                             score <- 0.0 | ||||
|                         } | ||||
| 
 | ||||
|                         score | ||||
|                     } | ||||
| 
 | ||||
|                     results[, | ||||
|                         score := compute_score(as.matrix(.SD)), | ||||
|                         .SDcols = reference_gene_ids, | ||||
|                         by = gene | ||||
|                     ] | ||||
| 
 | ||||
|                     results[, .(gene, score)] | ||||
| 
 | ||||
|                     result( | ||||
|                         method = "correlation", | ||||
|                         scores = results[, .(gene, score)], | ||||
|                         details = list(all_correlations = results) | ||||
|                     ) | ||||
|                 } | ||||
| 
 | ||||
|                 score | ||||
|             } | ||||
| 
 | ||||
|             results[, | ||||
|                 score := compute_score(as.matrix(.SD)), | ||||
|                 .SDcols = reference_gene_ids, | ||||
|                 by = gene | ||||
|             ] | ||||
| 
 | ||||
|             results[, .(gene, score)] | ||||
| 
 | ||||
|             structure( | ||||
|                 list( | ||||
|                     results = results[, .(gene, score)], | ||||
|                     all_correlations = results | ||||
|                 ), | ||||
|                 class = "geposan_method_results" | ||||
|             ) | ||||
|         } | ||||
|     ) | ||||
|  |  | |||
							
								
								
									
										67
									
								
								R/method.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								R/method.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,67 @@ | |||
| #' Describe a new method for analyzing gene position data. | ||||
| #' | ||||
| #' @param id Unique identifier for the method. | ||||
| #' @param name Human readable name. | ||||
| #' @param description Slightly longer description. | ||||
| #' @param func Function to apply the method. The function should accept two | ||||
| #'   parameters: an object of class `geposan_preset` as input and a function to | ||||
| #'   report progress information to as a numeric value. The return value should | ||||
| #'   be an object of class `geposan_result`. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @export | ||||
| method <- function(id, name, description, func) { | ||||
|     stopifnot(is.character(id) & length(id) == 1) | ||||
|     stopifnot(is.character(name) & length(name) == 1) | ||||
|     stopifnot(is.character(description) & length(description) == 1) | ||||
|     stopifnot(is.function(func)) | ||||
| 
 | ||||
|     structure( | ||||
|         list( | ||||
|             id = id, | ||||
|             name = name, | ||||
|             description = description, | ||||
|             func = func | ||||
|         ), | ||||
|         class = "geposan_method" | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| #' Get a list of all available methods. | ||||
| #' | ||||
| #' @export | ||||
| all_methods <- function() { | ||||
|     list( | ||||
|         clustering(), | ||||
|         correlation(), | ||||
|         neural(), | ||||
|         adjacency(), | ||||
|         proximity() | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| #' Print a method object. | ||||
| #' | ||||
| #' @param x The method to print. | ||||
| #' @param ... Other parameters. | ||||
| #' | ||||
| #' @seealso [method()] | ||||
| #' | ||||
| #' @export | ||||
| print.geposan_method <- function(x, ...) { | ||||
|     cat(sprintf( | ||||
|         paste0( | ||||
|             "geposan method:", | ||||
|             "\n  Method ID: %s", | ||||
|             "\n  Name: %s", | ||||
|             "\n  Description: %s", | ||||
|             "\n" | ||||
|         ), | ||||
|         x$id, | ||||
|         x$name, | ||||
|         x$description | ||||
|     )) | ||||
| 
 | ||||
|     invisible(x) | ||||
| } | ||||
							
								
								
									
										424
									
								
								R/neural.R
									
										
									
									
									
								
							
							
						
						
									
										424
									
								
								R/neural.R
									
										
									
									
									
								
							|  | @ -1,248 +1,254 @@ | |||
| # Find genes by training and applying a neural network. | ||||
| # | ||||
| # @param seed The seed will be used to make the results reproducible. | ||||
| # @param n_models This number specifies how many sets of training data should | ||||
| #   be created. For each set, there will be a model trained on the remaining | ||||
| #   training data and validated using this set. For non-training genes, the | ||||
| #   final score will be the mean of the result of applying the different | ||||
| #   models. | ||||
| neural <- function(preset, progress = NULL, seed = 751833, n_models = 5) { | ||||
|     species_ids <- preset$species_ids | ||||
|     gene_ids <- preset$gene_ids | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
| #' Find genes by training and applying a neural network. | ||||
| #' | ||||
| #' @param seed The seed will be used to make the results reproducible. | ||||
| #' @param n_models This number specifies how many sets of training data should | ||||
| #'   be created. For each set, there will be a model trained on the remaining | ||||
| #'   training data and validated using this set. For non-training genes, the | ||||
| #'   final score will be the mean of the result of applying the different | ||||
| #'   models. There should be at least two training sets. The analysis will only | ||||
| #'   work, if there is at least one reference gene per training set. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @export | ||||
| neural <- function(seed = 180199, n_models = 5) { | ||||
|     method( | ||||
|         id = "neural", | ||||
|         name = "Neural", | ||||
|         description = "Assessment by neural network", | ||||
|         function(preset, progress) { | ||||
|             species_ids <- preset$species_ids | ||||
|             gene_ids <- preset$gene_ids | ||||
|             reference_gene_ids <- preset$reference_gene_ids | ||||
| 
 | ||||
|     cached( | ||||
|         "neural", | ||||
|         c(species_ids, gene_ids, reference_gene_ids, seed, n_models), | ||||
|         { # nolint | ||||
|             reference_count <- length(reference_gene_ids) | ||||
|             if (!n_models %in% 2:reference_count) { | ||||
|                 stop(paste0( | ||||
|                     "n_models has to be between 2 and the number of reference ", | ||||
|                     "genes." | ||||
|                 )) | ||||
|             } | ||||
|             cached( | ||||
|                 "neural", | ||||
|                 c(species_ids, gene_ids, reference_gene_ids, seed, n_models), | ||||
|                 { # nolint | ||||
|                     reference_count <- length(reference_gene_ids) | ||||
|                     stopifnot(n_models %in% 2:reference_count) | ||||
| 
 | ||||
|             # Make results reproducible. | ||||
|             tensorflow::set_random_seed(seed) | ||||
|                     # Make results reproducible. | ||||
|                     tensorflow::set_random_seed(seed) | ||||
| 
 | ||||
|             # Step 1: Prepare input data. | ||||
|             # --------------------------- | ||||
|                     # Step 1: Prepare input data. | ||||
|                     # --------------------------- | ||||
| 
 | ||||
|             # Prefilter distances by species. | ||||
|             distances <- geposan::distances[species %chin% species_ids] | ||||
|                     # Prefilter distances by species. | ||||
|                     distances <- geposan::distances[species %chin% species_ids] | ||||
| 
 | ||||
|             # Input data for the network. This contains the gene ID as an | ||||
|             # identifier as well as the per-species gene distances as input | ||||
|             # variables. | ||||
|             data <- data.table(gene = gene_ids) | ||||
|                     # Input data for the network. This contains the gene ID as | ||||
|                     # an identifier as well as the per-species gene distances as | ||||
|                     # input variables. | ||||
|                     data <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|             # Buffer to keep track of the names of the input variables. | ||||
|             input_vars <- NULL | ||||
|                     # Buffer to keep track of the names of the input variables. | ||||
|                     input_vars <- NULL | ||||
| 
 | ||||
|             # Make a columns containing positions and distances for each | ||||
|             # species. | ||||
|             for (species_id in species_ids) { | ||||
|                 species_data <- distances[ | ||||
|                     species == species_id, | ||||
|                     .(gene, distance) | ||||
|                 ] | ||||
|                     # Make a columns containing positions and distances for each | ||||
|                     # species. | ||||
|                     for (species_id in species_ids) { | ||||
|                         species_data <- distances[ | ||||
|                             species == species_id, | ||||
|                             .(gene, distance) | ||||
|                         ] | ||||
| 
 | ||||
|                 # Only include species with at least 25% known values. As | ||||
|                 # positions and distances always coexist, we don't loose any | ||||
|                 # data here. | ||||
|                         # Only include species with at least 25% known values. | ||||
|                         # As positions and distances always coexist, we don't | ||||
|                         # loose any data here. | ||||
| 
 | ||||
|                 species_data <- stats::na.omit(species_data) | ||||
|                         species_data <- stats::na.omit(species_data) | ||||
| 
 | ||||
|                 if (nrow(species_data) >= 0.25 * length(gene_ids)) { | ||||
|                     data <- merge(data, species_data, all.x = TRUE) | ||||
|                         if (nrow(species_data) >= 0.25 * length(gene_ids)) { | ||||
|                             data <- merge(data, species_data, all.x = TRUE) | ||||
| 
 | ||||
|                     # Replace missing data with mean values. The neural network | ||||
|                     # can't handle NAs in a meaningful way. Choosing extreme | ||||
|                     # values here would result in heavily biased results. | ||||
|                     # Therefore, the mean value is chosen as a compromise. | ||||
|                     # However, this will of course lessen the significance of | ||||
|                     # the results. | ||||
|                             # Replace missing data with mean values. The neural | ||||
|                             # network can't handle NAs in a meaningful way. | ||||
|                             # Choosing extreme values here would result in | ||||
|                             # heavily biased results. Therefore, the mean value | ||||
|                             # is chosen as a compromise. However, this will of | ||||
|                             # course lessen the significance of the results. | ||||
| 
 | ||||
|                     mean_distance <- round(species_data[, mean(distance)]) | ||||
|                     data[is.na(distance), `:=`(distance = mean_distance)] | ||||
|                             mean_distance <- round( | ||||
|                                 species_data[, mean(distance)] | ||||
|                             ) | ||||
| 
 | ||||
|                     # Name the new column after the species. | ||||
|                     setnames(data, "distance", species_id) | ||||
|                             data[is.na(distance), distance := mean_distance] | ||||
| 
 | ||||
|                     # Add the input variable to the buffer. | ||||
|                     input_vars <- c(input_vars, species_id) | ||||
|                 } | ||||
|             } | ||||
|                             # Name the new column after the species. | ||||
|                             setnames(data, "distance", species_id) | ||||
| 
 | ||||
|             if (!is.null(progress)) { | ||||
|                 progress(0.1) | ||||
|             } | ||||
|                             # Add the input variable to the buffer. | ||||
|                             input_vars <- c(input_vars, species_id) | ||||
|                         } | ||||
|                     } | ||||
| 
 | ||||
|             # Step 2: Prepare training data. | ||||
|             # ------------------------------ | ||||
|                     progress(0.1) | ||||
| 
 | ||||
|             # Take out the reference data. | ||||
|                     # Step 2: Prepare training data. | ||||
|                     # ------------------------------ | ||||
| 
 | ||||
|             reference_data <- data[gene %chin% reference_gene_ids] | ||||
|             reference_data[, score := 1.0] | ||||
|                     # Take out the reference data. | ||||
| 
 | ||||
|             # Take out random samples from the remaining genes. This is another | ||||
|             # compromise with a negative impact on significance. Because there | ||||
|             # is no information on genes with are explicitely *not* TPE-OLD | ||||
|             # genes, we have to assume that a random sample of genes has a low | ||||
|             # probability of including TPE-OLD genes. | ||||
|                     reference_data <- data[gene %chin% reference_gene_ids] | ||||
|                     reference_data[, score := 1.0] | ||||
| 
 | ||||
|             without_reference_data <- data[!gene %chin% reference_gene_ids] | ||||
|                     # Take out random samples from the remaining genes. This is | ||||
|                     # another compromise with a negative impact on | ||||
|                     # significance. We assume that a random gene is not likely | ||||
|                     # to match the reference genes. | ||||
| 
 | ||||
|             control_data <- without_reference_data[ | ||||
|                 sample( | ||||
|                     nrow(without_reference_data), | ||||
|                     reference_count | ||||
|                 ) | ||||
|             ] | ||||
|                     without_reference_data <- data[ | ||||
|                         !gene %chin% reference_gene_ids | ||||
|                     ] | ||||
| 
 | ||||
|             control_data[, score := 0.0] | ||||
|                     control_data <- without_reference_data[ | ||||
|                         sample( | ||||
|                             nrow(without_reference_data), | ||||
|                             reference_count | ||||
|                         ) | ||||
|                     ] | ||||
| 
 | ||||
|             # Split the training data into random sets to have validation data | ||||
|             # for each model. | ||||
|                     control_data[, score := 0.0] | ||||
| 
 | ||||
|             # Scramble the source tables. | ||||
|             reference_data <- reference_data[sample(reference_count)] | ||||
|             control_data <- control_data[sample(reference_count)] | ||||
|                     # Split the training data into random sets to have | ||||
|                     # validation data for each model. | ||||
| 
 | ||||
|             networks <- list() | ||||
|                     # Scramble the source tables. | ||||
|                     reference_data <- reference_data[sample(reference_count)] | ||||
|                     control_data <- control_data[sample(reference_count)] | ||||
| 
 | ||||
|             indices <- seq_len(reference_count) | ||||
|             indices_split <- split(indices, indices %% n_models) | ||||
|                     networks <- list() | ||||
| 
 | ||||
|             for (i in seq_len(n_models)) { | ||||
|                 training_data <- rbindlist(list( | ||||
|                     reference_data[!indices_split[[i]]], | ||||
|                     control_data[!indices_split[[i]]] | ||||
|                 )) | ||||
|                     indices <- seq_len(reference_count) | ||||
|                     indices_split <- split(indices, indices %% n_models) | ||||
| 
 | ||||
|                 validation_data <- rbindlist(list( | ||||
|                     reference_data[indices_split[[i]]], | ||||
|                     control_data[indices_split[[i]]] | ||||
|                 )) | ||||
|                     for (i in seq_len(n_models)) { | ||||
|                         training_data <- rbindlist(list( | ||||
|                             reference_data[!indices_split[[i]]], | ||||
|                             control_data[!indices_split[[i]]] | ||||
|                         )) | ||||
| 
 | ||||
|                 networks[[i]] <- list( | ||||
|                     training_data = training_data, | ||||
|                     validation_data = validation_data | ||||
|                 ) | ||||
|             } | ||||
|                         validation_data <- rbindlist(list( | ||||
|                             reference_data[indices_split[[i]]], | ||||
|                             control_data[indices_split[[i]]] | ||||
|                         )) | ||||
| 
 | ||||
|             # Step 3: Create, train and apply neural network. | ||||
|             # ----------------------------------------------- | ||||
|                         networks[[i]] <- list( | ||||
|                             training_data = training_data, | ||||
|                             validation_data = validation_data | ||||
|                         ) | ||||
|                     } | ||||
| 
 | ||||
|             # Layers for the neural network. | ||||
|             input_layer <- length(input_vars) | ||||
|             layer1 <- input_layer | ||||
|             layer2 <- 0.5 * input_layer | ||||
|             layer3 <- 0.5 * layer2 | ||||
|                     # Step 3: Create, train and apply neural network. | ||||
|                     # ----------------------------------------------- | ||||
| 
 | ||||
|             # Convert data to matrix and normalize it. | ||||
|             to_matrix <- function(data) { | ||||
|                 data_matrix <- as.matrix(data[, ..input_vars]) | ||||
|                 colnames(data_matrix) <- NULL | ||||
|                 keras::normalize(data_matrix) | ||||
|             } | ||||
|                     # Layers for the neural network. | ||||
|                     input_layer <- length(input_vars) | ||||
|                     layer1 <- input_layer | ||||
|                     layer2 <- 0.5 * input_layer | ||||
|                     layer3 <- 0.5 * layer2 | ||||
| 
 | ||||
|             data_matrix <- to_matrix(data) | ||||
|             output_vars <- NULL | ||||
|                     # Convert data to matrix and normalize it. | ||||
|                     to_matrix <- function(data) { | ||||
|                         data_matrix <- as.matrix(data[, ..input_vars]) | ||||
|                         colnames(data_matrix) <- NULL | ||||
|                         keras::normalize(data_matrix) | ||||
|                     } | ||||
| 
 | ||||
|             for (i in seq_along(networks)) { | ||||
|                 # Create a new model for each training session, because the | ||||
|                 # model would keep its state across training sessions otherwise. | ||||
|                 model <- keras::keras_model_sequential() |> | ||||
|                     keras::layer_dense( | ||||
|                         units = layer1, | ||||
|                         activation = "relu", | ||||
|                         input_shape = input_layer, | ||||
|                     ) |> | ||||
|                     keras::layer_dense( | ||||
|                         units = layer2, | ||||
|                         activation = "relu", | ||||
|                         kernel_regularizer = keras::regularizer_l2() | ||||
|                     ) |> | ||||
|                     keras::layer_dense( | ||||
|                         units = layer3, | ||||
|                         activation = "relu", | ||||
|                         kernel_regularizer = keras::regularizer_l2() | ||||
|                     ) |> | ||||
|                     keras::layer_dense( | ||||
|                         units = 1, | ||||
|                         activation = "sigmoid" | ||||
|                     ) |> | ||||
|                     keras::compile( | ||||
|                         loss = keras::loss_mean_absolute_error(), | ||||
|                         optimizer = keras::optimizer_adam() | ||||
|                     data_matrix <- to_matrix(data) | ||||
|                     output_vars <- NULL | ||||
| 
 | ||||
|                     for (i in seq_along(networks)) { | ||||
|                         # Create a new model for each training session, because | ||||
|                         # the model would keep its state across training | ||||
|                         # sessions otherwise. | ||||
|                         model <- keras::keras_model_sequential() |> | ||||
|                             keras::layer_dense( | ||||
|                                 units = layer1, | ||||
|                                 activation = "relu", | ||||
|                                 input_shape = input_layer, | ||||
|                             ) |> | ||||
|                             keras::layer_dense( | ||||
|                                 units = layer2, | ||||
|                                 activation = "relu", | ||||
|                                 kernel_regularizer = keras::regularizer_l2() | ||||
|                             ) |> | ||||
|                             keras::layer_dense( | ||||
|                                 units = layer3, | ||||
|                                 activation = "relu", | ||||
|                                 kernel_regularizer = keras::regularizer_l2() | ||||
|                             ) |> | ||||
|                             keras::layer_dense( | ||||
|                                 units = 1, | ||||
|                                 activation = "sigmoid" | ||||
|                             ) |> | ||||
|                             keras::compile( | ||||
|                                 loss = keras::loss_mean_absolute_error(), | ||||
|                                 optimizer = keras::optimizer_adam() | ||||
|                             ) | ||||
| 
 | ||||
|                         # Train the model. | ||||
| 
 | ||||
|                         network <- networks[[i]] | ||||
| 
 | ||||
|                         training_data <- network$training_data | ||||
|                         training_matrix <- to_matrix(training_data) | ||||
|                         validation_data <- network$validation_data | ||||
|                         validation_matrix <- to_matrix(validation_data) | ||||
| 
 | ||||
|                         fit <- keras::fit( | ||||
|                             model, | ||||
|                             x = training_matrix, | ||||
|                             y = training_data$score, | ||||
|                             validation_data = list( | ||||
|                                 x_val = validation_matrix, | ||||
|                                 y_val = validation_data$score | ||||
|                             ), | ||||
|                             epochs = 500, | ||||
|                             verbose = FALSE | ||||
|                         ) | ||||
| 
 | ||||
|                         # Apply the model. | ||||
| 
 | ||||
|                         data[, new_score := stats::predict(model, data_matrix)] | ||||
| 
 | ||||
|                         # Remove the values of the training data itself. | ||||
|                         data[gene %chin% training_data$gene, new_score := NA] | ||||
| 
 | ||||
|                         output_var <- sprintf("score%i", i) | ||||
|                         setnames(data, "new_score", output_var) | ||||
|                         output_vars <- c(output_vars, output_var) | ||||
| 
 | ||||
| 
 | ||||
|                         # Store the details. | ||||
| 
 | ||||
|                         networks[[i]]$model <- keras::serialize_model(model) | ||||
|                         networks[[i]]$fit <- fit | ||||
| 
 | ||||
|                         progress(0.1 + i * (0.9 / n_models)) | ||||
|                     } | ||||
| 
 | ||||
|                     # Compute the final score as the mean score. | ||||
|                     data[, | ||||
|                         score := mean(as.numeric(.SD), na.rm = TRUE), | ||||
|                         .SDcols = output_vars, | ||||
|                         by = gene | ||||
|                     ] | ||||
| 
 | ||||
|                     progress(1.0) | ||||
| 
 | ||||
|                     result( | ||||
|                         method = "neural", | ||||
|                         scores = data[, .(gene, score)], | ||||
|                         details = list( | ||||
|                             seed = seed, | ||||
|                             n_models = n_models, | ||||
|                             all_results = data[, !..input_vars], | ||||
|                             networks = networks | ||||
|                         ) | ||||
|                     ) | ||||
| 
 | ||||
|                 # Train the model. | ||||
| 
 | ||||
|                 network <- networks[[i]] | ||||
| 
 | ||||
|                 training_data <- network$training_data | ||||
|                 training_matrix <- to_matrix(training_data) | ||||
|                 validation_data <- network$validation_data | ||||
|                 validation_matrix <- to_matrix(validation_data) | ||||
| 
 | ||||
|                 fit <- keras::fit( | ||||
|                     model, | ||||
|                     x = training_matrix, | ||||
|                     y = training_data$score, | ||||
|                     validation_data = list( | ||||
|                         x_val = validation_matrix, | ||||
|                         y_val = validation_data$score | ||||
|                     ), | ||||
|                     epochs = 500, | ||||
|                     verbose = FALSE | ||||
|                 ) | ||||
| 
 | ||||
|                 # Apply the model. | ||||
| 
 | ||||
|                 data[, new_score := stats::predict(model, data_matrix)] | ||||
| 
 | ||||
|                 # Remove the values of the training data itself. | ||||
|                 data[gene %chin% training_data$gene, new_score := NA] | ||||
| 
 | ||||
|                 output_var <- sprintf("score%i", i) | ||||
|                 setnames(data, "new_score", output_var) | ||||
|                 output_vars <- c(output_vars, output_var) | ||||
| 
 | ||||
| 
 | ||||
|                 # Store the details. | ||||
| 
 | ||||
|                 networks[[i]]$model <- keras::serialize_model(model) | ||||
|                 networks[[i]]$fit <- fit | ||||
| 
 | ||||
|                 if (!is.null(progress)) { | ||||
|                     progress(0.1 + i * (0.9 / n_models)) | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             # Compute the final score as the mean score. | ||||
|             data[, | ||||
|                 score := mean(as.numeric(.SD), na.rm = TRUE), | ||||
|                 .SDcols = output_vars, | ||||
|                 by = gene | ||||
|             ] | ||||
| 
 | ||||
|             if (!is.null(progress)) { | ||||
|                 progress(1.0) | ||||
|             } | ||||
| 
 | ||||
|             structure( | ||||
|                 list( | ||||
|                     results = data[, .(gene, score)], | ||||
|                     seed = seed, | ||||
|                     n_models = n_models, | ||||
|                     all_results = data[, !..input_vars], | ||||
|                     networks = networks | ||||
|                 ), | ||||
|                 class = "geposan_method_results" | ||||
|             ) | ||||
|         } | ||||
|     ) | ||||
|  |  | |||
							
								
								
									
										70
									
								
								R/preset.R
									
										
									
									
									
								
							
							
						
						
									
										70
									
								
								R/preset.R
									
										
									
									
									
								
							|  | @ -5,46 +5,22 @@ | |||
| #' reference genes to be able to assess the results later. The genes will be | ||||
| #' filtered based on how many species have data for them. Genes which only have | ||||
| #' orthologs for less than 25% of the input species will be excluded from the | ||||
| #' preset and the analyis. | ||||
| #' preset and the analyis. See the different method functions for the available | ||||
| #' methods: [clustering()], [correlation()], [neural()], [adjacency()] and | ||||
| #' [proximity()]. | ||||
| #' | ||||
| #' Available methods are: | ||||
| #' | ||||
| #'  - `clusteriness` How much the gene distances to the nearest telomere | ||||
| #'    cluster across species. | ||||
| #'  - `correlation` The mean correlation of gene distances to the nearest | ||||
| #'    telomere across species. | ||||
| #'  - `neural` Assessment by neural network trained on the reference genes. | ||||
| #'  - `adjacency` Proximity to reference genes. | ||||
| #'  - `proximity` Mean proximity to telomeres. | ||||
| #' | ||||
| #' Available optimization targets are: | ||||
| #' | ||||
| #'  - `mean` Mean rank of the reference genes. | ||||
| #'  - `median` Median rank of the reference genes. | ||||
| #'  - `max` First rank of the reference genes. | ||||
| #'  - `min` Last rank of the reference genes. | ||||
| #' | ||||
| #' @param methods Methods to apply. | ||||
| #' @param methods List of methods to apply. | ||||
| #' @param species_ids IDs of species to include. | ||||
| #' @param gene_ids IDs of genes to screen. | ||||
| #' @param reference_gene_ids IDs of reference genes to compare to. | ||||
| #' @param optimization_target Parameter of the reference genes that the ranking | ||||
| #'   should be optimized for. | ||||
| #' | ||||
| #' @return The preset to use with [analyze()]. | ||||
| #' | ||||
| #' @export | ||||
| preset <- function(methods = c( | ||||
|                        "clusteriness", | ||||
|                        "correlation", | ||||
|                        "neural", | ||||
|                        "adjacency", | ||||
|                        "proximity" | ||||
|                    ), | ||||
|                    species_ids = NULL, | ||||
|                    gene_ids = NULL, | ||||
|                    reference_gene_ids = NULL, | ||||
|                    optimization_target = "mean_rank") { | ||||
| preset <- function(methods = all_methods(), | ||||
|                    species_ids = geposan::species$id, | ||||
|                    gene_ids = geposan::genes$id, | ||||
|                    reference_gene_ids) { | ||||
|     # Count included species per gene. | ||||
|     genes_n_species <- geposan::distances[ | ||||
|         species %chin% species_ids, | ||||
|  | @ -63,11 +39,10 @@ preset <- function(methods = c( | |||
|     # for the object later. | ||||
|     structure( | ||||
|         list( | ||||
|             methods = sort(methods), | ||||
|             methods = methods, | ||||
|             species_ids = sort(species_ids), | ||||
|             gene_ids = sort(gene_ids_filtered), | ||||
|             reference_gene_ids = sort(reference_gene_ids), | ||||
|             optimization_target = optimization_target | ||||
|             reference_gene_ids = sort(reference_gene_ids) | ||||
|         ), | ||||
|         class = "geposan_preset" | ||||
|     ) | ||||
|  | @ -82,25 +57,20 @@ preset <- function(methods = c( | |||
| #' | ||||
| #' @export | ||||
| print.geposan_preset <- function(x, ...) { | ||||
|     cat("geposan preset:") | ||||
|     cat("\n  Included methods: ") | ||||
|     cat(x$methods, sep = ", ") | ||||
| 
 | ||||
|     cat(sprintf( | ||||
|         "\n  Input data: %i species, %i genes", | ||||
|         paste0( | ||||
|             "geposan preset:", | ||||
|             "\n  Included methods: %s", | ||||
|             "\n  Number of species: %i", | ||||
|             "\n  Number of genes: %i", | ||||
|             "\n  Reference genes: %i", | ||||
|             "\n" | ||||
|         ), | ||||
|         paste(sapply(x$methods, function(m) m$id), collapse = ", "), | ||||
|         length(x$species_ids), | ||||
|         length(x$gene_ids) | ||||
|     )) | ||||
| 
 | ||||
|     cat(sprintf( | ||||
|         "\n  Comparison data: %i reference genes", | ||||
|         length(x$gene_ids), | ||||
|         length(x$reference_gene_ids) | ||||
|     )) | ||||
| 
 | ||||
|     cat(sprintf( | ||||
|         "\n  Optimization target: %s\n", | ||||
|         x$optimization_target | ||||
|     )) | ||||
| 
 | ||||
|     invisible(x) | ||||
| } | ||||
|  |  | |||
|  | @ -1,34 +1,39 @@ | |||
| # Score the mean distance of genes to the telomeres across species. | ||||
| # | ||||
| # A score will be given to each gene such that 0.0 corresponds to the maximal | ||||
| # mean distance across all genes and 1.0 corresponds to a distance of 0. | ||||
| proximity <- function(preset, progress = NULL) { | ||||
|     species_ids <- preset$species_ids | ||||
|     gene_ids <- preset$gene_ids | ||||
| #' Score the mean distance of genes to the telomeres across species. | ||||
| #' | ||||
| #' A score will be given to each gene such that 0.0 corresponds to the maximal | ||||
| #' mean distance across all genes and 1.0 corresponds to a distance of 0. | ||||
| #' | ||||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @export | ||||
| proximity <- function() { | ||||
|     method( | ||||
|         id = "proximity", | ||||
|         name = "Proximity", | ||||
|         description = "Proximity to telomeres", | ||||
|         function(preset, progress) { | ||||
|             species_ids <- preset$species_ids | ||||
|             gene_ids <- preset$gene_ids | ||||
| 
 | ||||
|     cached("proximity", c(species_ids, gene_ids), { | ||||
|         # Prefilter distances by species and gene. | ||||
|         data <- geposan::distances[ | ||||
|             species %chin% preset$species_ids & gene %chin% preset$gene_ids | ||||
|         ] | ||||
|             cached("proximity", c(species_ids, gene_ids), { | ||||
|                 # Prefilter distances by species and gene. | ||||
|                 data <- geposan::distances[ | ||||
|                     species %chin% preset$species_ids & | ||||
|                         gene %chin% preset$gene_ids | ||||
|                 ] | ||||
| 
 | ||||
|         # Compute the score as described above. | ||||
|         data <- data[, .(mean_distance = mean(distance)), by = "gene"] | ||||
|         max_distance <- data[, max(mean_distance)] | ||||
|         data[, score := 1 - mean_distance / max_distance] | ||||
|                 # Compute the score as described above. | ||||
|                 data <- data[, .(mean_distance = mean(distance)), by = "gene"] | ||||
|                 max_distance <- data[, max(mean_distance)] | ||||
|                 data[, score := 1 - mean_distance / max_distance] | ||||
| 
 | ||||
|         if (!is.null(progress)) { | ||||
|             # We do everything in one go, so it's not possible to report | ||||
|             # detailed progress information. As the method is relatively quick, | ||||
|             # this should not be a problem. | ||||
|             progress(1.0) | ||||
|                 progress(1.0) | ||||
| 
 | ||||
|                 result( | ||||
|                     method = "proximity", | ||||
|                     scores = data[, .(gene, score)] | ||||
|                 ) | ||||
|             }) | ||||
|         } | ||||
| 
 | ||||
|         structure( | ||||
|             list( | ||||
|                 results = data[, .(gene, score)] | ||||
|             ), | ||||
|             class = "geposan_method_results" | ||||
|         ) | ||||
|     }) | ||||
|     ) | ||||
| } | ||||
|  |  | |||
							
								
								
									
										12
									
								
								R/ranking.R
									
										
									
									
									
								
							
							
						
						
									
										12
									
								
								R/ranking.R
									
										
									
									
									
								
							|  | @ -13,10 +13,10 @@ | |||
| #' | ||||
| #' @export | ||||
| ranking <- function(analysis, weights) { | ||||
|     if (inherits(analysis, "geposan_analysis")) { | ||||
|         ranking <- copy(analysis$ranking) | ||||
|     } else if (inherits(analysis, "geposan_results")) { | ||||
|         ranking <- copy(analysis) | ||||
|     ranking <- if (inherits(analysis, "geposan_analysis")) { | ||||
|         copy(analysis$scores) | ||||
|     } else if (inherits(analysis, "geposan_ranking")) { | ||||
|         copy(analysis) | ||||
|     } else { | ||||
|         stop("Invalid analyis. Use geposan::analyze().") | ||||
|     } | ||||
|  | @ -39,7 +39,7 @@ ranking <- function(analysis, weights) { | |||
| 
 | ||||
|     structure( | ||||
|         ranking, | ||||
|         class = c("geposan_ranking", "geposan_results", class(ranking)) | ||||
|         class = c("geposan_ranking", class(ranking)) | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
|  | @ -60,7 +60,7 @@ ranking <- function(analysis, weights) { | |||
| #' @export | ||||
| optimal_weights <- function(analysis, methods, reference_gene_ids, | ||||
|                             target = "mean") { | ||||
|     if (!inherits(analysis, c("geposan_analysis", "geposan_results"))) { | ||||
|     if (!inherits(analysis, c("geposan_analysis", "geposan_ranking"))) { | ||||
|         stop("Invalid analyis. Use geposan::analyze().") | ||||
|     } | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										50
									
								
								R/result.R
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								R/result.R
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,50 @@ | |||
| #' Result of applying a method on gene position data. | ||||
| #' | ||||
| #' @param method_id ID of the method that produced this result. | ||||
| #' @param scores A `data.frame` mapping gene IDs (`gene`) to computed scores | ||||
| #'   between 0.0 and 1.0 (`score`). | ||||
| #' @param details Optional details that may contain intermediate results as | ||||
| #'   well as other information on the method application. | ||||
| #' | ||||
| #' @return An object of class `geposan_result`. | ||||
| #' | ||||
| #' @export | ||||
| result <- function(method_id, scores, details = list()) { | ||||
|     stopifnot(is.data.frame(scores) & | ||||
|         c("gene", "score") %chin% colnames(scores)) | ||||
|     stopifnot(is.list(details)) | ||||
| 
 | ||||
|     structure( | ||||
|         list( | ||||
|             method_id = method_id, | ||||
|             scores = scores, | ||||
|             details = details | ||||
|         ), | ||||
|         class = "geposan_result" | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| #' Print a result object. | ||||
| #' | ||||
| #' @param x The result to print. | ||||
| #' @param ... Other parameters. | ||||
| #' | ||||
| #' @seealso [result()] | ||||
| #' | ||||
| #' @export | ||||
| print.geposan_result <- function(x, ...) { | ||||
|     cat(sprintf( | ||||
|         paste0( | ||||
|             "geposan result:", | ||||
|             "\n  Method: %s", | ||||
|             "\n  Number of genes: %i", | ||||
|             "\n  Available details: %s", | ||||
|             "\n" | ||||
|         ), | ||||
|         x$method_id, | ||||
|         nrow(x$scores), | ||||
|         paste(names(x$details), collapse = ", ") | ||||
|     )) | ||||
| 
 | ||||
|     invisible(x) | ||||
| } | ||||
							
								
								
									
										16
									
								
								man/adjacency.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								man/adjacency.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,16 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/adjacency.R | ||||
| \name{adjacency} | ||||
| \alias{adjacency} | ||||
| \title{Score genes based on their proximity to the reference genes.} | ||||
| \usage{ | ||||
| adjacency() | ||||
| } | ||||
| \value{ | ||||
| An object of class \code{geposan_method}. | ||||
| } | ||||
| \description{ | ||||
| This method finds the distance value with the maximum density for each gene | ||||
| (i.e. the mode of its estimated distribution). Genes are scored by comparing | ||||
| those distance values with the values of the reference genes. | ||||
| } | ||||
							
								
								
									
										11
									
								
								man/all_methods.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								man/all_methods.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,11 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/method.R | ||||
| \name{all_methods} | ||||
| \alias{all_methods} | ||||
| \title{Get a list of all available methods.} | ||||
| \usage{ | ||||
| all_methods() | ||||
| } | ||||
| \description{ | ||||
| Get a list of all available methods. | ||||
| } | ||||
|  | @ -2,7 +2,7 @@ | |||
| % Please edit documentation in R/analyze.R | ||||
| \name{analyze} | ||||
| \alias{analyze} | ||||
| \title{Analyze by applying the specified preset.} | ||||
| \title{Analyze genes based on position data.} | ||||
| \usage{ | ||||
| analyze(preset, progress = NULL) | ||||
| } | ||||
|  | @ -11,17 +11,18 @@ analyze(preset, progress = NULL) | |||
| 
 | ||||
| \item{progress}{A function to be called for progress information. The | ||||
| function should accept a number between 0.0 and 1.0 for the current | ||||
| progress.} | ||||
| progress. If no function is provided, a simple text progress bar will be | ||||
| shown.} | ||||
| } | ||||
| \value{ | ||||
| An object containing the results of the analysis with the following | ||||
| items: | ||||
| \describe{ | ||||
| \item{\code{preset}}{The preset that was used.} | ||||
| \item{\code{weights}}{The optimal weights for ranking the reference genes.} | ||||
| \item{\code{ranking}}{The optimal ranking created using the weights.} | ||||
| \item{\code{scores}}{Table containing all scores for each gene.} | ||||
| \item{\code{results}}{Results from the different methods including details.} | ||||
| } | ||||
| } | ||||
| \description{ | ||||
| Analyze by applying the specified preset. | ||||
| Analyze genes based on position data. | ||||
| } | ||||
|  |  | |||
							
								
								
									
										25
									
								
								man/clusteriness.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								man/clusteriness.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,25 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/clustering.R | ||||
| \name{clusteriness} | ||||
| \alias{clusteriness} | ||||
| \title{Perform a cluster analysis.} | ||||
| \usage{ | ||||
| clusteriness(data, span = 1e+06, weight = 0.7) | ||||
| } | ||||
| \arguments{ | ||||
| \item{data}{The values that should be scored.} | ||||
| 
 | ||||
| \item{span}{The maximum span of values considered to be in one cluster.} | ||||
| 
 | ||||
| \item{weight}{The weight that will be given to the next largest cluster in | ||||
| relation to the previous one. For example, if \code{weight} is 0.7 (the | ||||
| default), the first cluster will weigh 1.0, the second 0.7, the third 0.49 | ||||
| etc.} | ||||
| } | ||||
| \description{ | ||||
| This function will cluster the data using \code{\link[stats:hclust]{stats::hclust()}} and | ||||
| \code{\link[stats:cutree]{stats::cutree()}}. Every cluster with at least two members qualifies for | ||||
| further analysis. Clusters are then ranked based on their size in relation | ||||
| to the total number of values. The return value is a final score between | ||||
| 0.0 and 1.0. Lower ranking clusters contribute less to this score. | ||||
| } | ||||
							
								
								
									
										18
									
								
								man/clustering.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								man/clustering.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/clustering.R | ||||
| \name{clustering} | ||||
| \alias{clustering} | ||||
| \title{Process genes clustering their distance to telomeres.} | ||||
| \usage{ | ||||
| clustering() | ||||
| } | ||||
| \value{ | ||||
| An object of class \code{geposan_method}. | ||||
| } | ||||
| \description{ | ||||
| The result will be cached and can be reused for different presets, because | ||||
| it is independent of the reference genes in use. | ||||
| } | ||||
| \seealso{ | ||||
| \code{\link[=clusteriness]{clusteriness()}} | ||||
| } | ||||
							
								
								
									
										16
									
								
								man/correlation.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								man/correlation.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,16 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/correlation.R | ||||
| \name{correlation} | ||||
| \alias{correlation} | ||||
| \title{Compute the mean correlation coefficient comparing gene distances with a set | ||||
| of reference genes.} | ||||
| \usage{ | ||||
| correlation() | ||||
| } | ||||
| \value{ | ||||
| An object of class \code{geposan_method}. | ||||
| } | ||||
| \description{ | ||||
| Compute the mean correlation coefficient comparing gene distances with a set | ||||
| of reference genes. | ||||
| } | ||||
							
								
								
									
										26
									
								
								man/method.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								man/method.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,26 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/method.R | ||||
| \name{method} | ||||
| \alias{method} | ||||
| \title{Describe a new method for analyzing gene position data.} | ||||
| \usage{ | ||||
| method(id, name, description, func) | ||||
| } | ||||
| \arguments{ | ||||
| \item{id}{Unique identifier for the method.} | ||||
| 
 | ||||
| \item{name}{Human readable name.} | ||||
| 
 | ||||
| \item{description}{Slightly longer description.} | ||||
| 
 | ||||
| \item{func}{Function to apply the method. The function should accept two | ||||
| parameters: an object of class \code{geposan_preset} as input and a function to | ||||
| report progress information to as a numeric value. The return value should | ||||
| be an object of class \code{geposan_result}.} | ||||
| } | ||||
| \value{ | ||||
| An object of class \code{geposan_method}. | ||||
| } | ||||
| \description{ | ||||
| Describe a new method for analyzing gene position data. | ||||
| } | ||||
							
								
								
									
										24
									
								
								man/neural.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								man/neural.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,24 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/neural.R | ||||
| \name{neural} | ||||
| \alias{neural} | ||||
| \title{Find genes by training and applying a neural network.} | ||||
| \usage{ | ||||
| neural(seed = 180199, n_models = 5) | ||||
| } | ||||
| \arguments{ | ||||
| \item{seed}{The seed will be used to make the results reproducible.} | ||||
| 
 | ||||
| \item{n_models}{This number specifies how many sets of training data should | ||||
| be created. For each set, there will be a model trained on the remaining | ||||
| training data and validated using this set. For non-training genes, the | ||||
| final score will be the mean of the result of applying the different | ||||
| models. There should be at least two training sets. The analysis will only | ||||
| work, if there is at least one reference gene per training set.} | ||||
| } | ||||
| \value{ | ||||
| An object of class \code{geposan_method}. | ||||
| } | ||||
| \description{ | ||||
| Find genes by training and applying a neural network. | ||||
| } | ||||
|  | @ -5,24 +5,20 @@ | |||
| \title{Create a new preset.} | ||||
| \usage{ | ||||
| preset( | ||||
|   methods = c("clusteriness", "correlation", "neural", "adjacency", "proximity"), | ||||
|   species_ids = NULL, | ||||
|   gene_ids = NULL, | ||||
|   reference_gene_ids = NULL, | ||||
|   optimization_target = "mean_rank" | ||||
|   methods = all_methods(), | ||||
|   species_ids = geposan::species$id, | ||||
|   gene_ids = geposan::genes$id, | ||||
|   reference_gene_ids | ||||
| ) | ||||
| } | ||||
| \arguments{ | ||||
| \item{methods}{Methods to apply.} | ||||
| \item{methods}{List of methods to apply.} | ||||
| 
 | ||||
| \item{species_ids}{IDs of species to include.} | ||||
| 
 | ||||
| \item{gene_ids}{IDs of genes to screen.} | ||||
| 
 | ||||
| \item{reference_gene_ids}{IDs of reference genes to compare to.} | ||||
| 
 | ||||
| \item{optimization_target}{Parameter of the reference genes that the ranking | ||||
| should be optimized for.} | ||||
| } | ||||
| \value{ | ||||
| The preset to use with \code{\link[=analyze]{analyze()}}. | ||||
|  | @ -33,25 +29,7 @@ analysis. Note that the genes to process should normally include the | |||
| reference genes to be able to assess the results later. The genes will be | ||||
| filtered based on how many species have data for them. Genes which only have | ||||
| orthologs for less than 25\% of the input species will be excluded from the | ||||
| preset and the analyis. | ||||
| } | ||||
| \details{ | ||||
| Available methods are: | ||||
| \itemize{ | ||||
| \item \code{clusteriness} How much the gene distances to the nearest telomere | ||||
| cluster across species. | ||||
| \item \code{correlation} The mean correlation of gene distances to the nearest | ||||
| telomere across species. | ||||
| \item \code{neural} Assessment by neural network trained on the reference genes. | ||||
| \item \code{adjacency} Proximity to reference genes. | ||||
| \item \code{proximity} Mean proximity to telomeres. | ||||
| } | ||||
| 
 | ||||
| Available optimization targets are: | ||||
| \itemize{ | ||||
| \item \code{mean} Mean rank of the reference genes. | ||||
| \item \code{median} Median rank of the reference genes. | ||||
| \item \code{max} First rank of the reference genes. | ||||
| \item \code{min} Last rank of the reference genes. | ||||
| } | ||||
| preset and the analyis. See the different method functions for the available | ||||
| methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and | ||||
| \code{\link[=proximity]{proximity()}}. | ||||
| } | ||||
|  |  | |||
							
								
								
									
										19
									
								
								man/print.geposan_analysis.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								man/print.geposan_analysis.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/analyze.R | ||||
| \name{print.geposan_analysis} | ||||
| \alias{print.geposan_analysis} | ||||
| \title{Print an analysis object.} | ||||
| \usage{ | ||||
| \method{print}{geposan_analysis}(x, ...) | ||||
| } | ||||
| \arguments{ | ||||
| \item{x}{The analysis to print.} | ||||
| 
 | ||||
| \item{...}{Other parameters.} | ||||
| } | ||||
| \description{ | ||||
| Print an analysis object. | ||||
| } | ||||
| \seealso{ | ||||
| \code{\link[=analyze]{analyze()}} | ||||
| } | ||||
							
								
								
									
										19
									
								
								man/print.geposan_method.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								man/print.geposan_method.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/method.R | ||||
| \name{print.geposan_method} | ||||
| \alias{print.geposan_method} | ||||
| \title{Print a method object.} | ||||
| \usage{ | ||||
| \method{print}{geposan_method}(x, ...) | ||||
| } | ||||
| \arguments{ | ||||
| \item{x}{The method to print.} | ||||
| 
 | ||||
| \item{...}{Other parameters.} | ||||
| } | ||||
| \description{ | ||||
| Print a method object. | ||||
| } | ||||
| \seealso{ | ||||
| \code{\link[=method]{method()}} | ||||
| } | ||||
							
								
								
									
										19
									
								
								man/print.geposan_result.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								man/print.geposan_result.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/result.R | ||||
| \name{print.geposan_result} | ||||
| \alias{print.geposan_result} | ||||
| \title{Print a result object.} | ||||
| \usage{ | ||||
| \method{print}{geposan_result}(x, ...) | ||||
| } | ||||
| \arguments{ | ||||
| \item{x}{The result to print.} | ||||
| 
 | ||||
| \item{...}{Other parameters.} | ||||
| } | ||||
| \description{ | ||||
| Print a result object. | ||||
| } | ||||
| \seealso{ | ||||
| \code{\link[=result]{result()}} | ||||
| } | ||||
							
								
								
									
										15
									
								
								man/proximity.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								man/proximity.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/proximity.R | ||||
| \name{proximity} | ||||
| \alias{proximity} | ||||
| \title{Score the mean distance of genes to the telomeres across species.} | ||||
| \usage{ | ||||
| proximity() | ||||
| } | ||||
| \value{ | ||||
| An object of class \code{geposan_method}. | ||||
| } | ||||
| \description{ | ||||
| A score will be given to each gene such that 0.0 corresponds to the maximal | ||||
| mean distance across all genes and 1.0 corresponds to a distance of 0. | ||||
| } | ||||
							
								
								
									
										23
									
								
								man/result.Rd
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								man/result.Rd
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| % Generated by roxygen2: do not edit by hand | ||||
| % Please edit documentation in R/result.R | ||||
| \name{result} | ||||
| \alias{result} | ||||
| \title{Result of applying a method on gene position data.} | ||||
| \usage{ | ||||
| result(method_id, scores, details = list()) | ||||
| } | ||||
| \arguments{ | ||||
| \item{method_id}{ID of the method that produced this result.} | ||||
| 
 | ||||
| \item{scores}{A \code{data.frame} mapping gene IDs (\code{gene}) to computed scores | ||||
| between 0.0 and 1.0 (\code{score}).} | ||||
| 
 | ||||
| \item{details}{Optional details that may contain intermediate results as | ||||
| well as other information on the method application.} | ||||
| } | ||||
| \value{ | ||||
| An object of class \code{geposan_result}. | ||||
| } | ||||
| \description{ | ||||
| Result of applying a method on gene position data. | ||||
| } | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue