mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-26 10:47:25 +01:00 
			
		
		
		
	Handle caching
This commit is contained in:
		
							parent
							
								
									b8365e0efb
								
							
						
					
					
						commit
						df6e23d219
					
				
					 7 changed files with 247 additions and 191 deletions
				
			
		
							
								
								
									
										46
									
								
								R/analyze.R
									
										
									
									
									
								
							
							
						
						
									
										46
									
								
								R/analyze.R
									
										
									
									
									
								
							|  | @ -59,30 +59,32 @@ analyze <- function(preset, progress = NULL) { | |||
|         "neural" = neural | ||||
|     ) | ||||
| 
 | ||||
|     total_progress <- 0.0 | ||||
|     method_count <- length(preset$method_ids) | ||||
|     results <- data.table(gene = preset$gene_ids) | ||||
|     cached("results", preset, { | ||||
|         total_progress <- 0.0 | ||||
|         method_count <- length(preset$method_ids) | ||||
|         results <- data.table(gene = preset$gene_ids) | ||||
| 
 | ||||
|     for (method_id in preset$method_ids) { | ||||
|         method_progress <- if (!is.null(progress)) function(p) { | ||||
|             progress(total_progress + p / method_count) | ||||
|         for (method_id in preset$method_ids) { | ||||
|             method_progress <- if (!is.null(progress)) function(p) { | ||||
|                 progress(total_progress + p / method_count) | ||||
|             } | ||||
| 
 | ||||
|             method_results <- methods[[method_id]](preset, method_progress) | ||||
|             setnames(method_results, "score", method_id) | ||||
| 
 | ||||
|             results <- merge( | ||||
|                 results, | ||||
|                 method_results, | ||||
|                 by = "gene" | ||||
|             ) | ||||
| 
 | ||||
|             total_progress <- total_progress + 1 / method_count | ||||
|         } | ||||
| 
 | ||||
|         method_results <- methods[[method_id]](preset, method_progress) | ||||
|         setnames(method_results, "score", method_id) | ||||
|         if (!is.null(progress)) { | ||||
|             progress(1.0) | ||||
|         } | ||||
| 
 | ||||
|         results <- merge( | ||||
|             results, | ||||
|             method_results, | ||||
|             by = "gene" | ||||
|         ) | ||||
| 
 | ||||
|         total_progress <- total_progress + 1 / method_count | ||||
|     } | ||||
| 
 | ||||
|     if (!is.null(progress)) { | ||||
|         progress(1.0) | ||||
|     } | ||||
| 
 | ||||
|     results | ||||
|         results | ||||
|     }) | ||||
| } | ||||
|  |  | |||
|  | @ -37,28 +37,33 @@ clusteriness_priv <- function(data, height = 1000000) { | |||
| 
 | ||||
| # Process genes clustering their distance to telomeres. | ||||
| clusteriness <- function(preset, progress = NULL) { | ||||
|     results <- data.table(gene = preset$gene_ids) | ||||
|     species_ids <- preset$species_ids | ||||
|     gene_ids <- preset$gene_ids | ||||
| 
 | ||||
|     # Prefilter the input data by species. | ||||
|     distances <- geposan::distances[species %chin% preset$species_ids] | ||||
|     cached("clusteriness", c(species_ids, gene_ids), { | ||||
|         results <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|     # Add an index for quickly accessing data per gene. | ||||
|     setkey(distances, gene) | ||||
|         # Prefilter the input data by species. | ||||
|         distances <- geposan::distances[species %chin% species_ids] | ||||
| 
 | ||||
|     genes_done <- 0 | ||||
|     genes_total <- length(preset$gene_ids) | ||||
|         # Add an index for quickly accessing data per gene. | ||||
|         setkey(distances, gene) | ||||
| 
 | ||||
|     # Perform the cluster analysis for one gene. | ||||
|     compute <- function(gene_id) { | ||||
|         score <- clusteriness_priv(distances[gene_id, distance]) | ||||
|         genes_done <- 0 | ||||
|         genes_total <- length(gene_ids) | ||||
| 
 | ||||
|         if (!is.null(progress)) { | ||||
|             genes_done <<- genes_done + 1 | ||||
|             progress(genes_done / genes_total) | ||||
|         # Perform the cluster analysis for one gene. | ||||
|         compute <- function(gene_id) { | ||||
|             score <- clusteriness_priv(distances[gene_id, distance]) | ||||
| 
 | ||||
|             if (!is.null(progress)) { | ||||
|                 genes_done <<- genes_done + 1 | ||||
|                 progress(genes_done / genes_total) | ||||
|             } | ||||
| 
 | ||||
|             score | ||||
|         } | ||||
| 
 | ||||
|         score | ||||
|     } | ||||
| 
 | ||||
|     results[, score := compute(gene), by = 1:nrow(results)] | ||||
|         results[, score := compute(gene), by = 1:nrow(results)] | ||||
|     }) | ||||
| } | ||||
|  |  | |||
							
								
								
									
										120
									
								
								R/correlation.R
									
										
									
									
									
								
							
							
						
						
									
										120
									
								
								R/correlation.R
									
										
									
									
									
								
							|  | @ -5,69 +5,75 @@ correlation <- function(preset, progress = NULL) { | |||
|     gene_ids <- preset$gene_ids | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
| 
 | ||||
|     # Prefilter distances by species. | ||||
|     distances <- geposan::distances[species %chin% species_ids] | ||||
|     cached("correlation", c(species_ids, gene_ids, reference_gene_ids), { | ||||
|         # Prefilter distances by species. | ||||
|         distances <- geposan::distances[species %chin% species_ids] | ||||
| 
 | ||||
|     # Tranform data to get species as rows and genes as columns. We construct | ||||
|     # columns per species, because it requires fewer iterations, and transpose | ||||
|     # the table afterwards. | ||||
|         # Tranform data to get species as rows and genes as columns. We | ||||
|         # construct columns per species, because it requires fewer iterations, | ||||
|         # and transpose the table afterwards. | ||||
| 
 | ||||
|     data <- data.table(gene = gene_ids) | ||||
|         data <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|     # Make a column containing distance data for each species. | ||||
|     for (species_id in species_ids) { | ||||
|         species_distances <- distances[species == species_id, .(gene, distance)] | ||||
|         data <- merge(data, species_distances, all.x = TRUE) | ||||
|         setnames(data, "distance", species_id) | ||||
|     } | ||||
|         # Make a column containing distance data for each species. | ||||
|         for (species_id in species_ids) { | ||||
|             species_distances <- distances[ | ||||
|                 species == species_id, | ||||
|                 .(gene, distance) | ||||
|             ] | ||||
| 
 | ||||
|     # Transpose to the desired format. | ||||
|     data <- transpose(data, make.names = "gene") | ||||
| 
 | ||||
|     if (!is.null(progress)) progress(0.33) | ||||
| 
 | ||||
|     # Take the reference data. | ||||
|     reference_data <- data[, ..reference_gene_ids] | ||||
| 
 | ||||
|     # Perform the correlation between all possible pairs. | ||||
|     results <- stats::cor( | ||||
|         data[, ..gene_ids], | ||||
|         reference_data, | ||||
|         use = "pairwise.complete.obs", | ||||
|         method = "spearman" | ||||
|     ) | ||||
| 
 | ||||
|     results <- data.table(results, keep.rownames = TRUE) | ||||
|     setnames(results, "rn", "gene") | ||||
| 
 | ||||
|     # Remove correlations between the reference genes themselves. | ||||
|     for (reference_gene_id in reference_gene_ids) { | ||||
|         column <- quote(reference_gene_id) | ||||
|         results[gene == reference_gene_id, eval(column) := NA] | ||||
|     } | ||||
| 
 | ||||
|     if (!is.null(progress)) progress(0.66) | ||||
| 
 | ||||
|     # Compute the final score as the mean of known correlation scores. Negative | ||||
|     # correlations will correctly lessen the score, which will be clamped to | ||||
|     # zero as its lower bound. Genes with no possible correlations at all will | ||||
|     # be assumed to have a score of 0.0. | ||||
| 
 | ||||
|     compute_score <- function(scores) { | ||||
|         score <- mean(scores, na.rm = TRUE) | ||||
| 
 | ||||
|         if (is.na(score) | score < 0.0) { | ||||
|             score <- 0.0 | ||||
|             data <- merge(data, species_distances, all.x = TRUE) | ||||
|             setnames(data, "distance", species_id) | ||||
|         } | ||||
| 
 | ||||
|         score | ||||
|     } | ||||
|         # Transpose to the desired format. | ||||
|         data <- transpose(data, make.names = "gene") | ||||
| 
 | ||||
|     results[, | ||||
|         score := compute_score(as.matrix(.SD)), | ||||
|         .SDcols = reference_gene_ids, | ||||
|         by = gene | ||||
|     ] | ||||
|         if (!is.null(progress)) progress(0.33) | ||||
| 
 | ||||
|     results[, .(gene, score)] | ||||
|         # Take the reference data. | ||||
|         reference_data <- data[, ..reference_gene_ids] | ||||
| 
 | ||||
|         # Perform the correlation between all possible pairs. | ||||
|         results <- stats::cor( | ||||
|             data[, ..gene_ids], | ||||
|             reference_data, | ||||
|             use = "pairwise.complete.obs", | ||||
|             method = "spearman" | ||||
|         ) | ||||
| 
 | ||||
|         results <- data.table(results, keep.rownames = TRUE) | ||||
|         setnames(results, "rn", "gene") | ||||
| 
 | ||||
|         # Remove correlations between the reference genes themselves. | ||||
|         for (reference_gene_id in reference_gene_ids) { | ||||
|             column <- quote(reference_gene_id) | ||||
|             results[gene == reference_gene_id, eval(column) := NA] | ||||
|         } | ||||
| 
 | ||||
|         if (!is.null(progress)) progress(0.66) | ||||
| 
 | ||||
|         # Compute the final score as the mean of known correlation scores. | ||||
|         # Negative correlations will correctly lessen the score, which will be | ||||
|         # clamped to zero as its lower bound. Genes with no possible | ||||
|         # correlations at all will be assumed to have a score of 0.0. | ||||
| 
 | ||||
|         compute_score <- function(scores) { | ||||
|             score <- mean(scores, na.rm = TRUE) | ||||
| 
 | ||||
|             if (is.na(score) | score < 0.0) { | ||||
|                 score <- 0.0 | ||||
|             } | ||||
| 
 | ||||
|             score | ||||
|         } | ||||
| 
 | ||||
|         results[, | ||||
|             score := compute_score(as.matrix(.SD)), | ||||
|             .SDcols = reference_gene_ids, | ||||
|             by = gene | ||||
|         ] | ||||
| 
 | ||||
|         results[, .(gene, score)] | ||||
|     }) | ||||
| } | ||||
|  |  | |||
							
								
								
									
										164
									
								
								R/neural.R
									
										
									
									
									
								
							
							
						
						
									
										164
									
								
								R/neural.R
									
										
									
									
									
								
							|  | @ -3,106 +3,112 @@ | |||
| # @param seed A seed to get reproducible results. | ||||
| neural <- function(preset, progress = NULL, seed = 448077) { | ||||
|     species_ids <- preset$species_ids | ||||
|     gene_ids <- preset$gene_ids | ||||
|     reference_gene_ids <- preset$reference_gene_ids | ||||
| 
 | ||||
|     set.seed(seed) | ||||
|     gene_count <- length(preset$gene_ids) | ||||
|     cached("neural", c(species_ids, gene_ids, reference_gene_ids), { | ||||
|         set.seed(seed) | ||||
|         gene_count <- length(gene_ids) | ||||
| 
 | ||||
|     # Prefilter distances by species. | ||||
|     distances <- geposan::distances[species %chin% species_ids] | ||||
|         # Prefilter distances by species. | ||||
|         distances <- geposan::distances[species %chin% species_ids] | ||||
| 
 | ||||
|     # Input data for the network. This contains the gene ID as an identifier | ||||
|     # as well as the per-species gene distances as input variables. | ||||
|     data <- data.table(gene = preset$gene_ids) | ||||
|         # Input data for the network. This contains the gene ID as an identifier | ||||
|         # as well as the per-species gene distances as input variables. | ||||
|         data <- data.table(gene = gene_ids) | ||||
| 
 | ||||
|     # Buffer to keep track of species included in the computation. Species | ||||
|     # from `species_ids` may be excluded if they don't have enough data. | ||||
|     species_ids_included <- NULL | ||||
|         # Buffer to keep track of species included in the computation. Species | ||||
|         # from `species_ids` may be excluded if they don't have enough data. | ||||
|         species_ids_included <- NULL | ||||
| 
 | ||||
|     # Make a column containing distance data for each species. | ||||
|     for (species_id in species_ids) { | ||||
|         species_distances <- distances[species == species_id, .(gene, distance)] | ||||
|         # Make a column containing distance data for each species. | ||||
|         for (species_id in species_ids) { | ||||
|             species_distances <- distances[ | ||||
|                 species == species_id, | ||||
|                 .(gene, distance) | ||||
|             ] | ||||
| 
 | ||||
|         # Only include species with at least 25% known values. | ||||
|             # Only include species with at least 25% known values. | ||||
| 
 | ||||
|         species_distances <- stats::na.omit(species_distances) | ||||
|             species_distances <- stats::na.omit(species_distances) | ||||
| 
 | ||||
|         if (nrow(species_distances) >= 0.25 * gene_count) { | ||||
|             species_ids_included <- c(species_ids_included, species_id) | ||||
|             data <- merge(data, species_distances, all.x = TRUE) | ||||
|             if (nrow(species_distances) >= 0.25 * gene_count) { | ||||
|                 species_ids_included <- c(species_ids_included, species_id) | ||||
|                 data <- merge(data, species_distances, all.x = TRUE) | ||||
| 
 | ||||
|             # Replace missing data with mean values. The neural network can't | ||||
|             # handle NAs in a meaningful way. Choosing extreme values here | ||||
|             # would result in heavily biased results. Therefore, the mean value | ||||
|             # is chosen as a compromise. However, this will of course lessen the | ||||
|             # significance of the results. | ||||
|                 # Replace missing data with mean values. The neural network | ||||
|                 # can't handle NAs in a meaningful way. Choosing extreme values | ||||
|                 # here would result in heavily biased results. Therefore, the | ||||
|                 # mean value is chosen as a compromise. However, this will of | ||||
|                 # course lessen the significance of the results. | ||||
| 
 | ||||
|             mean_distance <- round(species_distances[, mean(distance)]) | ||||
|             data[is.na(distance), distance := mean_distance] | ||||
|                 mean_distance <- round(species_distances[, mean(distance)]) | ||||
|                 data[is.na(distance), distance := mean_distance] | ||||
| 
 | ||||
|             # Name the new column after the species. | ||||
|             setnames(data, "distance", species_id) | ||||
|                 # Name the new column after the species. | ||||
|                 setnames(data, "distance", species_id) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     # Extract the reference genes. | ||||
|         # Extract the reference genes. | ||||
| 
 | ||||
|     reference_data <- data[gene %chin% reference_gene_ids] | ||||
|     reference_data[, neural := 1.0] | ||||
|         reference_data <- data[gene %chin% reference_gene_ids] | ||||
|         reference_data[, neural := 1.0] | ||||
| 
 | ||||
|     # Take out random samples from the remaining genes. This is another | ||||
|     # compromise with a negative impact on significance. Because there is no | ||||
|     # information on genes with are explicitely *not* TPE-OLD genes, we have to | ||||
|     # assume that a random sample of genes has a low probability of including | ||||
|     # TPE-OLD genes. | ||||
|         # Take out random samples from the remaining genes. This is another | ||||
|         # compromise with a negative impact on significance. Because there is | ||||
|         # no information on genes with are explicitely *not* TPE-OLD genes, we | ||||
|         # have to assume that a random sample of genes has a low probability of | ||||
|         # including TPE-OLD genes. | ||||
| 
 | ||||
|     without_reference_data <- data[!gene %chin% reference_gene_ids] | ||||
|         without_reference_data <- data[!gene %chin% reference_gene_ids] | ||||
| 
 | ||||
|     reference_samples <- without_reference_data[ | ||||
|         sample( | ||||
|             nrow(without_reference_data), | ||||
|             nrow(reference_data) | ||||
|         reference_samples <- without_reference_data[ | ||||
|             sample( | ||||
|                 nrow(without_reference_data), | ||||
|                 nrow(reference_data) | ||||
|             ) | ||||
|         ] | ||||
| 
 | ||||
|         reference_samples[, neural := 0.0] | ||||
| 
 | ||||
|         # Merge training data. The training data includes all reference genes as | ||||
|         # well as an equal number of random sample genes. | ||||
|         training_data <- rbindlist(list(reference_data, reference_samples)) | ||||
| 
 | ||||
|         # Construct and train the neural network. | ||||
| 
 | ||||
|         nn_formula <- stats::as.formula(sprintf( | ||||
|             "neural~%s", | ||||
|             paste(species_ids_included, collapse = "+") | ||||
|         )) | ||||
| 
 | ||||
|         layer1 <- length(species_ids) * 0.66 | ||||
|         layer2 <- layer1 * 0.66 | ||||
|         layer3 <- layer2 * 0.66 | ||||
| 
 | ||||
|         nn <- neuralnet::neuralnet( | ||||
|             nn_formula, | ||||
|             training_data, | ||||
|             hidden = c(layer1, layer2, layer3), | ||||
|             linear.output = FALSE | ||||
|         ) | ||||
|     ] | ||||
| 
 | ||||
|     reference_samples[, neural := 0.0] | ||||
|         if (!is.null(progress)) { | ||||
|             # We do everything in one go, so it's not possible to report | ||||
|             # detailed progress information. As the method is relatively quick, | ||||
|             # this should not be a problem. | ||||
|             progress(0.5) | ||||
|         } | ||||
| 
 | ||||
|     # Merge training data. The training data includes all reference genes as | ||||
|     # well as an equal number of random sample genes. | ||||
|     training_data <- rbindlist(list(reference_data, reference_samples)) | ||||
|         # Apply the neural network. | ||||
|         data[, score := neuralnet::compute(nn, data)$net.result] | ||||
| 
 | ||||
|     # Construct and train the neural network. | ||||
|         if (!is.null(progress)) { | ||||
|             # See above. | ||||
|             progress(1.0) | ||||
|         } | ||||
| 
 | ||||
|     nn_formula <- stats::as.formula(sprintf( | ||||
|         "neural~%s", | ||||
|         paste(species_ids_included, collapse = "+") | ||||
|     )) | ||||
| 
 | ||||
|     layer1 <- length(species_ids) * 0.66 | ||||
|     layer2 <- layer1 * 0.66 | ||||
|     layer3 <- layer2 * 0.66 | ||||
| 
 | ||||
|     nn <- neuralnet::neuralnet( | ||||
|         nn_formula, | ||||
|         training_data, | ||||
|         hidden = c(layer1, layer2, layer3), | ||||
|         linear.output = FALSE | ||||
|     ) | ||||
| 
 | ||||
|     if (!is.null(progress)) { | ||||
|         # We do everything in one go, so it's not possible to report detailed | ||||
|         # progress information. As the method is relatively quick, this should | ||||
|         # not be a problem. | ||||
|         progress(0.5) | ||||
|     } | ||||
| 
 | ||||
|     # Apply the neural network. | ||||
|     data[, score := neuralnet::compute(nn, data)$net.result] | ||||
| 
 | ||||
|     if (!is.null(progress)) { | ||||
|         # See above. | ||||
|         progress(1.0) | ||||
|     } | ||||
| 
 | ||||
|     data[, .(gene, score)] | ||||
|         data[, .(gene, score)] | ||||
|     }) | ||||
| } | ||||
|  |  | |||
|  | @ -3,23 +3,28 @@ | |||
| # A score will be given to each gene such that 0.0 corresponds to the maximal | ||||
| # mean distance across all genes and 1.0 corresponds to a distance of 0. | ||||
| proximity <- function(preset, progress = NULL) { | ||||
|     # Prefilter distances by species and gene. | ||||
|     distances <- geposan::distances[ | ||||
|         species %chin% preset$species_ids & gene %chin% preset$gene_ids | ||||
|     ] | ||||
|     species_ids <- preset$species_ids | ||||
|     gene_ids <- preset$gene_ids | ||||
| 
 | ||||
|     # Compute the score as described above. | ||||
|     cached("proximity", c(species_ids, gene_ids), { | ||||
|         # Prefilter distances by species and gene. | ||||
|         distances <- geposan::distances[ | ||||
|             species %chin% preset$species_ids & gene %chin% preset$gene_ids | ||||
|         ] | ||||
| 
 | ||||
|     distances <- distances[, .(mean_distance = mean(distance)), by = "gene"] | ||||
|     max_distance <- distances[, max(mean_distance)] | ||||
|     distances[, score := 1 - mean_distance / max_distance] | ||||
|         # Compute the score as described above. | ||||
| 
 | ||||
|     if (!is.null(progress)) { | ||||
|         # We do everything in one go, so it's not possible to report detailed | ||||
|         # progress information. As the method is relatively quick, this should | ||||
|         # not be a problem. | ||||
|         progress(1.0) | ||||
|     } | ||||
|         distances <- distances[, .(mean_distance = mean(distance)), by = "gene"] | ||||
|         max_distance <- distances[, max(mean_distance)] | ||||
|         distances[, score := 1 - mean_distance / max_distance] | ||||
| 
 | ||||
|     distances[, .(gene, score)] | ||||
|         if (!is.null(progress)) { | ||||
|             # We do everything in one go, so it's not possible to report | ||||
|             # detailed progress information. As the method is relatively quick, | ||||
|             # this should not be a problem. | ||||
|             progress(1.0) | ||||
|         } | ||||
| 
 | ||||
|         distances[, .(gene, score)] | ||||
|     }) | ||||
| } | ||||
|  |  | |||
							
								
								
									
										31
									
								
								R/utils.R
									
										
									
									
									
								
							
							
						
						
									
										31
									
								
								R/utils.R
									
										
									
									
									
								
							|  | @ -1,3 +1,34 @@ | |||
| # Cache the value of an expression on the file system. | ||||
| # | ||||
| # The expression will be evaluated if there is no matching cache file found. | ||||
| # The cache files will be located in a directory "cache" located in the current | ||||
| # working directory. | ||||
| # | ||||
| # @param name Human readable part of the cache file name. | ||||
| # @param objects A vector of objects that this expression depends on. The hash | ||||
| #   of those objects will be used for identifying the cache file. | ||||
| cached <- function(name, objects, expr) { | ||||
|     if (!dir.exists("cache")) { | ||||
|         dir.create("cache") | ||||
|     } | ||||
| 
 | ||||
|     id <- rlang::hash(objects) | ||||
|     cache_file <- sprintf("cache/%s_%s.rda", name, id) | ||||
| 
 | ||||
|     if (file.exists(cache_file)) { | ||||
|         # If the cache file exists, we restore the data from it. | ||||
|         load(cache_file) | ||||
|     } else { | ||||
|         # If the cache file doesn't exist, we have to do the computation. | ||||
|         data <- expr | ||||
| 
 | ||||
|         # The results are cached for the next run. | ||||
|         save(data, file = cache_file, compress = "xz") | ||||
|     } | ||||
| 
 | ||||
|     data | ||||
| } | ||||
| 
 | ||||
| # This is needed to make data.table's symbols available within the package. | ||||
| #' @import data.table | ||||
| NULL | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue