| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | #' Create a new preset. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' A preset is used to specify which methods and inputs should be used for an | 
					
						
							|  |  |  | #' analysis. Note that the genes to process should normally include the | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  | #' reference genes to be able to assess the results later. The genes will be | 
					
						
							|  |  |  | #' filtered based on how many species have data for them. Genes which only have | 
					
						
							|  |  |  | #' orthologs for less than 25% of the input species will be excluded from the | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | #' preset and the analyis. See the different method functions for the available | 
					
						
							|  |  |  | #' methods: [clustering()], [correlation()], [neural()], [adjacency()] and | 
					
						
							| 
									
										
										
										
											2022-02-24 14:37:33 +01:00
										 |  |  | #' [species_adjacency()]. | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | #' | 
					
						
							| 
									
										
										
										
											2022-01-17 19:52:51 +01:00
										 |  |  | #' @param reference_gene_ids IDs of reference genes to compare to. | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  | #' @param methods List of methods to apply. | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | #' @param species_ids IDs of species to include. | 
					
						
							|  |  |  | #' @param gene_ids IDs of genes to screen. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @return The preset to use with [analyze()]. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @export | 
					
						
							| 
									
										
										
										
											2022-01-17 19:52:51 +01:00
										 |  |  | preset <- function(reference_gene_ids, | 
					
						
							|  |  |  |                    methods = all_methods(), | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  |                    species_ids = geposan::species$id, | 
					
						
							| 
									
										
										
										
											2022-01-17 19:52:51 +01:00
										 |  |  |                    gene_ids = geposan::genes$id) { | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  |     # Count included species per gene. | 
					
						
							|  |  |  |     genes_n_species <- geposan::distances[ | 
					
						
							|  |  |  |         species %chin% species_ids, | 
					
						
							|  |  |  |         .(n_species = .N), | 
					
						
							|  |  |  |         by = "gene" | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Filter out genes with less than 25% existing orthologs. | 
					
						
							|  |  |  |     gene_ids_filtered <- genes_n_species[ | 
					
						
							| 
									
										
										
										
											2021-11-26 11:41:49 +01:00
										 |  |  |         gene %chin% gene_ids & | 
					
						
							|  |  |  |             n_species >= 0.25 * length(species_ids), | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  |         gene | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-17 20:22:50 +01:00
										 |  |  |     reference_gene_ids_excluded <- reference_gene_ids[ | 
					
						
							|  |  |  |         !reference_gene_ids %chin% gene_ids_filtered | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (length(reference_gene_ids_excluded > 0)) { | 
					
						
							|  |  |  |         warning(paste0( | 
					
						
							|  |  |  |             "The following reference gene IDs are excluded from the preset ", | 
					
						
							|  |  |  |             "because they don't have enough data: ", | 
					
						
							|  |  |  |             paste(reference_gene_ids_excluded, collapse = ", ") | 
					
						
							|  |  |  |         )) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     reference_gene_ids_included <- reference_gene_ids[ | 
					
						
							|  |  |  |         reference_gene_ids %chin% gene_ids_filtered | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (length(reference_gene_ids_included) < 1) { | 
					
						
							|  |  |  |         stop(paste0( | 
					
						
							|  |  |  |             "There has to be at least one reference gene for the preset to be ", | 
					
						
							|  |  |  |             "valid. Please note that some methods may require more reference ", | 
					
						
							|  |  |  |             "genes." | 
					
						
							|  |  |  |         )) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |     # The included data gets sorted to be able to produce predictable hashes | 
					
						
							|  |  |  |     # for the object later. | 
					
						
							|  |  |  |     structure( | 
					
						
							|  |  |  |         list( | 
					
						
							| 
									
										
										
										
											2022-01-17 20:22:50 +01:00
										 |  |  |             reference_gene_ids = sort(reference_gene_ids_included), | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  |             methods = methods, | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |             species_ids = sort(species_ids), | 
					
						
							| 
									
										
										
										
											2022-01-17 19:52:51 +01:00
										 |  |  |             gene_ids = sort(gene_ids_filtered) | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |         ), | 
					
						
							|  |  |  |         class = "geposan_preset" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' S3 method to print a preset object. | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  | #' @param x The preset to print. | 
					
						
							|  |  |  | #' @param ... Other parameters. | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | #' @seealso [preset()] | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @export | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  | print.geposan_preset <- function(x, ...) { | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |     cat(sprintf( | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  |         paste0( | 
					
						
							|  |  |  |             "geposan preset:", | 
					
						
							| 
									
										
										
										
											2022-01-17 19:52:51 +01:00
										 |  |  |             "\n  Reference genes: %i", | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  |             "\n  Included methods: %s", | 
					
						
							|  |  |  |             "\n  Number of species: %i", | 
					
						
							|  |  |  |             "\n  Number of genes: %i", | 
					
						
							|  |  |  |             "\n" | 
					
						
							|  |  |  |         ), | 
					
						
							| 
									
										
										
										
											2022-01-17 19:52:51 +01:00
										 |  |  |         length(x$reference_gene_ids), | 
					
						
							| 
									
										
										
										
											2021-12-16 13:01:44 +01:00
										 |  |  |         paste(sapply(x$methods, function(m) m$id), collapse = ", "), | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  |         length(x$species_ids), | 
					
						
							| 
									
										
										
										
											2022-01-17 19:52:51 +01:00
										 |  |  |         length(x$gene_ids) | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |     )) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  |     invisible(x) | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | } |