| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | #' Create a new preset. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' A preset is used to specify which methods and inputs should be used for an | 
					
						
							|  |  |  | #' analysis. Note that the genes to process should normally include the | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  | #' reference genes to be able to assess the results later. The genes will be | 
					
						
							|  |  |  | #' filtered based on how many species have data for them. Genes which only have | 
					
						
							|  |  |  | #' orthologs for less than 25% of the input species will be excluded from the | 
					
						
							|  |  |  | #' preset and the analyis. | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | #' | 
					
						
							|  |  |  | #' Available methods are: | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  | #'  - `clusteriness` How much the gene distances to the nearest telomere | 
					
						
							|  |  |  | #'    cluster across species. | 
					
						
							|  |  |  | #'  - `clusteriness_positions` The same as `clusteriness` but using absolute | 
					
						
							|  |  |  | #'    gene positions instead of distances. | 
					
						
							|  |  |  | #'  - `correlation` The mean correlation of gene distances to the nearest | 
					
						
							|  |  |  | #'    telomere across species. | 
					
						
							|  |  |  | #'  - `correlation_positions` Correlation using position data. | 
					
						
							|  |  |  | #'  - `neural` Assessment by neural network trained using distances. | 
					
						
							|  |  |  | #'  - `neural_positions` Assessment by neural network trained using absolute | 
					
						
							|  |  |  | #'    position data. | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | #'  - `proximity` Mean proximity to telomeres. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @param methods Methods to apply. | 
					
						
							|  |  |  | #' @param species_ids IDs of species to include. | 
					
						
							|  |  |  | #' @param gene_ids IDs of genes to screen. | 
					
						
							|  |  |  | #' @param reference_gene_ids IDs of reference genes to compare to. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @return The preset to use with [analyze()]. | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @export | 
					
						
							|  |  |  | preset <- function(methods = c( | 
					
						
							|  |  |  |                        "clusteriness", | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  |                        "clusteriness_positions", | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |                        "correlation", | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  |                        "correlation_positions", | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |                        "neural", | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  |                        "neural_positions", | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |                        "proximity" | 
					
						
							|  |  |  |                    ), | 
					
						
							|  |  |  |                    species_ids = NULL, | 
					
						
							|  |  |  |                    gene_ids = NULL, | 
					
						
							|  |  |  |                    reference_gene_ids = NULL) { | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  |     # Count included species per gene. | 
					
						
							|  |  |  |     genes_n_species <- geposan::distances[ | 
					
						
							|  |  |  |         species %chin% species_ids, | 
					
						
							|  |  |  |         .(n_species = .N), | 
					
						
							|  |  |  |         by = "gene" | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Filter out genes with less than 25% existing orthologs. | 
					
						
							|  |  |  |     gene_ids_filtered <- genes_n_species[ | 
					
						
							|  |  |  |         n_species >= 0.25 * length(species_ids), | 
					
						
							|  |  |  |         gene | 
					
						
							|  |  |  |     ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |     # The included data gets sorted to be able to produce predictable hashes | 
					
						
							|  |  |  |     # for the object later. | 
					
						
							|  |  |  |     structure( | 
					
						
							|  |  |  |         list( | 
					
						
							|  |  |  |             methods = sort(methods), | 
					
						
							|  |  |  |             species_ids = sort(species_ids), | 
					
						
							| 
									
										
										
										
											2021-11-18 12:30:19 +01:00
										 |  |  |             gene_ids = sort(gene_ids_filtered), | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |             reference_gene_ids = sort(reference_gene_ids) | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         class = "geposan_preset" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #' S3 method to print a preset object. | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  | #' @param x The preset to print. | 
					
						
							|  |  |  | #' @param ... Other parameters. | 
					
						
							|  |  |  | #' | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | #' @seealso [preset()] | 
					
						
							|  |  |  | #' | 
					
						
							|  |  |  | #' @export | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  | print.geposan_preset <- function(x, ...) { | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |     cat("geposan preset:") | 
					
						
							|  |  |  |     cat("\n  Included methods: ") | 
					
						
							| 
									
										
										
										
											2021-11-12 18:22:22 +01:00
										 |  |  |     cat(x$methods, sep = ", ") | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     cat(sprintf( | 
					
						
							|  |  |  |         "\n  Input data: %i species, %i genes", | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  |         length(x$species_ids), | 
					
						
							|  |  |  |         length(x$gene_ids) | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |     )) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cat(sprintf( | 
					
						
							|  |  |  |         "\n  Comparison data: %i reference genes\n", | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  |         length(x$reference_gene_ids) | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  |     )) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-06 13:22:57 +01:00
										 |  |  |     invisible(x) | 
					
						
							| 
									
										
										
										
											2021-11-03 14:17:39 +01:00
										 |  |  | } |