mirror of
				https://github.com/johrpan/geposan.git
				synced 2025-10-25 19:37:23 +02:00 
			
		
		
		
	preset: Filter species in addition to genes
This commit is contained in:
		
							parent
							
								
									9e96c54f23
								
							
						
					
					
						commit
						3217c9bd29
					
				
					 4 changed files with 49 additions and 48 deletions
				
			
		|  | @ -7,8 +7,6 @@ | |||
| #'   final score will be the mean of the result of applying the different | ||||
| #'   models. There should be at least two training sets. The analysis will only | ||||
| #'   work, if there is at least one reference gene per training set. | ||||
| #' @param gene_requirement Minimum proportion of genes from the preset that a | ||||
| #'   species has to have in order to be included in the models. | ||||
| #' @param control_ratio The proportion of random control genes that is included | ||||
| #'   in the training data sets in addition to the reference genes. This should | ||||
| #'   be a numeric value between 0.0 and 1.0. | ||||
|  | @ -16,10 +14,7 @@ | |||
| #' @return An object of class `geposan_method`. | ||||
| #' | ||||
| #' @export | ||||
| neural <- function(seed = 180199, | ||||
|                        n_models = 5, | ||||
|                        gene_requirement = 0.5, | ||||
|                        control_ratio = 0.5) { | ||||
| neural <- function(seed = 180199, n_models = 5, control_ratio = 0.5) { | ||||
|   method( | ||||
|     id = "neural", | ||||
|     name = "Neural", | ||||
|  | @ -37,7 +32,6 @@ neural <- function(seed = 180199, | |||
|           reference_gene_ids, | ||||
|           seed, | ||||
|           n_models, | ||||
|           gene_requirement, | ||||
|           control_ratio | ||||
|         ), | ||||
|         { # nolint | ||||
|  | @ -57,12 +51,6 @@ neural <- function(seed = 180199, | |||
|           distances <- geposan::distances[species %chin% species_ids & | ||||
|             gene %chin% gene_ids] | ||||
| 
 | ||||
|           # Only include species that have at least 25% of the included genes. | ||||
|           distances[, species_n_genes := .N, by = species] | ||||
|           distances <- distances[species_n_genes >= | ||||
|             gene_requirement * length(gene_ids)] | ||||
|           included_species <- distances[, unique(species)] | ||||
| 
 | ||||
|           # Reshape data to put species into columns. | ||||
|           data <- dcast( | ||||
|             distances, | ||||
|  | @ -72,7 +60,7 @@ neural <- function(seed = 180199, | |||
| 
 | ||||
|           # Replace values that are still missing with mean values for the | ||||
|           # species in question. | ||||
|           data[, (included_species) := lapply(included_species, \(species) { | ||||
|           data[, (species_ids) := lapply(species_ids, \(species) { | ||||
|             species <- get(species) | ||||
|             species[is.na(species)] <- mean(species, na.rm = TRUE) | ||||
|             species | ||||
|  | @ -129,7 +117,7 @@ neural <- function(seed = 180199, | |||
|           # Step 3: Create, train and apply neural network. | ||||
|           # ----------------------------------------------- | ||||
| 
 | ||||
|           data_matrix <- prepare_data(data, included_species) | ||||
|           data_matrix <- prepare_data(data, species_ids) | ||||
|           output_vars <- NULL | ||||
| 
 | ||||
|           for (i in seq_along(networks)) { | ||||
|  | @ -138,14 +126,14 @@ neural <- function(seed = 180199, | |||
|             # Create a new model for each training session, because | ||||
|             # the model would keep its state across training | ||||
|             # sessions otherwise. | ||||
|             model <- create_model(length(included_species)) | ||||
|             model <- create_model(length(species_ids)) | ||||
| 
 | ||||
|             # Train the model. | ||||
|             fit <- train_model( | ||||
|               model, | ||||
|               network$training_data, | ||||
|               network$validation_data, | ||||
|               included_species | ||||
|               species_ids | ||||
|             ) | ||||
| 
 | ||||
|             # Apply the model. | ||||
|  | @ -180,7 +168,7 @@ neural <- function(seed = 180199, | |||
|             details = list( | ||||
|               seed = seed, | ||||
|               n_models = n_models, | ||||
|               all_results = data[, !..included_species], | ||||
|               all_results = data[, !..species_ids], | ||||
|               networks = networks | ||||
|             ) | ||||
|           ) | ||||
|  |  | |||
							
								
								
									
										44
									
								
								R/preset.R
									
										
									
									
									
								
							
							
						
						
									
										44
									
								
								R/preset.R
									
										
									
									
									
								
							|  | @ -3,16 +3,19 @@ | |||
| #' A preset is used to specify which methods and inputs should be used for an | ||||
| #' analysis. Note that the genes to process should normally include the | ||||
| #' reference genes to be able to assess the results later. The genes will be | ||||
| #' filtered based on how many species have data for them. Genes which only have | ||||
| #' orthologs for less than 25% of the input species will be excluded from the | ||||
| #' preset and the analyis. See the different method functions for the available | ||||
| #' methods: [clustering()], [correlation()], [neural()], [adjacency()] and | ||||
| #' [species_adjacency()]. | ||||
| #' filtered based on how many species have data for them. Afterwards, species | ||||
| #' that still have many missing genes will also be excluded. See the different | ||||
| #' method functions for the available methods: [clustering()], [correlation()], | ||||
| #' [neural()], [adjacency()] and [species_adjacency()]. | ||||
| #' | ||||
| #' @param reference_gene_ids IDs of reference genes to compare to. | ||||
| #' @param methods List of methods to apply. | ||||
| #' @param species_ids IDs of species to include. | ||||
| #' @param gene_ids IDs of genes to screen. | ||||
| #' @param species_requirement The proportion of species a gene has to have | ||||
| #'  orthologs in in order for the gene to qualify. | ||||
| #' @param gene_requirement The proportion of genes that a species has to have | ||||
| #'  in order for the species to be included in the analysis. | ||||
| #' | ||||
| #' @return The preset to use with [analyze()]. | ||||
| #' | ||||
|  | @ -20,21 +23,32 @@ | |||
| preset <- function(reference_gene_ids, | ||||
|                    methods = all_methods(), | ||||
|                    species_ids = geposan::species$id, | ||||
|                    gene_ids = geposan::genes$id) { | ||||
|   # Count included species per gene. | ||||
|   genes_n_species <- geposan::distances[ | ||||
|     species %chin% species_ids, | ||||
|     .(n_species = .N), | ||||
|     by = "gene" | ||||
|                    gene_ids = geposan::genes$id, | ||||
|                    species_requirement = 0.25, | ||||
|                    gene_requirement = 0.5) { | ||||
|   # Prefilter distances. | ||||
|   distances <- geposan::distances[ | ||||
|     species %chin% species_ids & gene %chin% gene_ids | ||||
|   ] | ||||
| 
 | ||||
|   # Filter out genes with less than 25% existing orthologs. | ||||
|   # Count included species per gene. | ||||
|   genes_n_species <- distances[, .(n_species = .N), by = "gene"] | ||||
| 
 | ||||
|   # Filter out genes with less too few existing orthologs. | ||||
|   gene_ids_filtered <- genes_n_species[ | ||||
|     gene %chin% gene_ids & | ||||
|       n_species >= 0.25 * length(species_ids), | ||||
|     n_species >= species_requirement * length(species_ids), | ||||
|     gene | ||||
|   ] | ||||
| 
 | ||||
|   # Count included genes per species. | ||||
|   species_n_genes <- geposan::distances[, .(n_genes = .N), by = "species"] | ||||
| 
 | ||||
|   # Filter out species that have too few of the genes. | ||||
|   species_ids_filtered <- species_n_genes[ | ||||
|     n_genes >= gene_requirement * length(gene_ids_filtered), | ||||
|     species | ||||
|   ] | ||||
| 
 | ||||
|   reference_gene_ids_excluded <- reference_gene_ids[ | ||||
|     !reference_gene_ids %chin% gene_ids_filtered | ||||
|   ] | ||||
|  | @ -65,7 +79,7 @@ preset <- function(reference_gene_ids, | |||
|     list( | ||||
|       reference_gene_ids = sort(reference_gene_ids_included), | ||||
|       methods = methods, | ||||
|       species_ids = sort(species_ids), | ||||
|       species_ids = sort(species_ids_filtered), | ||||
|       gene_ids = sort(gene_ids_filtered) | ||||
|     ), | ||||
|     class = "geposan_preset" | ||||
|  |  | |||
|  | @ -4,12 +4,7 @@ | |||
| \alias{neural} | ||||
| \title{Find genes by training and applying a neural network.} | ||||
| \usage{ | ||||
| neural( | ||||
|   seed = 180199, | ||||
|   n_models = 5, | ||||
|   gene_requirement = 0.5, | ||||
|   control_ratio = 0.5 | ||||
| ) | ||||
| neural(seed = 180199, n_models = 5, control_ratio = 0.5) | ||||
| } | ||||
| \arguments{ | ||||
| \item{seed}{The seed will be used to make the results reproducible.} | ||||
|  | @ -21,9 +16,6 @@ final score will be the mean of the result of applying the different | |||
| models. There should be at least two training sets. The analysis will only | ||||
| work, if there is at least one reference gene per training set.} | ||||
| 
 | ||||
| \item{gene_requirement}{Minimum proportion of genes from the preset that a | ||||
| species has to have in order to be included in the models.} | ||||
| 
 | ||||
| \item{control_ratio}{The proportion of random control genes that is included | ||||
| in the training data sets in addition to the reference genes. This should | ||||
| be a numeric value between 0.0 and 1.0.} | ||||
|  |  | |||
|  | @ -8,7 +8,9 @@ preset( | |||
|   reference_gene_ids, | ||||
|   methods = all_methods(), | ||||
|   species_ids = geposan::species$id, | ||||
|   gene_ids = geposan::genes$id | ||||
|   gene_ids = geposan::genes$id, | ||||
|   species_requirement = 0.25, | ||||
|   gene_requirement = 0.5 | ||||
| ) | ||||
| } | ||||
| \arguments{ | ||||
|  | @ -19,6 +21,12 @@ preset( | |||
| \item{species_ids}{IDs of species to include.} | ||||
| 
 | ||||
| \item{gene_ids}{IDs of genes to screen.} | ||||
| 
 | ||||
| \item{species_requirement}{The proportion of species a gene has to have | ||||
| orthologs in in order for the gene to qualify.} | ||||
| 
 | ||||
| \item{gene_requirement}{The proportion of genes that a species has to have | ||||
| in order for the species to be included in the analysis.} | ||||
| } | ||||
| \value{ | ||||
| The preset to use with \code{\link[=analyze]{analyze()}}. | ||||
|  | @ -27,9 +35,8 @@ The preset to use with \code{\link[=analyze]{analyze()}}. | |||
| A preset is used to specify which methods and inputs should be used for an | ||||
| analysis. Note that the genes to process should normally include the | ||||
| reference genes to be able to assess the results later. The genes will be | ||||
| filtered based on how many species have data for them. Genes which only have | ||||
| orthologs for less than 25\% of the input species will be excluded from the | ||||
| preset and the analyis. See the different method functions for the available | ||||
| methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and | ||||
| \code{\link[=species_adjacency]{species_adjacency()}}. | ||||
| filtered based on how many species have data for them. Afterwards, species | ||||
| that still have many missing genes will also be excluded. See the different | ||||
| method functions for the available methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, | ||||
| \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and \code{\link[=species_adjacency]{species_adjacency()}}. | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue