preset: Filter species in addition to genes

This commit is contained in:
Elias Projahn 2022-05-30 13:49:52 +02:00
parent 9e96c54f23
commit 3217c9bd29
4 changed files with 49 additions and 48 deletions

View file

@ -7,8 +7,6 @@
#' final score will be the mean of the result of applying the different #' final score will be the mean of the result of applying the different
#' models. There should be at least two training sets. The analysis will only #' models. There should be at least two training sets. The analysis will only
#' work, if there is at least one reference gene per training set. #' work, if there is at least one reference gene per training set.
#' @param gene_requirement Minimum proportion of genes from the preset that a
#' species has to have in order to be included in the models.
#' @param control_ratio The proportion of random control genes that is included #' @param control_ratio The proportion of random control genes that is included
#' in the training data sets in addition to the reference genes. This should #' in the training data sets in addition to the reference genes. This should
#' be a numeric value between 0.0 and 1.0. #' be a numeric value between 0.0 and 1.0.
@ -16,10 +14,7 @@
#' @return An object of class `geposan_method`. #' @return An object of class `geposan_method`.
#' #'
#' @export #' @export
neural <- function(seed = 180199, neural <- function(seed = 180199, n_models = 5, control_ratio = 0.5) {
n_models = 5,
gene_requirement = 0.5,
control_ratio = 0.5) {
method( method(
id = "neural", id = "neural",
name = "Neural", name = "Neural",
@ -37,7 +32,6 @@ neural <- function(seed = 180199,
reference_gene_ids, reference_gene_ids,
seed, seed,
n_models, n_models,
gene_requirement,
control_ratio control_ratio
), ),
{ # nolint { # nolint
@ -57,12 +51,6 @@ neural <- function(seed = 180199,
distances <- geposan::distances[species %chin% species_ids & distances <- geposan::distances[species %chin% species_ids &
gene %chin% gene_ids] gene %chin% gene_ids]
# Only include species that have at least 25% of the included genes.
distances[, species_n_genes := .N, by = species]
distances <- distances[species_n_genes >=
gene_requirement * length(gene_ids)]
included_species <- distances[, unique(species)]
# Reshape data to put species into columns. # Reshape data to put species into columns.
data <- dcast( data <- dcast(
distances, distances,
@ -72,7 +60,7 @@ neural <- function(seed = 180199,
# Replace values that are still missing with mean values for the # Replace values that are still missing with mean values for the
# species in question. # species in question.
data[, (included_species) := lapply(included_species, \(species) { data[, (species_ids) := lapply(species_ids, \(species) {
species <- get(species) species <- get(species)
species[is.na(species)] <- mean(species, na.rm = TRUE) species[is.na(species)] <- mean(species, na.rm = TRUE)
species species
@ -129,7 +117,7 @@ neural <- function(seed = 180199,
# Step 3: Create, train and apply neural network. # Step 3: Create, train and apply neural network.
# ----------------------------------------------- # -----------------------------------------------
data_matrix <- prepare_data(data, included_species) data_matrix <- prepare_data(data, species_ids)
output_vars <- NULL output_vars <- NULL
for (i in seq_along(networks)) { for (i in seq_along(networks)) {
@ -138,14 +126,14 @@ neural <- function(seed = 180199,
# Create a new model for each training session, because # Create a new model for each training session, because
# the model would keep its state across training # the model would keep its state across training
# sessions otherwise. # sessions otherwise.
model <- create_model(length(included_species)) model <- create_model(length(species_ids))
# Train the model. # Train the model.
fit <- train_model( fit <- train_model(
model, model,
network$training_data, network$training_data,
network$validation_data, network$validation_data,
included_species species_ids
) )
# Apply the model. # Apply the model.
@ -180,7 +168,7 @@ neural <- function(seed = 180199,
details = list( details = list(
seed = seed, seed = seed,
n_models = n_models, n_models = n_models,
all_results = data[, !..included_species], all_results = data[, !..species_ids],
networks = networks networks = networks
) )
) )

View file

@ -3,16 +3,19 @@
#' A preset is used to specify which methods and inputs should be used for an #' A preset is used to specify which methods and inputs should be used for an
#' analysis. Note that the genes to process should normally include the #' analysis. Note that the genes to process should normally include the
#' reference genes to be able to assess the results later. The genes will be #' reference genes to be able to assess the results later. The genes will be
#' filtered based on how many species have data for them. Genes which only have #' filtered based on how many species have data for them. Afterwards, species
#' orthologs for less than 25% of the input species will be excluded from the #' that still have many missing genes will also be excluded. See the different
#' preset and the analyis. See the different method functions for the available #' method functions for the available methods: [clustering()], [correlation()],
#' methods: [clustering()], [correlation()], [neural()], [adjacency()] and #' [neural()], [adjacency()] and [species_adjacency()].
#' [species_adjacency()].
#' #'
#' @param reference_gene_ids IDs of reference genes to compare to. #' @param reference_gene_ids IDs of reference genes to compare to.
#' @param methods List of methods to apply. #' @param methods List of methods to apply.
#' @param species_ids IDs of species to include. #' @param species_ids IDs of species to include.
#' @param gene_ids IDs of genes to screen. #' @param gene_ids IDs of genes to screen.
#' @param species_requirement The proportion of species a gene has to have
#' orthologs in in order for the gene to qualify.
#' @param gene_requirement The proportion of genes that a species has to have
#' in order for the species to be included in the analysis.
#' #'
#' @return The preset to use with [analyze()]. #' @return The preset to use with [analyze()].
#' #'
@ -20,21 +23,32 @@
preset <- function(reference_gene_ids, preset <- function(reference_gene_ids,
methods = all_methods(), methods = all_methods(),
species_ids = geposan::species$id, species_ids = geposan::species$id,
gene_ids = geposan::genes$id) { gene_ids = geposan::genes$id,
# Count included species per gene. species_requirement = 0.25,
genes_n_species <- geposan::distances[ gene_requirement = 0.5) {
species %chin% species_ids, # Prefilter distances.
.(n_species = .N), distances <- geposan::distances[
by = "gene" species %chin% species_ids & gene %chin% gene_ids
] ]
# Filter out genes with less than 25% existing orthologs. # Count included species per gene.
genes_n_species <- distances[, .(n_species = .N), by = "gene"]
# Filter out genes with less too few existing orthologs.
gene_ids_filtered <- genes_n_species[ gene_ids_filtered <- genes_n_species[
gene %chin% gene_ids & n_species >= species_requirement * length(species_ids),
n_species >= 0.25 * length(species_ids),
gene gene
] ]
# Count included genes per species.
species_n_genes <- geposan::distances[, .(n_genes = .N), by = "species"]
# Filter out species that have too few of the genes.
species_ids_filtered <- species_n_genes[
n_genes >= gene_requirement * length(gene_ids_filtered),
species
]
reference_gene_ids_excluded <- reference_gene_ids[ reference_gene_ids_excluded <- reference_gene_ids[
!reference_gene_ids %chin% gene_ids_filtered !reference_gene_ids %chin% gene_ids_filtered
] ]
@ -65,7 +79,7 @@ preset <- function(reference_gene_ids,
list( list(
reference_gene_ids = sort(reference_gene_ids_included), reference_gene_ids = sort(reference_gene_ids_included),
methods = methods, methods = methods,
species_ids = sort(species_ids), species_ids = sort(species_ids_filtered),
gene_ids = sort(gene_ids_filtered) gene_ids = sort(gene_ids_filtered)
), ),
class = "geposan_preset" class = "geposan_preset"

View file

@ -4,12 +4,7 @@
\alias{neural} \alias{neural}
\title{Find genes by training and applying a neural network.} \title{Find genes by training and applying a neural network.}
\usage{ \usage{
neural( neural(seed = 180199, n_models = 5, control_ratio = 0.5)
seed = 180199,
n_models = 5,
gene_requirement = 0.5,
control_ratio = 0.5
)
} }
\arguments{ \arguments{
\item{seed}{The seed will be used to make the results reproducible.} \item{seed}{The seed will be used to make the results reproducible.}
@ -21,9 +16,6 @@ final score will be the mean of the result of applying the different
models. There should be at least two training sets. The analysis will only models. There should be at least two training sets. The analysis will only
work, if there is at least one reference gene per training set.} work, if there is at least one reference gene per training set.}
\item{gene_requirement}{Minimum proportion of genes from the preset that a
species has to have in order to be included in the models.}
\item{control_ratio}{The proportion of random control genes that is included \item{control_ratio}{The proportion of random control genes that is included
in the training data sets in addition to the reference genes. This should in the training data sets in addition to the reference genes. This should
be a numeric value between 0.0 and 1.0.} be a numeric value between 0.0 and 1.0.}

View file

@ -8,7 +8,9 @@ preset(
reference_gene_ids, reference_gene_ids,
methods = all_methods(), methods = all_methods(),
species_ids = geposan::species$id, species_ids = geposan::species$id,
gene_ids = geposan::genes$id gene_ids = geposan::genes$id,
species_requirement = 0.25,
gene_requirement = 0.5
) )
} }
\arguments{ \arguments{
@ -19,6 +21,12 @@ preset(
\item{species_ids}{IDs of species to include.} \item{species_ids}{IDs of species to include.}
\item{gene_ids}{IDs of genes to screen.} \item{gene_ids}{IDs of genes to screen.}
\item{species_requirement}{The proportion of species a gene has to have
orthologs in in order for the gene to qualify.}
\item{gene_requirement}{The proportion of genes that a species has to have
in order for the species to be included in the analysis.}
} }
\value{ \value{
The preset to use with \code{\link[=analyze]{analyze()}}. The preset to use with \code{\link[=analyze]{analyze()}}.
@ -27,9 +35,8 @@ The preset to use with \code{\link[=analyze]{analyze()}}.
A preset is used to specify which methods and inputs should be used for an A preset is used to specify which methods and inputs should be used for an
analysis. Note that the genes to process should normally include the analysis. Note that the genes to process should normally include the
reference genes to be able to assess the results later. The genes will be reference genes to be able to assess the results later. The genes will be
filtered based on how many species have data for them. Genes which only have filtered based on how many species have data for them. Afterwards, species
orthologs for less than 25\% of the input species will be excluded from the that still have many missing genes will also be excluded. See the different
preset and the analyis. See the different method functions for the available method functions for the available methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}},
methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and \code{\link[=species_adjacency]{species_adjacency()}}.
\code{\link[=species_adjacency]{species_adjacency()}}.
} }