preset: Filter species in addition to genes

This commit is contained in:
Elias Projahn 2022-05-30 13:49:52 +02:00
parent 9e96c54f23
commit 3217c9bd29
4 changed files with 49 additions and 48 deletions

View file

@ -7,8 +7,6 @@
#' final score will be the mean of the result of applying the different
#' models. There should be at least two training sets. The analysis will only
#' work, if there is at least one reference gene per training set.
#' @param gene_requirement Minimum proportion of genes from the preset that a
#' species has to have in order to be included in the models.
#' @param control_ratio The proportion of random control genes that is included
#' in the training data sets in addition to the reference genes. This should
#' be a numeric value between 0.0 and 1.0.
@ -16,10 +14,7 @@
#' @return An object of class `geposan_method`.
#'
#' @export
neural <- function(seed = 180199,
n_models = 5,
gene_requirement = 0.5,
control_ratio = 0.5) {
neural <- function(seed = 180199, n_models = 5, control_ratio = 0.5) {
method(
id = "neural",
name = "Neural",
@ -37,7 +32,6 @@ neural <- function(seed = 180199,
reference_gene_ids,
seed,
n_models,
gene_requirement,
control_ratio
),
{ # nolint
@ -57,12 +51,6 @@ neural <- function(seed = 180199,
distances <- geposan::distances[species %chin% species_ids &
gene %chin% gene_ids]
# Only include species that have at least 25% of the included genes.
distances[, species_n_genes := .N, by = species]
distances <- distances[species_n_genes >=
gene_requirement * length(gene_ids)]
included_species <- distances[, unique(species)]
# Reshape data to put species into columns.
data <- dcast(
distances,
@ -72,7 +60,7 @@ neural <- function(seed = 180199,
# Replace values that are still missing with mean values for the
# species in question.
data[, (included_species) := lapply(included_species, \(species) {
data[, (species_ids) := lapply(species_ids, \(species) {
species <- get(species)
species[is.na(species)] <- mean(species, na.rm = TRUE)
species
@ -129,7 +117,7 @@ neural <- function(seed = 180199,
# Step 3: Create, train and apply neural network.
# -----------------------------------------------
data_matrix <- prepare_data(data, included_species)
data_matrix <- prepare_data(data, species_ids)
output_vars <- NULL
for (i in seq_along(networks)) {
@ -138,14 +126,14 @@ neural <- function(seed = 180199,
# Create a new model for each training session, because
# the model would keep its state across training
# sessions otherwise.
model <- create_model(length(included_species))
model <- create_model(length(species_ids))
# Train the model.
fit <- train_model(
model,
network$training_data,
network$validation_data,
included_species
species_ids
)
# Apply the model.
@ -180,7 +168,7 @@ neural <- function(seed = 180199,
details = list(
seed = seed,
n_models = n_models,
all_results = data[, !..included_species],
all_results = data[, !..species_ids],
networks = networks
)
)

View file

@ -3,16 +3,19 @@
#' A preset is used to specify which methods and inputs should be used for an
#' analysis. Note that the genes to process should normally include the
#' reference genes to be able to assess the results later. The genes will be
#' filtered based on how many species have data for them. Genes which only have
#' orthologs for less than 25% of the input species will be excluded from the
#' preset and the analyis. See the different method functions for the available
#' methods: [clustering()], [correlation()], [neural()], [adjacency()] and
#' [species_adjacency()].
#' filtered based on how many species have data for them. Afterwards, species
#' that still have many missing genes will also be excluded. See the different
#' method functions for the available methods: [clustering()], [correlation()],
#' [neural()], [adjacency()] and [species_adjacency()].
#'
#' @param reference_gene_ids IDs of reference genes to compare to.
#' @param methods List of methods to apply.
#' @param species_ids IDs of species to include.
#' @param gene_ids IDs of genes to screen.
#' @param species_requirement The proportion of species a gene has to have
#' orthologs in in order for the gene to qualify.
#' @param gene_requirement The proportion of genes that a species has to have
#' in order for the species to be included in the analysis.
#'
#' @return The preset to use with [analyze()].
#'
@ -20,21 +23,32 @@
preset <- function(reference_gene_ids,
methods = all_methods(),
species_ids = geposan::species$id,
gene_ids = geposan::genes$id) {
# Count included species per gene.
genes_n_species <- geposan::distances[
species %chin% species_ids,
.(n_species = .N),
by = "gene"
gene_ids = geposan::genes$id,
species_requirement = 0.25,
gene_requirement = 0.5) {
# Prefilter distances.
distances <- geposan::distances[
species %chin% species_ids & gene %chin% gene_ids
]
# Filter out genes with less than 25% existing orthologs.
# Count included species per gene.
genes_n_species <- distances[, .(n_species = .N), by = "gene"]
# Filter out genes with less too few existing orthologs.
gene_ids_filtered <- genes_n_species[
gene %chin% gene_ids &
n_species >= 0.25 * length(species_ids),
n_species >= species_requirement * length(species_ids),
gene
]
# Count included genes per species.
species_n_genes <- geposan::distances[, .(n_genes = .N), by = "species"]
# Filter out species that have too few of the genes.
species_ids_filtered <- species_n_genes[
n_genes >= gene_requirement * length(gene_ids_filtered),
species
]
reference_gene_ids_excluded <- reference_gene_ids[
!reference_gene_ids %chin% gene_ids_filtered
]
@ -65,7 +79,7 @@ preset <- function(reference_gene_ids,
list(
reference_gene_ids = sort(reference_gene_ids_included),
methods = methods,
species_ids = sort(species_ids),
species_ids = sort(species_ids_filtered),
gene_ids = sort(gene_ids_filtered)
),
class = "geposan_preset"

View file

@ -4,12 +4,7 @@
\alias{neural}
\title{Find genes by training and applying a neural network.}
\usage{
neural(
seed = 180199,
n_models = 5,
gene_requirement = 0.5,
control_ratio = 0.5
)
neural(seed = 180199, n_models = 5, control_ratio = 0.5)
}
\arguments{
\item{seed}{The seed will be used to make the results reproducible.}
@ -21,9 +16,6 @@ final score will be the mean of the result of applying the different
models. There should be at least two training sets. The analysis will only
work, if there is at least one reference gene per training set.}
\item{gene_requirement}{Minimum proportion of genes from the preset that a
species has to have in order to be included in the models.}
\item{control_ratio}{The proportion of random control genes that is included
in the training data sets in addition to the reference genes. This should
be a numeric value between 0.0 and 1.0.}

View file

@ -8,7 +8,9 @@ preset(
reference_gene_ids,
methods = all_methods(),
species_ids = geposan::species$id,
gene_ids = geposan::genes$id
gene_ids = geposan::genes$id,
species_requirement = 0.25,
gene_requirement = 0.5
)
}
\arguments{
@ -19,6 +21,12 @@ preset(
\item{species_ids}{IDs of species to include.}
\item{gene_ids}{IDs of genes to screen.}
\item{species_requirement}{The proportion of species a gene has to have
orthologs in in order for the gene to qualify.}
\item{gene_requirement}{The proportion of genes that a species has to have
in order for the species to be included in the analysis.}
}
\value{
The preset to use with \code{\link[=analyze]{analyze()}}.
@ -27,9 +35,8 @@ The preset to use with \code{\link[=analyze]{analyze()}}.
A preset is used to specify which methods and inputs should be used for an
analysis. Note that the genes to process should normally include the
reference genes to be able to assess the results later. The genes will be
filtered based on how many species have data for them. Genes which only have
orthologs for less than 25\% of the input species will be excluded from the
preset and the analyis. See the different method functions for the available
methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}}, \code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and
\code{\link[=species_adjacency]{species_adjacency()}}.
filtered based on how many species have data for them. Afterwards, species
that still have many missing genes will also be excluded. See the different
method functions for the available methods: \code{\link[=clustering]{clustering()}}, \code{\link[=correlation]{correlation()}},
\code{\link[=neural]{neural()}}, \code{\link[=adjacency]{adjacency()}} and \code{\link[=species_adjacency]{species_adjacency()}}.
}