preset: Remove min_n_species customization

This commit is contained in:
Elias Projahn 2021-11-18 12:30:19 +01:00
parent 33056bfa40
commit de1c1ed40e
3 changed files with 48 additions and 31 deletions

View file

@ -75,17 +75,7 @@ analyze <- function(preset, progress = NULL) {
total_progress <- total_progress + 1 / method_count total_progress <- total_progress + 1 / method_count
} }
# Count included species from the preset per gene. results
genes_n_species <- geposan::distances[
species %chin% preset$species_ids,
.(n_species = .N),
by = "gene"
]
setkey(genes_n_species, "gene")
# Return the results for genes with enough species.
results[genes_n_species[gene, n_species] >= preset$min_n_species]
}) })
if (!is.null(progress)) { if (!is.null(progress)) {

View file

@ -2,20 +2,28 @@
#' #'
#' A preset is used to specify which methods and inputs should be used for an #' A preset is used to specify which methods and inputs should be used for an
#' analysis. Note that the genes to process should normally include the #' analysis. Note that the genes to process should normally include the
#' reference genes to be able to assess the results later. #' reference genes to be able to assess the results later. The genes will be
#' filtered based on how many species have data for them. Genes which only have
#' orthologs for less than 25% of the input species will be excluded from the
#' preset and the analyis.
#' #'
#' Available methods are: #' Available methods are:
#' #'
#' - `clusteriness` How much the gene distances cluster across species. #' - `clusteriness` How much the gene distances to the nearest telomere
#' - `correlation` The mean correlation with the reference genes. #' cluster across species.
#' - `clusteriness_positions` The same as `clusteriness` but using absolute
#' gene positions instead of distances.
#' - `correlation` The mean correlation of gene distances to the nearest
#' telomere across species.
#' - `correlation_positions` Correlation using position data.
#' - `neural` Assessment by neural network trained using distances.
#' - `neural_positions` Assessment by neural network trained using absolute
#' position data.
#' - `proximity` Mean proximity to telomeres. #' - `proximity` Mean proximity to telomeres.
#' - `neural` Assessment by neural network.
#' #'
#' @param methods Methods to apply. #' @param methods Methods to apply.
#' @param species_ids IDs of species to include. #' @param species_ids IDs of species to include.
#' @param gene_ids IDs of genes to screen. #' @param gene_ids IDs of genes to screen.
#' @param min_n_species Minimum number of orthologs that a gene should have to
#' be included in the analysis.
#' @param reference_gene_ids IDs of reference genes to compare to. #' @param reference_gene_ids IDs of reference genes to compare to.
#' #'
#' @return The preset to use with [analyze()]. #' @return The preset to use with [analyze()].
@ -23,22 +31,36 @@
#' @export #' @export
preset <- function(methods = c( preset <- function(methods = c(
"clusteriness", "clusteriness",
"clusteriness_positions",
"correlation", "correlation",
"correlation_positions",
"neural", "neural",
"neural_positions",
"proximity" "proximity"
), ),
species_ids = NULL, species_ids = NULL,
gene_ids = NULL, gene_ids = NULL,
min_n_species = 10,
reference_gene_ids = NULL) { reference_gene_ids = NULL) {
# Count included species per gene.
genes_n_species <- geposan::distances[
species %chin% species_ids,
.(n_species = .N),
by = "gene"
]
# Filter out genes with less than 25% existing orthologs.
gene_ids_filtered <- genes_n_species[
n_species >= 0.25 * length(species_ids),
gene
]
# The included data gets sorted to be able to produce predictable hashes # The included data gets sorted to be able to produce predictable hashes
# for the object later. # for the object later.
structure( structure(
list( list(
methods = sort(methods), methods = sort(methods),
species_ids = sort(species_ids), species_ids = sort(species_ids),
gene_ids = sort(gene_ids), gene_ids = sort(gene_ids_filtered),
min_n_species = as.numeric(min_n_species),
reference_gene_ids = sort(reference_gene_ids) reference_gene_ids = sort(reference_gene_ids)
), ),
class = "geposan_preset" class = "geposan_preset"
@ -64,8 +86,6 @@ print.geposan_preset <- function(x, ...) {
length(x$gene_ids) length(x$gene_ids)
)) ))
cat(sprintf("\n Species per gene: \u2265 %i", x$min_n_species))
cat(sprintf( cat(sprintf(
"\n Comparison data: %i reference genes\n", "\n Comparison data: %i reference genes\n",
length(x$reference_gene_ids) length(x$reference_gene_ids)

View file

@ -5,10 +5,10 @@
\title{Create a new preset.} \title{Create a new preset.}
\usage{ \usage{
preset( preset(
methods = c("clusteriness", "correlation", "neural", "proximity"), methods = c("clusteriness", "clusteriness_positions", "correlation",
"correlation_positions", "neural", "neural_positions", "proximity"),
species_ids = NULL, species_ids = NULL,
gene_ids = NULL, gene_ids = NULL,
min_n_species = 10,
reference_gene_ids = NULL reference_gene_ids = NULL
) )
} }
@ -19,9 +19,6 @@ preset(
\item{gene_ids}{IDs of genes to screen.} \item{gene_ids}{IDs of genes to screen.}
\item{min_n_species}{Minimum number of orthologs that a gene should have to
be included in the analysis.}
\item{reference_gene_ids}{IDs of reference genes to compare to.} \item{reference_gene_ids}{IDs of reference genes to compare to.}
} }
\value{ \value{
@ -30,14 +27,24 @@ The preset to use with \code{\link[=analyze]{analyze()}}.
\description{ \description{
A preset is used to specify which methods and inputs should be used for an A preset is used to specify which methods and inputs should be used for an
analysis. Note that the genes to process should normally include the analysis. Note that the genes to process should normally include the
reference genes to be able to assess the results later. reference genes to be able to assess the results later. The genes will be
filtered based on how many species have data for them. Genes which only have
orthologs for less than 25\% of the input species will be excluded from the
preset and the analyis.
} }
\details{ \details{
Available methods are: Available methods are:
\itemize{ \itemize{
\item \code{clusteriness} How much the gene distances cluster across species. \item \code{clusteriness} How much the gene distances to the nearest telomere
\item \code{correlation} The mean correlation with the reference genes. cluster across species.
\item \code{clusteriness_positions} The same as \code{clusteriness} but using absolute
gene positions instead of distances.
\item \code{correlation} The mean correlation of gene distances to the nearest
telomere across species.
\item \code{correlation_positions} Correlation using position data.
\item \code{neural} Assessment by neural network trained using distances.
\item \code{neural_positions} Assessment by neural network trained using absolute
position data.
\item \code{proximity} Mean proximity to telomeres. \item \code{proximity} Mean proximity to telomeres.
\item \code{neural} Assessment by neural network.
} }
} }