diff --git a/R/analyze.R b/R/analyze.R index 2278ded..8d9b2de 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -75,17 +75,7 @@ analyze <- function(preset, progress = NULL) { total_progress <- total_progress + 1 / method_count } - # Count included species from the preset per gene. - genes_n_species <- geposan::distances[ - species %chin% preset$species_ids, - .(n_species = .N), - by = "gene" - ] - - setkey(genes_n_species, "gene") - - # Return the results for genes with enough species. - results[genes_n_species[gene, n_species] >= preset$min_n_species] + results }) if (!is.null(progress)) { diff --git a/R/preset.R b/R/preset.R index 6fe8e7f..50ee098 100644 --- a/R/preset.R +++ b/R/preset.R @@ -2,20 +2,28 @@ #' #' A preset is used to specify which methods and inputs should be used for an #' analysis. Note that the genes to process should normally include the -#' reference genes to be able to assess the results later. +#' reference genes to be able to assess the results later. The genes will be +#' filtered based on how many species have data for them. Genes which only have +#' orthologs for less than 25% of the input species will be excluded from the +#' preset and the analyis. #' #' Available methods are: #' -#' - `clusteriness` How much the gene distances cluster across species. -#' - `correlation` The mean correlation with the reference genes. +#' - `clusteriness` How much the gene distances to the nearest telomere +#' cluster across species. +#' - `clusteriness_positions` The same as `clusteriness` but using absolute +#' gene positions instead of distances. +#' - `correlation` The mean correlation of gene distances to the nearest +#' telomere across species. +#' - `correlation_positions` Correlation using position data. +#' - `neural` Assessment by neural network trained using distances. +#' - `neural_positions` Assessment by neural network trained using absolute +#' position data. #' - `proximity` Mean proximity to telomeres. -#' - `neural` Assessment by neural network. #' #' @param methods Methods to apply. #' @param species_ids IDs of species to include. #' @param gene_ids IDs of genes to screen. -#' @param min_n_species Minimum number of orthologs that a gene should have to -#' be included in the analysis. #' @param reference_gene_ids IDs of reference genes to compare to. #' #' @return The preset to use with [analyze()]. @@ -23,22 +31,36 @@ #' @export preset <- function(methods = c( "clusteriness", + "clusteriness_positions", "correlation", + "correlation_positions", "neural", + "neural_positions", "proximity" ), species_ids = NULL, gene_ids = NULL, - min_n_species = 10, reference_gene_ids = NULL) { + # Count included species per gene. + genes_n_species <- geposan::distances[ + species %chin% species_ids, + .(n_species = .N), + by = "gene" + ] + + # Filter out genes with less than 25% existing orthologs. + gene_ids_filtered <- genes_n_species[ + n_species >= 0.25 * length(species_ids), + gene + ] + # The included data gets sorted to be able to produce predictable hashes # for the object later. structure( list( methods = sort(methods), species_ids = sort(species_ids), - gene_ids = sort(gene_ids), - min_n_species = as.numeric(min_n_species), + gene_ids = sort(gene_ids_filtered), reference_gene_ids = sort(reference_gene_ids) ), class = "geposan_preset" @@ -64,8 +86,6 @@ print.geposan_preset <- function(x, ...) { length(x$gene_ids) )) - cat(sprintf("\n Species per gene: \u2265 %i", x$min_n_species)) - cat(sprintf( "\n Comparison data: %i reference genes\n", length(x$reference_gene_ids) diff --git a/man/preset.Rd b/man/preset.Rd index 015900e..e6414c5 100644 --- a/man/preset.Rd +++ b/man/preset.Rd @@ -5,10 +5,10 @@ \title{Create a new preset.} \usage{ preset( - methods = c("clusteriness", "correlation", "neural", "proximity"), + methods = c("clusteriness", "clusteriness_positions", "correlation", + "correlation_positions", "neural", "neural_positions", "proximity"), species_ids = NULL, gene_ids = NULL, - min_n_species = 10, reference_gene_ids = NULL ) } @@ -19,9 +19,6 @@ preset( \item{gene_ids}{IDs of genes to screen.} -\item{min_n_species}{Minimum number of orthologs that a gene should have to -be included in the analysis.} - \item{reference_gene_ids}{IDs of reference genes to compare to.} } \value{ @@ -30,14 +27,24 @@ The preset to use with \code{\link[=analyze]{analyze()}}. \description{ A preset is used to specify which methods and inputs should be used for an analysis. Note that the genes to process should normally include the -reference genes to be able to assess the results later. +reference genes to be able to assess the results later. The genes will be +filtered based on how many species have data for them. Genes which only have +orthologs for less than 25\% of the input species will be excluded from the +preset and the analyis. } \details{ Available methods are: \itemize{ -\item \code{clusteriness} How much the gene distances cluster across species. -\item \code{correlation} The mean correlation with the reference genes. +\item \code{clusteriness} How much the gene distances to the nearest telomere +cluster across species. +\item \code{clusteriness_positions} The same as \code{clusteriness} but using absolute +gene positions instead of distances. +\item \code{correlation} The mean correlation of gene distances to the nearest +telomere across species. +\item \code{correlation_positions} Correlation using position data. +\item \code{neural} Assessment by neural network trained using distances. +\item \code{neural_positions} Assessment by neural network trained using absolute +position data. \item \code{proximity} Mean proximity to telomeres. -\item \code{neural} Assessment by neural network. } }