mirror of
https://github.com/johrpan/geposan.git
synced 2025-10-26 10:47:25 +01:00
preset: Filter species in addition to genes
This commit is contained in:
parent
9e96c54f23
commit
3217c9bd29
4 changed files with 49 additions and 48 deletions
44
R/preset.R
44
R/preset.R
|
|
@ -3,16 +3,19 @@
|
|||
#' A preset is used to specify which methods and inputs should be used for an
|
||||
#' analysis. Note that the genes to process should normally include the
|
||||
#' reference genes to be able to assess the results later. The genes will be
|
||||
#' filtered based on how many species have data for them. Genes which only have
|
||||
#' orthologs for less than 25% of the input species will be excluded from the
|
||||
#' preset and the analyis. See the different method functions for the available
|
||||
#' methods: [clustering()], [correlation()], [neural()], [adjacency()] and
|
||||
#' [species_adjacency()].
|
||||
#' filtered based on how many species have data for them. Afterwards, species
|
||||
#' that still have many missing genes will also be excluded. See the different
|
||||
#' method functions for the available methods: [clustering()], [correlation()],
|
||||
#' [neural()], [adjacency()] and [species_adjacency()].
|
||||
#'
|
||||
#' @param reference_gene_ids IDs of reference genes to compare to.
|
||||
#' @param methods List of methods to apply.
|
||||
#' @param species_ids IDs of species to include.
|
||||
#' @param gene_ids IDs of genes to screen.
|
||||
#' @param species_requirement The proportion of species a gene has to have
|
||||
#' orthologs in in order for the gene to qualify.
|
||||
#' @param gene_requirement The proportion of genes that a species has to have
|
||||
#' in order for the species to be included in the analysis.
|
||||
#'
|
||||
#' @return The preset to use with [analyze()].
|
||||
#'
|
||||
|
|
@ -20,21 +23,32 @@
|
|||
preset <- function(reference_gene_ids,
|
||||
methods = all_methods(),
|
||||
species_ids = geposan::species$id,
|
||||
gene_ids = geposan::genes$id) {
|
||||
# Count included species per gene.
|
||||
genes_n_species <- geposan::distances[
|
||||
species %chin% species_ids,
|
||||
.(n_species = .N),
|
||||
by = "gene"
|
||||
gene_ids = geposan::genes$id,
|
||||
species_requirement = 0.25,
|
||||
gene_requirement = 0.5) {
|
||||
# Prefilter distances.
|
||||
distances <- geposan::distances[
|
||||
species %chin% species_ids & gene %chin% gene_ids
|
||||
]
|
||||
|
||||
# Filter out genes with less than 25% existing orthologs.
|
||||
# Count included species per gene.
|
||||
genes_n_species <- distances[, .(n_species = .N), by = "gene"]
|
||||
|
||||
# Filter out genes with less too few existing orthologs.
|
||||
gene_ids_filtered <- genes_n_species[
|
||||
gene %chin% gene_ids &
|
||||
n_species >= 0.25 * length(species_ids),
|
||||
n_species >= species_requirement * length(species_ids),
|
||||
gene
|
||||
]
|
||||
|
||||
# Count included genes per species.
|
||||
species_n_genes <- geposan::distances[, .(n_genes = .N), by = "species"]
|
||||
|
||||
# Filter out species that have too few of the genes.
|
||||
species_ids_filtered <- species_n_genes[
|
||||
n_genes >= gene_requirement * length(gene_ids_filtered),
|
||||
species
|
||||
]
|
||||
|
||||
reference_gene_ids_excluded <- reference_gene_ids[
|
||||
!reference_gene_ids %chin% gene_ids_filtered
|
||||
]
|
||||
|
|
@ -65,7 +79,7 @@ preset <- function(reference_gene_ids,
|
|||
list(
|
||||
reference_gene_ids = sort(reference_gene_ids_included),
|
||||
methods = methods,
|
||||
species_ids = sort(species_ids),
|
||||
species_ids = sort(species_ids_filtered),
|
||||
gene_ids = sort(gene_ids_filtered)
|
||||
),
|
||||
class = "geposan_preset"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue