diff --git a/NAMESPACE b/NAMESPACE index 49d5bdf..685d468 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(analyze) export(clustering) export(compare) export(correlation) +export(densest) export(method) export(neural) export(optimal_weights) diff --git a/R/adjacency.R b/R/adjacency.R index 180974e..f688206 100644 --- a/R/adjacency.R +++ b/R/adjacency.R @@ -1,13 +1,36 @@ +#' Find the densest value in the data. +#' +#' This function assumes that data represents a continuous variable and finds +#' a single value with the highest estimated density. This can be used to +#' estimate the mode of the data. If there is only one value that value is +#' returned. If multiple density maxima with the same density exist, their mean +#' is returned. +#' +#' @param data The input data. +#' +#' @return The densest value of data. +#' +#' @export +densest <- function(data) { + as.numeric(if (length(data) <= 0) { + NULL + } else if (length(data) == 1) { + data + } else { + density <- stats::density(data) + mean(density$x[density$y == max(density$y)]) + }) +} + #' Score genes based on their proximity to the reference genes. #' -#' This method finds the distance value with the maximum density for each gene -#' (i.e. the mode of its estimated distribution). Genes are scored by comparing -#' those distance values with the values of the reference genes. +#' @param estimate A function that will be used to summarize the distance +#' values for each gene. See [densest()] for the default implementation. #' #' @return An object of class `geposan_method`. #' #' @export -adjacency <- function() { +adjacency <- function(estimate = densest) { method( id = "adjacency", name = "Adjacency", @@ -17,73 +40,64 @@ adjacency <- function() { gene_ids <- preset$gene_ids reference_gene_ids <- preset$reference_gene_ids - cached("adjacency", c(species_ids, gene_ids, reference_gene_ids), { - # Get the virtual distance value with the highest density. - compute_densest_distance <- function(distances) { - if (length(distances) <= 2) { - mean(distances) - } else { - d <- stats::density(distances) - d$x[which.max(d$y)] - } - } - - # Filter distances by species and gene and find the distance - # with the highest density of values for each gene. - data <- geposan::distances[ - species %chin% species_ids & gene %chin% gene_ids, - .(densest_distance = compute_densest_distance(distance)), - by = gene - ] - - # Compute the absolute value of the difference between the - # provided densest distance value in comparison to the mean of - # the densest distances of the comparison genes. - compute_difference <- function(densest_distance, - comparison_ids) { - # Get the mean of the densest distances of the reference - # genes. - mean_densest_distance <- data[ - gene %chin% comparison_ids, - mean(densest_distance) + cached( + "adjacency", + c(species_ids, gene_ids, reference_gene_ids, estimate), + { # nolint + # Filter distances by species and gene and summarize each + # gene's distance values using the estimation function. + data <- geposan::distances[ + species %chin% species_ids & gene %chin% gene_ids, + .(distance = estimate(distance)), + by = gene ] - abs(densest_distance - mean_densest_distance) - } + # Compute the absolute value of the difference between the + # estimated distances of each gene to the reference genes. + compute_difference <- function(distance, + comparison_ids) { + reference_distance <- data[ + gene %chin% comparison_ids, + mean(distance) + ] - # Compute the differences to the reference genes. - data[ - !gene %chin% reference_gene_ids, - difference := compute_difference( - densest_distance, - reference_gene_ids + abs(distance - reference_distance) + } + + # Compute the differences to the reference genes. + data[ + !gene %chin% reference_gene_ids, + difference := compute_difference( + distance, + reference_gene_ids + ) + ] + + progress(0.5) + + # Exclude the reference gene itself when computing its + # difference. + data[ + gene %chin% reference_gene_ids, + difference := compute_difference( + distance, + reference_gene_ids[reference_gene_ids != gene] + ), + by = gene + ] + + # Compute the final score by normalizing the difference. + data[, score := 1 - difference / max(difference)] + + progress(1.0) + + result( + method = "adjacency", + scores = data[, .(gene, score)], + details = list(data = data) ) - ] - - progress(0.5) - - # Exclude the reference gene itself when computing its - # difference. - data[ - gene %chin% reference_gene_ids, - difference := compute_difference( - densest_distance, - reference_gene_ids[reference_gene_ids != gene] - ), - by = gene - ] - - # Compute the final score by normalizing the difference. - data[, score := 1 - difference / max(difference)] - - progress(1.0) - - result( - method = "adjacency", - scores = data[, .(gene, score)], - details = list(data = data) - ) - }) + } + ) } ) } diff --git a/man/adjacency.Rd b/man/adjacency.Rd index 45cc6e0..742123b 100644 --- a/man/adjacency.Rd +++ b/man/adjacency.Rd @@ -4,13 +4,15 @@ \alias{adjacency} \title{Score genes based on their proximity to the reference genes.} \usage{ -adjacency() +adjacency(estimate = densest) +} +\arguments{ +\item{estimate}{A function that will be used to summarize the distance +values for each gene. See \code{\link[=densest]{densest()}} for the default implementation.} } \value{ An object of class \code{geposan_method}. } \description{ -This method finds the distance value with the maximum density for each gene -(i.e. the mode of its estimated distribution). Genes are scored by comparing -those distance values with the values of the reference genes. +Score genes based on their proximity to the reference genes. } diff --git a/man/densest.Rd b/man/densest.Rd new file mode 100644 index 0000000..252c6f1 --- /dev/null +++ b/man/densest.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/adjacency.R +\name{densest} +\alias{densest} +\title{Find the densest value in the data.} +\usage{ +densest(data) +} +\arguments{ +\item{data}{The input data.} +} +\value{ +The densest value of data. +} +\description{ +This function assumes that data represents a continuous variable and finds +a single value with the highest estimated density. This can be used to +estimate the mode of the data. If there is only one value that value is +returned. If multiple density maxima with the same density exist, their mean +is returned. +}