geposan/R/adjacency.R

#' Find the densest value in the data.
#'
#' This function assumes that data represents a continuous variable and finds
#' a single value with the highest estimated density. This can be used to
#' estimate the mode of the data. If there is only one value that value is
#' returned. If multiple density maxima with the same density exist, their mean
#' is returned.
#'
#' @param data The input data.
#'
#' @return The densest value of data.
#'
#' @export
densest <- function(data) {
    as.numeric(if (length(data) <= 0) {
        NULL
    } else if (length(data) == 1) {
        data
    } else {
        density <- stats::density(data)
        mean(density$x[density$y == max(density$y)])
    })
}

#' Score genes based on their proximity to the reference genes.
#'
#' In this case, the distance data that is available for one gene is first
#' combined. The resulting value is compared to the reference genes and
#' determines the gene's score in relation to other genes.
#'
#' @param distance_estimate A function that will be used to summarize the
#'   distance values for each gene. See [densest()] for the default
#'   implementation.
#' @param summarize A function that will be used to combine the different
#'   distances to the reference genes. By default [min()] is used. That means
#'   the distance to the nearest reference gene will be scored.
#'
#' @return An object of class `geposan_method`.
#'
#' @seealso [species_adjacency()]
#'
#' @export
adjacency <- function(distance_estimate = densest, summarize = min) {
    method(
        id = "adjacency",
        name = "Adjacency",
        description = "Adjacency to reference genes",
        function(preset, progress) {
            species_ids <- preset$species_ids
            gene_ids <- preset$gene_ids
            reference_gene_ids <- preset$reference_gene_ids

            cached(
                "adjacency",
                c(
                    species_ids,
                    gene_ids,
                    reference_gene_ids,
                    distance_estimate,
                    summarize
                ),
                { # nolint
                    # Filter distances by species and gene and summarize each
                    # gene's distance values using the estimation function.
                    data <- geposan::distances[
                        species %chin% species_ids & gene %chin% gene_ids,
                        .(distance = as.numeric(distance_estimate(distance))),
                        by = gene
                    ]

                    # Compute the absolute value of the difference between the
                    # estimated distances of each gene to the reference genes.
                    compute_difference <- function(distance_value,
                                                   comparison_ids) {
                        differences <- data[
                            gene %chin% comparison_ids,
                            .(difference = abs(distance_value - distance))
                        ]

                        summarize(differences$difference)
                    }

                    # Compute the differences to the reference genes.
                    data[
                        !gene %chin% reference_gene_ids,
                        difference := compute_difference(
                            distance,
                            reference_gene_ids
                        ),
                        by = gene
                    ]

                    progress(0.5)

                    # Exclude the reference gene itself when computing its
                    # difference.
                    data[
                        gene %chin% reference_gene_ids,
                        difference := compute_difference(
                            distance,
                            reference_gene_ids[reference_gene_ids != gene]
                        ),
                        by = gene
                    ]

                    # Compute the final score by normalizing the difference.
                    data[, score := 1 - difference / max(difference)]

                    progress(1.0)

                    result(
                        method = "adjacency",
                        scores = data[, .(gene, score)],
                        details = list(data = data)
                    )
                }
            )
        }
    )
}
adjacency: Revert to density estimate This reverts commit 23bb499d3a38a62cf8a7111cd0c2b5bc2784064c. 2022-01-13 18:35:02 +01:00			`#' Find the densest value in the data.`
			`#'`
			`#' This function assumes that data represents a continuous variable and finds`
			`#' a single value with the highest estimated density. This can be used to`
			`#' estimate the mode of the data. If there is only one value that value is`
			`#' returned. If multiple density maxima with the same density exist, their mean`
			`#' is returned.`
			`#'`
			`#' @param data The input data.`
			`#'`
			`#' @return The densest value of data.`
			`#'`
			`#' @export`
			`densest <- function(data) {`
			`as.numeric(if (length(data) <= 0) {`
			`NULL`
			`} else if (length(data) == 1) {`
			`data`
			`} else {`
			`density <- stats::density(data)`
			`mean(density$x[density$y == max(density$y)])`
			`})`
			`}`

Restructure classes and their responsibilities 2021-12-16 13:01:44 +01:00			`#' Score genes based on their proximity to the reference genes.`
			`#'`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`#' In this case, the distance data that is available for one gene is first`
			`#' combined. The resulting value is compared to the reference genes and`
			`#' determines the gene's score in relation to other genes.`
			`#'`
			`#' @param distance_estimate A function that will be used to summarize the`
			`#' distance values for each gene. See [densest()] for the default`
			`#' implementation.`
			`#' @param summarize A function that will be used to combine the different`
adjacency: Use minimum difference in distances 2022-01-09 20:26:42 +01:00			`#' distances to the reference genes. By default [min()] is used. That means`
			`#' the distance to the nearest reference gene will be scored.`
Restructure classes and their responsibilities 2021-12-16 13:01:44 +01:00			`#'`
			#' @return An object of class `geposan_method`.
			`#'`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`#' @seealso [species_adjacency()]`
			`#'`
Restructure classes and their responsibilities 2021-12-16 13:01:44 +01:00			`#' @export`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`adjacency <- function(distance_estimate = densest, summarize = min) {`
Restructure classes and their responsibilities 2021-12-16 13:01:44 +01:00			`method(`
			`id = "adjacency",`
			`name = "Adjacency",`
			`description = "Adjacency to reference genes",`
			`function(preset, progress) {`
			`species_ids <- preset$species_ids`
			`gene_ids <- preset$gene_ids`
			`reference_gene_ids <- preset$reference_gene_ids`
Add new method adjacency 2021-11-25 20:55:11 +01:00
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`cached(`
			`"adjacency",`
adjacency: Use minimum difference in distances 2022-01-09 20:26:42 +01:00			`c(`
			`species_ids,`
			`gene_ids,`
			`reference_gene_ids,`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`distance_estimate,`
			`summarize`
adjacency: Use minimum difference in distances 2022-01-09 20:26:42 +01:00			`),`
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`{ # nolint`
			`# Filter distances by species and gene and summarize each`
			`# gene's distance values using the estimation function.`
			`data <- geposan::distances[`
			`species %chin% species_ids & gene %chin% gene_ids,`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`.(distance = as.numeric(distance_estimate(distance))),`
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`by = gene`
Restructure classes and their responsibilities 2021-12-16 13:01:44 +01:00			`]`
Add new method adjacency 2021-11-25 20:55:11 +01:00
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`# Compute the absolute value of the difference between the`
			`# estimated distances of each gene to the reference genes.`
adjacency: Use minimum difference in distances 2022-01-09 20:26:42 +01:00			`compute_difference <- function(distance_value,`
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`comparison_ids) {`
adjacency: Use minimum difference in distances 2022-01-09 20:26:42 +01:00			`differences <- data[`
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`gene %chin% comparison_ids,`
adjacency: Use minimum difference in distances 2022-01-09 20:26:42 +01:00			`.(difference = abs(distance_value - distance))`
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`]`
Add new method adjacency 2021-11-25 20:55:11 +01:00
Add species adjacency method 2022-01-17 20:11:07 +01:00			`summarize(differences$difference)`
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`}`
Add new method adjacency 2021-11-25 20:55:11 +01:00
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`# Compute the differences to the reference genes.`
			`data[`
			`!gene %chin% reference_gene_ids,`
			`difference := compute_difference(`
			`distance,`
			`reference_gene_ids`
adjacency: Use minimum difference in distances 2022-01-09 20:26:42 +01:00			`),`
			`by = gene`
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`]`
Add new method adjacency 2021-11-25 20:55:11 +01:00
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`progress(0.5)`
Add new method adjacency 2021-11-25 20:55:11 +01:00
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`# Exclude the reference gene itself when computing its`
			`# difference.`
			`data[`
			`gene %chin% reference_gene_ids,`
			`difference := compute_difference(`
			`distance,`
			`reference_gene_ids[reference_gene_ids != gene]`
			`),`
			`by = gene`
			`]`

			`# Compute the final score by normalizing the difference.`
			`data[, score := 1 - difference / max(difference)]`
Add new method adjacency 2021-11-25 20:55:11 +01:00
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`progress(1.0)`
Add new method adjacency 2021-11-25 20:55:11 +01:00
adjacency: Make distance estimation customizable 2022-01-09 20:21:27 +01:00			`result(`
			`method = "adjacency",`
			`scores = data[, .(gene, score)],`
			`details = list(data = data)`
			`)`
			`}`
			`)`
Restructure classes and their responsibilities 2021-12-16 13:01:44 +01:00			`}`
			`)`
Add new method adjacency 2021-11-25 20:55:11 +01:00			`}`