geposan/R/method_species_adjacency.R

#' Score genes based on their adjacency to the reference genes within species.
#'
#' For each gene and species, the method will first combine the gene's distances
#' to the reference genes within that species. Afterwards, the results are
#' summarized across species and determine the gene's score.
#'
#' @param distance_estimate Function for combining the distance differences
#'   within one species.
#' @param summarize Function for summarizing the distance values across species.
#'
#' @return An object of class `geposan_method`.
#'
#' @seealso [adjacency()]
#'
#' @export
species_adjacency <- function(distance_estimate = stats::median,
                              summarize = stats::median) {
  method(
    id = "species_adjacency",
    name = "Species adj.",
    description = "Species adjacency",
    function(preset, progress) {
      species_ids <- preset$species_ids
      gene_ids <- preset$gene_ids
      reference_gene_ids <- preset$reference_gene_ids

      cached(
        "species_adjacency",
        c(
          species_ids,
          gene_ids,
          reference_gene_ids,
          distance_estimate,
          summarize
        ),
        { # nolint
          # Prefilter distances.
          data <- geposan::distances[
            species %chin% species_ids & gene %chin% gene_ids
          ]

          progress_state <- 0.0
          progress_step <- 0.9 / length(species_ids)

          # Iterate through all species and find the distance
          # estimates within that species.
          for (species_id in species_ids) {
            # For all genes, compute the distance to one reference
            # gene at a time in one go.
            for (reference_gene_id in reference_gene_ids) {
              comparison_distance <- data[
                species == species_id &
                  gene == reference_gene_id,
                distance
              ]

              column <- quote(reference_gene_id)

              if (length(comparison_distance) != 1) {
                # If we don't have a comparison distance, we
                # can't compute a difference. This happens, if
                # the species doesn't have the reference gene.
                data[
                  species == species_id &
                    gene %chin% gene_ids,
                  eval(column) := NA_integer_
                ]
              } else {
                data[
                  species == species_id &
                    gene %chin% gene_ids,
                  eval(column) :=
                    abs(distance - comparison_distance)
                ]
              }
            }

            # Combine the distances to the different reference genes
            # into one value using the provided function.
            data[
              species == species_id &
                gene %chin% gene_ids,
              combined_distance := as.numeric(
                distance_estimate(stats::na.omit(
                  # Convert the data.table subset into a
                  # vector to get the correct na.omit
                  # behavior.
                  as.matrix(.SD)[1, ]
                ))
              ),
              .SDcols = reference_gene_ids,
              by = gene
            ]

            progress_state <- progress_state + progress_step
            progress(progress_state)
          }

          progress(0.9)

          # Remove the distances between the reference genes.
          for (reference_gene_id in reference_gene_ids) {
            column <- quote(reference_gene_id)
            data[gene == reference_gene_id, eval(column) := NA]
          }

          # Recompute the combined distance for the reference genes.
          data[
            gene %chin% reference_gene_ids,
            combined_distance := as.numeric(
              distance_estimate(stats::na.omit(
                as.matrix(.SD)[1, ]
              ))
            ),
            .SDcols = reference_gene_ids,
            by = list(species, gene)
          ]

          # Combine the distances into one value.
          results <- data[,
            .(
              summarized_distances = as.numeric(
                summarize(stats::na.omit(combined_distance))
              )
            ),
            by = gene
          ]

          # Compute the final score by normalizing the difference.
          results[
            ,
            score := 1 - summarized_distances /
              max(summarized_distances)
          ]

          progress(1.0)

          result(
            method = "species_adjacency",
            scores = results[, .(gene, score)],
            details = list(
              data = data,
              results = results
            )
          )
        }
      )
    }
  )
}
Add species adjacency method 2022-01-17 20:11:07 +01:00			`#' Score genes based on their adjacency to the reference genes within species.`
			`#'`
			`#' For each gene and species, the method will first combine the gene's distances`
			`#' to the reference genes within that species. Afterwards, the results are`
			`#' summarized across species and determine the gene's score.`
			`#'`
			`#' @param distance_estimate Function for combining the distance differences`
			`#' within one species.`
			`#' @param summarize Function for summarizing the distance values across species.`
			`#'`
			#' @return An object of class `geposan_method`.
			`#'`
			`#' @seealso [adjacency()]`
			`#'`
			`#' @export`
Use median by default in adjacency methods 2022-02-24 15:36:07 +01:00			`species_adjacency <- function(distance_estimate = stats::median,`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`summarize = stats::median) {`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`method(`
			`id = "species_adjacency",`
			`name = "Species adj.",`
			`description = "Species adjacency",`
			`function(preset, progress) {`
			`species_ids <- preset$species_ids`
			`gene_ids <- preset$gene_ids`
			`reference_gene_ids <- preset$reference_gene_ids`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`cached(`
			`"species_adjacency",`
			`c(`
			`species_ids,`
			`gene_ids,`
			`reference_gene_ids,`
			`distance_estimate,`
			`summarize`
			`),`
			`{ # nolint`
			`# Prefilter distances.`
			`data <- geposan::distances[`
			`species %chin% species_ids & gene %chin% gene_ids`
			`]`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`progress_state <- 0.0`
			`progress_step <- 0.9 / length(species_ids)`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`# Iterate through all species and find the distance`
			`# estimates within that species.`
			`for (species_id in species_ids) {`
			`# For all genes, compute the distance to one reference`
			`# gene at a time in one go.`
			`for (reference_gene_id in reference_gene_ids) {`
			`comparison_distance <- data[`
			`species == species_id &`
			`gene == reference_gene_id,`
			`distance`
			`]`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`column <- quote(reference_gene_id)`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`if (length(comparison_distance) != 1) {`
			`# If we don't have a comparison distance, we`
			`# can't compute a difference. This happens, if`
			`# the species doesn't have the reference gene.`
			`data[`
			`species == species_id &`
			`gene %chin% gene_ids,`
			`eval(column) := NA_integer_`
			`]`
			`} else {`
			`data[`
			`species == species_id &`
			`gene %chin% gene_ids,`
			`eval(column) :=`
			`abs(distance - comparison_distance)`
			`]`
			`}`
			`}`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`# Combine the distances to the different reference genes`
			`# into one value using the provided function.`
			`data[`
			`species == species_id &`
			`gene %chin% gene_ids,`
			`combined_distance := as.numeric(`
			`distance_estimate(stats::na.omit(`
			`# Convert the data.table subset into a`
			`# vector to get the correct na.omit`
			`# behavior.`
			`as.matrix(.SD)[1, ]`
			`))`
			`),`
			`.SDcols = reference_gene_ids,`
			`by = gene`
			`]`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`progress_state <- progress_state + progress_step`
			`progress(progress_state)`
			`}`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`progress(0.9)`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`# Remove the distances between the reference genes.`
			`for (reference_gene_id in reference_gene_ids) {`
			`column <- quote(reference_gene_id)`
			`data[gene == reference_gene_id, eval(column) := NA]`
			`}`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`# Recompute the combined distance for the reference genes.`
			`data[`
			`gene %chin% reference_gene_ids,`
			`combined_distance := as.numeric(`
			`distance_estimate(stats::na.omit(`
			`as.matrix(.SD)[1, ]`
			`))`
			`),`
			`.SDcols = reference_gene_ids,`
			`by = list(species, gene)`
			`]`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`# Combine the distances into one value.`
			`results <- data[,`
			`.(`
			`summarized_distances = as.numeric(`
			`summarize(stats::na.omit(combined_distance))`
			`)`
			`),`
			`by = gene`
			`]`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`# Compute the final score by normalizing the difference.`
			`results[`
			`,`
			`score := 1 - summarized_distances /`
			`max(summarized_distances)`
			`]`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`progress(1.0)`
Add species adjacency method 2022-01-17 20:11:07 +01:00
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`result(`
			`method = "species_adjacency",`
			`scores = results[, .(gene, score)],`
			`details = list(`
			`data = data,`
			`results = results`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`)`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`)`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`}`
Reindent code to use just two spaces 2022-05-26 12:42:19 +02:00			`)`
			`}`
			`)`
Add species adjacency method 2022-01-17 20:11:07 +01:00			`}`