#' Score genes based on their adjacency to the reference genes within species. #' #' For each gene and species, the method will first combine the gene's distances #' to the reference genes within that species. Afterwards, the results are #' summarized across species and determine the gene's score. #' #' @param distance_estimate Function for combining the distance differences #' within one species. #' @param summarize Function for summarizing the distance values across species. #' #' @return An object of class `geposan_method`. #' #' @seealso [adjacency()] #' #' @export species_adjacency <- function(distance_estimate = min, summarize = stats::median) { method( id = "species_adjacency", name = "Species adj.", description = "Species adjacency", function(preset, progress) { species_ids <- preset$species_ids gene_ids <- preset$gene_ids reference_gene_ids <- preset$reference_gene_ids cached( "species_adjacency", c( species_ids, gene_ids, reference_gene_ids, distance_estimate, summarize ), { # nolint # Prefilter distances. data <- geposan::distances[ species %chin% species_ids & gene %chin% gene_ids ] progress_state <- 0.0 progress_step <- 0.9 / length(species_ids) # Iterate through all species and find the distance # estimates within that species. for (species_id in species_ids) { # For all genes, compute the distance to one reference # gene at a time in one go. for (reference_gene_id in reference_gene_ids) { comparison_distance <- data[ species == species_id & gene == reference_gene_id, distance ] column <- quote(reference_gene_id) if (length(comparison_distance) != 1) { # If we don't have a comparison distance, we # can't compute a difference. This happens, if # the species doesn't have the reference gene. data[ species == species_id & gene %chin% gene_ids, eval(column) := NA_integer_ ] } else { data[ species == species_id & gene %chin% gene_ids, eval(column) := abs(distance - comparison_distance) ] } } # Combine the distances to the different reference genes # into one value using the provided function. data[ species == species_id & gene %chin% gene_ids, combined_distance := as.numeric( distance_estimate(na.omit( # Convert the data.table subset into a # vector to get the correct na.omit # behavior. as.matrix(.SD)[1, ] )) ), .SDcols = reference_gene_ids, by = gene ] progress_state <- progress_state + progress_step progress(progress_state) } progress(0.9) # Remove the distances between the reference genes. for (reference_gene_id in reference_gene_ids) { column <- quote(reference_gene_id) data[gene == reference_gene_id, eval(column) := NA] } # Recompute the combined distance for the reference genes. data[ gene %chin% reference_gene_ids, combined_distance := as.numeric( distance_estimate(na.omit(as.matrix(.SD)[1, ])) ), .SDcols = reference_gene_ids, by = list(species, gene) ] # Combine the distances into one value. results <- data[, .( summarized_distances = as.numeric( summarize(na.omit(combined_distance)) ) ), by = gene ] # Compute the final score by normalizing the difference. results[ , score := 1 - summarized_distances / max(summarized_distances) ] progress(1.0) result( method = "species_adjacency", scores = results[, .(gene, score)], details = list( data = data, results = results ) ) } ) } ) }