Retrieve input data using biomaRt

2025-10-26 19:27:24 +01:00 · 2021-09-16 00:06:54 +02:00 · 2021-09-16 00:06:54 +02:00 · 1cea6c3631
commit 1cea6c3631
parent 040aabc610
205 changed files with 187 additions and 3296961 deletions
--- a/process.R
+++ b/process.R
@ -6,28 +6,27 @@ library(rlog)
 #' The return value will be a table containing genes and data to take in
 #' account when regarding them as TPE-OLD candidates.
 #'
-#' @param input Data from [`load_input()`].
+#' @param distances Gene distance data to use.
 #' @param species_ids IDs of species to include in the analysis.
-process_input <- function(input, species_ids) {
-    results <- data.table(gene = input$genes$id)
-
-    gene_ids <- input$genes[, id]
+#' @param gene_ids Genes to include in the computation.
+process_input <- function(distances, species_ids, gene_ids) {
+    results <- data.table(gene = gene_ids)
    gene_count <- length(gene_ids)

    for (i in seq_along(gene_ids)) {
        gene_id <- gene_ids[i]
-        log_info(sprintf("Processing gene %i/%i (%i)", i, gene_count, gene_id))
+        log_info(sprintf("Processing gene %i/%i (%s)", i, gene_count, gene_id))

-        distances <- input$distances[
+        data <- distances[
            species %chin% species_ids & gene == gene_id,
            .(species, distance)
        ]

-        if (distances[, .N] < 12) {
+        if (data[, .N] < 12) {
            next
        }

-        clusters <- hclust(dist(distances[, distance]))
+        clusters <- hclust(dist(data[, distance]))
        clusters_cut <- cutree(clusters, h = 1000000)

        # Find the largest cluster
@ -36,7 +35,7 @@ process_input <- function(input, species_ids) {
            which.max(tabulate(match(clusters_cut, cluster_indices)))
        ]

-        cluster <- distances[which(clusters_cut == cluster_index)]
+        cluster <- data[which(clusters_cut == cluster_index)]

        results[
            gene == gene_id,