From 7a21e15c0e0b41656e2ed222ea1e86463de92430 Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Sun, 1 Dec 2024 17:04:19 +0100
Subject: [PATCH] scripts: Add normal GSEA

---
 scripts/gsea.R         | 187 +++++++++++++++++++++++++++++++----------
 scripts/sliding_gsea.R |  59 +++++++++++++
 2 files changed, 201 insertions(+), 45 deletions(-)
 create mode 100644 scripts/sliding_gsea.R
diff --git a/scripts/gsea.R b/scripts/gsea.R
index 59bd8fc..80d04c4 100644
--- a/scripts/gsea.R
+++ b/scripts/gsea.R
@@ -1,59 +1,156 @@
-# This script performs a gene set enrichment analysis (GSEA) across the whole
-# ranking of ubiquituos genes using g:Profiler.
-
-# Size of each gene bucket. The GSEA is done once for each bucket within the
-# ranking.
-bucket_size <- 500
-
 library(data.table)
 library(here)
 
 i_am("scripts/gsea.R")
-file_path <- here("scripts", "output", "ubigen_gsea.Rds")
-image_path <- here("scripts", "output", "ubigen_gsea.svg")
 
-# The result will be saved in `file_path` to avoid unnecessary API calls.
-result <- if (file.exists(file_path)) {
-  readRDS(file_path)
-} else {
-  data <- copy(ubigen::genes)
-  data[, bucket := ceiling(rank / bucket_size)]
+ranking_gtex <- ubigen::rank_genes(ubigen::gtex_all)
+ranking_cmap <- ubigen::rank_genes(ubigen::cmap)
 
-  result <- data[, .(analysis = list(gprofiler2::gost(gene))), by = bucket]
-  saveRDS(result, file = file_path)
+data <- merge(
+  ranking_gtex[, .(gene, score, percentile)],
+  ranking_cmap[, .(gene, score, percentile)],
+  by = "gene",
+  suffixes = c(x = "_gtex", y = "_cmap")
+)
 
-  result
+data[, score := score_gtex * score_cmap]
+setorder(data, -score)
+data[, percentile := (.N - .I) / .N]
+
+gsea_1_0 <- gprofiler2::gost(
+  data[percentile_gtex >= 0.95 & percentile_cmap < 0.95, gene],
+  domain_scope = "custom_annotated",
+  custom_bg = data[, gene]
+)
+
+gsea_1_1 <- gprofiler2::gost(
+  data[percentile_gtex >= 0.95 & percentile_cmap >= 0.95, gene],
+  domain_scope = "custom_annotated",
+  custom_bg = data[, gene]
+)
+
+# This code is based on gostplot.R from the gprofiler2 package.
+
+gsea_sources <- c(
+  "GO:MF",
+  "GO:BP",
+  "GO:CC",
+  "KEGG",
+  "REAC",
+  "WP",
+  "TF",
+  "MIRNA",
+  "HPA",
+  "CORUM",
+  "HP"
+)
+
+gsea_source_colors <- data.table(
+  source = gsea_sources,
+  color = c(
+    "#dc3912",
+    "#ff9900",
+    "#109618",
+    "#dd4477",
+    "#3366cc",
+    "#0099c6",
+    "#5574a6",
+    "#22aa99",
+    "#6633cc",
+    "#66aa00",
+    "#990099"
+  )
+)
+
+lerp <- function(x) {
+  (x - min(x)) / (max(x) - min(x))
 }
 
-result[, result := lapply(analysis, function(a) a$result)]
-result <- result[, rbindlist(result), by = bucket]
+gsea_plot <- function(
+    gsea_result,
+    sources = c("GO:MF", "GO:BP", "GO:CC", "KEGG", "REAC", "WP", "TF", "HP")) {
+  source_data <- gsea_source_colors[source %chin% sources]
 
-data <- result[, .(count = .N), by = c("bucket", "source")]
-data[, total := sum(count), by = bucket]
+  source_data[,
+    width := gsea_result$meta$result_metadata[[source]]$number_of_terms,
+    by = source
+  ]
 
-smooth_model <- loess(total ~ bucket, data, span = 0.3)
-bucket_smoothed <- seq(1, nrow(data), 0.1)
-total_smoothed <- predict(smooth_model, bucket_smoothed)
+  source_data[seq_len(.N - 1), width := width + 2000]
+  source_data[, source_x := cumsum(width) - width]
+  source_data[, source_center := source_x + width / 2]
 
-fig <- plotly::plot_ly(data) |>
-  plotly::add_bars(
-    x = ~bucket,
-    y = ~count,
-    color = ~source
-  ) |>
-  plotly::add_lines(
-    x = bucket_smoothed,
-    y = total_smoothed,
-    name = "All (smoothed)"
-  ) |>
-  plotly::layout(
-    xaxis = list(title = glue::glue("Bucket of genes (n = {bucket_size})")),
-    yaxis = list(title = "Number of associated terms"),
-    barmode = "stack",
-    legend = list(title = list(text = "<b>Source of term</b>"))
-  )
+  data <- gsea_result$result |> as.data.table()
+  data <- merge(data, source_data, by = "source")
+  data[, x := source_x + source_order]
+  data[, y := -log10(p_value)]
+  data[y > 16, y := 17]
 
-plotly::save_image(fig, image_path, width = 1200, height = 800)
+  plotly::plot_ly() |>
+    plotly::add_markers(
+      data = data,
+      x = ~x,
+      y = ~y,
+      text = ~term_name,
+      marker = list(
+        size = ~ 4 + 6 * lerp(term_size),
+        color = ~color,
+        line = list(width = 0)
+      ),
+      cliponaxis = FALSE
+    ) |>
+    plotly::layout(
+      xaxis = list(
+        title = "",
+        range = c(0, source_data[.N, source_x + width]),
+        tickmode = "array",
+        tickvals = source_data[, source_center],
+        ticktext = source_data[, source],
+        showgrid = FALSE,
+        zeroline = FALSE
+      ),
+      yaxis = list(
+        title = "−log₁₀(p)",
+        range = c(0, 18),
+        tickmode = "array",
+        tickvals = c(2, 4, 6, 8, 10, 12, 14, 16),
+        ticktext = c("2", "4", "6", "8", "10", "12", "14", "≥ 16")
+      ),
+      font = list(size = 8),
+      margin = list(
+        pad = 2,
+        l = 0,
+        r = 0,
+        t = 0,
+        b = 0
+      )
+    )
+}
 
-gsea_plot_ranking <- fig
-usethis::use_data(gsea_plot_ranking, internal = TRUE, overwrite = TRUE)
+fig_gsea_1_0 <- gsea_plot(gsea_1_0)
+fig_gsea_1_1 <- gsea_plot(gsea_1_1)
+
+# Plotly specifies all sizes in pixels, including font size. Because of
+# that, we can actually think of these pixels as points. One point is defined as
+# 1/72 inch and SVG uses 96 DPI as the standard resolution.
+#
+# 1 plotly pixel = 1 point = 1/72 inch = 1 1/3 actual pixels
+#
+# So, we specify width and height in points (= plotly pixels) and scale up the
+# image by 96/72 to convert everything from points to pixels at 96 DPI.
+
+plotly::save_image(
+  fig_gsea_1_0,
+  file = here("scripts/output/gsea_1_0.svg"),
+  width = 6.27 * 72,
+  height = 3.135 * 72,
+  scale = 96 / 72
+)
+
+plotly::save_image(
+  fig_gsea_1_1,
+  file = here("scripts/output/gsea_1_1.svg"),
+  width = 6.27 * 72,
+  height = 3.135 * 72,
+  scale = 96 / 72
+)
diff --git a/scripts/sliding_gsea.R b/scripts/sliding_gsea.R
new file mode 100644
index 0000000..59bd8fc
--- /dev/null
+++ b/scripts/sliding_gsea.R
@@ -0,0 +1,59 @@
+# This script performs a gene set enrichment analysis (GSEA) across the whole
+# ranking of ubiquituos genes using g:Profiler.
+
+# Size of each gene bucket. The GSEA is done once for each bucket within the
+# ranking.
+bucket_size <- 500
+
+library(data.table)
+library(here)
+
+i_am("scripts/gsea.R")
+file_path <- here("scripts", "output", "ubigen_gsea.Rds")
+image_path <- here("scripts", "output", "ubigen_gsea.svg")
+
+# The result will be saved in `file_path` to avoid unnecessary API calls.
+result <- if (file.exists(file_path)) {
+  readRDS(file_path)
+} else {
+  data <- copy(ubigen::genes)
+  data[, bucket := ceiling(rank / bucket_size)]
+
+  result <- data[, .(analysis = list(gprofiler2::gost(gene))), by = bucket]
+  saveRDS(result, file = file_path)
+
+  result
+}
+
+result[, result := lapply(analysis, function(a) a$result)]
+result <- result[, rbindlist(result), by = bucket]
+
+data <- result[, .(count = .N), by = c("bucket", "source")]
+data[, total := sum(count), by = bucket]
+
+smooth_model <- loess(total ~ bucket, data, span = 0.3)
+bucket_smoothed <- seq(1, nrow(data), 0.1)
+total_smoothed <- predict(smooth_model, bucket_smoothed)
+
+fig <- plotly::plot_ly(data) |>
+  plotly::add_bars(
+    x = ~bucket,
+    y = ~count,
+    color = ~source
+  ) |>
+  plotly::add_lines(
+    x = bucket_smoothed,
+    y = total_smoothed,
+    name = "All (smoothed)"
+  ) |>
+  plotly::layout(
+    xaxis = list(title = glue::glue("Bucket of genes (n = {bucket_size})")),
+    yaxis = list(title = "Number of associated terms"),
+    barmode = "stack",
+    legend = list(title = list(text = "<b>Source of term</b>"))
+  )
+
+plotly::save_image(fig, image_path, width = 1200, height = 800)
+
+gsea_plot_ranking <- fig
+usethis::use_data(gsea_plot_ranking, internal = TRUE, overwrite = TRUE)