From 8358f7e5e71a59c09fb78e0469ccae8703acc032 Mon Sep 17 00:00:00 2001
From: Elias Projahn <elias@johrpan.de>
Date: Sun, 24 Nov 2024 11:01:50 +0100
Subject: [PATCH] scripts: Add analysis of drug effects

---
 scripts/cmap_drugs_analysis.R | 136 ++++++++++++++++++++++++++++++++++
 scripts/cmap_drugs_input.R    |  47 ++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 scripts/cmap_drugs_analysis.R
 create mode 100644 scripts/cmap_drugs_input.R

diff --git a/scripts/cmap_drugs_analysis.R b/scripts/cmap_drugs_analysis.R
new file mode 100644
index 0000000..b563a13
--- /dev/null
+++ b/scripts/cmap_drugs_analysis.R
@@ -0,0 +1,136 @@
+library(data.table)
+library(here)
+
+i_am("scripts/cmap_drugs_analysis.R")
+
+data <- fread(here("scripts/output/cmap_drugs.csv"))
+
+data[, c("drug", "concentration", "cell_line") :=
+  tstrsplit(drug, "_", fixed = TRUE)]
+
+data[, concentration := as.double(concentration)]
+
+data <- data[,
+  .(abs_mean_change = mean(abs(mean_change))),
+  by = .(drug, group)
+]
+
+# Source: PubChem ID list upload based on identifiers converted from CMap
+# drug names using the PubChem ID exchange.
+pubchem_data <- fread(here("scripts/input/pubchem_data.csv"))
+
+pubchem_data <- pubchem_data[, .(cid, cmpdname, annotation)]
+pubchem_data <- unique(pubchem_data, by = "cid")
+pubchem_data <- pubchem_data[,
+  .(
+    cmpdname,
+    annotation = strsplit(annotation, "|", fixed = TRUE) |> unlist()
+  ),
+  by = cid
+]
+
+# Filter for WHO ATC annotations
+pubchem_data <- pubchem_data[stringr::str_detect(annotation, "^[A-Z] - ")]
+
+# Extract ATC levels
+
+pubchem_data[, atc_1 := stringr::str_match(
+  annotation,
+  "^[A-Z] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+pubchem_data[, atc_2 := stringr::str_match(
+  annotation,
+  "> [A-Z][0-9][0-9] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+pubchem_data[, atc_3 := stringr::str_match(
+  annotation,
+  "> [A-Z][0-9][0-9][A-Z] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+# Source: PubChem ID exchange
+drugs_pubchem_mapping <- fread(here("scripts/input/drugs_pubchem.tsv")) |>
+  na.omit()
+
+data <- merge(data, drugs_pubchem_mapping, by = "drug", allow.cartesian = TRUE)
+data <- merge(data, pubchem_data, by = "cid", allow.cartesian = TRUE)
+data[, drug_category := atc_1]
+
+
+# Select top drug categories
+
+results_drug_categories <- data[,
+  .(score = mean(abs_mean_change)),
+  by = .(group, drug_category)
+]
+
+results_drug_categories <- results_drug_categories[,
+  .(mean_score = mean(score)),
+  by = drug_category
+]
+
+setorder(results_drug_categories, -mean_score)
+top_drug_categories <- results_drug_categories[1:7, drug_category]
+drug_categories <- c(top_drug_categories, "Other")
+
+# Merge other drug categories
+
+data[!(drug_category %chin% top_drug_categories), drug_category := "Other"]
+
+# Recompute results with new categories
+
+results <- data[,
+  .(score = mean(abs_mean_change)),
+  by = .(group, drug_category)
+]
+
+group_plots <- list()
+
+for (group_value in results[, unique(group)]) {
+  group_plot <- plotly::plot_ly() |>
+    plotly::add_bars(
+      data = results[group == group_value],
+      x = ~drug_category,
+      y = ~score,
+      color = ~drug_category
+    ) |>
+    plotly::layout(
+      xaxis = list(
+        categoryarray = drug_categories,
+        title = "",
+        showticklabels = FALSE
+      ),
+      yaxis = list(
+        range = c(0.0, 0.03),
+        nticks = 4,
+        title = ""
+      ),
+      font = list(size = 8),
+      margin = list(
+        pad = 2,
+        l = 48,
+        r = 0,
+        t = 0,
+        b = 36
+      )
+    )
+
+  plotly::save_image(
+    group_plot |> plotly::hide_legend(),
+    file = here(glue::glue("scripts/output/drug_categories_{group_value}.svg")),
+    width = 3 * 72,
+    height = 4 * 72,
+    scale = 96 / 72
+  )
+
+  group_plots <- c(group_plots, list(group_plot))
+}
+
+plotly::save_image(
+  group_plot,
+  file = here(glue::glue("scripts/output/drug_categories_legend.svg")),
+  width = 6.27 * 72,
+  height = 6.27 * 72,
+  scale = 96 / 72
+)
diff --git a/scripts/cmap_drugs_input.R b/scripts/cmap_drugs_input.R
new file mode 100644
index 0000000..2a01ffc
--- /dev/null
+++ b/scripts/cmap_drugs_input.R
@@ -0,0 +1,47 @@
+library(data.table)
+library(gprofiler2)
+library(here)
+
+i_am("scripts/cmap_drugs_input.R")
+
+# Source: custom
+load(here("scripts", "input", "CMap_20180808.RData"))
+
+data <- CMap$"HT_HG-U133A"
+rm(CMap)
+
+transcripts <- dimnames(data)$transcripts
+genes <- gconvert(
+  transcripts,
+  numeric_ns = "ENTREZGENE_ACC",
+  mthreshold = 1,
+  filter_na = FALSE
+)$target
+dimnames(data)[[1]] <- genes
+
+data_drugs <- as.data.table(data)
+data_drugs <- na.omit(data_drugs)
+data_drugs <- data_drugs[data == "logFoldChange", .(transcripts, drugs, value)]
+
+setnames(
+  data_drugs,
+  c("transcripts", "drugs", "value"),
+  c("gene", "drug", "change")
+)
+
+genes_0_0 <- scan(here("scripts/output/genes_0_0.txt"), what = character())
+genes_0_1 <- scan(here("scripts/output/genes_0_1.txt"), what = character())
+genes_1_0 <- scan(here("scripts/output/genes_1_0.txt"), what = character())
+genes_1_1 <- scan(here("scripts/output/genes_1_1.txt"), what = character())
+
+data_drugs[gene %chin% genes_0_0, group := "genes_0_0"]
+data_drugs[gene %chin% genes_0_1, group := "genes_0_1"]
+data_drugs[gene %chin% genes_1_0, group := "genes_1_0"]
+data_drugs[gene %chin% genes_1_1, group := "genes_1_1"]
+
+data_drugs <- na.omit(data_drugs)
+
+results <- data_drugs[, .(mean_change = mean(change)), by = .(drug, group)]
+fwrite(results, file = here("scripts/output/cmap_drugs.csv"))
+
+write(data_drugs[, unique(drug)], file = here("scripts/output/drugs.txt"))