Add drug plots

2025-10-26 19:57:24 +01:00 · 2025-02-16 10:36:54 +01:00 · 2025-02-16 10:36:54 +01:00 · 785b748ba4
commit 785b748ba4
parent cf8e9e79d5
8 changed files with 365 additions and 185 deletions
--- a/scripts/cmap_drugs_analysis.R
+++ b/scripts/cmap_drugs_analysis.R
@ -1,136 +0,0 @@
-library(data.table)
-library(here)
-
-i_am("scripts/cmap_drugs_analysis.R")
-
-data <- fread(here("scripts/output/cmap_drugs.csv"))
-
-data[, c("drug", "concentration", "cell_line") :=
-  tstrsplit(drug, "_", fixed = TRUE)]
-
-data[, concentration := as.double(concentration)]
-
-data <- data[,
-  .(abs_mean_change = mean(abs(mean_change))),
-  by = .(drug, group)
-]
-
-# Source: PubChem ID list upload based on identifiers converted from CMap
-# drug names using the PubChem ID exchange.
-pubchem_data <- fread(here("scripts/input/pubchem_data.csv"))
-
-pubchem_data <- pubchem_data[, .(cid, cmpdname, annotation)]
-pubchem_data <- unique(pubchem_data, by = "cid")
-pubchem_data <- pubchem_data[,
-  .(
-    cmpdname,
-    annotation = strsplit(annotation, "|", fixed = TRUE) |> unlist()
-  ),
-  by = cid
-]
-
-# Filter for WHO ATC annotations
-pubchem_data <- pubchem_data[stringr::str_detect(annotation, "^[A-Z] - ")]
-
-# Extract ATC levels
-
-pubchem_data[, atc_1 := stringr::str_match(
-  annotation,
-  "^[A-Z] - ([^>]*)"
-)[, 2] |> stringr::str_trim()]
-
-pubchem_data[, atc_2 := stringr::str_match(
-  annotation,
-  "> [A-Z][0-9][0-9] - ([^>]*)"
-)[, 2] |> stringr::str_trim()]
-
-pubchem_data[, atc_3 := stringr::str_match(
-  annotation,
-  "> [A-Z][0-9][0-9][A-Z] - ([^>]*)"
-)[, 2] |> stringr::str_trim()]
-
-# Source: PubChem ID exchange
-drugs_pubchem_mapping <- fread(here("scripts/input/drugs_pubchem.tsv")) |>
-  na.omit()
-
-data <- merge(data, drugs_pubchem_mapping, by = "drug", allow.cartesian = TRUE)
-data <- merge(data, pubchem_data, by = "cid", allow.cartesian = TRUE)
-data[, drug_category := atc_1]
-
-
-# Select top drug categories
-
-results_drug_categories <- data[,
-  .(score = mean(abs_mean_change)),
-  by = .(group, drug_category)
-]
-
-results_drug_categories <- results_drug_categories[,
-  .(mean_score = mean(score)),
-  by = drug_category
-]
-
-setorder(results_drug_categories, -mean_score)
-top_drug_categories <- results_drug_categories[1:7, drug_category]
-drug_categories <- c(top_drug_categories, "Other")
-
-# Merge other drug categories
-
-data[!(drug_category %chin% top_drug_categories), drug_category := "Other"]
-
-# Recompute results with new categories
-
-results <- data[,
-  .(score = mean(abs_mean_change)),
-  by = .(group, drug_category)
-]
-
-group_plots <- list()
-
-for (group_value in results[, unique(group)]) {
-  group_plot <- plotly::plot_ly() |>
-    plotly::add_bars(
-      data = results[group == group_value],
-      x = ~drug_category,
-      y = ~score,
-      color = ~drug_category
-    ) |>
-    plotly::layout(
-      xaxis = list(
-        categoryarray = drug_categories,
-        title = "",
-        showticklabels = FALSE
-      ),
-      yaxis = list(
-        range = c(0.0, 0.03),
-        nticks = 4,
-        title = ""
-      ),
-      font = list(size = 8),
-      margin = list(
-        pad = 2,
-        l = 48,
-        r = 0,
-        t = 0,
-        b = 36
-      )
-    )
-
-  plotly::save_image(
-    group_plot |> plotly::hide_legend(),
-    file = here(glue::glue("scripts/output/drug_categories_{group_value}.svg")),
-    width = 3 * 72,
-    height = 4 * 72,
-    scale = 96 / 72
-  )
-
-  group_plots <- c(group_plots, list(group_plot))
-}
-
-plotly::save_image(
-  group_plot,
-  file = here(glue::glue("scripts/output/drug_categories_legend.svg")),
-  width = 6.27 * 72,
-  height = 6.27 * 72,
-  scale = 96 / 72
-)
--- a/scripts/cmap_drugs_input.R
+++ b/scripts/cmap_drugs_input.R
@ -1,47 +0,0 @@
-library(data.table)
-library(gprofiler2)
-library(here)
-
-i_am("scripts/cmap_drugs_input.R")
-
-# Source: custom
-load(here("scripts", "input", "CMap_20180808.RData"))
-
-data <- CMap$"HT_HG-U133A"
-rm(CMap)
-
-transcripts <- dimnames(data)$transcripts
-genes <- gconvert(
-  transcripts,
-  numeric_ns = "ENTREZGENE_ACC",
-  mthreshold = 1,
-  filter_na = FALSE
-)$target
-dimnames(data)[[1]] <- genes
-
-data_drugs <- as.data.table(data)
-data_drugs <- na.omit(data_drugs)
-data_drugs <- data_drugs[data == "logFoldChange", .(transcripts, drugs, value)]
-
-setnames(
-  data_drugs,
-  c("transcripts", "drugs", "value"),
-  c("gene", "drug", "change")
-)
-
-genes_0_0 <- scan(here("scripts/output/genes_0_0.txt"), what = character())
-genes_0_1 <- scan(here("scripts/output/genes_0_1.txt"), what = character())
-genes_1_0 <- scan(here("scripts/output/genes_1_0.txt"), what = character())
-genes_1_1 <- scan(here("scripts/output/genes_1_1.txt"), what = character())
-
-data_drugs[gene %chin% genes_0_0, group := "genes_0_0"]
-data_drugs[gene %chin% genes_0_1, group := "genes_0_1"]
-data_drugs[gene %chin% genes_1_0, group := "genes_1_0"]
-data_drugs[gene %chin% genes_1_1, group := "genes_1_1"]
-
-data_drugs <- na.omit(data_drugs)
-
-results <- data_drugs[, .(mean_change = mean(change)), by = .(drug, group)]
-fwrite(results, file = here("scripts/output/cmap_drugs.csv"))
-
-write(data_drugs[, unique(drug)], file = here("scripts/output/drugs.txt"))
--- a/scripts/drugs_analysis.R
+++ b/scripts/drugs_analysis.R
@ -0,0 +1,237 @@
+library(data.table)
+library(here)
+
+i_am("scripts/drugs_analysis.R")
+
+drugs_cmap <- fread(here("scripts/output/drugs_cmap.csv"))
+
+# Only keep significant changes
+drugs_cmap <- drugs_cmap[p_value <= 0.05]
+
+# Keep one row per gene and drug, with the most significant change.
+setkey(drugs_cmap, gene, drug, p_value)
+drugs_cmap <- drugs_cmap[
+  rowid(gene, drug) == 1,
+  .(gene, drug, log_fold_change, p_value)
+]
+
+drugs_cmap[, negative_log_10_p := -log10(p_value)]
+
+ranking_data <- fread(here("scripts/output/gsea_vs_cmap_groups.csv"))
+n_ubiquitous <- ranking_data[percentile_gtex >= 0.95, .N]
+n_non_ubiquitous <- ranking_data[percentile_gtex < 0.95, .N]
+data <- merge(drugs_cmap, ranking_data, by = "gene")
+
+drugs <- fread(here("scripts/output/drugs.csv"), na.strings = "")
+data <- merge(data, drugs, by = "drug", all.x = TRUE, allow.cartesian = TRUE)
+
+# Use CMap names as fallback (for drugs not present in drugs.csv above)
+data[is.na(name), name := stringr::str_to_sentence(drug)]
+
+# Figures for single drugs
+
+results_drugs <- unique(data, by = c("drug", "gene"))
+results_drugs[,
+  `:=`(
+    proportion_ubiquitous =
+      .SD[percentile_gtex >= 0.95, .N / n_ubiquitous],
+    proportion_non_ubiquitous =
+      .SD[percentile_gtex < 0.95, .N / n_non_ubiquitous],
+    drug_score_gtex = weighted.mean(score_gtex, abs(log_fold_change)),
+    drug_score_cmap = weighted.mean(score_cmap, abs(log_fold_change))
+  ),
+  by = drug
+]
+
+results_drugs[, bias := proportion_ubiquitous / proportion_non_ubiquitous]
+setorder(results_drugs, -bias)
+
+results_drugs_unique <- unique(results_drugs, by = "drug")
+
+# Exclude some exotic drugs
+results_drugs_unique <- results_drugs_unique[!is.na(indication)]
+
+n_drugs <- nrow(results_drugs_unique)
+selected_drugs <- c(
+  results_drugs_unique[1:10, drug],
+  results_drugs_unique[(n_drugs - 9):n_drugs, drug]
+)
+
+fig_drug_scores_new <- plotly::plot_ly(results_drugs_unique) |>
+  plotly::add_markers(
+    x = ~drug_score_gtex,
+    y = ~drug_score_cmap,
+    text = ~name,
+    marker = list(size = 4)
+  ) |>
+  plotly::layout(
+    xaxis = list(
+      title = "Score based on GTEx (all)"
+    ),
+    yaxis = list(
+      title = "Score based on CMap"
+    )
+  )
+
+# To not overwrite other data:
+load(here("R/sysdata.rda"))
+fig_drug_scores <- fig_drug_scores_new
+
+usethis::use_data(
+  fig_drug_scores,
+  gsea_plot_ranking, # From R/sysdata.rda
+  internal = TRUE,
+  overwrite = TRUE
+)
+
+results_drugs_unique <- results_drugs_unique[drug %chin% selected_drugs]
+
+fig_drugs <- plotly::plot_ly(results_drugs_unique) |>
+  plotly::add_bars(
+    x = ~proportion_ubiquitous,
+    y = ~name
+  ) |>
+  plotly::add_bars(
+    x = ~ -proportion_non_ubiquitous,
+    y = ~name
+  ) |>
+  plotly::layout(
+    xaxis = list(
+      range = c(-0.8, 0.8),
+      title = "Proportion of genes that are influenced significantly",
+      tickformat = ".0%"
+    ),
+    yaxis = list(
+      categoryarray = rev(results_drugs_unique[, name]),
+      title = ""
+    ),
+    barmode = "relative",
+    showlegend = FALSE,
+    font = list(size = 8),
+    margin = list(
+      pad = 2,
+      l = 0,
+      r = 0,
+      t = 0,
+      b = 36
+    )
+  )
+
+# Figure for mechanisms of action
+
+results_moa <- unique(
+  data[!is.na(mechanism_of_action) & mechanism_of_action != "Unknown"],
+  by = c("drug", "gene", "mechanism_of_action")
+)
+
+results_moa <- results_moa[,
+  .(
+    percentile_gtex = percentile_gtex[1],
+    log_fold_change = mean(log_fold_change),
+    score_gtex = mean(score_gtex)
+  ),
+  by = c("mechanism_of_action", "gene")
+]
+
+results_moa[,
+  `:=`(
+    proportion_ubiquitous = .SD[percentile_gtex >= 0.95, .N / n_ubiquitous],
+    proportion_non_ubiquitous =
+      .SD[percentile_gtex < 0.95, .N / n_non_ubiquitous],
+    moa_score = weighted.mean(score_gtex, abs(log_fold_change))
+  ),
+  by = mechanism_of_action
+]
+
+results_moa[, bias := proportion_ubiquitous / proportion_non_ubiquitous]
+setorder(results_moa, -bias)
+
+results_moa_unique <- unique(results_moa, by = "mechanism_of_action")
+n_moa <- nrow(results_moa_unique)
+selected_moas <- c(
+  results_moa_unique[1:10, mechanism_of_action],
+  results_moa_unique[(n_moa - 9):n_moa, mechanism_of_action]
+)
+
+results_moa_unique <-
+  results_moa_unique[mechanism_of_action %chin% selected_moas]
+
+fig_moas <- plotly::plot_ly(results_moa_unique) |>
+  plotly::add_bars(
+    x = ~proportion_ubiquitous,
+    y = ~mechanism_of_action
+  ) |>
+  plotly::add_bars(
+    x = ~ -proportion_non_ubiquitous,
+    y = ~mechanism_of_action
+  ) |>
+  plotly::layout(
+    xaxis = list(
+      range = c(-0.8, 0.8),
+      title = "Proportion of genes that are influenced significantly",
+      tickformat = ".0%"
+    ),
+    yaxis = list(
+      categoryarray = rev(results_moa_unique[, mechanism_of_action]),
+      title = ""
+    ),
+    barmode = "relative",
+    showlegend = FALSE,
+    font = list(size = 8),
+    margin = list(
+      pad = 2,
+      l = 0,
+      r = 0,
+      t = 0,
+      b = 36
+    )
+  )
+
+plotly::save_image(
+  fig_drug_scores |> plotly::layout(
+    font = list(size = 8),
+    margin = list(
+      pad = 2,
+      l = 36,
+      r = 0,
+      t = 0,
+      b = 36
+    )
+  ),
+  file = here("scripts/output/drug_scores.svg"),
+  width = 6.27 * 72,
+  height = 6.27 * 72,
+  scale = 96 / 72
+)
+
+plotly::save_image(
+  fig_drugs,
+  file = here("scripts/output/drugs_labels.svg"),
+  width = 3.135 * 72,
+  height = 6.27 * 72,
+  scale = 96 / 72
+)
+
+plotly::save_image(
+  fig_drugs |> plotly::layout(yaxis = list(showticklabels = FALSE)),
+  file = here("scripts/output/drugs.svg"),
+  width = 3.135 * 72,
+  height = 6.27 * 72,
+  scale = 96 / 72
+)
+
+plotly::save_image(
+  fig_moas,
+  file = here("scripts/output/moas_labels.svg"),
+  width = 3.135 * 72,
+  height = 6.27 * 72,
+  scale = 96 / 72
+)
+
+plotly::save_image(
+  fig_moas |> plotly::layout(yaxis = list(showticklabels = FALSE)),
+  file = here("scripts/output/moas.svg"),
+  width = 3.135 * 72,
+  height = 6.27 * 72,
+  scale = 96 / 72
+)
--- a/scripts/drugs_input.R
+++ b/scripts/drugs_input.R
@ -0,0 +1,103 @@
+library(data.table)
+library(here)
+
+i_am("scripts/drugs_input.R")
+
+# Source: PubChem ID exchange based on CMap drug identifiers.
+drugs_cmap_pubchem <- fread(here("scripts/input/drugs_cmap_pubchem.tsv"))
+drugs_cmap_pubchem <- na.omit(drugs_cmap_pubchem)
+
+# Source: UniChem ID mapping
+drugs_chembl_pubchem <- fread(here("scripts/input/drugs_chembl_pubchem.tsv"))
+
+# Source: ChEMBL SQLite database
+# SELECT DISTINCT
+#   chembl_id,
+#   synonyms AS name,
+#   mesh_heading AS indication,
+#   mechanism_of_action
+# FROM molecule_dictionary
+#   LEFT JOIN drug_indication
+#     ON molecule_dictionary.molregno = drug_indication.molregno
+#   LEFT JOIN drug_mechanism
+#     ON molecule_dictionary.molregno = drug_mechanism.molregno
+#   LEFT JOIN (
+#       SELECT molregno, synonyms FROM molecule_synonyms WHERE syn_type == 'INN'
+#     ) AS molecule_synonyms
+#     ON molecule_dictionary.molregno = molecule_synonyms.molregno
+#   WHERE name IS NOT NULL
+#     OR indication IS NOT NULL
+#     OR mechanism_of_action IS NOT NULL;
+drugs_chembl <- fread(here("scripts/input/drugs_chembl.csv"))
+
+# Source: PubChem ID list upload based on identifiers converted from CMap
+# drug names using the PubChem ID exchange.
+drugs_pubchem <- fread(here("scripts/input/drugs_pubchem.csv"))
+
+drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, annotation)]
+drugs_pubchem <- unique(drugs_pubchem, by = "cid")
+drugs_pubchem <- drugs_pubchem[,
+  .(
+    cmpdname,
+    annotation = strsplit(annotation, "|", fixed = TRUE) |> unlist()
+  ),
+  by = cid
+]
+
+# Filter for WHO ATC annotations
+drugs_pubchem <- drugs_pubchem[stringr::str_detect(annotation, "^[A-Z] - ")]
+
+# Extract ATC levels
+
+drugs_pubchem[, atc_1 := stringr::str_match(
+  annotation,
+  "^[A-Z] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+drugs_pubchem[, atc_2 := stringr::str_match(
+  annotation,
+  "> [A-Z][0-9][0-9] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+drugs_pubchem[, atc_3 := stringr::str_match(
+  annotation,
+  "> [A-Z][0-9][0-9][A-Z] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, atc_1, atc_2, atc_3)]
+setnames(drugs_pubchem, c("cid", "cmpdname"), c("pubchem_cid", "pubchem_name"))
+
+drugs <- merge(
+  drugs_cmap_pubchem,
+  drugs_chembl_pubchem,
+  by = "pubchem_cid",
+  all.x = TRUE
+)
+
+drugs <- merge(
+  drugs,
+  drugs_chembl,
+  by = "chembl_id",
+  all.x = TRUE
+)
+
+drugs <- merge(
+  drugs,
+  drugs_pubchem,
+  by = "pubchem_cid",
+  all.x = TRUE,
+  allow.cartesian = TRUE
+)
+
+# Prefer INN name, then PubChem, then CMap:
+drugs[name == "", name := NA]
+drugs[is.na(name), name := pubchem_name]
+drugs[name == "", name := NA]
+drugs[is.na(name), name := stringr::str_to_sentence(drug)]
+drugs[, pubchem_name := NULL]
+
+# Clean up empty values:
+drugs[indication == "", indication := NA]
+drugs[mechanism_of_action == "", mechanism_of_action := NA]
+
+fwrite(drugs, file = here("scripts/output/drugs.csv"))
--- a/scripts/sliding_gsea_package.R
+++ b/scripts/sliding_gsea_package.R
@ -55,5 +55,13 @@ fig <- plotly::plot_ly(data) |>

 plotly::save_image(fig, image_path, width = 1200, height = 800)

+# To not overwrite other data:
+load(here("R/sysdata.rda"))
 gsea_plot_ranking <- fig
-usethis::use_data(gsea_plot_ranking, internal = TRUE, overwrite = TRUE)
+
+usethis::use_data(
+  gsea_plot_ranking,
+  fig_drug_scores, # From R/sysdata.rda
+  internal = TRUE,
+  overwrite = TRUE
+)