Add drug plots

2025-10-26 19:57:24 +01:00 · 2025-02-16 10:36:54 +01:00 · 2025-02-16 10:36:54 +01:00 · 785b748ba4
commit 785b748ba4
parent cf8e9e79d5
8 changed files with 365 additions and 185 deletions
--- a/scripts/drugs_input.R
+++ b/scripts/drugs_input.R
@ -0,0 +1,103 @@
+library(data.table)
+library(here)
+
+i_am("scripts/drugs_input.R")
+
+# Source: PubChem ID exchange based on CMap drug identifiers.
+drugs_cmap_pubchem <- fread(here("scripts/input/drugs_cmap_pubchem.tsv"))
+drugs_cmap_pubchem <- na.omit(drugs_cmap_pubchem)
+
+# Source: UniChem ID mapping
+drugs_chembl_pubchem <- fread(here("scripts/input/drugs_chembl_pubchem.tsv"))
+
+# Source: ChEMBL SQLite database
+# SELECT DISTINCT
+#   chembl_id,
+#   synonyms AS name,
+#   mesh_heading AS indication,
+#   mechanism_of_action
+# FROM molecule_dictionary
+#   LEFT JOIN drug_indication
+#     ON molecule_dictionary.molregno = drug_indication.molregno
+#   LEFT JOIN drug_mechanism
+#     ON molecule_dictionary.molregno = drug_mechanism.molregno
+#   LEFT JOIN (
+#       SELECT molregno, synonyms FROM molecule_synonyms WHERE syn_type == 'INN'
+#     ) AS molecule_synonyms
+#     ON molecule_dictionary.molregno = molecule_synonyms.molregno
+#   WHERE name IS NOT NULL
+#     OR indication IS NOT NULL
+#     OR mechanism_of_action IS NOT NULL;
+drugs_chembl <- fread(here("scripts/input/drugs_chembl.csv"))
+
+# Source: PubChem ID list upload based on identifiers converted from CMap
+# drug names using the PubChem ID exchange.
+drugs_pubchem <- fread(here("scripts/input/drugs_pubchem.csv"))
+
+drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, annotation)]
+drugs_pubchem <- unique(drugs_pubchem, by = "cid")
+drugs_pubchem <- drugs_pubchem[,
+  .(
+    cmpdname,
+    annotation = strsplit(annotation, "|", fixed = TRUE) |> unlist()
+  ),
+  by = cid
+]
+
+# Filter for WHO ATC annotations
+drugs_pubchem <- drugs_pubchem[stringr::str_detect(annotation, "^[A-Z] - ")]
+
+# Extract ATC levels
+
+drugs_pubchem[, atc_1 := stringr::str_match(
+  annotation,
+  "^[A-Z] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+drugs_pubchem[, atc_2 := stringr::str_match(
+  annotation,
+  "> [A-Z][0-9][0-9] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+drugs_pubchem[, atc_3 := stringr::str_match(
+  annotation,
+  "> [A-Z][0-9][0-9][A-Z] - ([^>]*)"
+)[, 2] |> stringr::str_trim()]
+
+drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, atc_1, atc_2, atc_3)]
+setnames(drugs_pubchem, c("cid", "cmpdname"), c("pubchem_cid", "pubchem_name"))
+
+drugs <- merge(
+  drugs_cmap_pubchem,
+  drugs_chembl_pubchem,
+  by = "pubchem_cid",
+  all.x = TRUE
+)
+
+drugs <- merge(
+  drugs,
+  drugs_chembl,
+  by = "chembl_id",
+  all.x = TRUE
+)
+
+drugs <- merge(
+  drugs,
+  drugs_pubchem,
+  by = "pubchem_cid",
+  all.x = TRUE,
+  allow.cartesian = TRUE
+)
+
+# Prefer INN name, then PubChem, then CMap:
+drugs[name == "", name := NA]
+drugs[is.na(name), name := pubchem_name]
+drugs[name == "", name := NA]
+drugs[is.na(name), name := stringr::str_to_sentence(drug)]
+drugs[, pubchem_name := NULL]
+
+# Clean up empty values:
+drugs[indication == "", indication := NA]
+drugs[mechanism_of_action == "", mechanism_of_action := NA]
+
+fwrite(drugs, file = here("scripts/output/drugs.csv"))