ubigen/scripts/drugs_input.R

library(data.table)
library(here)

i_am("scripts/drugs_input.R")

# Source: PubChem ID exchange based on CMap drug identifiers.
drugs_cmap_pubchem <- fread(here("scripts/input/drugs_cmap_pubchem.tsv"))
drugs_cmap_pubchem <- na.omit(drugs_cmap_pubchem)

# Source: UniChem ID mapping
drugs_chembl_pubchem <- fread(here("scripts/input/drugs_chembl_pubchem.tsv"))

# Source: ChEMBL SQLite database
# SELECT DISTINCT
#   chembl_id,
#   synonyms AS name,
#   mesh_heading AS indication,
#   mechanism_of_action
# FROM molecule_dictionary
#   LEFT JOIN drug_indication
#     ON molecule_dictionary.molregno = drug_indication.molregno
#   LEFT JOIN drug_mechanism
#     ON molecule_dictionary.molregno = drug_mechanism.molregno
#   LEFT JOIN (
#       SELECT molregno, synonyms FROM molecule_synonyms WHERE syn_type == 'INN'
#     ) AS molecule_synonyms
#     ON molecule_dictionary.molregno = molecule_synonyms.molregno
#   WHERE name IS NOT NULL
#     OR indication IS NOT NULL
#     OR mechanism_of_action IS NOT NULL;
drugs_chembl <- fread(here("scripts/input/drugs_chembl.csv"))

# Source: PubChem ID list upload based on identifiers converted from CMap
# drug names using the PubChem ID exchange.
drugs_pubchem <- fread(here("scripts/input/drugs_pubchem.csv"))

drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, annotation)]
drugs_pubchem <- unique(drugs_pubchem, by = "cid")
drugs_pubchem <- drugs_pubchem[,
  .(
    cmpdname,
    annotation = strsplit(annotation, "|", fixed = TRUE) |> unlist()
  ),
  by = cid
]

# Filter for WHO ATC annotations
drugs_pubchem <- drugs_pubchem[stringr::str_detect(annotation, "^[A-Z] - ")]

# Extract ATC levels

drugs_pubchem[, atc_1 := stringr::str_match(
  annotation,
  "^[A-Z] - ([^>]*)"
)[, 2] |> stringr::str_trim()]

drugs_pubchem[, atc_2 := stringr::str_match(
  annotation,
  "> [A-Z][0-9][0-9] - ([^>]*)"
)[, 2] |> stringr::str_trim()]

drugs_pubchem[, atc_3 := stringr::str_match(
  annotation,
  "> [A-Z][0-9][0-9][A-Z] - ([^>]*)"
)[, 2] |> stringr::str_trim()]

drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, atc_1, atc_2, atc_3)]
setnames(drugs_pubchem, c("cid", "cmpdname"), c("pubchem_cid", "pubchem_name"))

drugs <- merge(
  drugs_cmap_pubchem,
  drugs_chembl_pubchem,
  by = "pubchem_cid",
  all.x = TRUE
)

drugs <- merge(
  drugs,
  drugs_chembl,
  by = "chembl_id",
  all.x = TRUE
)

drugs <- merge(
  drugs,
  drugs_pubchem,
  by = "pubchem_cid",
  all.x = TRUE,
  allow.cartesian = TRUE
)

# Prefer INN name, then PubChem, then CMap:
drugs[name == "", name := NA]
drugs[is.na(name), name := pubchem_name]
drugs[name == "", name := NA]
drugs[is.na(name), name := stringr::str_to_sentence(drug)]
drugs[, pubchem_name := NULL]

# Clean up empty values:
drugs[indication == "", indication := NA]
drugs[mechanism_of_action == "", mechanism_of_action := NA]

fwrite(drugs, file = here("scripts/output/drugs.csv"))
Add drug plots 2025-02-16 10:36:54 +01:00			`library(data.table)`
			`library(here)`

			`i_am("scripts/drugs_input.R")`

			`# Source: PubChem ID exchange based on CMap drug identifiers.`
			`drugs_cmap_pubchem <- fread(here("scripts/input/drugs_cmap_pubchem.tsv"))`
			`drugs_cmap_pubchem <- na.omit(drugs_cmap_pubchem)`

			`# Source: UniChem ID mapping`
			`drugs_chembl_pubchem <- fread(here("scripts/input/drugs_chembl_pubchem.tsv"))`

			`# Source: ChEMBL SQLite database`
			`# SELECT DISTINCT`
			`# chembl_id,`
			`# synonyms AS name,`
			`# mesh_heading AS indication,`
			`# mechanism_of_action`
			`# FROM molecule_dictionary`
			`# LEFT JOIN drug_indication`
			`# ON molecule_dictionary.molregno = drug_indication.molregno`
			`# LEFT JOIN drug_mechanism`
			`# ON molecule_dictionary.molregno = drug_mechanism.molregno`
			`# LEFT JOIN (`
			`# SELECT molregno, synonyms FROM molecule_synonyms WHERE syn_type == 'INN'`
			`# ) AS molecule_synonyms`
			`# ON molecule_dictionary.molregno = molecule_synonyms.molregno`
			`# WHERE name IS NOT NULL`
			`# OR indication IS NOT NULL`
			`# OR mechanism_of_action IS NOT NULL;`
			`drugs_chembl <- fread(here("scripts/input/drugs_chembl.csv"))`

			`# Source: PubChem ID list upload based on identifiers converted from CMap`
			`# drug names using the PubChem ID exchange.`
			`drugs_pubchem <- fread(here("scripts/input/drugs_pubchem.csv"))`

			`drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, annotation)]`
			`drugs_pubchem <- unique(drugs_pubchem, by = "cid")`
			`drugs_pubchem <- drugs_pubchem[,`
			`.(`
			`cmpdname,`
			`annotation = strsplit(annotation, "\|", fixed = TRUE) \|> unlist()`
			`),`
			`by = cid`
			`]`

			`# Filter for WHO ATC annotations`
			`drugs_pubchem <- drugs_pubchem[stringr::str_detect(annotation, "^[A-Z] - ")]`

			`# Extract ATC levels`

			`drugs_pubchem[, atc_1 := stringr::str_match(`
			`annotation,`
			`"^[A-Z] - ([^>]*)"`
			`)[, 2] \|> stringr::str_trim()]`

			`drugs_pubchem[, atc_2 := stringr::str_match(`
			`annotation,`
			`"> [A-Z][0-9][0-9] - ([^>]*)"`
			`)[, 2] \|> stringr::str_trim()]`

			`drugs_pubchem[, atc_3 := stringr::str_match(`
			`annotation,`
			`"> [A-Z][0-9][0-9][A-Z] - ([^>]*)"`
			`)[, 2] \|> stringr::str_trim()]`

			`drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, atc_1, atc_2, atc_3)]`
			`setnames(drugs_pubchem, c("cid", "cmpdname"), c("pubchem_cid", "pubchem_name"))`

			`drugs <- merge(`
			`drugs_cmap_pubchem,`
			`drugs_chembl_pubchem,`
			`by = "pubchem_cid",`
			`all.x = TRUE`
			`)`

			`drugs <- merge(`
			`drugs,`
			`drugs_chembl,`
			`by = "chembl_id",`
			`all.x = TRUE`
			`)`

			`drugs <- merge(`
			`drugs,`
			`drugs_pubchem,`
			`by = "pubchem_cid",`
			`all.x = TRUE,`
			`allow.cartesian = TRUE`
			`)`

			`# Prefer INN name, then PubChem, then CMap:`
			`drugs[name == "", name := NA]`
			`drugs[is.na(name), name := pubchem_name]`
			`drugs[name == "", name := NA]`
			`drugs[is.na(name), name := stringr::str_to_sentence(drug)]`
			`drugs[, pubchem_name := NULL]`

			`# Clean up empty values:`
			`drugs[indication == "", indication := NA]`
			`drugs[mechanism_of_action == "", mechanism_of_action := NA]`

			`fwrite(drugs, file = here("scripts/output/drugs.csv"))`