ubigen/scripts/drugs_input.R

library(data.table)
library(here)

i_am("scripts/drugs_input.R")

# Source: PubChem ID exchange based on CMap drug identifiers.
drugs_cmap_pubchem <- fread(here("scripts/input/drugs_cmap_pubchem.tsv"))
drugs_cmap_pubchem <- na.omit(drugs_cmap_pubchem)

# Source: UniChem ID mapping
drugs_chembl_pubchem <- fread(here("scripts/input/drugs_chembl_pubchem.tsv"))

# Source: ChEMBL SQLite database
# SELECT DISTINCT
#   chembl_id,
#   synonyms AS name,
#   mesh_heading AS indication,
#   mechanism_of_action
# FROM molecule_dictionary
#   LEFT JOIN drug_indication
#     ON molecule_dictionary.molregno = drug_indication.molregno
#   LEFT JOIN drug_mechanism
#     ON molecule_dictionary.molregno = drug_mechanism.molregno
#   LEFT JOIN (
#       SELECT molregno, synonyms FROM molecule_synonyms WHERE syn_type == 'INN'
#     ) AS molecule_synonyms
#     ON molecule_dictionary.molregno = molecule_synonyms.molregno
#   WHERE name IS NOT NULL
#     OR indication IS NOT NULL
#     OR mechanism_of_action IS NOT NULL;
drugs_chembl <- fread(here("scripts/input/drugs_chembl.csv"))

# Source: PubChem ID list upload based on identifiers converted from CMap
# drug names using the PubChem ID exchange.
drugs_pubchem <- fread(here("scripts/input/drugs_pubchem.csv"))

drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, annotation)]
drugs_pubchem <- unique(drugs_pubchem, by = "cid")
drugs_pubchem <- drugs_pubchem[,
  .(
    cmpdname,
    annotation = strsplit(annotation, "|", fixed = TRUE) |> unlist()
  ),
  by = cid
]

# Filter for WHO ATC annotations
drugs_pubchem <- drugs_pubchem[stringr::str_detect(annotation, "^[A-Z] - ")]

# Extract ATC levels

drugs_pubchem[, atc_1 := stringr::str_match(
  annotation,
  "^[A-Z] - ([^>]*)"
)[, 2] |> stringr::str_trim()]

drugs_pubchem[, atc_2 := stringr::str_match(
  annotation,
  "> [A-Z][0-9][0-9] - ([^>]*)"
)[, 2] |> stringr::str_trim()]

drugs_pubchem[, atc_3 := stringr::str_match(
  annotation,
  "> [A-Z][0-9][0-9][A-Z] - ([^>]*)"
)[, 2] |> stringr::str_trim()]

drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, atc_1, atc_2, atc_3)]
setnames(drugs_pubchem, c("cid", "cmpdname"), c("pubchem_cid", "pubchem_name"))

drugs <- merge(
  drugs_cmap_pubchem,
  drugs_chembl_pubchem,
  by = "pubchem_cid",
  all.x = TRUE
)

drugs <- merge(
  drugs,
  drugs_chembl,
  by = "chembl_id",
  all.x = TRUE
)

drugs <- merge(
  drugs,
  drugs_pubchem,
  by = "pubchem_cid",
  all.x = TRUE,
  allow.cartesian = TRUE
)

# Prefer INN name, then PubChem, then CMap:
drugs[name == "", name := NA]
drugs[is.na(name), name := pubchem_name]
drugs[name == "", name := NA]
drugs[is.na(name), name := stringr::str_to_sentence(drug)]
drugs[, pubchem_name := NULL]

# Clean up empty values:
drugs[indication == "", indication := NA]
drugs[mechanism_of_action == "", mechanism_of_action := NA]

fwrite(drugs, file = here("scripts/output/drugs.csv"))