ubigen/scripts/drugs_input.R
2025-02-16 10:36:54 +01:00

103 lines
2.8 KiB
R

library(data.table)
library(here)
i_am("scripts/drugs_input.R")
# Source: PubChem ID exchange based on CMap drug identifiers.
drugs_cmap_pubchem <- fread(here("scripts/input/drugs_cmap_pubchem.tsv"))
drugs_cmap_pubchem <- na.omit(drugs_cmap_pubchem)
# Source: UniChem ID mapping
drugs_chembl_pubchem <- fread(here("scripts/input/drugs_chembl_pubchem.tsv"))
# Source: ChEMBL SQLite database
# SELECT DISTINCT
# chembl_id,
# synonyms AS name,
# mesh_heading AS indication,
# mechanism_of_action
# FROM molecule_dictionary
# LEFT JOIN drug_indication
# ON molecule_dictionary.molregno = drug_indication.molregno
# LEFT JOIN drug_mechanism
# ON molecule_dictionary.molregno = drug_mechanism.molregno
# LEFT JOIN (
# SELECT molregno, synonyms FROM molecule_synonyms WHERE syn_type == 'INN'
# ) AS molecule_synonyms
# ON molecule_dictionary.molregno = molecule_synonyms.molregno
# WHERE name IS NOT NULL
# OR indication IS NOT NULL
# OR mechanism_of_action IS NOT NULL;
drugs_chembl <- fread(here("scripts/input/drugs_chembl.csv"))
# Source: PubChem ID list upload based on identifiers converted from CMap
# drug names using the PubChem ID exchange.
drugs_pubchem <- fread(here("scripts/input/drugs_pubchem.csv"))
drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, annotation)]
drugs_pubchem <- unique(drugs_pubchem, by = "cid")
drugs_pubchem <- drugs_pubchem[,
.(
cmpdname,
annotation = strsplit(annotation, "|", fixed = TRUE) |> unlist()
),
by = cid
]
# Filter for WHO ATC annotations
drugs_pubchem <- drugs_pubchem[stringr::str_detect(annotation, "^[A-Z] - ")]
# Extract ATC levels
drugs_pubchem[, atc_1 := stringr::str_match(
annotation,
"^[A-Z] - ([^>]*)"
)[, 2] |> stringr::str_trim()]
drugs_pubchem[, atc_2 := stringr::str_match(
annotation,
"> [A-Z][0-9][0-9] - ([^>]*)"
)[, 2] |> stringr::str_trim()]
drugs_pubchem[, atc_3 := stringr::str_match(
annotation,
"> [A-Z][0-9][0-9][A-Z] - ([^>]*)"
)[, 2] |> stringr::str_trim()]
drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, atc_1, atc_2, atc_3)]
setnames(drugs_pubchem, c("cid", "cmpdname"), c("pubchem_cid", "pubchem_name"))
drugs <- merge(
drugs_cmap_pubchem,
drugs_chembl_pubchem,
by = "pubchem_cid",
all.x = TRUE
)
drugs <- merge(
drugs,
drugs_chembl,
by = "chembl_id",
all.x = TRUE
)
drugs <- merge(
drugs,
drugs_pubchem,
by = "pubchem_cid",
all.x = TRUE,
allow.cartesian = TRUE
)
# Prefer INN name, then PubChem, then CMap:
drugs[name == "", name := NA]
drugs[is.na(name), name := pubchem_name]
drugs[name == "", name := NA]
drugs[is.na(name), name := stringr::str_to_sentence(drug)]
drugs[, pubchem_name := NULL]
# Clean up empty values:
drugs[indication == "", indication := NA]
drugs[mechanism_of_action == "", mechanism_of_action := NA]
fwrite(drugs, file = here("scripts/output/drugs.csv"))