mirror of
				https://github.com/johrpan/ubigen.git
				synced 2025-10-28 04:37:23 +01:00 
			
		
		
		
	
		
			
	
	
		
			104 lines
		
	
	
	
		
			2.8 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
		
		
			
		
	
	
			104 lines
		
	
	
	
		
			2.8 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
|  | library(data.table) | ||
|  | library(here) | ||
|  | 
 | ||
|  | i_am("scripts/drugs_input.R") | ||
|  | 
 | ||
|  | # Source: PubChem ID exchange based on CMap drug identifiers. | ||
|  | drugs_cmap_pubchem <- fread(here("scripts/input/drugs_cmap_pubchem.tsv")) | ||
|  | drugs_cmap_pubchem <- na.omit(drugs_cmap_pubchem) | ||
|  | 
 | ||
|  | # Source: UniChem ID mapping | ||
|  | drugs_chembl_pubchem <- fread(here("scripts/input/drugs_chembl_pubchem.tsv")) | ||
|  | 
 | ||
|  | # Source: ChEMBL SQLite database | ||
|  | # SELECT DISTINCT | ||
|  | #   chembl_id, | ||
|  | #   synonyms AS name, | ||
|  | #   mesh_heading AS indication, | ||
|  | #   mechanism_of_action | ||
|  | # FROM molecule_dictionary | ||
|  | #   LEFT JOIN drug_indication | ||
|  | #     ON molecule_dictionary.molregno = drug_indication.molregno | ||
|  | #   LEFT JOIN drug_mechanism | ||
|  | #     ON molecule_dictionary.molregno = drug_mechanism.molregno | ||
|  | #   LEFT JOIN ( | ||
|  | #       SELECT molregno, synonyms FROM molecule_synonyms WHERE syn_type == 'INN' | ||
|  | #     ) AS molecule_synonyms | ||
|  | #     ON molecule_dictionary.molregno = molecule_synonyms.molregno | ||
|  | #   WHERE name IS NOT NULL | ||
|  | #     OR indication IS NOT NULL | ||
|  | #     OR mechanism_of_action IS NOT NULL; | ||
|  | drugs_chembl <- fread(here("scripts/input/drugs_chembl.csv")) | ||
|  | 
 | ||
|  | # Source: PubChem ID list upload based on identifiers converted from CMap | ||
|  | # drug names using the PubChem ID exchange. | ||
|  | drugs_pubchem <- fread(here("scripts/input/drugs_pubchem.csv")) | ||
|  | 
 | ||
|  | drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, annotation)] | ||
|  | drugs_pubchem <- unique(drugs_pubchem, by = "cid") | ||
|  | drugs_pubchem <- drugs_pubchem[, | ||
|  |   .( | ||
|  |     cmpdname, | ||
|  |     annotation = strsplit(annotation, "|", fixed = TRUE) |> unlist() | ||
|  |   ), | ||
|  |   by = cid | ||
|  | ] | ||
|  | 
 | ||
|  | # Filter for WHO ATC annotations | ||
|  | drugs_pubchem <- drugs_pubchem[stringr::str_detect(annotation, "^[A-Z] - ")] | ||
|  | 
 | ||
|  | # Extract ATC levels | ||
|  | 
 | ||
|  | drugs_pubchem[, atc_1 := stringr::str_match( | ||
|  |   annotation, | ||
|  |   "^[A-Z] - ([^>]*)" | ||
|  | )[, 2] |> stringr::str_trim()] | ||
|  | 
 | ||
|  | drugs_pubchem[, atc_2 := stringr::str_match( | ||
|  |   annotation, | ||
|  |   "> [A-Z][0-9][0-9] - ([^>]*)" | ||
|  | )[, 2] |> stringr::str_trim()] | ||
|  | 
 | ||
|  | drugs_pubchem[, atc_3 := stringr::str_match( | ||
|  |   annotation, | ||
|  |   "> [A-Z][0-9][0-9][A-Z] - ([^>]*)" | ||
|  | )[, 2] |> stringr::str_trim()] | ||
|  | 
 | ||
|  | drugs_pubchem <- drugs_pubchem[, .(cid, cmpdname, atc_1, atc_2, atc_3)] | ||
|  | setnames(drugs_pubchem, c("cid", "cmpdname"), c("pubchem_cid", "pubchem_name")) | ||
|  | 
 | ||
|  | drugs <- merge( | ||
|  |   drugs_cmap_pubchem, | ||
|  |   drugs_chembl_pubchem, | ||
|  |   by = "pubchem_cid", | ||
|  |   all.x = TRUE | ||
|  | ) | ||
|  | 
 | ||
|  | drugs <- merge( | ||
|  |   drugs, | ||
|  |   drugs_chembl, | ||
|  |   by = "chembl_id", | ||
|  |   all.x = TRUE | ||
|  | ) | ||
|  | 
 | ||
|  | drugs <- merge( | ||
|  |   drugs, | ||
|  |   drugs_pubchem, | ||
|  |   by = "pubchem_cid", | ||
|  |   all.x = TRUE, | ||
|  |   allow.cartesian = TRUE | ||
|  | ) | ||
|  | 
 | ||
|  | # Prefer INN name, then PubChem, then CMap: | ||
|  | drugs[name == "", name := NA] | ||
|  | drugs[is.na(name), name := pubchem_name] | ||
|  | drugs[name == "", name := NA] | ||
|  | drugs[is.na(name), name := stringr::str_to_sentence(drug)] | ||
|  | drugs[, pubchem_name := NULL] | ||
|  | 
 | ||
|  | # Clean up empty values: | ||
|  | drugs[indication == "", indication := NA] | ||
|  | drugs[mechanism_of_action == "", mechanism_of_action := NA] | ||
|  | 
 | ||
|  | fwrite(drugs, file = here("scripts/output/drugs.csv")) |