From 1ee8c290c2e7253c8e2229eba034fd7aea49c8ed Mon Sep 17 00:00:00 2001
From: Dylan Welzel <dylanwelzel@gmail.com>
Date: Wed, 13 Mar 2024 09:56:13 -0700
Subject: [PATCH] pharmgkb parser fixes

https://github.com/biothings/mychem.info/issues/174
---
 .../sources/pharmgkb/pharmgkb_parser.py       | 77 +++++++++----------
 1 file changed, 36 insertions(+), 41 deletions(-)

diff --git a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
index 0a42bd91..4984ee53 100644
--- a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
+++ b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
@@ -1,23 +1,24 @@
 import csv
 import re
 import sys
+
 from biothings.utils.dataload import dict_sweep, unlist
 
 csv.field_size_limit(sys.maxsize)
 
+
 def load_data(tsv_file):
     _file = open(tsv_file)
-    reader = csv.DictReader(_file,delimiter='\t')
-    _dict = {}
-    drug_list = []
+    reader = csv.DictReader(_file, delimiter='\t')
     for row in reader:
         _id = row["PharmGKB Accession Id"]
         _d = restr_dict(row)
         _d = clean_up(_d)
         _d = unlist(dict_sweep(_d))
-        _dict = {'_id':_id,'pharmgkb':_d}
+        _dict = {'_id': _id, 'pharmgkb': _d}
         yield _dict
 
+
 def restr_dict(d):
     def _restr_xrefs(xrefs):
         """Restructure field names related to the pharmgkb.xrefs field"""
@@ -26,56 +27,47 @@ def _restr_xrefs(xrefs):
             ('National Drug Code Directory', 'ndc'),
             ('Drugs Product Database (DPD)', 'dpd'),
             ('FDA Drug Label at DailyMed', 'dailymed.setid'),
-            ]
+        ]
         res = []
-        for v in xrefs:
+        for v in xrefs.split(','):
             for rf_orig, rf_new in rename_fields:
                 if rf_orig in v:
                     v = v.replace(rf_orig, rf_new)
             # Multiple replacements on the 'Web Resource' field
             if 'Web Resource' in v:
                 if 'http://en.wikipedia.org/wiki/' in v:
-                    v = v.replace('Web Resource', 'wikipedia.url_stub')
-                    v = v.replace('http://en.wikipedia.org/wiki/', '')
+                    v = v.replace('Web Resource', 'wikipedia.url_stub').replace(
+                        'http://en.wikipedia.org/wiki/', '')
             # Add 'CHEBI:' prefix if not there already
-            elif 'ChEBI:' in v:
-                if 'ChEBI:CHEBI' not in v:
-                    v = v.replace('ChEBI:', 'ChEBI:CHEBI:')
-            res.append(v)
+            elif 'ChEBI:' in v and 'ChEBI:CHEBI' not in v:
+                v = v.replace('ChEBI:', 'ChEBI:CHEBI:')
+            res.append(v.strip())
         return res
+
     _d = {}
-    _li2 = ["Trade Names","Generic Names","Brand Mixtures","Dosing Guideline"]
-    _li1 = ["SMILES","Name","Type","InChI"]
     for key, val in iter(d.items()):
-        if key in _li1:
-            _d.update({key.lower():val})
-        elif key in _li2:
-            val = val.split(',"')
-            val = list(map(lambda each:each.strip('"'), val))  #python 3 compatible
-            k = key.lower().replace(" ","_").replace('-','_').replace(".","_")
-            _d.update({k:val})
+        if key in ["SMILES", "Name", "Type", "InChI"]:
+            _d.update({key.lower(): val})
+        elif key in ["Trade Names", "Generic Names", "Brand Mixtures"]:
+            # Convert to list if not empty, otherwise default to empty list
+            _d.update({key.lower().replace(" ", "_")                      : val.split(', ') if val else []})
+        elif key == "Dosing Guideline":
+            # Convert to boolean
+            _d.update({"dosing_guideline": True if val == "Yes" else False})
         elif key == "PharmGKB Accession Id":
-            k = 'id'
-            _d.update({k:val})
+            _d.update({'id': val})
         elif key == "Cross-references":
-            k = "xrefs"
-            val = val.split(',"')
-            val = list(map(lambda each:each.strip('"'), val))  #python 3 compatible
-            val = _restr_xrefs(val)
-            _d.update({k:val})
+            _d.update({"xrefs": _restr_xrefs(val)})
         elif key == "External Vocabulary":
-            # external_vocabulary - remove parenthesis and text within
-            k = "external_vocabulary"
-            # note:  regular expressions appear to be causing an error
-            # val = re.sub('\([^)]*\)', '', val)
-            val = val.split(',"')
-            val = list(map(lambda each:remove_paren(each.strip('"')), val))  #python 3 compatible
-            _d.update({k:val})
+            # Process and remove parentheses if present
+            val = [remove_paren(each.strip()) for each in val.split(',')]
+            _d.update({"external_vocabulary": val})
     return _d
 
+
 def clean_up(d):
-    _li = ['xrefs','external_vocabulary']
-    _d= {}
+    _li = ['xrefs', 'external_vocabulary']
+    _d = {}
     for key, val in iter(d.items()):
         if key in _li:
             for ele in val:
@@ -83,7 +75,7 @@ def clean_up(d):
                 # Note:  original pharmgkb keys do not have '.'
                 k = transform_xrefs_fieldnames(ele[0:idx])
                 v = ele[idx+1:]
-                if k in ["pubchem.cid","pubchem.sid"]:
+                if k in ["pubchem.cid", "pubchem.sid"]:
                     v = int(v)
                 # Handle nested elements (ex: 'wikipedia.url_stub') here
                 sub_d = sub_field(k, v)
@@ -91,9 +83,10 @@ def clean_up(d):
     # 'xrefs' and 'external_vocabulary' are merged
     if 'external_vocabulary' in d.keys():
         d.pop('external_vocabulary')
-    d.update({'xrefs':_d})
+    d.update({'xrefs': _d})
     return d
 
+
 def sub_field(k, v):
     """Return a nested dictionary with field keys k and value v."""
     res = {}
@@ -105,6 +98,7 @@ def sub_field(k, v):
     field_d[fields[-1]] = v
     return res
 
+
 def remove_paren(v):
     """remove first occurance of trailing parentheses from a string"""
     idx = v.find('(')
@@ -112,16 +106,17 @@ def remove_paren(v):
         return v[0:idx]
     return v
 
+
 def transform_xrefs_fieldnames(k):
     fields = [
         ('Chemical Abstracts Service', 'cas'),
         ('Therapeutic Targets Database', 'ttd'),
         ('PubChem Substance', 'pubchem.sid'),
         ('PubChem Compound', 'pubchem.cid')
-        ]
+    ]
     for orig_f, new_f in fields:
         if orig_f in k:
             k = k.replace(orig_f, new_f)
             break
-    k = k.lower().replace(' ','_').replace('-','_')
+    k = k.lower().replace(' ', '_').replace('-', '_')
     return k