pharmgkb parser fixes

#174
biothings · Mar 13, 2024 · 1ee8c29 · 1ee8c29
1 parent e16c785
commit 1ee8c29
Showing 1 changed file with 36 additions and 41 deletions.
diff --git a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
@@ -1,23 +1,24 @@
 import csv
 import re
 import sys
+
 from biothings.utils.dataload import dict_sweep, unlist
 
 csv.field_size_limit(sys.maxsize)
 
+
 def load_data(tsv_file):
  _file = open(tsv_file)
- reader = csv.DictReader(_file,delimiter='\t')
- _dict = {}
- drug_list = []
+ reader = csv.DictReader(_file, delimiter='\t')
  for row in reader:
  _id = row["PharmGKB Accession Id"]
  _d = restr_dict(row)
  _d = clean_up(_d)
  _d = unlist(dict_sweep(_d))
- _dict = {'_id':_id,'pharmgkb':_d}
+ _dict = {'_id': _id, 'pharmgkb': _d}
  yield _dict
 
+
 def restr_dict(d):
  def _restr_xrefs(xrefs):
  """Restructure field names related to the pharmgkb.xrefs field"""
@@ -26,74 +27,66 @@ def _restr_xrefs(xrefs):
  ('National Drug Code Directory', 'ndc'),
  ('Drugs Product Database (DPD)', 'dpd'),
  ('FDA Drug Label at DailyMed', 'dailymed.setid'),
-  ]
+ ]
  res = []
- for v in xrefs:
+ for v in xrefs.split(','):
  for rf_orig, rf_new in rename_fields:
  if rf_orig in v:
  v = v.replace(rf_orig, rf_new)
  # Multiple replacements on the 'Web Resource' field
  if 'Web Resource' in v:
  if 'http://en.wikipedia.org/wiki/' in v:
- v = v.replace('Web Resource', 'wikipedia.url_stub')
- v = v.replace('http://en.wikipedia.org/wiki/', '')
+ v = v.replace('Web Resource', 'wikipedia.url_stub').replace(
+  'http://en.wikipedia.org/wiki/', '')
  # Add 'CHEBI:' prefix if not there already
- elif 'ChEBI:' in v:
- if 'ChEBI:CHEBI' not in v:
- v = v.replace('ChEBI:', 'ChEBI:CHEBI:')
- res.append(v)
+ elif 'ChEBI:' in v and 'ChEBI:CHEBI' not in v:
+ v = v.replace('ChEBI:', 'ChEBI:CHEBI:')
+ res.append(v.strip())
  return res
+
  _d = {}
- _li2 = ["Trade Names","Generic Names","Brand Mixtures","Dosing Guideline"]
- _li1 = ["SMILES","Name","Type","InChI"]
  for key, val in iter(d.items()):
- if key in _li1:
- _d.update({key.lower():val})
- elif key in _li2:
- val = val.split(',"')
- val = list(map(lambda each:each.strip('"'), val)) #python 3 compatible
- k = key.lower().replace(" ","_").replace('-','_').replace(".","_")
- _d.update({k:val})
+ if key in ["SMILES", "Name", "Type", "InChI"]:
+ _d.update({key.lower(): val})
+ elif key in ["Trade Names", "Generic Names", "Brand Mixtures"]:
+ # Convert to list if not empty, otherwise default to empty list
+ _d.update({key.lower().replace(" ", "_") : val.split(', ') if val else []})
+ elif key == "Dosing Guideline":
+ # Convert to boolean
+ _d.update({"dosing_guideline": True if val == "Yes" else False})
  elif key == "PharmGKB Accession Id":
- k = 'id'
- _d.update({k:val})
+ _d.update({'id': val})
  elif key == "Cross-references":
- k = "xrefs"
- val = val.split(',"')
- val = list(map(lambda each:each.strip('"'), val)) #python 3 compatible
- val = _restr_xrefs(val)
- _d.update({k:val})
+ _d.update({"xrefs": _restr_xrefs(val)})
  elif key == "External Vocabulary":
- # external_vocabulary - remove parenthesis and text within
- k = "external_vocabulary"
- # note: regular expressions appear to be causing an error
- # val = re.sub('\([^)]*\)', '', val)
- val = val.split(',"')
- val = list(map(lambda each:remove_paren(each.strip('"')), val)) #python 3 compatible
- _d.update({k:val})
+ # Process and remove parentheses if present
+ val = [remove_paren(each.strip()) for each in val.split(',')]
+ _d.update({"external_vocabulary": val})
  return _d
 
+
 def clean_up(d):
- _li = ['xrefs','external_vocabulary']
- _d= {}
+ _li = ['xrefs', 'external_vocabulary']
+ _d = {}
  for key, val in iter(d.items()):
  if key in _li:
  for ele in val:
  idx = ele.find(':')
  # Note: original pharmgkb keys do not have '.'
  k = transform_xrefs_fieldnames(ele[0:idx])
  v = ele[idx+1:]
- if k in ["pubchem.cid","pubchem.sid"]:
+ if k in ["pubchem.cid", "pubchem.sid"]:
  v = int(v)
  # Handle nested elements (ex: 'wikipedia.url_stub') here
  sub_d = sub_field(k, v)
  _d.update(sub_d)
  # 'xrefs' and 'external_vocabulary' are merged
  if 'external_vocabulary' in d.keys():
  d.pop('external_vocabulary')
- d.update({'xrefs':_d})
+ d.update({'xrefs': _d})
  return d
 
+
 def sub_field(k, v):
  """Return a nested dictionary with field keys k and value v."""
  res = {}
@@ -105,23 +98,25 @@ def sub_field(k, v):
  field_d[fields[-1]] = v
  return res
 
+
 def remove_paren(v):
  """remove first occurance of trailing parentheses from a string"""
  idx = v.find('(')
  if idx != -1:
  return v[0:idx]
  return v
 
+
 def transform_xrefs_fieldnames(k):
  fields = [
  ('Chemical Abstracts Service', 'cas'),
  ('Therapeutic Targets Database', 'ttd'),
  ('PubChem Substance', 'pubchem.sid'),
  ('PubChem Compound', 'pubchem.cid')
-  ]
+ ]
  for orig_f, new_f in fields:
  if orig_f in k:
  k = k.replace(orig_f, new_f)
  break
- k = k.lower().replace(' ','_').replace('-','_')
+ k = k.lower().replace(' ', '_').replace('-', '_')
  return k