Skip to content

Commit

Permalink
pharmgkb parser fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
DylanWelzel committed Mar 13, 2024
1 parent e16c785 commit 1ee8c29
Showing 1 changed file with 36 additions and 41 deletions.
77 changes: 36 additions & 41 deletions src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
import csv
import re
import sys

from biothings.utils.dataload import dict_sweep, unlist

csv.field_size_limit(sys.maxsize)


def load_data(tsv_file):
_file = open(tsv_file)
reader = csv.DictReader(_file,delimiter='\t')
_dict = {}
drug_list = []
reader = csv.DictReader(_file, delimiter='\t')
for row in reader:
_id = row["PharmGKB Accession Id"]
_d = restr_dict(row)
_d = clean_up(_d)
_d = unlist(dict_sweep(_d))
_dict = {'_id':_id,'pharmgkb':_d}
_dict = {'_id': _id, 'pharmgkb': _d}
yield _dict


def restr_dict(d):
def _restr_xrefs(xrefs):
"""Restructure field names related to the pharmgkb.xrefs field"""
Expand All @@ -26,74 +27,66 @@ def _restr_xrefs(xrefs):
('National Drug Code Directory', 'ndc'),
('Drugs Product Database (DPD)', 'dpd'),
('FDA Drug Label at DailyMed', 'dailymed.setid'),
]
]
res = []
for v in xrefs:
for v in xrefs.split(','):
for rf_orig, rf_new in rename_fields:
if rf_orig in v:
v = v.replace(rf_orig, rf_new)
# Multiple replacements on the 'Web Resource' field
if 'Web Resource' in v:
if 'http://en.wikipedia.org/wiki/' in v:
v = v.replace('Web Resource', 'wikipedia.url_stub')
v = v.replace('http://en.wikipedia.org/wiki/', '')
v = v.replace('Web Resource', 'wikipedia.url_stub').replace(
'http://en.wikipedia.org/wiki/', '')
# Add 'CHEBI:' prefix if not there already
elif 'ChEBI:' in v:
if 'ChEBI:CHEBI' not in v:
v = v.replace('ChEBI:', 'ChEBI:CHEBI:')
res.append(v)
elif 'ChEBI:' in v and 'ChEBI:CHEBI' not in v:
v = v.replace('ChEBI:', 'ChEBI:CHEBI:')
res.append(v.strip())
return res

_d = {}
_li2 = ["Trade Names","Generic Names","Brand Mixtures","Dosing Guideline"]
_li1 = ["SMILES","Name","Type","InChI"]
for key, val in iter(d.items()):
if key in _li1:
_d.update({key.lower():val})
elif key in _li2:
val = val.split(',"')
val = list(map(lambda each:each.strip('"'), val)) #python 3 compatible
k = key.lower().replace(" ","_").replace('-','_').replace(".","_")
_d.update({k:val})
if key in ["SMILES", "Name", "Type", "InChI"]:
_d.update({key.lower(): val})
elif key in ["Trade Names", "Generic Names", "Brand Mixtures"]:
# Convert to list if not empty, otherwise default to empty list
_d.update({key.lower().replace(" ", "_") : val.split(', ') if val else []})
elif key == "Dosing Guideline":
# Convert to boolean
_d.update({"dosing_guideline": True if val == "Yes" else False})
elif key == "PharmGKB Accession Id":
k = 'id'
_d.update({k:val})
_d.update({'id': val})
elif key == "Cross-references":
k = "xrefs"
val = val.split(',"')
val = list(map(lambda each:each.strip('"'), val)) #python 3 compatible
val = _restr_xrefs(val)
_d.update({k:val})
_d.update({"xrefs": _restr_xrefs(val)})
elif key == "External Vocabulary":
# external_vocabulary - remove parenthesis and text within
k = "external_vocabulary"
# note: regular expressions appear to be causing an error
# val = re.sub('\([^)]*\)', '', val)
val = val.split(',"')
val = list(map(lambda each:remove_paren(each.strip('"')), val)) #python 3 compatible
_d.update({k:val})
# Process and remove parentheses if present
val = [remove_paren(each.strip()) for each in val.split(',')]
_d.update({"external_vocabulary": val})
return _d


def clean_up(d):
_li = ['xrefs','external_vocabulary']
_d= {}
_li = ['xrefs', 'external_vocabulary']
_d = {}
for key, val in iter(d.items()):
if key in _li:
for ele in val:
idx = ele.find(':')
# Note: original pharmgkb keys do not have '.'
k = transform_xrefs_fieldnames(ele[0:idx])
v = ele[idx+1:]
if k in ["pubchem.cid","pubchem.sid"]:
if k in ["pubchem.cid", "pubchem.sid"]:
v = int(v)
# Handle nested elements (ex: 'wikipedia.url_stub') here
sub_d = sub_field(k, v)
_d.update(sub_d)
# 'xrefs' and 'external_vocabulary' are merged
if 'external_vocabulary' in d.keys():
d.pop('external_vocabulary')
d.update({'xrefs':_d})
d.update({'xrefs': _d})
return d


def sub_field(k, v):
"""Return a nested dictionary with field keys k and value v."""
res = {}
Expand All @@ -105,23 +98,25 @@ def sub_field(k, v):
field_d[fields[-1]] = v
return res


def remove_paren(v):
"""remove first occurance of trailing parentheses from a string"""
idx = v.find('(')
if idx != -1:
return v[0:idx]
return v


def transform_xrefs_fieldnames(k):
fields = [
('Chemical Abstracts Service', 'cas'),
('Therapeutic Targets Database', 'ttd'),
('PubChem Substance', 'pubchem.sid'),
('PubChem Compound', 'pubchem.cid')
]
]
for orig_f, new_f in fields:
if orig_f in k:
k = k.replace(orig_f, new_f)
break
k = k.lower().replace(' ','_').replace('-','_')
k = k.lower().replace(' ', '_').replace('-', '_')
return k

0 comments on commit 1ee8c29

Please sign in to comment.