From 1ee8c290c2e7253c8e2229eba034fd7aea49c8ed Mon Sep 17 00:00:00 2001 From: Dylan Welzel Date: Wed, 13 Mar 2024 09:56:13 -0700 Subject: [PATCH] pharmgkb parser fixes https://github.com/biothings/mychem.info/issues/174 --- .../sources/pharmgkb/pharmgkb_parser.py | 77 +++++++++---------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py index 0a42bd91..4984ee53 100644 --- a/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py +++ b/src/hub/dataload/sources/pharmgkb/pharmgkb_parser.py @@ -1,23 +1,24 @@ import csv import re import sys + from biothings.utils.dataload import dict_sweep, unlist csv.field_size_limit(sys.maxsize) + def load_data(tsv_file): _file = open(tsv_file) - reader = csv.DictReader(_file,delimiter='\t') - _dict = {} - drug_list = [] + reader = csv.DictReader(_file, delimiter='\t') for row in reader: _id = row["PharmGKB Accession Id"] _d = restr_dict(row) _d = clean_up(_d) _d = unlist(dict_sweep(_d)) - _dict = {'_id':_id,'pharmgkb':_d} + _dict = {'_id': _id, 'pharmgkb': _d} yield _dict + def restr_dict(d): def _restr_xrefs(xrefs): """Restructure field names related to the pharmgkb.xrefs field""" @@ -26,56 +27,47 @@ def _restr_xrefs(xrefs): ('National Drug Code Directory', 'ndc'), ('Drugs Product Database (DPD)', 'dpd'), ('FDA Drug Label at DailyMed', 'dailymed.setid'), - ] + ] res = [] - for v in xrefs: + for v in xrefs.split(','): for rf_orig, rf_new in rename_fields: if rf_orig in v: v = v.replace(rf_orig, rf_new) # Multiple replacements on the 'Web Resource' field if 'Web Resource' in v: if 'http://en.wikipedia.org/wiki/' in v: - v = v.replace('Web Resource', 'wikipedia.url_stub') - v = v.replace('http://en.wikipedia.org/wiki/', '') + v = v.replace('Web Resource', 'wikipedia.url_stub').replace( + 'http://en.wikipedia.org/wiki/', '') # Add 'CHEBI:' prefix if not there already - elif 'ChEBI:' in v: - if 'ChEBI:CHEBI' not in v: - v = v.replace('ChEBI:', 'ChEBI:CHEBI:') - res.append(v) + elif 'ChEBI:' in v and 'ChEBI:CHEBI' not in v: + v = v.replace('ChEBI:', 'ChEBI:CHEBI:') + res.append(v.strip()) return res + _d = {} - _li2 = ["Trade Names","Generic Names","Brand Mixtures","Dosing Guideline"] - _li1 = ["SMILES","Name","Type","InChI"] for key, val in iter(d.items()): - if key in _li1: - _d.update({key.lower():val}) - elif key in _li2: - val = val.split(',"') - val = list(map(lambda each:each.strip('"'), val)) #python 3 compatible - k = key.lower().replace(" ","_").replace('-','_').replace(".","_") - _d.update({k:val}) + if key in ["SMILES", "Name", "Type", "InChI"]: + _d.update({key.lower(): val}) + elif key in ["Trade Names", "Generic Names", "Brand Mixtures"]: + # Convert to list if not empty, otherwise default to empty list + _d.update({key.lower().replace(" ", "_") : val.split(', ') if val else []}) + elif key == "Dosing Guideline": + # Convert to boolean + _d.update({"dosing_guideline": True if val == "Yes" else False}) elif key == "PharmGKB Accession Id": - k = 'id' - _d.update({k:val}) + _d.update({'id': val}) elif key == "Cross-references": - k = "xrefs" - val = val.split(',"') - val = list(map(lambda each:each.strip('"'), val)) #python 3 compatible - val = _restr_xrefs(val) - _d.update({k:val}) + _d.update({"xrefs": _restr_xrefs(val)}) elif key == "External Vocabulary": - # external_vocabulary - remove parenthesis and text within - k = "external_vocabulary" - # note: regular expressions appear to be causing an error - # val = re.sub('\([^)]*\)', '', val) - val = val.split(',"') - val = list(map(lambda each:remove_paren(each.strip('"')), val)) #python 3 compatible - _d.update({k:val}) + # Process and remove parentheses if present + val = [remove_paren(each.strip()) for each in val.split(',')] + _d.update({"external_vocabulary": val}) return _d + def clean_up(d): - _li = ['xrefs','external_vocabulary'] - _d= {} + _li = ['xrefs', 'external_vocabulary'] + _d = {} for key, val in iter(d.items()): if key in _li: for ele in val: @@ -83,7 +75,7 @@ def clean_up(d): # Note: original pharmgkb keys do not have '.' k = transform_xrefs_fieldnames(ele[0:idx]) v = ele[idx+1:] - if k in ["pubchem.cid","pubchem.sid"]: + if k in ["pubchem.cid", "pubchem.sid"]: v = int(v) # Handle nested elements (ex: 'wikipedia.url_stub') here sub_d = sub_field(k, v) @@ -91,9 +83,10 @@ def clean_up(d): # 'xrefs' and 'external_vocabulary' are merged if 'external_vocabulary' in d.keys(): d.pop('external_vocabulary') - d.update({'xrefs':_d}) + d.update({'xrefs': _d}) return d + def sub_field(k, v): """Return a nested dictionary with field keys k and value v.""" res = {} @@ -105,6 +98,7 @@ def sub_field(k, v): field_d[fields[-1]] = v return res + def remove_paren(v): """remove first occurance of trailing parentheses from a string""" idx = v.find('(') @@ -112,16 +106,17 @@ def remove_paren(v): return v[0:idx] return v + def transform_xrefs_fieldnames(k): fields = [ ('Chemical Abstracts Service', 'cas'), ('Therapeutic Targets Database', 'ttd'), ('PubChem Substance', 'pubchem.sid'), ('PubChem Compound', 'pubchem.cid') - ] + ] for orig_f, new_f in fields: if orig_f in k: k = k.replace(orig_f, new_f) break - k = k.lower().replace(' ','_').replace('-','_') + k = k.lower().replace(' ', '_').replace('-', '_') return k