diff --git a/src/unihan_etl/validator.py b/src/unihan_etl/validator.py new file mode 100644 index 00000000..44595cf9 --- /dev/null +++ b/src/unihan_etl/validator.py @@ -0,0 +1,71 @@ +"""Experimental pydantic models for unihan data.""" +import typing as t + +import pydantic + +from unihan_etl.expansion import expand_kTGHZ2013 + + +class UCNBaseModel(pydantic.BaseModel): + """Core model for UCN data.""" + + ucn: str + + +class kTGHZ2013Location(pydantic.BaseModel): + """Core model for location.""" + + page: int + position: int + entry_type: int = pydantic.Field( + description=( + "0 for a main entry and greater than 0 for a parenthesized or bracketed " + + "variant of the main entry" + ) + ) + + +class kTGHZ2013Reading(pydantic.BaseModel): + """kTGHZ2013 model.""" + + reading: str + locations: t.List[kTGHZ2013Location] + + +class kTGHZ2013(UCNBaseModel): + """kTGHZ2013 model.""" + + readings: t.List[kTGHZ2013Reading] + + model_config = pydantic.ConfigDict( + validate_assignment=True, + arbitrary_types_allowed=True, + ) + + @classmethod + def from_string(cls, value: str) -> "kTGHZ2013": + """Accept csv valdation from UNIHAN.""" + if isinstance(value, str): + ucn, field, val = value.split("\t") + outs = expand_kTGHZ2013(val.split(" ")) + + return cls( + ucn=ucn, + readings=[ + kTGHZ2013Reading( + reading=out["reading"], + locations=[ + kTGHZ2013Location( + page=loc["page"], + position=loc["position"], + entry_type=loc["entry_type"], + ) + for loc in out["locations"] + ], + ) + for out in outs + ], + ) + elif isinstance(value, dict): + return pydantic.parse_obj_as(cls, value) + raise pydantic.ValidationError("Invalid input for kTGHZ2013 model.") # noqa: TRY003 diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 00000000..39d86abf --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,27 @@ +"""Test expansion of multi-value fields in UNIHAN.""" +import typing as t + +from unihan_etl import validator + +if t.TYPE_CHECKING: + pass + + +def test_kTGHZ2013() -> None: + """Example of kTGHZ2013 being parsed via pydantic.""" + model = validator.kTGHZ2013.from_string("U+3447 kTGHZ2013 482.140:zhòu") + assert model.ucn == "U+3447" + + model = validator.kTGHZ2013.from_string( + "U+4E07 kTGHZ2013 256.090:mò 379.160:wàn" + ) + assert model.ucn == "U+4E07" + assert model.readings[0].reading == "mò" + assert model.readings[1].reading == "wàn" + assert model.readings[1].locations[0] == validator.kTGHZ2013Location( + page=379, + position=16, + entry_type=0, + ) + + print(f"\n{model}\n")