diff --git a/reading.py b/reading.py index 26c2350..40e4371 100644 --- a/reading.py +++ b/reading.py @@ -21,7 +21,7 @@ import subprocess import platform -from typing import Any, Mapping, Optional, Union +from typing import Any, List, Mapping, Optional, Union mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n', '--unk-format=%m[] '] @@ -106,6 +106,13 @@ def __iter__(self): def convertToHiragana(expr: str) -> str: return expr.translate(translator) +def getAdditionalPossibleReadings(hiragana: str) -> Optional[List[str]]: + # The little ヵ and ヶ can show up in readings as "か" (eg: ヶ月, ヵ国, etc) + if hiragana == 'ゕ' or hiragana == 'ゖ': + return ['か'] + + return None + def isKana(char: str) -> bool: code = ord(char) @@ -159,11 +166,28 @@ def kanjiToRegex(kanji: str): # Hiragana and Katakana characters are inlined into the Regex if isKana(kanji[index]): # The reading variable is ALWAYS in hiragana only - regexPieces.append(convertToHiragana(kanji[index])) - - # Use kanji[index] here to retain original katakana/hiragana - # (We convert to hiragana just to match against reading) - definitions.append(RegexDefinition(kanji[index], None)) + hiragana = convertToHiragana(kanji[index]) + + additional = getAdditionalPossibleReadings(hiragana) + if additional: + # If it's possible that this kana could be read as a totally different kana + # (eg "ヶ" being read as "か"), we want to give it furigana. + # We'll register it as a capture group -- both because we don't know + # for SURE which reading we're expecting (so we'll register multiple + # possibilities), but ALSO so that we can go down the furigana generation + # pathway that's normally/usually reserved for kanji + regexPieces.append("(" + "|".join([hiragana] + additional) + ")") + + # Use kanji[index] here to retain original katakana/hiragana + # (We convert to hiragana just to match against reading) + definitions.append(RegexDefinition(kanji[index], numCaptureGroups)) + numCaptureGroups += 1 + else: + regexPieces.append(hiragana) + + # Use kanji[index] here to retain original katakana/hiragana + # (We convert to hiragana just to match against reading) + definitions.append(RegexDefinition(kanji[index], None)) # Advance to the next character index += 1 diff --git a/test/test_reading.py b/test/test_reading.py index 77aac6f..2420206 100644 --- a/test/test_reading.py +++ b/test/test_reading.py @@ -88,6 +88,28 @@ def testSpacesRetained(self): self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります") self.assertEqual(reading.mecab.reading("hello world"), "hello world") + # some kana characters will have different readings when used in readings + # (such as ヶ月 being read as かげつ). ensure that we can detect and handle these + def testKanaWithAdditionalReadings(self): + # Check that ヵ (small) stands in for か (large) in readings + # This should generate furigana for the small ヵ + self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです") + + # Check that ヶ *also* stands in for か in readings + # This should generate furigana for the small ヶ + self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです") + + # For the same sentence, also make sure that the full-sized か and カ + # are also recognized. + # However, neither of these should generate furigana. + self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か月[げつ]間[かん]訪問[ほうもん]するつもりです") + self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ月[げつ]間[かん]訪問[ほうもん]するつもりです") + + # Finally, ensure that we're not just ALWAYS adding furigana to ヶ and ヵ + # whenever we encounter them + self.assertEqual(reading.mecab.reading("ィヵ"), "ィヵ") + self.assertEqual(reading.mecab.reading("ゥヶ"), "ゥヶ") + class TestConvertToHiragana(unittest.TestCase): # ensure that if the function is called with an empty string, it will return # an empty string