Merge pull request #28 from obynio/ahlec/kagetsu

Add support for small ヵ/ヶ being read as large か in words
obynio · Feb 21, 2023 · 545631b · 545631b
2 parents 07fbea6 + 455114c
commit 545631b
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 6 deletions.
diff --git a/reading.py b/reading.py
@@ -21,7 +21,7 @@
 import subprocess
 import platform
 
-from typing import Any, Mapping, Optional, Union
+from typing import Any, List, Mapping, Optional, Union
 
 mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n',
  '--unk-format=%m[] ']
@@ -106,6 +106,13 @@ def __iter__(self):
 def convertToHiragana(expr: str) -> str:
  return expr.translate(translator)
 
+def getAdditionalPossibleReadings(hiragana: str) -> Optional[List[str]]:
+ # The little ヵ and ヶ can show up in readings as "か" (eg: ヶ月, ヵ国, etc)
+ if hiragana == 'ゕ' or hiragana == 'ゖ':
+ return ['か']
+
+ return None
+
 def isKana(char: str) -> bool:
  code = ord(char)
 
@@ -159,11 +166,28 @@ def kanjiToRegex(kanji: str):
  # Hiragana and Katakana characters are inlined into the Regex
  if isKana(kanji[index]):
  # The reading variable is ALWAYS in hiragana only
- regexPieces.append(convertToHiragana(kanji[index]))
-
- # Use kanji[index] here to retain original katakana/hiragana
- # (We convert to hiragana just to match against reading)
- definitions.append(RegexDefinition(kanji[index], None))
+ hiragana = convertToHiragana(kanji[index])
+
+ additional = getAdditionalPossibleReadings(hiragana)
+ if additional:
+ # If it's possible that this kana could be read as a totally different kana
+ # (eg "ヶ" being read as "か"), we want to give it furigana.
+ # We'll register it as a capture group -- both because we don't know
+ # for SURE which reading we're expecting (so we'll register multiple
+ # possibilities), but ALSO so that we can go down the furigana generation
+ # pathway that's normally/usually reserved for kanji
+ regexPieces.append("(" + "|".join([hiragana] + additional) + ")")
+
+ # Use kanji[index] here to retain original katakana/hiragana
+ # (We convert to hiragana just to match against reading)
+ definitions.append(RegexDefinition(kanji[index], numCaptureGroups))
+ numCaptureGroups += 1
+ else:
+ regexPieces.append(hiragana)
+
+ # Use kanji[index] here to retain original katakana/hiragana
+ # (We convert to hiragana just to match against reading)
+ definitions.append(RegexDefinition(kanji[index], None))
 
  # Advance to the next character
  index += 1

diff --git a/test/test_reading.py b/test/test_reading.py
@@ -88,6 +88,28 @@ def testSpacesRetained(self):
  self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります")
  self.assertEqual(reading.mecab.reading("hello world"), "hello world")
 
+ # some kana characters will have different readings when used in readings
+ # (such as ヶ月 being read as かげつ). ensure that we can detect and handle these
+ def testKanaWithAdditionalReadings(self):
+ # Check that ヵ (small) stands in for か (large) in readings
+ # This should generate furigana for the small ヵ
+ self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
+
+ # Check that ヶ *also* stands in for か in readings
+ # This should generate furigana for the small ヶ
+ self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
+
+ # For the same sentence, also make sure that the full-sized か and カ
+ # are also recognized.
+ # However, neither of these should generate furigana.
+ self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か月[げつ]間[かん]訪問[ほうもん]するつもりです")
+ self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ月[げつ]間[かん]訪問[ほうもん]するつもりです")
+
+ # Finally, ensure that we're not just ALWAYS adding furigana to ヶ and ヵ
+ # whenever we encounter them
+ self.assertEqual(reading.mecab.reading("ィヵ"), "ィヵ")
+ self.assertEqual(reading.mecab.reading("ゥヶ"), "ゥヶ")
+
 class TestConvertToHiragana(unittest.TestCase):
  # ensure that if the function is called with an empty string, it will return
  # an empty string