Skip to content

Commit

Permalink
Merge pull request #28 from obynio/ahlec/kagetsu
Browse files Browse the repository at this point in the history
Add support for small ヵ/ヶ being read as large か in words
  • Loading branch information
obynio committed Feb 21, 2023
2 parents 07fbea6 + 455114c commit 545631b
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 6 deletions.
36 changes: 30 additions & 6 deletions reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import subprocess
import platform

from typing import Any, Mapping, Optional, Union
from typing import Any, List, Mapping, Optional, Union

mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n',
'--unk-format=%m[] ']
Expand Down Expand Up @@ -106,6 +106,13 @@ def __iter__(self):
def convertToHiragana(expr: str) -> str:
return expr.translate(translator)

def getAdditionalPossibleReadings(hiragana: str) -> Optional[List[str]]:
# The little ヵ and ヶ can show up in readings as "か" (eg: ヶ月, ヵ国, etc)
if hiragana == 'ゕ' or hiragana == 'ゖ':
return ['か']

return None

def isKana(char: str) -> bool:
code = ord(char)

Expand Down Expand Up @@ -159,11 +166,28 @@ def kanjiToRegex(kanji: str):
# Hiragana and Katakana characters are inlined into the Regex
if isKana(kanji[index]):
# The reading variable is ALWAYS in hiragana only
regexPieces.append(convertToHiragana(kanji[index]))

# Use kanji[index] here to retain original katakana/hiragana
# (We convert to hiragana just to match against reading)
definitions.append(RegexDefinition(kanji[index], None))
hiragana = convertToHiragana(kanji[index])

additional = getAdditionalPossibleReadings(hiragana)
if additional:
# If it's possible that this kana could be read as a totally different kana
# (eg "ヶ" being read as "か"), we want to give it furigana.
# We'll register it as a capture group -- both because we don't know
# for SURE which reading we're expecting (so we'll register multiple
# possibilities), but ALSO so that we can go down the furigana generation
# pathway that's normally/usually reserved for kanji
regexPieces.append("(" + "|".join([hiragana] + additional) + ")")

# Use kanji[index] here to retain original katakana/hiragana
# (We convert to hiragana just to match against reading)
definitions.append(RegexDefinition(kanji[index], numCaptureGroups))
numCaptureGroups += 1
else:
regexPieces.append(hiragana)

# Use kanji[index] here to retain original katakana/hiragana
# (We convert to hiragana just to match against reading)
definitions.append(RegexDefinition(kanji[index], None))

# Advance to the next character
index += 1
Expand Down
22 changes: 22 additions & 0 deletions test/test_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,28 @@ def testSpacesRetained(self):
self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります")
self.assertEqual(reading.mecab.reading("hello world"), "hello world")

# some kana characters will have different readings when used in readings
# (such as ヶ月 being read as かげつ). ensure that we can detect and handle these
def testKanaWithAdditionalReadings(self):
# Check that ヵ (small) stands in for か (large) in readings
# This should generate furigana for the small ヵ
self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")

# Check that ヶ *also* stands in for か in readings
# This should generate furigana for the small ヶ
self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")

# For the same sentence, also make sure that the full-sized か and カ
# are also recognized.
# However, neither of these should generate furigana.
self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か月[げつ]間[かん]訪問[ほうもん]するつもりです")
self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ月[げつ]間[かん]訪問[ほうもん]するつもりです")

# Finally, ensure that we're not just ALWAYS adding furigana to ヶ and ヵ
# whenever we encounter them
self.assertEqual(reading.mecab.reading("ィヵ"), "ィヵ")
self.assertEqual(reading.mecab.reading("ゥヶ"), "ゥヶ")

class TestConvertToHiragana(unittest.TestCase):
# ensure that if the function is called with an empty string, it will return
# an empty string
Expand Down

0 comments on commit 545631b

Please sign in to comment.