diff --git a/reading.py b/reading.py index 038977a..aa355cc 100644 --- a/reading.py +++ b/reading.py @@ -26,6 +26,16 @@ HTML_REPLACER = '▦' NEWLINE_REPLACER = '▧' +# Unicode character used to replace ASCII Space (0x20) in expression before +# passing in to MeCab. MeCab separates kanji/reading nodes with ASCII spaces, +# so without this we wouldn't be able to tell apart a node separator from a +# space character in the original string. +# This is unique to ASCII Space (0x20) and does not apply to any other whitespace +# character (eg CJK Space) +# Codepoint chosen to be a unicode character unlikely to ever feature in ANY +# Anki card. +ASCII_SPACE_TOKEN = u"\U0000FFFF" + def htmlReplace(text): pattern = r"(?:<[^<]+?>)" matches = re.findall(pattern, text) @@ -100,6 +110,7 @@ def ensureOpen(self): def reading(self, expr, ignoreNumbers = True, useRubyTags = False): self.ensureOpen() matches, expr = escapeText(expr) + expr = expr.replace(" ", ASCII_SPACE_TOKEN) self.mecab.stdin.write(expr.encode("utf-8", "ignore") + b'\n') self.mecab.stdin.flush() expr = self.mecab.stdout.readline().rstrip(b'\r\n').decode('utf-8', "ignore") @@ -181,6 +192,7 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False): fin = ''.join(node.format(useRubyTags) for node in nodes) # Finalize formatting + fin = fin.replace(ASCII_SPACE_TOKEN, ' ') for match in matches: fin = fin.replace(HTML_REPLACER, match, 1) diff --git a/test/test_reading.py b/test/test_reading.py index 727f2d5..4149cd2 100644 --- a/test/test_reading.py +++ b/test/test_reading.py @@ -58,3 +58,9 @@ def testKanaBetweenKanji(self): self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き込[こ]む") self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り抜[ぬ]く") self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り回[まわ]る") + + # ensure that any regular ASCII space characters (0x20) that are in the original + # string are found in the resultant string as well + def testSpacesRetained(self): + self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります") + self.assertEqual(reading.mecab.reading("hello world"), "hello world")