Skip to content

Commit

Permalink
Calculate IsLower and IsAlpha sets in advance
Browse files Browse the repository at this point in the history
  • Loading branch information
jelmervdl committed Oct 30, 2023
1 parent 65543c3 commit 8163608
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions sacremoses/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-

import re
from itertools import chain

from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
Expand Down Expand Up @@ -30,6 +31,9 @@ class MosesTokenizer(object):
)
IsLower = str("".join(perluniprops.chars("IsLower")))

AlphaChars = frozenset(chain(perluniprops.chars("IsAlpha"), VIRAMAS, NUKTAS))
LowerChars = frozenset(perluniprops.chars("IsLower"))

# Remove ASCII junk.
DEDUPLICATE_SPACE = re.compile(r"\s+"), r" "
ASCII_JUNK = re.compile(r"[\000-\037]"), r""
Expand Down Expand Up @@ -357,10 +361,10 @@ def restore_multidots(self, text):
return re.sub(r"DOTMULTI", r".", text)

def islower(self, text):
return not set(text).difference(set(self.IsLower))
return bool(set(text) <= self.LowerChars)

def isanyalpha(self, text):
return any(set(text).intersection(set(self.IsAlpha)))
return bool(set(text) & self.AlphaChars)

def has_numeric_only(self, text):
return bool(re.search(r"[\s]+(\#NUMERIC_ONLY\#)", text))
Expand Down

0 comments on commit 8163608

Please sign in to comment.