Skip to content

Commit

Permalink
Merge pull request #143 from hplt-project/regex-optim-alt
Browse files Browse the repository at this point in the history
Compile regexp in detokenizer
  • Loading branch information
jelmervdl committed Sep 27, 2023
2 parents d04249b + 38d83b5 commit 303ae7f
Showing 1 changed file with 20 additions and 8 deletions.
28 changes: 20 additions & 8 deletions sacremoses/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,18 @@ class MosesDetokenizer(object):
"|".join(FINNISH_MORPHSET_3),
))

IS_CURRENCY_SYMBOL = re.compile(r"^[{}\(\[\{{\¿\¡]+$".format(IsSc))

IS_ENGLISH_CONTRACTION = re.compile(r"^['][{}]".format(IsAlpha))

IS_FRENCH_CONRTACTION = re.compile(r"[{}][']$".format(IsAlpha))

STARTS_WITH_ALPHA = re.compile(r"^[{}]".format(IsAlpha))

IS_PUNCT = re.compile(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$")

IS_OPEN_QUOTE = re.compile(r"""^[\'\"„“`]+$""")

def __init__(self, lang="en"):
super(MosesDetokenizer, self).__init__()
self.lang = lang
Expand Down Expand Up @@ -708,12 +720,12 @@ def tokenize(self, tokens, return_str=True, unescape=True):
detokenized_text += prepend_space + token
prepend_space = " "
# If it's a currency symbol.
elif re.search(r"^[" + self.IsSc + r"\(\[\{\¿\¡]+$", token):
elif self.IS_CURRENCY_SYMBOL.search(token):
# Perform right shift on currency and other random punctuation items
detokenized_text += prepend_space + token
prepend_space = ""

elif re.search(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$", token):
elif self.IS_PUNCT.search(token):
# In French, these punctuations are prefixed with a non-breakable space.
if self.lang == "fr" and re.search(r"^[\?\!\:\;\\\%]$", token):
detokenized_text += " "
Expand All @@ -724,7 +736,7 @@ def tokenize(self, tokens, return_str=True, unescape=True):
elif (
self.lang == "en"
and i > 0
and re.search(r"^['][{}]".format(self.IsAlpha), token)
and self.IS_ENGLISH_CONTRACTION.search(token)
):
# and re.search('[{}]$'.format(self.IsAlnum), tokens[i-1])):
# For English, left-shift the contraction.
Expand All @@ -747,8 +759,8 @@ def tokenize(self, tokens, return_str=True, unescape=True):
elif (
self.lang in ["fr", "it", "ga"]
and i <= len(tokens) - 2
and re.search(r"[{}][']$".format(self.IsAlpha), token)
and re.search(r"^[{}]".format(self.IsAlpha), tokens[i + 1])
and self.IS_FRENCH_CONRTACTION.search(token)
and self.STARTS_WITH_ALPHA.search(tokens[i + 1])
): # If the next token is alpha.
# For French and Italian, right-shift the contraction.
detokenized_text += prepend_space + token
Expand All @@ -757,7 +769,7 @@ def tokenize(self, tokens, return_str=True, unescape=True):
elif (
self.lang == "cs"
and i <= len(tokens) - 3
and re.search(r"[{}][']$".format(self.IsAlpha), token)
and self.IS_FRENCH_CONRTACTION.search(token)
and re.search(r"^[-–]$", tokens[i + 1])
and re.search(r"^li$|^mail.*", tokens[i + 2], re.IGNORECASE)
): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i)
Expand All @@ -767,7 +779,7 @@ def tokenize(self, tokens, return_str=True, unescape=True):
prepend_space = ""

# Combine punctuation smartly.
elif re.search(r"""^[\'\"„“`]+$""", token):
elif self.IS_OPEN_QUOTE.search(token):
normalized_quo = token
if re.search(r"^[„“”]+$", token):
normalized_quo = '"'
Expand Down Expand Up @@ -803,7 +815,7 @@ def tokenize(self, tokens, return_str=True, unescape=True):
elif (
self.lang == "fi"
and re.search(r":$", tokens[i - 1])
and re.search(self.FINNISH_REGEX, token)
and self.FINNISH_REGEX.search(token)
):
# Finnish : without intervening space if followed by case suffix
# EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
Expand Down

0 comments on commit 303ae7f

Please sign in to comment.