omegat-org · miurahr · Feb 16, 2024 · Feb 17, 2024 · Feb 17, 2024
diff --git a/config/checkstyle/suppressions.xml b/config/checkstyle/suppressions.xml
@@ -79,7 +79,7 @@
  <!-- util/Preferences -->
  <suppress files="Preferences\.java" checks="LineLength" lines="197"/>
  <!-- core/stat -->
- <suppress checks="(ParameterNumber|MethodLength)" files="FindMatches\.java" lines="164,350,459"/>
+ <suppress checks="ParameterNumber" files="FindMatches\.java"/>
  <!-- util/xml -->
  <suppress checks="(EmptyBlock|MethodLength)" files="XMLStreamReader\.java"/>
  <!-- util -->

diff --git a/src/org/omegat/core/statistics/CalcMatchStatistics.java b/src/org/omegat/core/statistics/CalcMatchStatistics.java
@@ -101,7 +101,7 @@ public class CalcMatchStatistics extends LongProcessThread {
  private final ThreadLocal<ISimilarityCalculator> distanceCalculator = ThreadLocal
  .withInitial(LevenshteinDistance::new);
  private final ThreadLocal<FindMatches> finder = ThreadLocal.withInitial(
- () -> new FindMatches(Core.getProject(), OConsts.MAX_NEAR_STRINGS, true, false, false));
+ () -> new FindMatches(Core.getProject(), OConsts.MAX_NEAR_STRINGS, false, false));
  private final StringBuilder textForLog = new StringBuilder();
 
  public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
@@ -299,7 +299,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
  int calcMaxSimilarity(SourceTextEntry ste) {
  String srcNoXmlTags = removeXmlTags(ste);
  FindMatches localFinder = finder.get();
- List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
+ List<NearString> nears = localFinder.search(srcNoXmlTags, false, false, this::isInterrupted);
  final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
  int maxSimilarity = 0;
  CACHE: for (NearString near : nears) {

diff --git a/src/org/omegat/core/statistics/FindMatches.java b/src/org/omegat/core/statistics/FindMatches.java
@@ -7,6 +7,7 @@
  2008 Alex Buloichik
  2012 Thomas Cordonnier, Martin Fleurke
  2013 Aaron Madlon-Kay, Alex Buloichik
+ 2024 Hiroshi Miura
  Home page: https://www.omegat.org/
  Support center: https://omegat.org/support
 
@@ -42,8 +43,6 @@
 import org.omegat.core.data.ExternalTMFactory;
 import org.omegat.core.data.ExternalTMX;
 import org.omegat.core.data.IProject;
-import org.omegat.core.data.IProject.DefaultTranslationsIterator;
-import org.omegat.core.data.IProject.MultipleTranslationsIterator;
 import org.omegat.core.data.ITMXEntry;
 import org.omegat.core.data.SourceTextEntry;
 import org.omegat.core.data.TMXEntry;
@@ -64,31 +63,30 @@
 
 /**
  * Class to find matches by specified criteria.
- *
+ * <p>
  * Since we can use stemmers to prepare tokens, we should use 3-pass comparison
  * of similarity. Similarity will be calculated in 3 steps:
- *
- * 1. Split original segment into word-only tokens using stemmer (with stop
- * words list), then compare tokens.
- *
- * 2. Split original segment into word-only tokens without stemmer, then compare
- * tokens.
- *
- * 3. Split original segment into not-only-words tokens (including numbers and
- * tags) without stemmer, then compare tokens.
- *
- * This class is not thread safe ! Must be used in the one thread only.
+ * <ol>
+ * <li>Split the original segment into word-only tokens using stemmer (with stop
+ * words list), then compare tokens.</li>
+ * <li>Split the original segment into word-only tokens without a stemmer, then compare
+ * tokens.</li>
+ * <li>Split the original segment into not-only-words tokens (including numbers and
+ * tags) without a stemmer, then compare tokens.</li>
+ * </ol>
+ * This class is not thread safe! Must be used in the one thread only.
  *
  * @author Maxym Mykhalchuk
  * @author Alex Buloichik ([email protected])
  * @author Martin Fleurke
  * @author Aaron Madlon-Kay
+ * @author Hiroshi Miura
  */
 public class FindMatches {
 
  /**
- * According to gettext source code, PO fuzzies are created above 60%
- * https://sourceforge.net/p/omegat/feature-requests/1258/
+ * According to gettext source code, PO fuzzy items are created above 60%
+ * <a href="https://sourceforge.net/p/omegat/feature-requests/1258/">RFE#1258</a>
  */
  static final int PENALTY_FOR_FUZZY = 40;
  private static final int PENALTY_FOR_REMOVED = 5;
@@ -127,46 +125,53 @@ public class FindMatches {
  /** Tokens for original string, includes numbers and tags. */
  private Token[] strTokensAll;
 
- // This finder used for search separate segment matches
- private FindMatches separateSegmentMatcher;
-
- private final int fuzzyMatchThreshold;
+ private int fuzzyMatchThreshold = 0;
 
  private final boolean applyThreshold;
 
- /**
- * @param searchExactlyTheSame
- * allows to search similarities with the same text as source
- * segment. This mode used only for separate sentence match in
- * paragraph project, i.e. where source is just part of current
- * source.
- */
+ /** Constructor(deprecated). */
+ @Deprecated
  public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
- boolean searchExactlyTheSame) {
- this(project, maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true);
+ boolean searchExactlyTheSame, boolean applyThreshold) {
+ this(project, maxCount, searchExactlyTheSame, applyThreshold);
  }
 
- public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
- boolean searchExactlyTheSame, boolean applyThreshold) {
+ /**
+ * Constructor.
+ * @param project OmegaT project.
+ * @param maxCount limit the maximum count of the results.
+ * @param searchExactlyTheSame
+ * allows searching similarities with the same text as source
+ * segment. This mode is used only for separate sentence match in
+ * a paragraph project, i.e. where a source is just part of the
+ * current source.
+ * @param applyThreshold
+ * Cut off the result by a custom threshold. It is useful
+ * when results are used only for the display.
+ */
+ public FindMatches(IProject project, int maxCount, boolean searchExactlyTheSame, boolean applyThreshold) {
  this.project = project;
  this.tok = project.getSourceTokenizer();
  this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
  this.maxCount = maxCount;
  this.searchExactlyTheSame = searchExactlyTheSame;
- if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
- separateSegmentMatcher = new FindMatches(project, 1, false, true);
- }
- this.fuzzyMatchThreshold = Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
- OConsts.FUZZY_MATCH_THRESHOLD);
+ this.result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
  this.applyThreshold = applyThreshold;
  }
 
  public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
  IStopped stop) throws StoppedException {
- result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
+ return searchNormal(searchText, requiresTranslation, fillSimilarityData, false, stop);
+ }
 
+ private void init(String searchText) {
+ result.clear();
  srcText = searchText;
  removedText = "";
+ if (applyThreshold) {
+ fuzzyMatchThreshold = Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
+ OConsts.FUZZY_MATCH_THRESHOLD);
+ }
 
  // remove part that is to be removed according to user settings.
  // Rationale: it might be a big string influencing the 'editing
@@ -187,10 +192,16 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
  strTokensNoStem = tokenizeNoStem(srcText);
  strTokensAll = tokenizeAll(srcText);
  /* HP: includes non - word tokens */
+ }
+
+ private List<NearString> searchNormal(String searchText, boolean requiresTranslation, boolean isFillSimilarityData,
+ boolean skipExternal, IStopped stop) throws StoppedException {
+ init(searchText);
 
  // travel by project entries, including orphaned
  if (project.getProjectProperties().isSupportDefaultTranslations()) {
- project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {
+ project.iterateByDefaultTranslations(new IProject.DefaultTranslationsIterator() {
+ @Override
  public void iterate(String source, TMXEntry trans) {
  checkStopped(stop);
  if (!searchExactlyTheSame && source.equals(searchText)) {
@@ -207,7 +218,8 @@ public void iterate(String source, TMXEntry trans) {
  }
  });
  }
- project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {
+ project.iterateByMultipleTranslations(new IProject.MultipleTranslationsIterator() {
+ @Override
  public void iterate(EntryKey source, TMXEntry trans) {
  checkStopped(stop);
  if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
@@ -224,6 +236,14 @@ public void iterate(EntryKey source, TMXEntry trans) {
  }
  });
 
+ if (!skipExternal) {
+ travelExternalTMs(requiresTranslation, stop);
+ }
+ finish(isFillSimilarityData, stop);
+ return result;
+ }
+
+ private void travelExternalTMs(boolean requiresTranslation, IStopped stop) {
  /*
  * Penalty applied for fuzzy matches in another language (if no match in
  * the target language was found).
@@ -259,7 +279,46 @@ public void iterate(EntryKey source, TMXEntry trans) {
  tmen.getCreationDate(), tmen.getChanger(), tmen.getChangeDate(), tmen.getProperties());
  }
  }
+ }
 
+ public List<NearString> searchSegmented(String searchText, IStopped stop) throws StoppedException {
+ FindMatches separateSegmentMatcher = new FindMatches(project, 1, true, true);
+ init(searchText);
+
+ // split paragraph even when segmentation disabled, then find
+ // matches for every segment
+ List<StringBuilder> spaces = new ArrayList<>();
+ List<Rule> brules = new ArrayList<>();
+ Language sourceLang = project.getProjectProperties().getSourceLanguage();
+ Language targetLang = project.getProjectProperties().getTargetLanguage();
+ List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
+ if (segments.size() > 1) {
+ List<String> fsrc = new ArrayList<>(segments.size());
+ List<String> ftrans = new ArrayList<>(segments.size());
+ // multiple segments
+ for (String onesrc : segments) {
+ // find match for separate segment
+ List<NearString> segmentMatch = separateSegmentMatcher.searchNormal(onesrc, true, false, true, stop);
+ if (!segmentMatch.isEmpty()
+ && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
+ fsrc.add(segmentMatch.get(0).source);
+ ftrans.add(segmentMatch.get(0).translation);
+ } else {
+ fsrc.add("");
+ ftrans.add("");
+ }
+ }
+ // glue found sources
+ String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
+ // glue found translations
+ String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
+ processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "",
+ 0, null);
+ }
+ return result;
+ }
+
+ private void finish(boolean fillSimilarityData, IStopped stop) {
  // travel by all entries for check source file translations
  for (SourceTextEntry ste : project.getAllEntries()) {
  checkStopped(stop);
@@ -269,58 +328,19 @@ public void iterate(EntryKey source, TMXEntry trans) {
  "", 0, "", 0, null);
  }
  }
-
- if (separateSegmentMatcher != null) {
- // split paragraph even when segmentation disabled, then find
- // matches for every segment
- List<StringBuilder> spaces = new ArrayList<StringBuilder>();
- List<Rule> brules = new ArrayList<Rule>();
- Language sourceLang = project.getProjectProperties().getSourceLanguage();
- Language targetLang = project.getProjectProperties().getTargetLanguage();
- List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
- if (segments.size() > 1) {
- List<String> fsrc = new ArrayList<String>(segments.size());
- List<String> ftrans = new ArrayList<String>(segments.size());
- // multiple segments
- for (short i = 0; i < segments.size(); i++) {
- String onesrc = segments.get(i);
-
- // find match for separate segment
- List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
- false, stop);
- if (!segmentMatch.isEmpty()
- && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
- fsrc.add(segmentMatch.get(0).source);
- ftrans.add(segmentMatch.get(0).translation);
- } else {
- fsrc.add("");
- ftrans.add("");
- }
- }
- // glue found sources
- String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
- // glue found translations
- String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
- processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "",
- 0, null);
- }
- }
-
  if (fillSimilarityData) {
- // fill similarity data only for result
+ // fill similarity data only for a result.
  for (NearString near : result) {
  // fix for bug 1586397
  byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll,
  tokenizeAll(near.source));
  near.attr = similarityData;
  }
  }
-
- return result;
  }
 
  /**
- * Compare one entry with original entry.
+ * Compare one entry with the original entry.
  *
  * @param key
  * entry to compare
@@ -381,7 +401,7 @@ protected void processEntry(EntryKey key, String source, String translation,
  similarityStem -= realPenaltyForRemoved;
 
  // check if we have chance by first percentage only
- if (!haveChanceToAdd(similarityStem, Integer.MAX_VALUE, Integer.MAX_VALUE)) {
+ if (noChanceToAdd(similarityStem, Integer.MAX_VALUE, Integer.MAX_VALUE)) {
  return;
  }
 
@@ -396,7 +416,7 @@ protected void processEntry(EntryKey key, String source, String translation,
  similarityNoStem -= realPenaltyForRemoved;
 
  // check if we have chance by first and second percentages
- if (!haveChanceToAdd(similarityStem, similarityNoStem, Integer.MAX_VALUE)) {
+ if (noChanceToAdd(similarityStem, similarityNoStem, Integer.MAX_VALUE)) {
  return;
  }
 
@@ -411,7 +431,7 @@ protected void processEntry(EntryKey key, String source, String translation,
  simAdjusted -= realPenaltyForRemoved;
 
  // check if we have chance by first, second and third percentages
- if (!haveChanceToAdd(similarityStem, similarityNoStem, simAdjusted)) {
+ if (noChanceToAdd(similarityStem, similarityNoStem, simAdjusted)) {
  return;
  }
 
@@ -437,9 +457,9 @@ protected void processEntry(EntryKey key, String source, String translation,
  * exactly similarity
  * @return true if we have chance
  */
- protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final int simExactly) {
+ private boolean noChanceToAdd(int simStem, int simNoStem, int simExactly) {
  if (result.size() < maxCount) {
- return true;
+ return false;
  }
  NearString st = result.get(result.size() - 1);
  int chance = Integer.compare(st.scores[0].score, simStem);
@@ -449,7 +469,7 @@ protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final
  if (chance == 0) {
  chance = Integer.compare(st.scores[0].adjustedScore, simExactly);
  }
- return chance != 1;
+ return chance == 1;
  }
 
  /**
@@ -508,9 +528,9 @@ protected void addNearString(final EntryKey key, final String source, final Stri
  /*
  * Methods for tokenize strings with caching.
  */
- Map<String, Token[]> tokenizeStemCache = new HashMap<String, Token[]>();
- Map<String, Token[]> tokenizeNoStemCache = new HashMap<String, Token[]>();
- Map<String, Token[]> tokenizeAllCache = new HashMap<String, Token[]>();
+ Map<String, Token[]> tokenizeStemCache = new HashMap<>();
+ Map<String, Token[]> tokenizeNoStemCache = new HashMap<>();
+ Map<String, Token[]> tokenizeAllCache = new HashMap<>();
 
  public Token[] tokenizeStem(String str) {
  Token[] tokens = tokenizeStemCache.get(str);
@@ -552,8 +572,8 @@ protected void checkStopped(IStopped stop) throws StoppedException {
  }
 
  /**
- * Process will throw this exception if it stopped.All callers must catch it
- * and just skip.
+ * The Process will throw this exception if it stopped. All callers must
+ * catch it and just skip.
  */
  @SuppressWarnings("serial")
  public static class StoppedException extends RuntimeException {