Skip to content

Commit

Permalink
[BUGS#1248] feat: add regression test
Browse files Browse the repository at this point in the history
Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed Feb 17, 2024
1 parent 2f99f45 commit e42b6d3
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 7 deletions.
16 changes: 16 additions & 0 deletions test/data/tmx/penalty-010/segment_1.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx PUBLIC "-//LISA OSCAR:1998//DTD for Translation Memory eXchange//EN" "tmx14.dtd">

<tmx version="1.4">
<header creationtoolversion="0.1" adminlang="en" segtype="paragraph" creationdate="20230930T155211Z" datatype="unknown" srclang="ja" creationtool="txt2tmx" o-tmf="TextEdit"></header>
<body>
<tu>
<tuv xml:lang="fr">
<seg>weird behavior</seg>
</tuv>
<tuv xml:lang="ja">
<seg>地力の搾取と浪費が現われる。(1)</seg>
</tuv>
</tu>
</body>
</tmx>
56 changes: 49 additions & 7 deletions test/src/org/omegat/core/statistics/FindMatchesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,14 @@
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.NearString;
import org.omegat.core.segmentation.Rule;
import org.omegat.core.segmentation.SRX;
import org.omegat.core.segmentation.Segmenter;
import org.omegat.tokenizer.DefaultTokenizer;
import org.omegat.tokenizer.ITokenizer;
import org.omegat.tokenizer.LuceneCJKTokenizer;
import org.omegat.tokenizer.LuceneEnglishTokenizer;
import org.omegat.tokenizer.LuceneFrenchTokenizer;
import org.omegat.util.Language;
import org.omegat.util.OConsts;
import org.omegat.util.Preferences;
Expand All @@ -71,6 +74,7 @@ public class FindMatchesTest {

private static final File TMX_EN_US_SR = new File("test/data/tmx/en-US_sr.tmx");
private static final File TMX_EN_US_GB_SR = new File("test/data/tmx/en-US_en-GB_fr_sr.tmx");
private static final File TMX_SEGMENT = new File("test/data/tmx/penalty-010/segment_1.tmx");
private static Path tmpDir;

/**
Expand Down Expand Up @@ -98,7 +102,7 @@ public void testSearchRFE1578() throws Exception {
prop.setSentenceSegmentingEnabled(false);
IProject project = new TestProject(prop, TMX_EN_US_SR);
Core.setProject(project);
Core.setSegmenter(new Segmenter(new SRX()));
Core.setSegmenter(new Segmenter(SRX.getDefault()));
IStopped iStopped = () -> false;
FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
List<NearString> result = finder.search("XXX", true, true, iStopped);
Expand Down Expand Up @@ -133,7 +137,7 @@ public void testSearchRFE1578_2() throws Exception {
prop.setSentenceSegmentingEnabled(false);
IProject project = new TestProject(prop, TMX_EN_US_GB_SR);
Core.setProject(project);
Core.setSegmenter(new Segmenter(new SRX()));
Core.setSegmenter(new Segmenter(SRX.getDefault()));
IStopped iStopped = () -> false;
FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
// Search source "XXx" in en-US
Expand All @@ -146,6 +150,34 @@ public void testSearchRFE1578_2() throws Exception {
assertEquals(3, result.size());
}

@Test
public void testSearchBUGS1248() throws Exception {
ProjectProperties prop = new ProjectProperties(tmpDir.toFile());
prop.setSourceLanguage("ja");
prop.setTargetLanguage("fr");
prop.setSupportDefaultTranslations(true);
prop.setSentenceSegmentingEnabled(false);
IProject project = new TestProject(prop, TMX_SEGMENT, new LuceneCJKTokenizer(), new LuceneFrenchTokenizer());
Core.setProject(project);
Core.setSegmenter(new Segmenter(SRX.getDefault()));
String srcText = "地力の搾取と浪費が現われる。(1)";
//
List<StringBuilder> spaces = new ArrayList<>();
List<Rule> brules = new ArrayList<>();
Language sourceLanguage = prop.getSourceLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLanguage, srcText, spaces, brules);
assertEquals(2, segments.size());
//
IStopped iStopped = () -> false;
FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
List<NearString> result = finder.search(srcText, true, true, iStopped);
assertEquals(srcText, result.get(0).source);
assertEquals("TM", result.get(0).comesFrom.name());
assertEquals(1, result.size());
assertEquals(90, result.get(0).scores[0].score);
assertEquals("weird behavior", result.get(0).translation);
}

@BeforeClass
public static void setUpClass() throws Exception {
tmpDir = Files.createTempDirectory("omegat");
Expand All @@ -164,12 +196,20 @@ public void setUp() throws Exception {
}

static class TestProject extends NotLoadedProject implements IProject {
private ProjectProperties prop;
private File testTmx;
private final ProjectProperties prop;
private final File testTmx;
private final ITokenizer sourceTokenizer;
private final ITokenizer targetTokenizer;

TestProject(ProjectProperties prop, File testTmx) {
this(prop, testTmx, new LuceneEnglishTokenizer(), new DefaultTokenizer());
}

TestProject(final ProjectProperties prop, final File testTmx) {
TestProject(ProjectProperties prop, File testTmx, ITokenizer source, ITokenizer target) {
this.prop = prop;
this.testTmx = testTmx;
sourceTokenizer = source;
targetTokenizer = target;
}

@Override
Expand All @@ -182,17 +222,19 @@ public List<SourceTextEntry> getAllEntries() {
List<SourceTextEntry> ste = new ArrayList<>();
ste.add(new SourceTextEntry(new EntryKey("source.txt", "XXX", null, "", "", null),
1, null, null, new ArrayList<>()));
ste.add(new SourceTextEntry(new EntryKey("source.txt", "地力の搾取と浪費が現われる。(1)", null, "", "", null),
1, null, null, Collections.emptyList()));
return ste;
}

@Override
public ITokenizer getSourceTokenizer() {
return new LuceneEnglishTokenizer();
return sourceTokenizer;
};

@Override
public ITokenizer getTargetTokenizer() {
return new DefaultTokenizer();
return targetTokenizer;
}

@Override
Expand Down

0 comments on commit e42b6d3

Please sign in to comment.