Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update tmx writer, escaper, and unit test #1049

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/org/omegat/util/TMXWriter2.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import static com.ctc.wstx.api.WstxOutputProperties.P_OUTPUT_ESCAPE_CR;
import static org.codehaus.stax2.XMLOutputFactory2.P_AUTOMATIC_EMPTY_ELEMENTS;
import static org.codehaus.stax2.XMLOutputFactory2.P_TEXT_ESCAPER;

import java.io.BufferedOutputStream;
import java.io.File;
Expand Down Expand Up @@ -109,6 +110,7 @@ public TMXWriter2(final File file, final Language sourceLanguage, final Language
factory = XMLOutputFactory.newInstance();
factory.setProperty(P_OUTPUT_ESCAPE_CR, false);
factory.setProperty(P_AUTOMATIC_EMPTY_ELEMENTS, true);
factory.setProperty(P_TEXT_ESCAPER, new TmxEscapingWriterFactory());

out = new BufferedOutputStream(Files.newOutputStream(file.toPath()));
xml = factory.createXMLStreamWriter(out, StandardCharsets.UTF_8.name());
Expand Down
137 changes: 137 additions & 0 deletions src/org/omegat/util/TmxEscapingWriterFactory.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* OmegaT - Computer Assisted Translation (CAT) tool
* with fuzzy matching, translation memory, keyword search,
* glossaries, and translation leveraging into updated projects.
*
* Copyright (C) 2024 Hiroshi Miura
* Home page: https://www.omegat.org/
* Support center: https://omegat.org/support
*
* This file is part of OmegaT.
*
* OmegaT is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OmegaT is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package org.omegat.util;

import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import org.codehaus.stax2.io.EscapingWriterFactory;
import org.jetbrains.annotations.NotNull;

public class TmxEscapingWriterFactory implements EscapingWriterFactory {

@Override
public Writer createEscapingWriterFor(@NotNull final Writer writer, final String s) {
return new EscapeWriter(writer);
}

@Override
public Writer createEscapingWriterFor(@NotNull final OutputStream outputStream, final String s) {
return new EscapeWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8));
}

public static class EscapeWriter extends Writer {

// Copy from woodstox:com.ctc.wstx.sw.BufferingXmlWriter
private static final byte[] QUOTABLE_TEXT_CHARS;
private static final int HIGH_ENC = 0xFFFE;

static {
byte[] q = new byte[256];
Arrays.fill(q, 0, 32, (byte) 1);
Arrays.fill(q, 127, 160, (byte) 1);
q['\t'] = 0;
q['\n'] = 0;
q['<'] = 1;
q['>'] = 1;
q['&'] = 1;
q['\r'] = (byte) (Platform.isWindows ? 0 : 1);
QUOTABLE_TEXT_CHARS = q;
}

private final Writer delegate;

public EscapeWriter(@NotNull Writer delegate) {
this.delegate = delegate;
}

/**
* Wrap Writer and escape characters for TEXT output.
* <p>
* this does not consider it as an attribute value.
*
* @param cbuf
* Array of characters
* @param off
* Offset from which to start writing characters
* @param len
* Number of characters to write
* @throws IOException
* when underlying writer object raises.
*/
@Override
public void write(@NotNull final char[] cbuf, final int off, final int len) throws IOException {
final int end = off + len;
int offset = off;
do {
int start = offset;
String ent = null;
for (; offset < end; offset++) {
int c = cbuf[offset];
if (c < 256 && QUOTABLE_TEXT_CHARS[c] != 0) {
if (c == '<') {
ent = "&lt;";
break;
} else if (c == '>') {
ent = "&gt;";
break;
} else if (c == '&') {
ent = "&amp;";
break;
} else {
ent = String.format("&#x%02x;", c);
break;
}
} else if (c >= HIGH_ENC) {
ent = String.format("&#x%04x;", c);
break;
}
}
int outLen = offset - start;
if (outLen > 0) {
delegate.write(cbuf, start, outLen);
}
if (ent != null) {
delegate.write(ent);
}
} while (++offset < end);
}

@Override
public void flush() throws IOException {
delegate.flush();
}

@Override
public void close() throws IOException {
delegate.close();
}
}
}
23 changes: 11 additions & 12 deletions test/data/tmx/test-save-tmx14.tmx
Original file line number Diff line number Diff line change
@@ -1,41 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx SYSTEM "tmx14.dtd">
<tmx version="1.4">
<header datatype="plaintext" srclang="en-US" adminlang="EN-US" o-tmf="OmegaT TMX" segtype="paragraph" creationtoolversion="test" creationtool="test"/>
<header datatype="plaintext" srclang="en-US" adminlang="EN-US" o-tmf="OmegaT TMX" segtype="paragraph"
creationtoolversion="test" creationtool="test"/>
<body>
<tu>
<tuv xml:lang="en-US">
<seg>source</seg>
</tuv>
<tuv xml:lang="be-BY">
<seg>target</seg>
</tuv>
<tuv xml:lang="en-US">
<seg>source</seg>
</tuv>
<tuv xml:lang="be-BY">
<seg>target</seg>
</tuv>
</tu>

<tu>
<tuv xml:lang="en-US">
<seg>1<ph x='1'>&lt;a1/&gt;</ph>2</seg>
<seg>1<ph x="1">&lt;a1/&gt;</ph>2</seg>
</tuv>
<tuv xml:lang="be-BY">
<seg>zz</seg>
</tuv>
</tu>
<tu>
<tuv xml:lang="en-US">
<seg>3<bpt i='1' x='1'>&lt;a1&gt;</bpt>4<ept i='1'>&lt;/a1&gt;</ept>5</seg>
<seg>3<bpt i="1" x="1">&lt;a1&gt;</bpt>4<ept i="1">&lt;/a1&gt;</ept>5</seg>
</tuv>
<tuv xml:lang="be-BY">
<seg>zz</seg>
</tuv>
</tu>
<tu>
<tuv xml:lang="en-US">
<seg>6<it pos='begin' x='1'>&lt;a1&gt;</it>7</seg>
<seg>6<it pos="begin" x="1">&lt;a1&gt;</it>7</seg>
</tuv>
<tuv xml:lang="be-BY">
<seg>zz</seg>
</tuv>
</tu>

</body>
</tmx>
3 changes: 2 additions & 1 deletion test/src/org/omegat/util/TMXWriterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
**************************************************************************/
package org.omegat.util;

import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

Expand Down Expand Up @@ -178,7 +179,7 @@ public void testEOLwrite() throws Exception {
text.append(buffer, 0, len);
}
}
assertTrue(text.toString().contains("tar" + eol + "get"));
assertThat(text.toString()).as("Preserve EOL mark in text.").contains("tar" + eol + "get");

final List<String> trs = new ArrayList<>();
load(null, trs, true, false);
Expand Down
132 changes: 132 additions & 0 deletions test/src/org/omegat/util/TmxEscapingWriterTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* OmegaT - Computer Assisted Translation (CAT) tool
* with fuzzy matching, translation memory, keyword search,
* glossaries, and translation leveraging into updated projects.
*
* Copyright (C) 2024 Hiroshi Miura
* Home page: https://www.omegat.org/
* Support center: https://omegat.org/support
*
* This file is part of OmegaT.
*
* OmegaT is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OmegaT is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package org.omegat.util;

import static org.assertj.core.api.Assertions.assertThat;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.StandardCharsets;

import org.junit.Before;
import org.junit.Test;

public class TmxEscapingWriterTest {

private ByteArrayOutputStream outputStream;
private Writer writer;

@Before
public void setUp() throws UnsupportedEncodingException {
outputStream = new ByteArrayOutputStream();
var factory = new TmxEscapingWriterFactory();
writer = factory.createEscapingWriterFor(outputStream, null);
}

private String result() {
return outputStream.toString(StandardCharsets.UTF_8);
}

/**
* Test basic characters, as-is.
*
* @throws IOException I/O error happened.
*/
@Test
public void escapeBasic() throws IOException {
writer.write("Hello world!\n");
writer.flush();
assertThat(result()).isEqualTo("Hello world!\n");
}

/**
* Test signs to be escaped.
* @throws IOException I/O error happened.
*/
@Test
public void escapeToEntities() throws IOException {
writer.write("'<escape>\"");
writer.flush();
assertThat(result())
.as("Check escape of < & and > signs and as-is for single/double quote")
.isEqualTo("'&lt;escape&gt;\"");
}

/**
* Test escape of NBSP, no-escape.
* @throws IOException I/O error happened.
*/
@Test
public void testNBSP() throws IOException {
writer.write("[\u00A0]");
writer.flush();
assertThat(result())
.as("Check NBSP is not escaped.")
.isEqualTo("[\u00A0]");
}

/**
* Test Control character No-Break-Here, escape.
*
* @throws IOException I/O error happened.
*/
@Test
public void testNBH() throws IOException {
writer.write("\u0083");
writer.flush();
assertThat(result())
.as("Check NO_BREAK_HERE control character to be escaped")
.isEqualTo("&#x83;");
}

/**
* Test emoji and flag, surrogate pair, escape.
* @throws IOException I/O error happened.
*/
@Test
public void testSurrogatePair() throws IOException {
writer.write("[😀]");
writer.flush();
assertThat(result())
.as("Check emoji and flag that requires surrogate pair for encode.")
.isEqualTo("[😀]");
}

/**
* Test Invalid character, BOM flag, escape.
* @throws IOException I/O error happened.
*/
@Test
public void testInvalidChar() throws IOException {
writer.write((char) 0xFFFE);
writer.flush();
assertThat(result())
.as("check BOM mark to be escaped when appeared in TEXT.")
.isEqualToIgnoringCase("&#xfffe;");
}
}
Loading