Skip to content

Commit

Permalink
Modify default encoding detector
Browse files Browse the repository at this point in the history
Replace HtmlEncodingDetector to StandardHtmlEncodingDetector
Adjust some test case
  • Loading branch information
PeterAlfredLee committed Dec 5, 2020
1 parent 0e7b475 commit 99eaa8a
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
Expand All @@ -56,7 +57,7 @@ public void testDefault() {
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(3, detectors.size());
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
assertTrue(detectors.get(0) instanceof StandardHtmlEncodingDetector);
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
<!-- Explicitly request default parsers -->
<parsers/>
<encodingDetectors>
<!-- All detectors except HtmlEncodingDetector -->
<!-- All detectors except StandardHtmlEncodingDetector -->
<encodingDetector class="org.apache.tika.detect.DefaultEncodingDetector">
<encodingDetector-exclude class="org.apache.tika.parser.html.HtmlEncodingDetector"/>
<encodingDetector-exclude class="org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector"/>
</encodingDetector>
<!-- One other detector, to check ordering -->
<encodingDetector class="org.apache.tika.detect.NonDetectingEncodingDetector"/>
Expand Down

0 comments on commit 99eaa8a

Please sign in to comment.