Skip to content

Commit

Permalink
Modify default encoding detector
Browse files Browse the repository at this point in the history
Replace HtmlEncodingDetector to StandardHtmlEncodingDetector
Adjust some test case
  • Loading branch information
PeterAlfredLee committed Aug 13, 2020
1 parent 5021f34 commit 2a1b6a5
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

org.apache.tika.parser.html.HtmlEncodingDetector
org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector
org.apache.tika.parser.txt.UniversalEncodingDetector
org.apache.tika.parser.txt.Icu4jEncodingDetector
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
Expand All @@ -52,7 +53,7 @@ public void testDefault() {
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
assertEquals(3, detectors.size());
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
assertTrue(detectors.get(0) instanceof StandardHtmlEncodingDetector);
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
<!-- Explicitly request default parsers -->
<parsers/>
<encodingDetectors>
<!-- All detectors except HtmlEncodingDetector -->
<!-- All detectors except StandardHtmlEncodingDetector -->
<encodingDetector class="org.apache.tika.detect.DefaultEncodingDetector">
<encodingDetector-exclude class="org.apache.tika.parser.html.HtmlEncodingDetector"/>
<encodingDetector-exclude class="org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector"/>
</encodingDetector>
<!-- One other detector, to check ordering -->
<encodingDetector class="org.apache.tika.detect.NonDetectingEncodingDetector"/>
Expand Down

0 comments on commit 2a1b6a5

Please sign in to comment.