Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add suport for handling unbound prefixes on tag names and attribute keys #1682

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
51 changes: 51 additions & 0 deletions src/main/java/org/jsoup/nodes/Attribute.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class Attribute implements Map.Entry<String, String>, Cloneable {
};

private String key;
private String convertedKey; // the converted version of the key with symbols in Uni-16 and original letters
@Nullable private String val;
@Nullable Attributes parent; // used to update the holding Attributes when the key / value is changed via this interface

Expand All @@ -48,10 +49,52 @@ public Attribute(String key, @Nullable String val, @Nullable Attributes parent)
key = key.trim();
Validate.notEmpty(key); // trimming could potentially make empty, so validate here
this.key = key;
this.convertedKey = convertSymbol(key);
this.val = val;
this.parent = parent;
}

/**
* Convert the unencoded (raw) key that contains unrecognized symbols to Unicode 16 following
* HTML Living Standard https://html.spec.whatwg.org/#coercing-an-html-dom-into-an-infoset.
*
* @param key attribute key; case is preserved.
* @return the converted key.
*/
public String convertSymbol(String key) {
String convertedKey = "";
for(int i = 0; i < key.length(); i++){
char c = key.charAt(i);

boolean isDigit = Character.isDigit(c);
boolean isLowerLetter = Character.isLowerCase(c);
boolean isUpperLetter = Character.isUpperCase(c);

// check whether the char is a digit or letter
if(!(isDigit | isLowerLetter | isUpperLetter)){
// convert the symbol to Unicode `U` + unicode with 6 hex chars
int uni16 = (int) c;
String hexString = Integer.toHexString(uni16);

// Reference: https://stackoverflow.com/questions/8689526/integer-to-two-digits-hex-in-java
convertedKey += "U";

if(hexString.length() < 6){
for(int idx = 0; idx < (6 - hexString.length()); idx++){
convertedKey += "0";
}
}

convertedKey += hexString.toUpperCase();

}else{
convertedKey += c;
}
}

return convertedKey;
}

/**
Get the attribute key.
@return the attribute key
Expand All @@ -60,6 +103,14 @@ public String getKey() {
return key;
}

/**
Get the attribute converted key.
@return the attribute converted key
*/
public String getConvertedKey() {
return convertedKey;
}

/**
Set the attribute key; case is preserved.
@param key the new key; must not be null
Expand Down
52 changes: 52 additions & 0 deletions src/main/java/org/jsoup/parser/Tag.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class Tag implements Cloneable {

private String tagName;
private String normalName; // always the lower case version of this tag, regardless of case preservation mode
private String unicodeName; // always the converted version of this tag with symbols in Uni-16 and original letters, regardless of case preservation mode
private boolean isBlock = true; // block
private boolean formatAsBlock = true; // should be formatted as a block
private boolean empty = false; // can hold nothing; e.g. img
Expand All @@ -27,6 +28,7 @@ public class Tag implements Cloneable {
private Tag(String tagName) {
this.tagName = tagName;
normalName = Normalizer.lowerCase(tagName);
unicodeName = convertSymbol(tagName);
}

/**
Expand All @@ -46,6 +48,56 @@ public String normalName() {
return normalName;
}

/**
* Get this tag's name whose symbols are converted to Unicode.
* @return the tag's converted name.
*/
public String unicodeName() {
return unicodeName;
}

/**
* Get the tag name whose symbols are converted to Unicode 16 following
* HTML Living Standard https://html.spec.whatwg.org/#coercing-an-html-dom-into-an-infoset.
*
* @param tagName Name of tag, e.g. "p", case is preserved.
* @return the tag's converted name.
*/
public String convertSymbol(String tagName) {
String convertName = "";
for(int i = 0; i < tagName.length(); i++){
char c = tagName.charAt(i);

boolean isDigit = Character.isDigit(c);
boolean isLowerLetter = Character.isLowerCase(c);
boolean isUpperLetter = Character.isUpperCase(c);

// check whether the char is a digit or letter
if(!(isDigit | isLowerLetter | isUpperLetter)){
// convert the symbol to Unicode `U` + unicode with 6 hex chars
int uni16 = (int) c;
String hexString = Integer.toHexString(uni16);

// Reference: https://stackoverflow.com/questions/8689526/integer-to-two-digits-hex-in-java
convertName += "U";

if(hexString.length() < 6){
for(int idx = 0; idx < (6 - hexString.length()); idx++){
convertName += "0";
}
}

convertName += hexString.toUpperCase();

}else{
convertName += c;
}
}

return convertName;
}


/**
* Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything.
* <p>
Expand Down
27 changes: 27 additions & 0 deletions src/test/java/org/jsoup/parser/AttributeParseTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,31 @@ public class AttributeParseTest {
doc = Jsoup.parse(html, "", Parser.xmlParser());
assertEquals("<img onerror=\"doMyJob\" />", doc.html());
}

// Test the attribute name like `xlink:href`
// Issue #1341
// https://github.com/jhy/jsoup/issues/1341
@Test public void handleUnboundPrefixofXlink() {
String h = "<!doctype html>\n" +
"<html lang=\"de\">\n" +
" <head>\n" +
"\n" +
" </head>\n" +
" <body>\n" +
"\t<test:h1>UnboundPrefix</test:h1>\n" +
"\t<svg width=\"180\" height=\"180\" xlink:href=\"UnboundPrefix\">\n" +
" \t<rect x=\"20\" y=\"20\" rx=\"20\" ry=\"20\" width=\"100\" height=\"100\" style=\"fill:lightgray; stroke:#1c87c9; stroke-width:4;\"/>\n" +
" \t</svg>\n" +
" </body>\n" +
"</html>\n";

Document doc = Jsoup.parse(h);

Element rv = doc.select("body").get(0).children().get(1);

assertEquals("xlinkU00003Ahref", rv.attributes().asList().get(2).getConvertedKey());

assertEquals("UnboundPrefix", rv.attributes().asList().get(2).getValue());
assertEquals("UnboundPrefix", rv.attributes().get("xlink:href"));
}
}
28 changes: 28 additions & 0 deletions src/test/java/org/jsoup/parser/TagTest.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
package org.jsoup.parser;

import org.jsoup.Jsoup;
import org.jsoup.MultiLocaleExtension.MultiLocaleTest;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.jupiter.api.Test;

import java.util.List;
import java.util.Locale;

import static org.junit.jupiter.api.Assertions.*;
Expand Down Expand Up @@ -81,4 +85,28 @@ public void canBeInsensitive(Locale locale) {
assertTrue(Tag.isKnownTag("div"));
assertFalse(Tag.isKnownTag("explain"));
}

// Test the tag containing symbols like `:`
// Issue #1341
// https://github.com/jhy/jsoup/issues/1341
@Test public void handleSymbolTags() {
String h = "<!doctype html>\n" +
"<html lang=\"de\">\n" +
" <head>\n" +
"\n" +
" </head>\n" +
" <body>\n" +
"\t<test:h1>UnboundPrefix</test:h1>\n" +
"\t<svg width=\"180\" height=\"180\" xlink:href=\"UnboundPrefix\">\n" +
" \t<rect x=\"20\" y=\"20\" rx=\"20\" ry=\"20\" width=\"100\" height=\"100\" style=\"fill:lightgray; stroke:#1c87c9; stroke-width:4;\"/>\n" +
" \t</svg>\n" +
" </body>\n" +
"</html>\n";

Document doc = Jsoup.parse(h);

Element rv = doc.select("body").get(0).children().get(0);
assertEquals("testU00003Ah1", rv.tag().unicodeName());
assertEquals("test:h1", rv.tagName());
}
}