jhy · duanyang25 · Dec 4, 2021 · Dec 7, 2021 · Dec 7, 2021 · Dec 7, 2021
diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java
@@ -24,6 +24,7 @@ public class Attribute implements Map.Entry<String, String>, Cloneable {
  };
 
  private String key;
+ private String convertedKey; // the converted version of the key with symbols in Uni-16 and original letters
  @Nullable private String val;
  @Nullable Attributes parent; // used to update the holding Attributes when the key / value is changed via this interface
 
@@ -48,10 +49,52 @@ public Attribute(String key, @Nullable String val, @Nullable Attributes parent)
  key = key.trim();
  Validate.notEmpty(key); // trimming could potentially make empty, so validate here
  this.key = key;
+ this.convertedKey = convertSymbol(key);
  this.val = val;
  this.parent = parent;
  }
 
+ /**
+ * Convert the unencoded (raw) key that contains unrecognized symbols to Unicode 16 following
+ * HTML Living Standard https://html.spec.whatwg.org/#coercing-an-html-dom-into-an-infoset.
+ *
+ * @param key attribute key; case is preserved.
+ * @return the converted key.
+ */
+ public String convertSymbol(String key) {
+ String convertedKey = "";
+ for(int i = 0; i < key.length(); i++){
+ char c = key.charAt(i);
+
+ boolean isDigit = Character.isDigit(c);
+ boolean isLowerLetter = Character.isLowerCase(c);
+ boolean isUpperLetter = Character.isUpperCase(c);
+
+ // check whether the char is a digit or letter
+ if(!(isDigit | isLowerLetter | isUpperLetter)){
+ // convert the symbol to Unicode `U` + unicode with 6 hex chars
+ int uni16 = (int) c;
+ String hexString = Integer.toHexString(uni16);
+
+ // Reference: https://stackoverflow.com/questions/8689526/integer-to-two-digits-hex-in-java
+ convertedKey += "U";
+
+ if(hexString.length() < 6){
+ for(int idx = 0; idx < (6 - hexString.length()); idx++){
+ convertedKey += "0";
+ }
+ }
+
+ convertedKey += hexString.toUpperCase();
+
+ }else{
+ convertedKey += c;
+ }
+ }
+
+ return convertedKey;
+ }
+
  /**
  Get the attribute key.
  @return the attribute key
@@ -60,6 +103,14 @@ public String getKey() {
  return key;
  }
 
+ /**
+ Get the attribute converted key.
+ @return the attribute converted key
+ */
+ public String getConvertedKey() {
+ return convertedKey;
+ }
+
  /**
  Set the attribute key; case is preserved.
  @param key the new key; must not be null

diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java
@@ -16,6 +16,7 @@ public class Tag implements Cloneable {
 
  private String tagName;
  private String normalName; // always the lower case version of this tag, regardless of case preservation mode
+ private String unicodeName; // always the converted version of this tag with symbols in Uni-16 and original letters, regardless of case preservation mode
  private boolean isBlock = true; // block
  private boolean formatAsBlock = true; // should be formatted as a block
  private boolean empty = false; // can hold nothing; e.g. img
@@ -27,6 +28,7 @@ public class Tag implements Cloneable {
  private Tag(String tagName) {
  this.tagName = tagName;
  normalName = Normalizer.lowerCase(tagName);
+ unicodeName = convertSymbol(tagName);
  }
 
  /**
@@ -46,6 +48,56 @@ public String normalName() {
  return normalName;
  }
 
+ /**
+ * Get this tag's name whose symbols are converted to Unicode.
+ * @return the tag's converted name.
+ */
+ public String unicodeName() {
+ return unicodeName;
+ }
+
+ /**
+ * Get the tag name whose symbols are converted to Unicode 16 following
+ * HTML Living Standard https://html.spec.whatwg.org/#coercing-an-html-dom-into-an-infoset.
+ *
+ * @param tagName Name of tag, e.g. "p", case is preserved.
+ * @return the tag's converted name.
+ */
+ public String convertSymbol(String tagName) {
+ String convertName = "";
+ for(int i = 0; i < tagName.length(); i++){
+ char c = tagName.charAt(i);
+
+ boolean isDigit = Character.isDigit(c);
+ boolean isLowerLetter = Character.isLowerCase(c);
+ boolean isUpperLetter = Character.isUpperCase(c);
+
+ // check whether the char is a digit or letter
+ if(!(isDigit | isLowerLetter | isUpperLetter)){
+ // convert the symbol to Unicode `U` + unicode with 6 hex chars
+ int uni16 = (int) c;
+ String hexString = Integer.toHexString(uni16);
+
+ // Reference: https://stackoverflow.com/questions/8689526/integer-to-two-digits-hex-in-java
+ convertName += "U";
+
+ if(hexString.length() < 6){
+ for(int idx = 0; idx < (6 - hexString.length()); idx++){
+ convertName += "0";
+ }
+ }
+
+ convertName += hexString.toUpperCase();
+
+ }else{
+ convertName += c;
+ }
+ }
+
+ return convertName;
+ }
+
+
  /**
  * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything.
  * <p>

diff --git a/src/test/java/org/jsoup/parser/AttributeParseTest.java b/src/test/java/org/jsoup/parser/AttributeParseTest.java
@@ -96,4 +96,31 @@ public class AttributeParseTest {
  doc = Jsoup.parse(html, "", Parser.xmlParser());
  assertEquals("<img onerror=\"doMyJob\" />", doc.html());
  }
+
+ // Test the attribute name like `xlink:href`
+ // Issue #1341
+ // https://github.com/jhy/jsoup/issues/1341
+ @Test public void handleUnboundPrefixofXlink() {
+ String h = "<!doctype html>\n" +
+ "<html lang=\"de\">\n" +
+ " <head>\n" +
+ "\n" +
+ " </head>\n" +
+ " <body>\n" +
+ "\t<test:h1>UnboundPrefix</test:h1>\n" +
+ "\t<svg width=\"180\" height=\"180\" xlink:href=\"UnboundPrefix\">\n" +
+ " \t<rect x=\"20\" y=\"20\" rx=\"20\" ry=\"20\" width=\"100\" height=\"100\" style=\"fill:lightgray; stroke:#1c87c9; stroke-width:4;\"/>\n" +
+ " \t</svg>\n" +
+ " </body>\n" +
+ "</html>\n";
+
+ Document doc = Jsoup.parse(h);
+
+ Element rv = doc.select("body").get(0).children().get(1);
+
+ assertEquals("xlinkU00003Ahref", rv.attributes().asList().get(2).getConvertedKey());
+
+ assertEquals("UnboundPrefix", rv.attributes().asList().get(2).getValue());
+ assertEquals("UnboundPrefix", rv.attributes().get("xlink:href"));
+ }
 }
diff --git a/src/test/java/org/jsoup/parser/TagTest.java b/src/test/java/org/jsoup/parser/TagTest.java
@@ -1,8 +1,12 @@
 package org.jsoup.parser;
 
+import org.jsoup.Jsoup;
 import org.jsoup.MultiLocaleExtension.MultiLocaleTest;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
 import org.junit.jupiter.api.Test;
 
+import java.util.List;
 import java.util.Locale;
 
 import static org.junit.jupiter.api.Assertions.*;
@@ -81,4 +85,28 @@ public void canBeInsensitive(Locale locale) {
  assertTrue(Tag.isKnownTag("div"));
  assertFalse(Tag.isKnownTag("explain"));
  }
+
+ // Test the tag containing symbols like `:`
+ // Issue #1341
+ // https://github.com/jhy/jsoup/issues/1341
+ @Test public void handleSymbolTags() {
+ String h = "<!doctype html>\n" +
+ "<html lang=\"de\">\n" +
+ " <head>\n" +
+ "\n" +
+ " </head>\n" +
+ " <body>\n" +
+ "\t<test:h1>UnboundPrefix</test:h1>\n" +
+ "\t<svg width=\"180\" height=\"180\" xlink:href=\"UnboundPrefix\">\n" +
+ " \t<rect x=\"20\" y=\"20\" rx=\"20\" ry=\"20\" width=\"100\" height=\"100\" style=\"fill:lightgray; stroke:#1c87c9; stroke-width:4;\"/>\n" +
+ " \t</svg>\n" +
+ " </body>\n" +
+ "</html>\n";
+
+ Document doc = Jsoup.parse(h);
+
+ Element rv = doc.select("body").get(0).children().get(0);
+ assertEquals("testU00003Ah1", rv.tag().unicodeName());
+ assertEquals("test:h1", rv.tagName());
+ }
 }