dgtlmoon · Constantin1489 · May 2, 2024 · May 2, 2024 · May 2, 2024 · May 2, 2024
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
@@ -110,6 +110,38 @@ def elementpath_tostring(obj):
 
  return str(obj)
 
+def forest_transplanting(root):
+ """
+ libxml2 violates DOM rules. it means there can be multiple root element
+ nodes. So I choose just transplating them to a new root by default.
+ See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716
+ This will emulate xpath1 of html of libxml2 like '/html[2]/*'.
+ To make this function work, 'fragment=True' in elementpath.select is required.
+ """
+ from lxml import etree
+ from itertools import chain
+ root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)]
+ root_siblings = [s for s in root.itersiblings()]
+
+ Is_fragment=False
+ # If element node exsits in root element node's sibilings, it is fragment.
+ for node in chain(root_siblings_preceding, root_siblings):
+ if not hasattr(node.tag, '__name__'):
+ Is_fragment=True
+ # early exit. because the root is already root element.
+ # So, two root element nodes are detected. DOM violation.
+ break
+
+ if Is_fragment:
+ new_root = etree.Element("new_root")
+ root_siblings_preceding.reverse()
+ for node in chain(root_siblings_preceding, [root], root_siblings):
+ new_root.append(node)
+ return new_root, True
+
+ return root, False
+
+
 # Return str Utf-8 of matched rules
 def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
  from lxml import etree, html
@@ -123,9 +155,10 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
  parser = etree.XMLParser(strip_cdata=False)
 
  tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
+ tree, is_fragment = forest_transplanting(tree)
  html_block = ""
 
- r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
+ r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=is_fragment)
  #@note: //title/text() wont work where <title>CDATA..
 
  if type(r) != list:

diff --git a/changedetectionio/tests/test_xpath_selector_unit.py b/changedetectionio/tests/test_xpath_selector_unit.py
@@ -201,3 +201,61 @@ def test_trips(html_content, xpath, answer):
  html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
  assert type(html_content) == str
  assert answer in html_content
+
+DOM_violation_two_html_root_element = """<!DOCTYPE html>
+<html>
+ <body>
+ <h1>Hello world</h1>
+ <p>First paragraph.</p>
+ </body>
+</html>
+<html>
+ <body>
+ <h1>Hello world</h1>
+ <p>Browsers parse this part by fixing it but lxml doesn't and returns two root element node</p>
+ <p>Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.</p>
+ </body>
+</html>"""
+@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
+@pytest.mark.parametrize("xpath, answer", [
+ ("/html/body/p[1]", "First paragraph."),
+ ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+ ("count(/html/body/p[1])", "2"),
+ ("count(/html)", "2"),
+ ("count(//html)", "2"),
+ ("count(//body)", "2"),
+ ("count(/html/body)", "2"),
+ ("//html/body/p[1]", "First paragraph."),
+ ("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+ ("//body/p[1]", "First paragraph."),
+ ("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+ ("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+ ("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
+ ])
+def test_broken_DOM_01(html_content, xpath, answer):
+ # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs.
+ with pytest.raises(Exception):
+ from lxml import etree, html
+ import elementpath
+ from elementpath.xpath3 import XPath3Parser
+ parser = etree.HTMLParser()
+ tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
+ # just example xpath
+ # Error will occur.
+ r = elementpath.select(tree, xpath.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
+
+ html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
+ assert type(html_content) == str
+ assert answer in html_content
+
+@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
+@pytest.mark.parametrize("xpath, answer", [
+ ("/html[2]/body/p[1]", "First paragraph."),
+ ("//html[2]/body/p[1]", "First paragraph."),
+ ])
+def test_Broken_DOM_02(html_content, xpath, answer):
+ # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs.
+ html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
+ assert type(html_content) == str
+ # Check the answer is *not in* the html_content
+ assert answer not in html_content
diff --git a/requirements.txt b/requirements.txt
@@ -55,7 +55,7 @@ beautifulsoup4
 lxml >=4.8.0,<6
 
 # XPath 2.0-3.1 support - 4.2.0 broke something?
-elementpath==4.1.5
+elementpath==4.4.0
 
 selenium~=4.14.0