Replace html5lib with html5rdf, make it an optional dependency (#2951)

* Revert previous commit that made html support non-optional. html support is now optional again, and it uses html5rdf rather than html5lib/html5lib-modern. * Revert "Auxiliary commit to revert individual files from 18d1d70" This reverts commit 6f80ad9cd2cc4b76ff4e1bc0998951768aff7573. * Add html5lib-modern back into dockerfile dependencies so the docker image can be built with the released rdflib v7.1.0 * Reformat test file again. * lxml is not actually required for operation of html5rdf * Fix differences in compariston of XML and HTML nodes when html5rdf is used vs when it is not used. * Add correct type hint to _XML_COMPARABLE variable. * Fix logic in using non-ill-typed XML literals for comparison
RDFLib · Oct 28, 2024 · 638a867 · 638a867
1 parent 4afa455
commit 638a867
Show file tree

Hide file tree

Showing 10 changed files with 90 additions and 49 deletions.
diff --git a/devtools/constraints.min b/devtools/constraints.min
@@ -6,6 +6,6 @@ pyparsing==2.1.0
 importlib-metadata==4.0.0
 berkeleydb==18.1.2
 networkx==2.0
-html5lib-modern==1.2.0
+html5rdf==1.2.0
 lxml==4.3.0
 orjson==3.9.14
diff --git a/docker/latest/requirements.in b/docker/latest/requirements.in
@@ -1,6 +1,6 @@
 # This file is used for building a docker image of the latest rdflib release. It
 # will be updated by dependabot when new releases are made.
 rdflib==7.1.0
+html5rdf==1.2.0
+# html5lib-modern is required to allow the Dockerfile to build on with pre-RDFLib-7.1.1 releases.
 html5lib-modern==1.2.0
-# isodate is required to allow the Dockerfile to build on with pre-RDFLib-7.1 releases.
-isodate==0.7.2
diff --git a/docker/latest/requirements.txt b/docker/latest/requirements.txt
@@ -4,11 +4,11 @@
 #
 #    pip-compile docker/latest/requirements.in
 #
-html5lib-modern==1.2
+html5rdf==1.2
     # via
     #   -r docker/latest/requirements.in
     #   rdflib
-isodate==0.7.2
+html5lib-modern==1.2
     # via -r docker/latest/requirements.in
 pyparsing==3.0.9
     # via rdflib

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,7 @@ isodate = {version=">=0.7.2,<1.0.0", python = "<3.11"}
 pyparsing = ">=2.1.0,<4"
 berkeleydb = {version = "^18.1.0", optional = true}
 networkx = {version = ">=2,<4", optional = true}
-html5lib-modern = "^1.2"
+html5rdf = {version = ">=1.2,<2", optional = true}
 lxml = {version = ">=4.3,<6.0", optional = true}
 orjson = {version = ">=3.9.14,<4", optional = true}
 
@@ -74,6 +74,9 @@ ruff = ">=0.0.286,<0.8.0"
 [tool.poetry.extras]
 berkeleydb = ["berkeleydb"]
 networkx = ["networkx"]
+# html support is optional, it is used only in tokenizing `rdf:HTML` type Literals
+html = ["html5rdf"]
+# lxml support is optional, it is used only for parsing XML-formatted SPARQL results
 lxml = ["lxml"]
 orjson = ["orjson"]
 

diff --git a/rdflib/term.py b/rdflib/term.py
@@ -65,8 +65,6 @@
 from urllib.parse import urldefrag, urljoin, urlparse
 from uuid import uuid4
 
-import html5lib
-
 import rdflib
 import rdflib.util
 from rdflib.compat import long_type
@@ -86,6 +84,14 @@
     from .namespace import NamespaceManager
     from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath
 
+_HAS_HTML5RDF = False
+
+try:
+    import html5rdf
+
+    _HAS_HTML5RDF = True
+except ImportError:
+    html5rdf = None
 
 _SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"
 
@@ -1107,7 +1113,7 @@ def __gt__(self, other: Any) -> bool:
         if other is None:
             return True  # Everything is greater than None
         if isinstance(other, Literal):
-            # Fast path for comapring numeric literals
+            # Fast path for comparing numeric literals
             # that are not ill-typed and don't have a None value
             if (
                 (
@@ -1350,9 +1356,15 @@ def eq(self, other: Any) -> bool:
 
         """
         if isinstance(other, Literal):
+            # Fast path for comparing numeric literals
+            # that are not ill-typed and don't have a None value
             if (
-                self.datatype in _NUMERIC_LITERAL_TYPES
-                and other.datatype in _NUMERIC_LITERAL_TYPES
+                (
+                    self.datatype in _NUMERIC_LITERAL_TYPES
+                    and other.datatype in _NUMERIC_LITERAL_TYPES
+                )
+                and ((not self.ill_typed) and (not other.ill_typed))
+                and (self.value is not None and other.value is not None)
             ):
                 if self.value is not None and other.value is not None:
                     return self.value == other.value
@@ -1374,6 +1386,16 @@ def eq(self, other: Any) -> bool:
                 # string/plain literals, compare on lexical form
                 return str.__eq__(self, other)
 
+            # XML can be compared to HTML, only if html5rdf is enabled
+            if (
+                (dtself in _XML_COMPARABLE and dtother in _XML_COMPARABLE)
+                and
+                # Ill-typed can be None if unknown, but we don't want it to be True.
+                ((self.ill_typed is not True) and (other.ill_typed is not True))
+                and (self.value is not None and other.value is not None)
+            ):
+                return _isEqualXMLNode(self.value, other.value)
+
             if dtself != dtother:
                 if rdflib.DAWG_LITERAL_COLLATION:
                     raise TypeError(
@@ -1387,9 +1409,6 @@ def eq(self, other: Any) -> bool:
             # maybe there are counter examples
 
             if self.value is not None and other.value is not None:
-                if self.datatype in (_RDF_XMLLITERAL, _RDF_HTMLLITERAL):
-                    return _isEqualXMLNode(self.value, other.value)
-
                 return self.value == other.value
             else:
                 if str.__eq__(self, other):
@@ -1668,19 +1687,19 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document:  # noqa: N802
 def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
     """
     Parse the lexical form of an HTML literal into a document fragment
-    using the ``dom`` from html5lib tree builder.
+    using the ``dom`` from html5rdf tree builder.
 
     :param lexical_form: The lexical form of the HTML literal.
     :return: A document fragment representing the HTML literal.
-    :raises: `html5lib.html5parser.ParseError` if the lexical form is
+    :raises: `html5rdf.html5parser.ParseError` if the lexical form is
         not valid HTML.
     """
-    parser = html5lib.HTMLParser(
-        tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
+    parser = html5rdf.HTMLParser(
+        tree=html5rdf.treebuilders.getTreeBuilder("dom"), strict=True
     )
     try:
         result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
-    except html5lib.html5parser.ParseError as e:
+    except html5rdf.html5parser.ParseError as e:
         logger.info(f"Failed to parse HTML: {e}")
         raise e
     result.normalize()
@@ -1695,7 +1714,7 @@ def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
     :param value: A document fragment representing an HTML literal.
     :return: The lexical form of the HTML literal.
     """
-    result = html5lib.serialize(value, tree="dom")
+    result = html5rdf.serialize(value, tree="dom")
     return result
 
 
@@ -2012,14 +2031,21 @@ def _castPythonToLiteral(  # noqa: N802
     (Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
     (timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
     (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
-    # This is a bit dirty, by accident the html5lib parser produces
-    # DocumentFragments, and the xml parser Documents, letting this
-    # decide what datatype to use makes roundtripping easier, but it a
-    # bit random.
-    (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)),
     (Fraction, (None, _OWL_RATIONAL)),
 ]
 
+if html5rdf is not None:
+    # This is a bit dirty, by accident the html5rdf parser produces
+    # DocumentFragments, and the xml parser Documents, letting this
+    # decide what datatype to use makes roundtripping easier, but its a
+    # bit random.
+
+    # This must happen before _GenericPythonToXSDRules is assigned to
+    # _OriginalGenericPythonToXSDRules.
+    _GenericPythonToXSDRules.append(
+        (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
+    )
+
 _OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)
 
 _SpecificPythonToXSDRules: List[
@@ -2069,10 +2095,17 @@ def _castPythonToLiteral(  # noqa: N802
     URIRef(_XSD_PFX + "double"): float,
     URIRef(_XSD_PFX + "base64Binary"): b64decode,
     URIRef(_XSD_PFX + "anyURI"): None,
-    _RDF_HTMLLITERAL: _parse_html,
     _RDF_XMLLITERAL: _parseXML,
 }
 
+if html5rdf is not None:
+    # It is probably best to keep this close to the definition of
+    # _GenericPythonToXSDRules so nobody misses it.
+    XSDToPython[_RDF_HTMLLITERAL] = _parse_html
+    _XML_COMPARABLE: Tuple[URIRef, ...] = (_RDF_XMLLITERAL, _RDF_HTMLLITERAL)
+else:
+    _XML_COMPARABLE = (_RDF_XMLLITERAL,)
+
 _check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
     URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
     URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,

diff --git a/test/test_literal/test_literal.py b/test/test_literal/test_literal.py
@@ -23,11 +23,11 @@
 
 
 try:
-    import html5lib as _  # noqa: F401
+    import html5rdf as _  # noqa: F401
 
-    _HAVE_HTML5LIB = True
+    _HAVE_HTML5RDF = True
 except ImportError:
-    _HAVE_HTML5LIB = False
+    _HAVE_HTML5RDF = False
 
 import pytest
 
@@ -981,7 +981,7 @@ def __eq__(self, __value: object) -> bool:
         (
             lambda: Literal("<body>", datatype=RDF.HTML),
             LiteralChecker(
-                ..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "<body>"
+                ..., None, RDF.HTML, True if _HAVE_HTML5RDF else None, "<body>"
             ),
         ),
         (
@@ -990,7 +990,7 @@ def __eq__(self, __value: object) -> bool:
                 ...,
                 None,
                 RDF.HTML,
-                False if _HAVE_HTML5LIB else None,
+                False if _HAVE_HTML5RDF else None,
                 "<table></table>",
             ),
         ),

diff --git a/test/test_literal/test_literal_html5lib.py b/test/test_literal/test_literal_html5lib.py
@@ -1,7 +1,6 @@
 import xml.dom.minidom
 from typing import Callable
 
-import html5lib  # noqa: F401
 import pytest
 
 import rdflib.term
@@ -10,8 +9,14 @@
 from test.utils.literal import LiteralChecker
 from test.utils.outcome import OutcomeChecker, OutcomePrimitives
 
+try:
+    import html5rdf as _  # noqa: F401
+except ImportError:
+    pytest.skip("html5rdf not installed", allow_module_level=True)
 
-def test_has_html5lib() -> None:
+
+def test_has_html5rdf() -> None:
+    assert rdflib.term._HAS_HTML5RDF is True
     assert RDF.HTML in rdflib.term.XSDToPython
     rule = next(
         (
@@ -29,7 +34,7 @@ def test_has_html5lib() -> None:
     ["factory", "outcome"],
     [
         # Ill-typed literals, these have lexical forms that result in
-        # errors when parsed as HTML by html5lib.
+        # errors when parsed as HTML by html5rdf.
         (
             lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML),
             LiteralChecker(
@@ -47,7 +52,7 @@ def test_has_html5lib() -> None:
             ),
         ),
         # Well-typed literals, these have lexical forms that parse
-        # without errors with html5lib.
+        # without errors with html5rdf.
         (
             lambda: Literal("<table></table>", datatype=RDF.HTML),
             LiteralChecker(..., None, RDF.HTML, False, "<table></table>"),

diff --git a/test/test_literal/test_xmlliterals.py b/test/test_literal/test_xmlliterals.py
@@ -9,11 +9,11 @@
 from rdflib import RDF, Literal
 
 try:
-    import html5lib  # noqa: F401
+    import html5rdf  # noqa: F401
 
-    have_html5lib = True
+    have_html5rdf = True
 except ImportError:
-    have_html5lib = False
+    have_html5rdf = False
 
 
 def testPythonRoundtrip():  # noqa: N802
@@ -90,7 +90,7 @@ def testRoundtrip():  # noqa: N802
     roundtrip("nt")
 
 
-@pytest.mark.skipif(not have_html5lib, reason="requires html5lib")
+@pytest.mark.skipif(not have_html5rdf, reason="requires html5rdf")
 def testHTML():  # noqa: N802
     l1 = Literal("<msg>hello</msg>", datatype=RDF.XMLLiteral)
     assert l1.value is not None, "xml must have been parsed"
@@ -126,7 +126,7 @@ def testHTML():  # noqa: N802
                         textwrap.dedent(
                             """\
                     <!DOCTYPE example>
-                    <something/>
+                    <something2/>
                     """
                         )
                     ),
@@ -137,7 +137,7 @@ def testHTML():  # noqa: N802
                         textwrap.dedent(
                             """\
                     <!DOCTYPE example>
-                    <something />
+                    <something2 />
                     """
                         )
                     ),

diff --git a/tox.ini b/tox.ini
@@ -15,7 +15,7 @@ setenv =
     COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
     MYPY_CACHE_DIR = {envdir}/.mypy_cache
     docs: POETRY_ARGS_docs = --only=docs
-    extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson
+    extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson
     lxml: POETRY_ARGS_lxml = --extras=lxml
 commands_pre =
     py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
@@ -59,7 +59,7 @@ setenv =
     PYTHONHASHSEED = 0
 commands_pre =
     poetry lock --check
-    poetry install --only=main --only=docs
+    poetry install --only=main --only=docs --extras=html
     poetry env info
 commands =
     poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html