Skip to content

Commit

Permalink
Replace html5lib with html5rdf, make it an optional dependency (#2951)
Browse files Browse the repository at this point in the history
* Revert previous commit that made html support non-optional.
html support is now optional again, and it uses html5rdf rather than html5lib/html5lib-modern.

* Revert "Auxiliary commit to revert individual files from 18d1d70"

This reverts commit 6f80ad9cd2cc4b76ff4e1bc0998951768aff7573.

* Add html5lib-modern back into dockerfile dependencies so the docker image can be built with the released rdflib v7.1.0

* Reformat test file again.

* lxml is not actually required for operation of html5rdf

* Fix differences in compariston of XML and HTML nodes when html5rdf is used vs when it is not used.

* Add correct type hint to _XML_COMPARABLE variable.

* Fix logic in using non-ill-typed XML literals for comparison
  • Loading branch information
ashleysommer authored Oct 28, 2024
1 parent 4afa455 commit 638a867
Show file tree
Hide file tree
Showing 10 changed files with 90 additions and 49 deletions.
2 changes: 1 addition & 1 deletion devtools/constraints.min
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ pyparsing==2.1.0
importlib-metadata==4.0.0
berkeleydb==18.1.2
networkx==2.0
html5lib-modern==1.2.0
html5rdf==1.2.0
lxml==4.3.0
orjson==3.9.14
4 changes: 2 additions & 2 deletions docker/latest/requirements.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file is used for building a docker image of the latest rdflib release. It
# will be updated by dependabot when new releases are made.
rdflib==7.1.0
html5rdf==1.2.0
# html5lib-modern is required to allow the Dockerfile to build on with pre-RDFLib-7.1.1 releases.
html5lib-modern==1.2.0
# isodate is required to allow the Dockerfile to build on with pre-RDFLib-7.1 releases.
isodate==0.7.2
4 changes: 2 additions & 2 deletions docker/latest/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
#
# pip-compile docker/latest/requirements.in
#
html5lib-modern==1.2
html5rdf==1.2
# via
# -r docker/latest/requirements.in
# rdflib
isodate==0.7.2
html5lib-modern==1.2
# via -r docker/latest/requirements.in
pyparsing==3.0.9
# via rdflib
Expand Down
12 changes: 6 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ isodate = {version=">=0.7.2,<1.0.0", python = "<3.11"}
pyparsing = ">=2.1.0,<4"
berkeleydb = {version = "^18.1.0", optional = true}
networkx = {version = ">=2,<4", optional = true}
html5lib-modern = "^1.2"
html5rdf = {version = ">=1.2,<2", optional = true}
lxml = {version = ">=4.3,<6.0", optional = true}
orjson = {version = ">=3.9.14,<4", optional = true}

Expand Down Expand Up @@ -74,6 +74,9 @@ ruff = ">=0.0.286,<0.8.0"
[tool.poetry.extras]
berkeleydb = ["berkeleydb"]
networkx = ["networkx"]
# html support is optional, it is used only in tokenizing `rdf:HTML` type Literals
html = ["html5rdf"]
# lxml support is optional, it is used only for parsing XML-formatted SPARQL results
lxml = ["lxml"]
orjson = ["orjson"]

Expand Down
73 changes: 53 additions & 20 deletions rdflib/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@
from urllib.parse import urldefrag, urljoin, urlparse
from uuid import uuid4

import html5lib

import rdflib
import rdflib.util
from rdflib.compat import long_type
Expand All @@ -86,6 +84,14 @@
from .namespace import NamespaceManager
from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath

_HAS_HTML5RDF = False

try:
import html5rdf

_HAS_HTML5RDF = True
except ImportError:
html5rdf = None

_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"

Expand Down Expand Up @@ -1107,7 +1113,7 @@ def __gt__(self, other: Any) -> bool:
if other is None:
return True # Everything is greater than None
if isinstance(other, Literal):
# Fast path for comapring numeric literals
# Fast path for comparing numeric literals
# that are not ill-typed and don't have a None value
if (
(
Expand Down Expand Up @@ -1350,9 +1356,15 @@ def eq(self, other: Any) -> bool:
"""
if isinstance(other, Literal):
# Fast path for comparing numeric literals
# that are not ill-typed and don't have a None value
if (
self.datatype in _NUMERIC_LITERAL_TYPES
and other.datatype in _NUMERIC_LITERAL_TYPES
(
self.datatype in _NUMERIC_LITERAL_TYPES
and other.datatype in _NUMERIC_LITERAL_TYPES
)
and ((not self.ill_typed) and (not other.ill_typed))
and (self.value is not None and other.value is not None)
):
if self.value is not None and other.value is not None:
return self.value == other.value
Expand All @@ -1374,6 +1386,16 @@ def eq(self, other: Any) -> bool:
# string/plain literals, compare on lexical form
return str.__eq__(self, other)

# XML can be compared to HTML, only if html5rdf is enabled
if (
(dtself in _XML_COMPARABLE and dtother in _XML_COMPARABLE)
and
# Ill-typed can be None if unknown, but we don't want it to be True.
((self.ill_typed is not True) and (other.ill_typed is not True))
and (self.value is not None and other.value is not None)
):
return _isEqualXMLNode(self.value, other.value)

if dtself != dtother:
if rdflib.DAWG_LITERAL_COLLATION:
raise TypeError(
Expand All @@ -1387,9 +1409,6 @@ def eq(self, other: Any) -> bool:
# maybe there are counter examples

if self.value is not None and other.value is not None:
if self.datatype in (_RDF_XMLLITERAL, _RDF_HTMLLITERAL):
return _isEqualXMLNode(self.value, other.value)

return self.value == other.value
else:
if str.__eq__(self, other):
Expand Down Expand Up @@ -1668,19 +1687,19 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document: # noqa: N802
def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
"""
Parse the lexical form of an HTML literal into a document fragment
using the ``dom`` from html5lib tree builder.
using the ``dom`` from html5rdf tree builder.
:param lexical_form: The lexical form of the HTML literal.
:return: A document fragment representing the HTML literal.
:raises: `html5lib.html5parser.ParseError` if the lexical form is
:raises: `html5rdf.html5parser.ParseError` if the lexical form is
not valid HTML.
"""
parser = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
parser = html5rdf.HTMLParser(
tree=html5rdf.treebuilders.getTreeBuilder("dom"), strict=True
)
try:
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
except html5lib.html5parser.ParseError as e:
except html5rdf.html5parser.ParseError as e:
logger.info(f"Failed to parse HTML: {e}")
raise e
result.normalize()
Expand All @@ -1695,7 +1714,7 @@ def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
:param value: A document fragment representing an HTML literal.
:return: The lexical form of the HTML literal.
"""
result = html5lib.serialize(value, tree="dom")
result = html5rdf.serialize(value, tree="dom")
return result


Expand Down Expand Up @@ -2012,14 +2031,21 @@ def _castPythonToLiteral( # noqa: N802
(Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
(timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
(xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
# This is a bit dirty, by accident the html5lib parser produces
# DocumentFragments, and the xml parser Documents, letting this
# decide what datatype to use makes roundtripping easier, but it a
# bit random.
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)),
(Fraction, (None, _OWL_RATIONAL)),
]

if html5rdf is not None:
# This is a bit dirty, by accident the html5rdf parser produces
# DocumentFragments, and the xml parser Documents, letting this
# decide what datatype to use makes roundtripping easier, but its a
# bit random.

# This must happen before _GenericPythonToXSDRules is assigned to
# _OriginalGenericPythonToXSDRules.
_GenericPythonToXSDRules.append(
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
)

_OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)

_SpecificPythonToXSDRules: List[
Expand Down Expand Up @@ -2069,10 +2095,17 @@ def _castPythonToLiteral( # noqa: N802
URIRef(_XSD_PFX + "double"): float,
URIRef(_XSD_PFX + "base64Binary"): b64decode,
URIRef(_XSD_PFX + "anyURI"): None,
_RDF_HTMLLITERAL: _parse_html,
_RDF_XMLLITERAL: _parseXML,
}

if html5rdf is not None:
# It is probably best to keep this close to the definition of
# _GenericPythonToXSDRules so nobody misses it.
XSDToPython[_RDF_HTMLLITERAL] = _parse_html
_XML_COMPARABLE: Tuple[URIRef, ...] = (_RDF_XMLLITERAL, _RDF_HTMLLITERAL)
else:
_XML_COMPARABLE = (_RDF_XMLLITERAL,)

_check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,
Expand Down
10 changes: 5 additions & 5 deletions test/test_literal/test_literal.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@


try:
import html5lib as _ # noqa: F401
import html5rdf as _ # noqa: F401

_HAVE_HTML5LIB = True
_HAVE_HTML5RDF = True
except ImportError:
_HAVE_HTML5LIB = False
_HAVE_HTML5RDF = False

import pytest

Expand Down Expand Up @@ -981,7 +981,7 @@ def __eq__(self, __value: object) -> bool:
(
lambda: Literal("<body>", datatype=RDF.HTML),
LiteralChecker(
..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "<body>"
..., None, RDF.HTML, True if _HAVE_HTML5RDF else None, "<body>"
),
),
(
Expand All @@ -990,7 +990,7 @@ def __eq__(self, __value: object) -> bool:
...,
None,
RDF.HTML,
False if _HAVE_HTML5LIB else None,
False if _HAVE_HTML5RDF else None,
"<table></table>",
),
),
Expand Down
13 changes: 9 additions & 4 deletions test/test_literal/test_literal_html5lib.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import xml.dom.minidom
from typing import Callable

import html5lib # noqa: F401
import pytest

import rdflib.term
Expand All @@ -10,8 +9,14 @@
from test.utils.literal import LiteralChecker
from test.utils.outcome import OutcomeChecker, OutcomePrimitives

try:
import html5rdf as _ # noqa: F401
except ImportError:
pytest.skip("html5rdf not installed", allow_module_level=True)

def test_has_html5lib() -> None:

def test_has_html5rdf() -> None:
assert rdflib.term._HAS_HTML5RDF is True
assert RDF.HTML in rdflib.term.XSDToPython
rule = next(
(
Expand All @@ -29,7 +34,7 @@ def test_has_html5lib() -> None:
["factory", "outcome"],
[
# Ill-typed literals, these have lexical forms that result in
# errors when parsed as HTML by html5lib.
# errors when parsed as HTML by html5rdf.
(
lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML),
LiteralChecker(
Expand All @@ -47,7 +52,7 @@ def test_has_html5lib() -> None:
),
),
# Well-typed literals, these have lexical forms that parse
# without errors with html5lib.
# without errors with html5rdf.
(
lambda: Literal("<table></table>", datatype=RDF.HTML),
LiteralChecker(..., None, RDF.HTML, False, "<table></table>"),
Expand Down
12 changes: 6 additions & 6 deletions test/test_literal/test_xmlliterals.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from rdflib import RDF, Literal

try:
import html5lib # noqa: F401
import html5rdf # noqa: F401

have_html5lib = True
have_html5rdf = True
except ImportError:
have_html5lib = False
have_html5rdf = False


def testPythonRoundtrip(): # noqa: N802
Expand Down Expand Up @@ -90,7 +90,7 @@ def testRoundtrip(): # noqa: N802
roundtrip("nt")


@pytest.mark.skipif(not have_html5lib, reason="requires html5lib")
@pytest.mark.skipif(not have_html5rdf, reason="requires html5rdf")
def testHTML(): # noqa: N802
l1 = Literal("<msg>hello</msg>", datatype=RDF.XMLLiteral)
assert l1.value is not None, "xml must have been parsed"
Expand Down Expand Up @@ -126,7 +126,7 @@ def testHTML(): # noqa: N802
textwrap.dedent(
"""\
<!DOCTYPE example>
<something/>
<something2/>
"""
)
),
Expand All @@ -137,7 +137,7 @@ def testHTML(): # noqa: N802
textwrap.dedent(
"""\
<!DOCTYPE example>
<something />
<something2 />
"""
)
),
Expand Down
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ setenv =
COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
MYPY_CACHE_DIR = {envdir}/.mypy_cache
docs: POETRY_ARGS_docs = --only=docs
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson
lxml: POETRY_ARGS_lxml = --extras=lxml
commands_pre =
py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
Expand Down Expand Up @@ -59,7 +59,7 @@ setenv =
PYTHONHASHSEED = 0
commands_pre =
poetry lock --check
poetry install --only=main --only=docs
poetry install --only=main --only=docs --extras=html
poetry env info
commands =
poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html
Expand Down

0 comments on commit 638a867

Please # to comment.