Skip to content

Commit

Permalink
Fix rebuild of validator per instance & cache bug
Browse files Browse the repository at this point in the history
The schema loader is asked to retrieve a validator per instance, in
order to support the interface of metaschema evaluation. However, this
results in a complete rebuild of the validator even when there is one
schema being used for many instances (as in the `--schemafile` usage
mode).

Introducing an LRU cache on the validator builder results in the same
validator being reused (and is sensitive to changes in
settings/parameters).

Additionally fix a bug in the remote ref download caching in which the
key used for lookup was incorrect, rendering the cache completely
ineffective.

A new regression test confirms that the caching is fully effective by
only observing the number of requests made by the program over "N"
instancefiles, where N>=1.
  • Loading branch information
sirosen committed Jul 27, 2024
1 parent 74b003b commit f9645ee
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 4 deletions.
11 changes: 11 additions & 0 deletions src/check_jsonschema/schema_loader/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import functools
import pathlib
import typing as t
import urllib.error
Expand Down Expand Up @@ -130,11 +131,21 @@ def get_validator(
instance_doc: dict[str, t.Any],
format_opts: FormatOptions,
fill_defaults: bool,
) -> jsonschema.protocols.Validator:
return self._get_validator(format_opts, fill_defaults)

@functools.lru_cache
def _get_validator(
self,
format_opts: FormatOptions,
fill_defaults: bool,
) -> jsonschema.protocols.Validator:
retrieval_uri = self.get_schema_retrieval_uri()
schema = self.get_schema()

schema_dialect = schema.get("$schema")
if schema_dialect is not None and not isinstance(schema_dialect, str):
schema_dialect = None

# format checker (which may be None)
format_checker = make_format_checker(format_opts, schema_dialect)
Expand Down
8 changes: 4 additions & 4 deletions src/check_jsonschema/schema_loader/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ def retrieve_reference(uri: str) -> referencing.Resource[Schema]:
else:
full_uri = uri

if full_uri in cache._cache:
return cache[uri]
if full_uri in cache:
return cache[full_uri]

full_uri_scheme = urllib.parse.urlsplit(full_uri).scheme
if full_uri_scheme in ("http", "https"):
Expand All @@ -100,8 +100,8 @@ def validation_callback(content: bytes) -> None:
else:
parsed_object = get_local_file(full_uri)

cache[uri] = parsed_object
return cache[uri]
cache[full_uri] = parsed_object
return cache[full_uri]

return retrieve_reference

Expand Down
57 changes: 57 additions & 0 deletions tests/acceptance/test_remote_ref_resolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,60 @@ def test_ref_resolution_with_custom_base_uri(run_line, tmp_path, check_passes):
assert result.exit_code == 0, output
else:
assert result.exit_code == 1, output


@pytest.mark.parametrize("num_instances", (1, 2, 10))
@pytest.mark.parametrize("check_passes", (True, False))
def test_remote_ref_resolution_callout_count_is_scale_free_in_instancefiles(
run_line, tmp_path, num_instances, check_passes
):
"""
Test that for any N > 1, validation of a schema with a ref against N instance files
has exactly the same number of callouts as validation when N=1
This proves that the validator and caching are working correctly, and we aren't
repeating callouts to rebuild state.
"""
schema_uri = "https://example.org/schemas/main.json"
ref_uri = "https://example.org/schemas/title_schema.json"

main_schema = {
"$id": schema_uri,
"$schema": "http://json-schema.org/draft-07/schema",
"properties": {
"title": {"$ref": "./title_schema.json"},
},
"additionalProperties": False,
}
title_schema = {"type": "string"}
responses.add("GET", schema_uri, json=main_schema)
responses.add("GET", ref_uri, json=title_schema)

# write N documents
instance_doc = {"title": "doc one" if check_passes else 2}
instance_paths = []
for i in range(num_instances):
instance_path = tmp_path / f"instance{i}.json"
instance_path.write_text(json.dumps(instance_doc))
instance_paths.append(str(instance_path))

result = run_line(
[
"check-jsonschema",
"--schemafile",
schema_uri,
]
+ instance_paths
)
output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}"
if check_passes:
assert result.exit_code == 0, output
else:
assert result.exit_code == 1, output

# this is the moment of the "real" test run here:
# no matter how many instances there were, there should only have been two calls
# (one for the schema and one for the $ref)
assert len(responses.calls) == 2
assert len([c for c in responses.calls if c.request.url == schema_uri]) == 1
assert len([c for c in responses.calls if c.request.url == ref_uri]) == 1

0 comments on commit f9645ee

Please # to comment.