Skip to content

Commit

Permalink
unicode version check
Browse files Browse the repository at this point in the history
  • Loading branch information
Carbon225 committed Apr 7, 2023
1 parent 96e86c6 commit 33c14be
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 4 deletions.
22 changes: 19 additions & 3 deletions ens_normalize/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import os
import pickle
import pickletools
from pyunormalize import NFC, NFD
from pyunormalize import NFC, NFD, UNICODE_VERSION
import warnings


SPEC_PATH = os.path.join(os.path.dirname(__file__), 'spec.json')
Expand Down Expand Up @@ -395,12 +396,14 @@ class NormalizationData:
# Increment VERSION when the spec changes
# or if the code in this class changes.
# It will force the cache to be regenerated.
VERSION = 1
VERSION = 2

def __init__(self):
with open(SPEC_PATH, encoding='utf-8') as f:
spec = json.load(f)

self.version = NormalizationData.VERSION
self.unicode_version: str = spec['unicode']
self.ignored: Set[int] = set(spec['ignored'])
self.mapped: Dict[int, List[int]] = {cp_src: mapping for cp_src, mapping in spec['mapped']}
self.cm: Set[int] = set(spec['cm'])
Expand Down Expand Up @@ -431,7 +434,7 @@ def load_normalization_data() -> NormalizationData:
if os.path.exists(cache_path):
with open(cache_path, 'rb') as f:
data: NormalizationData = pickle.load(f)
if data.VERSION == NormalizationData.VERSION:
if getattr(data, 'version', None) == NormalizationData.VERSION:
return data
data = NormalizationData()
# Python >= 3.8 is required for protocol 5
Expand All @@ -445,6 +448,19 @@ def load_normalization_data() -> NormalizationData:
NORMALIZATION = load_normalization_data()


def check_spec_unicode_version():
if not NORMALIZATION.unicode_version.startswith(UNICODE_VERSION):
warnings.warn(
f'Unicode version mismatch: '
f'pyunormalize is using {UNICODE_VERSION}, '
f'but the ENS Normalization spec is for {NORMALIZATION.unicode_version}',
UnicodeWarning,
)


check_spec_unicode_version()


def collapse_valid_tokens(tokens: List[Token]) -> List[Token]:
"""
Combine cps from continuous valid tokens into single tokens.
Expand Down
20 changes: 19 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pyunormalize = "^15.0.0"
[tool.poetry.group.dev.dependencies]
pytest = "^7.2.1"
pytest-cov = "^4.0.0"
pytest-mock = "^3.10.0"

[build-system]
requires = ["poetry-core"]
Expand Down
8 changes: 8 additions & 0 deletions tests/test_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from ens_normalize import *
import ens_normalize as ens_normalize_module
import warnings


TESTS_PATH = os.path.join(os.path.dirname(__file__), 'ens-normalize-tests.json')
Expand Down Expand Up @@ -372,3 +373,10 @@ def test_error_meta():
e: CurableError = ens_process(f'bitcoin.bitcin.bi̇tcin.bitсin{c}').error
assert e.general_info == 'Contains visually confusing characters from multiple scripts (Latin plus other scripts)'
assert e.disallowed_sequence_info == 'This character is disallowed because it is visually confusing with another character from the Latin script'


def test_unicode_version_check(mocker):
mocker.patch('ens_normalize.normalization.UNICODE_VERSION', '15.0.1')
warnings.filterwarnings('error')
with pytest.raises(UnicodeWarning):
ens_normalize_module.normalization.check_spec_unicode_version()

0 comments on commit 33c14be

Please # to comment.