unicode version check

namehash · Apr 7, 2023 · 33c14be · 33c14be
1 parent 96e86c6
commit 33c14be
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 4 deletions.
diff --git a/ens_normalize/normalization.py b/ens_normalize/normalization.py
@@ -5,7 +5,8 @@
 import os
 import pickle
 import pickletools
-from pyunormalize import NFC, NFD
+from pyunormalize import NFC, NFD, UNICODE_VERSION
+import warnings
 
 
 SPEC_PATH = os.path.join(os.path.dirname(__file__), 'spec.json')
@@ -395,12 +396,14 @@ class NormalizationData:
     # Increment VERSION when the spec changes
     # or if the code in this class changes.
     # It will force the cache to be regenerated.
-    VERSION = 1
+    VERSION = 2
 
     def __init__(self):
         with open(SPEC_PATH, encoding='utf-8') as f:
             spec = json.load(f)
 
+        self.version = NormalizationData.VERSION
+        self.unicode_version: str = spec['unicode']
         self.ignored: Set[int] = set(spec['ignored'])
         self.mapped: Dict[int, List[int]] = {cp_src: mapping for cp_src, mapping in spec['mapped']}
         self.cm: Set[int] = set(spec['cm'])
@@ -431,7 +434,7 @@ def load_normalization_data() -> NormalizationData:
     if os.path.exists(cache_path):
         with open(cache_path, 'rb') as f:
             data: NormalizationData = pickle.load(f)
-            if data.VERSION == NormalizationData.VERSION:
+            if getattr(data, 'version', None) == NormalizationData.VERSION:
                 return data
     data = NormalizationData()
     # Python >= 3.8 is required for protocol 5
@@ -445,6 +448,19 @@ def load_normalization_data() -> NormalizationData:
 NORMALIZATION = load_normalization_data()
 
 
+def check_spec_unicode_version():
+    if not NORMALIZATION.unicode_version.startswith(UNICODE_VERSION):
+        warnings.warn(
+            f'Unicode version mismatch: '
+            f'pyunormalize is using {UNICODE_VERSION}, '
+            f'but the ENS Normalization spec is for {NORMALIZATION.unicode_version}',
+            UnicodeWarning,
+        )
+
+
+check_spec_unicode_version()
+
+
 def collapse_valid_tokens(tokens: List[Token]) -> List[Token]:
     """
     Combine cps from continuous valid tokens into single tokens.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ pyunormalize = "^15.0.0"
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2.1"
 pytest-cov = "^4.0.0"
+pytest-mock = "^3.10.0"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/tests/test_normalization.py b/tests/test_normalization.py
@@ -3,6 +3,7 @@
 import os
 from ens_normalize import *
 import ens_normalize as ens_normalize_module
+import warnings
 
 
 TESTS_PATH = os.path.join(os.path.dirname(__file__), 'ens-normalize-tests.json')
@@ -372,3 +373,10 @@ def test_error_meta():
     e: CurableError = ens_process(f'bitcoin.bitcin.bi̇tcin.bitсin{c}').error
     assert e.general_info == 'Contains visually confusing characters from multiple scripts (Latin plus other scripts)'
     assert e.disallowed_sequence_info == 'This character is disallowed because it is visually confusing with another character from the Latin script'
+
+
+def test_unicode_version_check(mocker):
+    mocker.patch('ens_normalize.normalization.UNICODE_VERSION', '15.0.1')
+    warnings.filterwarnings('error')
+    with pytest.raises(UnicodeWarning):
+        ens_normalize_module.normalization.check_spec_unicode_version()