pierre-24 · pierre-24 · Mar 1, 2024 · Feb 29, 2024 · Feb 29, 2024 · pierre-24
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -30,6 +30,9 @@ jobs:
     - name: Lint
       run: |
         make lint
+    - name: Typing
+      run: |
+        make mypy
     - name: Test
       run: |
-        make test
+        make test
diff --git a/Makefile b/Makefile
@@ -8,10 +8,13 @@ help:
 
 
 install:
-	pip3 install -e .[dev]
+	python -m pip install --editable '.[dev]'
 
 lint:
-	flake8 pyiso4 tests --max-line-length=120 --ignore=N802
+	python -m flake8 pyiso4 tests --max-line-length=120 --ignore=N802
+
+mypy:
+	python -m mypy pyiso4 tests
 
 test:
-	python -m unittest discover -s tests
+	python -m unittest discover -s tests
diff --git a/pyiso4/lexer.py b/pyiso4/lexer.py
@@ -1,4 +1,4 @@
-from typing import List, Iterable
+from typing import List, Iterable, Optional
 from enum import Enum, unique
 from unicodedata import normalize
 import re
@@ -46,17 +46,17 @@ def __repr__(self) -> str:
 
 
 class Lexer:
-    def __init__(self, inp, stopwords: List[str]):
+    def __init__(self, inp: str, stopwords: List[str]) -> None:
         self.input = normalize('NFC', inp)
         self.pos = 0
         self.count = -1
         self.start_word = 0
-        self.current_word = None
+        self.current_word: Optional[str] = None
         self.stopwords = stopwords
 
         self.next()
 
-    def _skip_space(self):
+    def _skip_space(self) -> None:
         while self.pos < len(self.input) and self.input[self.pos] in SPACES:
             self.pos += 1
 
@@ -77,7 +77,7 @@ def next(self) -> None:
         self.count += 1
 
     def tokenize(self) -> Iterable[Token]:
-        def yield_hyphenated(word: str, base_pos: int):
+        def yield_hyphenated(word: str, base_pos: int) -> Iterable[Token]:
             is_first = True
             len_ = 0
             for w in word.split('-'):

diff --git a/pyiso4/ltwa.py b/pyiso4/ltwa.py
@@ -1,10 +1,10 @@
-from typing import List, Tuple
+from typing import List, Tuple, Optional, Union
 from unidecode import unidecode
 import re
 import pathlib
 
 from pyiso4.prefix_tree import PrefixTree
-from pyiso4.lexer import Lexer, TokenType
+from pyiso4.lexer import Lexer, Token, TokenType
 from pyiso4.normalize_string import normalize, Level, BOUNDARY, number_of_ligatures
 
 
@@ -27,7 +27,7 @@ def normalize(inp: str) -> str:
         return normalize(inp, Level.NORMAL).lower()
 
     @classmethod
-    def from_line(cls, line: str):
+    def from_line(cls, line: str) -> 'Pattern':
         """Constructed from a LTWA csv line"""
 
         fields = line.split('\t')
@@ -55,7 +55,7 @@ def to_key(self) -> str:
         else:
             return self.pattern
 
-    def match(self, sentence: str, langs: List[str] = None) -> bool:
+    def match(self, sentence: str, langs: Optional[List[str]] = None) -> bool:
         """Check if the pattern matches the begining of ``sentence``.
         Assume that it has been normalized.
         """
@@ -106,7 +106,7 @@ def match(self, sentence: str, langs: List[str] = None) -> bool:
         else:  # if that's a boundary, then its a match
             return BOUNDARY.match(sentence[final_pos + 1:]) is not None
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return 'Pattern({}, {})'.format(self.pattern, self.replacement)
 
 
@@ -121,10 +121,10 @@ def __init__(self, ltwa_prefix: PrefixTree, ltwa_suffix: PrefixTree, stopwords:
         self.stopwords = stopwords
 
     @classmethod
-    def create(
-            cls,
-            ltwa_file: str = _here / 'LTWA_20210702.csv',
-            stopwords: str = _here / 'stopwords.txt'):
+    def create(cls,
+               ltwa_file: Union[str, pathlib.Path] = _here / 'LTWA_20210702.csv',
+               stopwords: Union[str, pathlib.Path] = _here / 'stopwords.txt',
+               ) -> 'Abbreviate':
         """Create an object from the LTWA CSV file and a newline-separated list of stopwords"""
 
         ltwa_prefix = PrefixTree()
@@ -154,15 +154,17 @@ def create(
 
         return cls(ltwa_prefix, ltwa_suffix, stopwds)
 
-    def _potential_matches(self, sentence: str, langs: List[str] = None) -> List[Pattern]:
+    def _potential_matches(self,
+                           sentence: str,
+                           langs: Optional[List[str]] = None) -> List[Pattern]:
         # look into prefix
         results = self.ltwa_prefix.search(sentence)
 
         # look into suffixes
         results += self.ltwa_suffix.search(str(reversed(sentence)))
 
         # remove everything that does not match
-        results = filter(lambda p: p.match(sentence, langs), results)
+        results = list(filter(lambda p: p.match(sentence, langs), results))
 
         # return longer matches first, with ending dashes if possible
         return sorted(
@@ -175,15 +177,19 @@ def match_capitalization_and_diacritic(abbrv: str, original: str) -> str:
         """Matches the capitalization and diacritics of the `original` word, as long as they are similar
         """
 
-        abbrv = list(normalize(abbrv, Level.SOFT))
-        for i, c in enumerate(abbrv):
+        normalized_abbrv = list(normalize(abbrv, Level.SOFT))
+        for i, c in enumerate(normalized_abbrv):
             unided = unidecode(original[i])
             if unidecode(c) in [unided.lower(), unided.upper()]:
-                abbrv[i] = original[i]
+                normalized_abbrv[i] = original[i]
 
-        return ''.join(abbrv)
+        return ''.join(normalized_abbrv)
 
-    def abbreviate(self, sentence: str, fallback: str, guide: str, langs: List[str] = None) -> Tuple[str, int]:
+    def abbreviate(self,
+                   sentence: str,
+                   fallback: str,
+                   guide: str,
+                   langs: Optional[List[str]] = None) -> Tuple[str, int]:
         """Abbreviate the beginning of ``sentence`` by looking for an appropriate pattern.
         If not found, use ``fallback``. If found, matches the capitalization given by ``guide``.
         Also returns the length of the sentence that was replaced.
@@ -199,7 +205,10 @@ def abbreviate(self, sentence: str, fallback: str, guide: str, langs: List[str]
 
         return fallback, len(fallback)
 
-    def __call__(self, title: str, remove_part: bool = True, langs: List[str] = None) -> str:
+    def __call__(self,
+                 title: str,
+                 remove_part: bool = True,
+                 langs: Optional[List[str]] = None) -> str:
         """Abbreviate a title according to the rules of Section 7 in the ISSN manual
         (https://www.issn.org/understanding-the-issn/assignment-rules/issn-manual/)
 
@@ -219,7 +228,7 @@ def __call__(self, title: str, remove_part: bool = True, langs: List[str] = None
         title_normalized = Pattern.normalize(title)
 
         lexer = Lexer(title_soft_normalized, self.stopwords)
-        tokens = []
+        tokens: List[Token] = []
         prev_article = None
 
         # filter tokens

diff --git a/pyiso4/normalize_string.py b/pyiso4/normalize_string.py
@@ -9,7 +9,7 @@
 LIGATURES = 'ŒœÆæ'
 
 
-def number_of_ligatures(word: str):
+def number_of_ligatures(word: str) -> int:
     return sum(1 for c in word if c in LIGATURES)
 
 

diff --git a/pyiso4/prefix_tree.py b/pyiso4/prefix_tree.py
@@ -32,7 +32,7 @@ def insert(self, key: str, obj: Any, position: int = 0) -> None:
             self.objs.append((key, obj))
             self._split(position)
 
-    def _split(self, position) -> None:
+    def _split(self, position: int) -> None:
         """Split the node if it contains too much objects"""
 
         if not self.split and self.char is not None and len(self.objs) > self.MAX_OBJS:
@@ -76,7 +76,7 @@ class PrefixTree:
     """Prefix tree that return correct results up to a certain point (depending on `Node.MAX_OBJS`).
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.root = Node('')
 
     def insert(self, key: str, obj: Any) -> None:

diff --git a/pyiso4/py.typed b/pyiso4/py.typed
diff --git a/pyiso4/script.py b/pyiso4/script.py
@@ -6,7 +6,7 @@
 from pyiso4.ltwa import Abbreviate
 
 
-def get_arguments_parser():
+def get_arguments_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description=pyiso4.__doc__)
     parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + pyiso4.__version__)
 
@@ -30,7 +30,7 @@ def get_arguments_parser():
     return parser
 
 
-def main():
+def main() -> None:
     args = get_arguments_parser().parse_args()
 
     # load LTWA

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,43 +1,54 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools>=62"]
+
 [project]
 name = "pyiso4"
 dynamic = ["version"]
-authors = [
-    {name = "Pierre Beaujean", email = "pierre.beaujean@unamur.be"},
-]
 description = "Abbreviate a scientific journal title following the ISO-4 rules"
 readme = "README.md"
 license = {file = "LICENSE"}
+authors = [{name = "Pierre Beaujean", email = "pierre.beaujean@unamur.be"}]
 requires-python = ">=3.8"
 classifiers = [
     "Development Status :: 4 - Beta",
     "License :: OSI Approved :: MIT License",
     "Natural Language :: English",
     "Operating System :: OS Independent",
     "Programming Language :: Python",
+    "Programming Language :: Python :: 3 :: Only",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
 ]
-
 dependencies = [
-    'unidecode'
+    "unidecode",
 ]
 
 [project.optional-dependencies]
 dev = [
-    "flake8",
-    "flake8-quotes",
     "autopep8",
     "bump2version",
+    "flake8",
+    "flake8-quotes",
+    "mypy",
 ]
 
 [project.scripts]
-iso4abbreviate = 'pyiso4.script:main'
+iso4abbreviate = "pyiso4.script:main"
 
 [tool.setuptools]
-packages = ['pyiso4']
+packages = ["pyiso4"]
+
+[tool.setuptools.package-data]
+pyiso4 = ["py.typed"]
 
 [tool.setuptools.dynamic]
-version = {attr = "pyiso4.__version__"}
+version = {attr = "pyiso4.__version__"}
+
+[tool.mypy]
+strict = true
+hide_error_codes = false
+warn_unused_ignores = true
diff --git a/tests/tests.py b/tests/tests.py
@@ -1,4 +1,5 @@
 import unittest
+from typing import Any
 
 from pyiso4.lexer import Lexer, TokenType
 from pyiso4.ltwa import Pattern, Abbreviate
@@ -8,7 +9,7 @@
 class TestNormalize(unittest.TestCase):
     """Test the unicode normalization"""
 
-    def test_normalize(self):
+    def test_normalize(self) -> None:
         tests = [
             ('test', 'test'),
             ('abbréviation', 'abbreviation'),
@@ -21,7 +22,7 @@ def test_normalize(self):
         for inp, out in tests:
             self.assertEqual(out, normalize(inp, Level.NORMAL))
 
-    def test_normalize_extra(self):
+    def test_normalize_extra(self) -> None:
         tests = [
             ('TeSt', 'test'),
             ("Côte-d'Azur", 'cote d azur')
@@ -30,17 +31,14 @@ def test_normalize_extra(self):
         for inp, out in tests:
             self.assertEqual(out, normalize(inp, Level.HARD))
 
-    def test_ligatures(self):
+    def test_ligatures(self) -> None:
         self.assertEqual(number_of_ligatures('test'), 0)
         self.assertEqual(number_of_ligatures('coeur'), 0)
         self.assertEqual(number_of_ligatures('cœur'), 1)
 
 
 class TestLexer(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def test_stopword(self):
+    def test_stopword(self) -> None:
         stopwords = ['x', 'y']
         text = ' '.join(stopwords)
 
@@ -54,13 +52,13 @@ def test_stopword(self):
         for t in tokens[:-1]:  # skip EOS
             self.assertEqual(t.type, TokenType.STOPWORD)
 
-    def test_token_position(self):
+    def test_token_position(self) -> None:
         text = 'this is a test-case for you'
         for t in Lexer(text, []).tokenize():
             if t.position >= 0:
                 self.assertEqual(text[t.position], t.value[0])
 
-    def test_hyphenated_words(self):
+    def test_hyphenated_words(self) -> None:
         cpd1 = 'état'
         cpd2 = 'nation'
         text = '{}-{}'.format(cpd1, cpd2)
@@ -71,7 +69,7 @@ def test_hyphenated_words(self):
         self.assertEqual(tokens[2].type, TokenType.WORD)
         self.assertEqual(tokens[2].value, cpd2)
 
-    def test_surname_as_abbreviation(self):
+    def test_surname_as_abbreviation(self) -> None:
         abbrv = 'A.'
         text = 'Legacy of {} Einstein'.format(abbrv)
         tokens = list(Lexer(text, ['of']).tokenize())
@@ -82,7 +80,7 @@ def test_surname_as_abbreviation(self):
 
 
 class TestPattern(unittest.TestCase):
-    def test_pattern_match(self):
+    def test_pattern_match(self) -> None:
         text = 'abc'
 
         # no dash
@@ -114,7 +112,7 @@ def test_pattern_match(self):
         self.assertTrue(pattern.match(word + 's'))  # plural form
         self.assertFalse(pattern.match(word + 'x'))  # not an inflexion
 
-    def test_patter_match_on_sentence(self):
+    def test_patter_match_on_sentence(self) -> None:
         # no dash
         text = 'abc'
         pattern_without_dash = Pattern.from_line('{}\tx\tmul'.format(text))
@@ -133,11 +131,11 @@ def test_patter_match_on_sentence(self):
 
 
 class TestAbbreviate(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self.abbreviate = Abbreviate.create()
 
-    def test_abbreviations(self):
+    def test_abbreviations(self) -> None:
         with open('tests/tests.tsv') as f:
             for line in f.readlines():
                 fields = line.split('\t')