Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Advertise typing support #10

Merged
merged 2 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ jobs:
- name: Lint
run: |
make lint
- name: Typing
run: |
make mypy
- name: Test
run: |
make test
make test
9 changes: 6 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@ help:


install:
pip3 install -e .[dev]
python -m pip install --editable '.[dev]'

lint:
flake8 pyiso4 tests --max-line-length=120 --ignore=N802
python -m flake8 pyiso4 tests --max-line-length=120 --ignore=N802

mypy:
python -m mypy pyiso4 tests

test:
python -m unittest discover -s tests
python -m unittest discover -s tests
10 changes: 5 additions & 5 deletions pyiso4/lexer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Iterable
from typing import List, Iterable, Optional
from enum import Enum, unique
from unicodedata import normalize
import re
Expand Down Expand Up @@ -46,17 +46,17 @@ def __repr__(self) -> str:


class Lexer:
def __init__(self, inp, stopwords: List[str]):
def __init__(self, inp: str, stopwords: List[str]) -> None:
self.input = normalize('NFC', inp)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That should be "Lexer" not None, then :)

Copy link
Contributor Author

@alexfikl alexfikl Mar 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__init__ always returns None (__new__ returns the type of the object)
https://docs.python.org/3/reference/datamodel.html#object.__init__

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, ok, I thought they were returning the Object itself. Good to know :)

self.pos = 0
self.count = -1
self.start_word = 0
self.current_word = None
self.current_word: Optional[str] = None
self.stopwords = stopwords

self.next()

def _skip_space(self):
def _skip_space(self) -> None:
while self.pos < len(self.input) and self.input[self.pos] in SPACES:
self.pos += 1

Expand All @@ -77,7 +77,7 @@ def next(self) -> None:
self.count += 1

def tokenize(self) -> Iterable[Token]:
def yield_hyphenated(word: str, base_pos: int):
def yield_hyphenated(word: str, base_pos: int) -> Iterable[Token]:
is_first = True
len_ = 0
for w in word.split('-'):
Expand Down
45 changes: 27 additions & 18 deletions pyiso4/ltwa.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import List, Tuple
from typing import List, Tuple, Optional, Union
from unidecode import unidecode
import re
import pathlib

from pyiso4.prefix_tree import PrefixTree
from pyiso4.lexer import Lexer, TokenType
from pyiso4.lexer import Lexer, Token, TokenType
from pyiso4.normalize_string import normalize, Level, BOUNDARY, number_of_ligatures


Expand All @@ -27,7 +27,7 @@ def normalize(inp: str) -> str:
return normalize(inp, Level.NORMAL).lower()

@classmethod
def from_line(cls, line: str):
def from_line(cls, line: str) -> 'Pattern':
"""Constructed from a LTWA csv line"""

fields = line.split('\t')
Expand Down Expand Up @@ -55,7 +55,7 @@ def to_key(self) -> str:
else:
return self.pattern

def match(self, sentence: str, langs: List[str] = None) -> bool:
def match(self, sentence: str, langs: Optional[List[str]] = None) -> bool:
"""Check if the pattern matches the begining of ``sentence``.
Assume that it has been normalized.
"""
Expand Down Expand Up @@ -106,7 +106,7 @@ def match(self, sentence: str, langs: List[str] = None) -> bool:
else: # if that's a boundary, then its a match
return BOUNDARY.match(sentence[final_pos + 1:]) is not None

def __repr__(self):
def __repr__(self) -> str:
return 'Pattern({}, {})'.format(self.pattern, self.replacement)


Expand All @@ -121,10 +121,10 @@ def __init__(self, ltwa_prefix: PrefixTree, ltwa_suffix: PrefixTree, stopwords:
self.stopwords = stopwords

@classmethod
def create(
cls,
ltwa_file: str = _here / 'LTWA_20210702.csv',
stopwords: str = _here / 'stopwords.txt'):
def create(cls,
ltwa_file: Union[str, pathlib.Path] = _here / 'LTWA_20210702.csv',
stopwords: Union[str, pathlib.Path] = _here / 'stopwords.txt',
) -> 'Abbreviate':
"""Create an object from the LTWA CSV file and a newline-separated list of stopwords"""

ltwa_prefix = PrefixTree()
Expand Down Expand Up @@ -154,15 +154,17 @@ def create(

return cls(ltwa_prefix, ltwa_suffix, stopwds)

def _potential_matches(self, sentence: str, langs: List[str] = None) -> List[Pattern]:
def _potential_matches(self,
sentence: str,
langs: Optional[List[str]] = None) -> List[Pattern]:
# look into prefix
results = self.ltwa_prefix.search(sentence)

# look into suffixes
results += self.ltwa_suffix.search(str(reversed(sentence)))

# remove everything that does not match
results = filter(lambda p: p.match(sentence, langs), results)
results = list(filter(lambda p: p.match(sentence, langs), results))

# return longer matches first, with ending dashes if possible
return sorted(
Expand All @@ -175,15 +177,19 @@ def match_capitalization_and_diacritic(abbrv: str, original: str) -> str:
"""Matches the capitalization and diacritics of the `original` word, as long as they are similar
"""

abbrv = list(normalize(abbrv, Level.SOFT))
for i, c in enumerate(abbrv):
normalized_abbrv = list(normalize(abbrv, Level.SOFT))
for i, c in enumerate(normalized_abbrv):
unided = unidecode(original[i])
if unidecode(c) in [unided.lower(), unided.upper()]:
abbrv[i] = original[i]
normalized_abbrv[i] = original[i]

return ''.join(abbrv)
return ''.join(normalized_abbrv)

def abbreviate(self, sentence: str, fallback: str, guide: str, langs: List[str] = None) -> Tuple[str, int]:
def abbreviate(self,
sentence: str,
fallback: str,
guide: str,
langs: Optional[List[str]] = None) -> Tuple[str, int]:
"""Abbreviate the beginning of ``sentence`` by looking for an appropriate pattern.
If not found, use ``fallback``. If found, matches the capitalization given by ``guide``.
Also returns the length of the sentence that was replaced.
Expand All @@ -199,7 +205,10 @@ def abbreviate(self, sentence: str, fallback: str, guide: str, langs: List[str]

return fallback, len(fallback)

def __call__(self, title: str, remove_part: bool = True, langs: List[str] = None) -> str:
def __call__(self,
title: str,
remove_part: bool = True,
langs: Optional[List[str]] = None) -> str:
"""Abbreviate a title according to the rules of Section 7 in the ISSN manual
(https://www.issn.org/understanding-the-issn/assignment-rules/issn-manual/)

Expand All @@ -219,7 +228,7 @@ def __call__(self, title: str, remove_part: bool = True, langs: List[str] = None
title_normalized = Pattern.normalize(title)

lexer = Lexer(title_soft_normalized, self.stopwords)
tokens = []
tokens: List[Token] = []
prev_article = None

# filter tokens
Expand Down
2 changes: 1 addition & 1 deletion pyiso4/normalize_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
LIGATURES = 'ŒœÆæ'


def number_of_ligatures(word: str):
def number_of_ligatures(word: str) -> int:
return sum(1 for c in word if c in LIGATURES)


Expand Down
4 changes: 2 additions & 2 deletions pyiso4/prefix_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def insert(self, key: str, obj: Any, position: int = 0) -> None:
self.objs.append((key, obj))
self._split(position)

def _split(self, position) -> None:
def _split(self, position: int) -> None:
"""Split the node if it contains too much objects"""

if not self.split and self.char is not None and len(self.objs) > self.MAX_OBJS:
Expand Down Expand Up @@ -76,7 +76,7 @@ class PrefixTree:
"""Prefix tree that return correct results up to a certain point (depending on `Node.MAX_OBJS`).
"""

def __init__(self):
def __init__(self) -> None:
self.root = Node('')

def insert(self, key: str, obj: Any) -> None:
Expand Down
Empty file added pyiso4/py.typed
Empty file.
4 changes: 2 additions & 2 deletions pyiso4/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pyiso4.ltwa import Abbreviate


def get_arguments_parser():
def get_arguments_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=pyiso4.__doc__)
parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + pyiso4.__version__)

Expand All @@ -30,7 +30,7 @@ def get_arguments_parser():
return parser


def main():
def main() -> None:
args = get_arguments_parser().parse_args()

# load LTWA
Expand Down
31 changes: 21 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,43 +1,54 @@
[build-system]
build-backend = "setuptools.build_meta"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this mandatory? :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand correctly, setuptools is the default at the moment
https://peps.python.org/pep-0518/#build-system-table
But I guess it's good to be explicit?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, then :)

requires = ["setuptools>=62"]

[project]
name = "pyiso4"
dynamic = ["version"]
authors = [
{name = "Pierre Beaujean", email = "pierre.beaujean@unamur.be"},
]
description = "Abbreviate a scientific journal title following the ISO-4 rules"
readme = "README.md"
license = {file = "LICENSE"}
authors = [{name = "Pierre Beaujean", email = "pierre.beaujean@unamur.be"}]
requires-python = ">=3.8"
classifiers = [
"Development Status :: 4 - Beta",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]

dependencies = [
'unidecode'
"unidecode",
]

[project.optional-dependencies]
dev = [
"flake8",
"flake8-quotes",
"autopep8",
"bump2version",
"flake8",
"flake8-quotes",
"mypy",
]

[project.scripts]
iso4abbreviate = 'pyiso4.script:main'
iso4abbreviate = "pyiso4.script:main"

[tool.setuptools]
packages = ['pyiso4']
packages = ["pyiso4"]

[tool.setuptools.package-data]
pyiso4 = ["py.typed"]

[tool.setuptools.dynamic]
version = {attr = "pyiso4.__version__"}
version = {attr = "pyiso4.__version__"}

[tool.mypy]
strict = true
hide_error_codes = false
warn_unused_ignores = true
26 changes: 12 additions & 14 deletions tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from typing import Any

from pyiso4.lexer import Lexer, TokenType
from pyiso4.ltwa import Pattern, Abbreviate
Expand All @@ -8,7 +9,7 @@
class TestNormalize(unittest.TestCase):
"""Test the unicode normalization"""

def test_normalize(self):
def test_normalize(self) -> None:
tests = [
('test', 'test'),
('abbréviation', 'abbreviation'),
Expand All @@ -21,7 +22,7 @@ def test_normalize(self):
for inp, out in tests:
self.assertEqual(out, normalize(inp, Level.NORMAL))

def test_normalize_extra(self):
def test_normalize_extra(self) -> None:
tests = [
('TeSt', 'test'),
("Côte-d'Azur", 'cote d azur')
Expand All @@ -30,17 +31,14 @@ def test_normalize_extra(self):
for inp, out in tests:
self.assertEqual(out, normalize(inp, Level.HARD))

def test_ligatures(self):
def test_ligatures(self) -> None:
self.assertEqual(number_of_ligatures('test'), 0)
self.assertEqual(number_of_ligatures('coeur'), 0)
self.assertEqual(number_of_ligatures('cœur'), 1)


class TestLexer(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def test_stopword(self):
def test_stopword(self) -> None:
stopwords = ['x', 'y']
text = ' '.join(stopwords)

Expand All @@ -54,13 +52,13 @@ def test_stopword(self):
for t in tokens[:-1]: # skip EOS
self.assertEqual(t.type, TokenType.STOPWORD)

def test_token_position(self):
def test_token_position(self) -> None:
text = 'this is a test-case for you'
for t in Lexer(text, []).tokenize():
if t.position >= 0:
self.assertEqual(text[t.position], t.value[0])

def test_hyphenated_words(self):
def test_hyphenated_words(self) -> None:
cpd1 = 'état'
cpd2 = 'nation'
text = '{}-{}'.format(cpd1, cpd2)
Expand All @@ -71,7 +69,7 @@ def test_hyphenated_words(self):
self.assertEqual(tokens[2].type, TokenType.WORD)
self.assertEqual(tokens[2].value, cpd2)

def test_surname_as_abbreviation(self):
def test_surname_as_abbreviation(self) -> None:
abbrv = 'A.'
text = 'Legacy of {} Einstein'.format(abbrv)
tokens = list(Lexer(text, ['of']).tokenize())
Expand All @@ -82,7 +80,7 @@ def test_surname_as_abbreviation(self):


class TestPattern(unittest.TestCase):
def test_pattern_match(self):
def test_pattern_match(self) -> None:
text = 'abc'

# no dash
Expand Down Expand Up @@ -114,7 +112,7 @@ def test_pattern_match(self):
self.assertTrue(pattern.match(word + 's')) # plural form
self.assertFalse(pattern.match(word + 'x')) # not an inflexion

def test_patter_match_on_sentence(self):
def test_patter_match_on_sentence(self) -> None:
# no dash
text = 'abc'
pattern_without_dash = Pattern.from_line('{}\tx\tmul'.format(text))
Expand All @@ -133,11 +131,11 @@ def test_patter_match_on_sentence(self):


class TestAbbreviate(unittest.TestCase):
def __init__(self, *args, **kwargs):
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.abbreviate = Abbreviate.create()

def test_abbreviations(self):
def test_abbreviations(self) -> None:
with open('tests/tests.tsv') as f:
for line in f.readlines():
fields = line.split('\t')
Expand Down
Loading