From 7955b63fcba85737cb09b1e1e3be243582e40eee Mon Sep 17 00:00:00 2001 From: Karl Besser Date: Fri, 5 Jul 2024 19:35:26 -0400 Subject: [PATCH] Fix missed detection of single letter part names Fix the wrong classification of single letter part names, if the single letter is also a stopword. --- pyiso4/lexer.py | 3 ++- tests/tests.tsv | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pyiso4/lexer.py b/pyiso4/lexer.py index 3c369a6..7156e39 100644 --- a/pyiso4/lexer.py +++ b/pyiso4/lexer.py @@ -132,7 +132,7 @@ def yield_hyphenated(word: str, base_pos: int) -> Iterable[Token]: yield Token(TokenType.PART, word, self.start_word) was_part = self.count # check if ordinal (preceded by PART) - elif IS_ORDINAL.match(word) and self.count == was_part + 1: + elif IS_ORDINAL.fullmatch(word) and self.count == was_part + 1: yield Token(TokenType.ORDINAL, word, self.start_word) # check if article (after ordinal, so "a" is detected as ordinal if preceded by PART) elif lower_word in ARTICLES: @@ -155,6 +155,7 @@ def yield_hyphenated(word: str, base_pos: int) -> Iterable[Token]: # yield the remaining symbols, if any if len(end_symbols) > 0: yield Token(TokenType.SYMBOLS, end_symbols, self.pos - len(end_symbols)) + was_part = self.count self.next() diff --git a/tests/tests.tsv b/tests/tests.tsv index bcea7ee..31fea47 100644 --- a/tests/tests.tsv +++ b/tests/tests.tsv @@ -43,3 +43,9 @@ Norsk Militært Tidsskrift Nor. Mil. Tidsskr. Proceedings of the 2024 Conference on Science Proc. 2024 Conf. Sci. IEEE Power and Energy Magazine IEEE Power Energy Mag. IEEE Transactions on Automatic Control IEEE Trans. Autom. Control +E.S.A. bulletin E.S.A. bull. +Acta Universitatis Carolinae. Iuridica Acta Univ. Carol., Iurid. +Physical Review. A Phys. Rev., A +Physical Review. D Phys. Rev., D +Physical Review. E Phys. Rev., E +Physical Review. I Phys. Rev., I