From 5fda810ef51900b7d46a728d247a7f70cb6543cd Mon Sep 17 00:00:00 2001 From: cpburnz <2126043+cpburnz@users.noreply.github.com> Date: Sat, 9 Dec 2023 17:13:23 -0500 Subject: [PATCH] Fix issue 81 --- CHANGES.rst | 2 + pathspec/gitignore.py | 3 - pathspec/patterns/gitwildmatch.py | 208 +++++++++++++++--------------- tests/test_02_gitwildmatch.py | 47 ++++++- 4 files changed, 147 insertions(+), 113 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 0610f84..9916547 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,6 +22,7 @@ New features: Bug fixes: +- `Issue #81`_: GitIgnoreSpec behaviors differ from git. - `Pull #83`_: Fix ReadTheDocs builds. Improvements: @@ -32,6 +33,7 @@ Improvements: - Improve type hint on *on_error* parameter on `pathspec.util.iter_tree_entries()`. +.. _`Issue #81`: https://github.com/cpburnz/python-pathspec/issues/81 .. _`Pull #82`: https://github.com/cpburnz/python-pathspec/pull/82 .. _`Pull #83`: https://github.com/cpburnz/python-pathspec/pull/83 diff --git a/pathspec/gitignore.py b/pathspec/gitignore.py index a8d3c43..994a2c7 100644 --- a/pathspec/gitignore.py +++ b/pathspec/gitignore.py @@ -138,9 +138,6 @@ def _match_file( # Check for directory marker. dir_mark = match.match.groupdict().get(_DIR_MARK) - # TODO: A exclude (whitelist) dir pattern here needs to deprioritize - # for 81-c. - if dir_mark: # Pattern matched by a directory pattern. priority = 1 diff --git a/pathspec/patterns/gitwildmatch.py b/pathspec/patterns/gitwildmatch.py index 5c00086..6a3d6d5 100644 --- a/pathspec/patterns/gitwildmatch.py +++ b/pathspec/patterns/gitwildmatch.py @@ -1,15 +1,14 @@ """ -This module implements Git's wildmatch pattern matching which itself is -derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" -files. +This module implements Git's wildmatch pattern matching which itself is derived +from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files. """ import re import warnings from typing import ( AnyStr, - Optional, - Tuple) + Optional, # Replaced by `X | None` in 3.10. + Tuple) # Replaced by `tuple` in 3.9. from .. import util from ..pattern import RegexPattern @@ -36,8 +35,8 @@ class GitWildMatchPatternError(ValueError): class GitWildMatchPattern(RegexPattern): """ - The :class:`GitWildMatchPattern` class represents a compiled Git - wildmatch pattern. + The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch + pattern. """ # Keep the dict-less class hierarchy. @@ -51,13 +50,12 @@ def pattern_to_regex( """ Convert the pattern into a regular expression. - *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert - into a regular expression. + *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a + regular expression. - Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, - or :data:`None`); and whether matched files should be included - (:data:`True`), excluded (:data:`False`), or if it is a - null-operation (:data:`None`). + Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or + :data:`None`); and whether matched files should be included (:data:`True`), + excluded (:data:`False`), or if it is a null-operation (:data:`None`). """ if isinstance(pattern, str): return_type = str @@ -70,51 +68,52 @@ def pattern_to_regex( original_pattern = pattern if pattern.endswith('\\ '): - # EDGE CASE: Spaces can be escaped with backslash. - # If a pattern that ends with backslash followed by a space, - # only strip from left. + # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends + # with backslash followed by a space, only strip from left. pattern = pattern.lstrip() else: pattern = pattern.strip() if pattern.startswith('#'): - # A pattern starting with a hash ('#') serves as a comment - # (neither includes nor excludes files). Escape the hash with a - # back-slash to match a literal hash (i.e., '\#'). + # A pattern starting with a hash ('#') serves as a comment (neither + # includes nor excludes files). Escape the hash with a back-slash to match + # a literal hash (i.e., '\#'). regex = None include = None elif pattern == '/': - # EDGE CASE: According to `git check-ignore` (v2.4.1), a single - # '/' does not match any file. + # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does + # not match any file. regex = None include = None elif pattern: if pattern.startswith('!'): - # A pattern starting with an exclamation mark ('!') negates the - # pattern (exclude instead of include). Escape the exclamation - # mark with a back-slash to match a literal exclamation mark - # (i.e., '\!'). + # A pattern starting with an exclamation mark ('!') negates the pattern + # (exclude instead of include). Escape the exclamation mark with a + # back-slash to match a literal exclamation mark (i.e., '\!'). include = False # Remove leading exclamation mark. pattern = pattern[1:] else: include = True - # Allow a regex override for edge cases that cannot be handled - # through normalization. + # Allow a regex override for edge cases that cannot be handled through + # normalization. override_regex = None # Split pattern into segments. pattern_segs = pattern.split('/') + # Check whether the pattern is specifically a directory pattern before + # normalization. + is_dir_pattern = not pattern_segs[-1] + # Normalize pattern to make processing easier. - # EDGE CASE: Deal with duplicate double-asterisk sequences. - # Collapse each sequence down to one double-asterisk. Iterate over - # the segments in reverse and remove the duplicate double - # asterisks as we go. + # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each + # sequence down to one double-asterisk. Iterate over the segments in + # reverse and remove the duplicate double asterisks as we go. for i in range(len(pattern_segs) - 1, 0, -1): prev = pattern_segs[i-1] seg = pattern_segs[i] @@ -122,45 +121,42 @@ def pattern_to_regex( del pattern_segs[i] if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]: - # EDGE CASE: The '**/' pattern should match everything except - # individual files in the root directory. This case cannot be - # adequately handled through normalization. Use the override. + # EDGE CASE: The '**/' pattern should match everything except individual + # files in the root directory. This case cannot be adequately handled + # through normalization. Use the override. override_regex = f'^.+(?P<{_DIR_MARK}>/).*$' if not pattern_segs[0]: - # A pattern beginning with a slash ('/') will only match paths - # directly on the root directory instead of any descendant - # paths. So, remove empty first segment to make pattern relative - # to root. + # A pattern beginning with a slash ('/') will only match paths directly + # on the root directory instead of any descendant paths. So, remove + # empty first segment to make pattern relative to root. del pattern_segs[0] elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): - # A single pattern without a beginning slash ('/') will match - # any descendant path. This is equivalent to "**/{pattern}". So, - # prepend with double-asterisks to make pattern relative to - # root. - # EDGE CASE: This also holds for a single pattern with a - # trailing slash (e.g. dir/). + # A single pattern without a beginning slash ('/') will match any + # descendant path. This is equivalent to "**/{pattern}". So, prepend + # with double-asterisks to make pattern relative to root. + # - EDGE CASE: This also holds for a single pattern with a trailing + # slash (e.g. dir/). if pattern_segs[0] != '**': pattern_segs.insert(0, '**') else: - # EDGE CASE: A pattern without a beginning slash ('/') but - # contains at least one prepended directory (e.g. - # "dir/{pattern}") should not match "**/dir/{pattern}", - # according to `git check-ignore` (v2.4.1). + # EDGE CASE: A pattern without a beginning slash ('/') but contains at + # least one prepended directory (e.g. "dir/{pattern}") should not match + # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1). pass if not pattern_segs: - # After resolving the edge cases, we end up with no pattern at - # all. This must be because the pattern is invalid. + # After resolving the edge cases, we end up with no pattern at all. This + # must be because the pattern is invalid. raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") if not pattern_segs[-1] and len(pattern_segs) > 1: - # A pattern ending with a slash ('/') will match all descendant - # paths if it is a directory but not if it is a regular file. - # This is equivalent to "{pattern}/**". So, set last segment to - # a double-asterisk to include all descendants. + # A pattern ending with a slash ('/') will match all descendant paths if + # it is a directory but not if it is a regular file. This is equivalent + # to "{pattern}/**". So, set last segment to a double-asterisk to + # include all descendants. pattern_segs[-1] = '**' if override_regex is None: @@ -171,21 +167,27 @@ def pattern_to_regex( for i, seg in enumerate(pattern_segs): if seg == '**': if i == 0 and i == end: - # A pattern consisting solely of double-asterisks ('**') - # will match every path. - output.append(f'[^/]+(?:(?P<{_DIR_MARK}>/).*)?') + # A pattern consisting solely of double-asterisks ('**') will + # match every path. + output.append(f'[^/]+(?:/.*)?') + elif i == 0: # A normalized pattern beginning with double-asterisks # ('**') will match any leading path segments. output.append('(?:.+/)?') need_slash = False + elif i == end: - # A normalized pattern ending with double-asterisks ('**') - # will match any trailing path segments. - output.append(f'(?P<{_DIR_MARK}>/).*') + # A normalized pattern ending with double-asterisks ('**') will + # match any trailing path segments. + if is_dir_pattern: + output.append(f'(?P<{_DIR_MARK}>/).*') + else: + output.append(f'/.*') + else: - # A pattern with inner double-asterisks ('**') will match - # multiple (or zero) inner path segments. + # A pattern with inner double-asterisks ('**') will match multiple + # (or zero) inner path segments. output.append('(?:/.+)?') need_slash = True @@ -197,9 +199,9 @@ def pattern_to_regex( output.append('[^/]+') if i == end: - # A pattern ending without a slash ('/') will match a file - # or a directory (with paths underneath it). E.g., "foo" - # matches "foo", "foo/bar", "foo/bar/baz", etc. + # A pattern ending without a slash ('/') will match a file or a + # directory (with paths underneath it). E.g., "foo" matches "foo", + # "foo/bar", "foo/bar/baz", etc. output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') need_slash = True @@ -215,9 +217,9 @@ def pattern_to_regex( raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e if i == end: - # A pattern ending without a slash ('/') will match a file - # or a directory (with paths underneath it). E.g., "foo" - # matches "foo", "foo/bar", "foo/bar/baz", etc. + # A pattern ending without a slash ('/') will match a file or a + # directory (with paths underneath it). E.g., "foo" matches "foo", + # "foo/bar", "foo/bar/baz", etc. output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') need_slash = True @@ -230,8 +232,8 @@ def pattern_to_regex( regex = override_regex else: - # A blank pattern is a null-operation (neither includes nor - # excludes files). + # A blank pattern is a null-operation (neither includes nor excludes + # files). regex = None include = None @@ -243,16 +245,16 @@ def pattern_to_regex( @staticmethod def _translate_segment_glob(pattern: str) -> str: """ - Translates the glob pattern to a regular expression. This is used in - the constructor to translate a path segment glob pattern to its - corresponding regular expression. + Translates the glob pattern to a regular expression. This is used in the + constructor to translate a path segment glob pattern to its corresponding + regular expression. *pattern* (:class:`str`) is the glob pattern. Returns the regular expression (:class:`str`). """ - # NOTE: This is derived from `fnmatch.translate()` and is similar to - # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. + # NOTE: This is derived from `fnmatch.translate()` and is similar to the + # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. escape = False regex = '' @@ -272,41 +274,40 @@ def _translate_segment_glob(pattern: str) -> str: escape = True elif char == '*': - # Multi-character wildcard. Match any string (except slashes), - # including an empty string. + # Multi-character wildcard. Match any string (except slashes), including + # an empty string. regex += '[^/]*' elif char == '?': - # Single-character wildcard. Match any single character (except - # a slash). + # Single-character wildcard. Match any single character (except a + # slash). regex += '[^/]' elif char == '[': - # Bracket expression wildcard. Except for the beginning - # exclamation mark, the whole bracket expression can be used - # directly as regex but we have to find where the expression - # ends. + # Bracket expression wildcard. Except for the beginning exclamation + # mark, the whole bracket expression can be used directly as regex, but + # we have to find where the expression ends. # - "[][!]" matches ']', '[' and '!'. # - "[]-]" matches ']' and '-'. # - "[!]a-]" matches any character except ']', 'a' and '-'. j = i - + # Pass bracket expression negation. if j < end and (pattern[j] == '!' or pattern[j] == '^'): j += 1 - + # Pass first closing bracket if it is at the beginning of the # expression. if j < end and pattern[j] == ']': j += 1 - + # Find closing bracket. Stop once we reach the end or find it. while j < end and pattern[j] != ']': j += 1 if j < end: - # Found end of bracket expression. Increment j to be one past - # the closing bracket: + # Found end of bracket expression. Increment j to be one past the + # closing bracket: # # [...] # ^ ^ @@ -320,17 +321,16 @@ def _translate_segment_glob(pattern: str) -> str: expr += '^' i += 1 elif pattern[i] == '^': - # POSIX declares that the regex bracket expression negation - # "[^...]" is undefined in a glob pattern. Python's - # `fnmatch.translate()` escapes the caret ('^') as a - # literal. Git supports the using a caret for negation. - # Maintain consistency with Git because that is the expected - # behavior. + # POSIX declares that the regex bracket expression negation "[^...]" + # is undefined in a glob pattern. Python's `fnmatch.translate()` + # escapes the caret ('^') as a literal. Git supports the using a + # caret for negation. Maintain consistency with Git because that is + # the expected behavior. expr += '^' i += 1 - # Build regex bracket expression. Escape slashes so they are - # treated as literal slashes by regex as defined by POSIX. + # Build regex bracket expression. Escape slashes so they are treated + # as literal slashes by regex as defined by POSIX. expr += pattern[i:j].replace('\\', '\\\\') # Add regex bracket expression to regex result. @@ -340,8 +340,8 @@ def _translate_segment_glob(pattern: str) -> str: i = j else: - # Failed to find closing bracket, treat opening bracket as a - # bracket literal instead of as an expression. + # Failed to find closing bracket, treat opening bracket as a bracket + # literal instead of as an expression. regex += '\\[' else: @@ -358,8 +358,8 @@ def escape(s: AnyStr) -> AnyStr: """ Escape special characters in the given string. - *s* (:class:`str` or :class:`bytes`) a filename or a string that you - want to escape, usually before adding it to a ".gitignore". + *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to + escape, usually before adding it to a ".gitignore". Returns the escaped string (:class:`str` or :class:`bytes`). """ @@ -404,8 +404,8 @@ def _deprecated() -> None: Warn about deprecation. """ warnings.warn(( - "GitIgnorePattern ('gitignore') is deprecated. Use " - "GitWildMatchPattern ('gitwildmatch') instead." + "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern " + "('gitwildmatch') instead." ), DeprecationWarning, stacklevel=3) @classmethod @@ -416,6 +416,6 @@ def pattern_to_regex(cls, *args, **kw): cls._deprecated() return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) -# Register `GitIgnorePattern` as "gitignore" for backward compatibility -# with v0.4. +# Register `GitIgnorePattern` as "gitignore" for backward compatibility with +# v0.4. util.register_pattern('gitignore', GitIgnorePattern) diff --git a/tests/test_02_gitwildmatch.py b/tests/test_02_gitwildmatch.py index be915cf..3d272a4 100644 --- a/tests/test_02_gitwildmatch.py +++ b/tests/test_02_gitwildmatch.py @@ -208,7 +208,7 @@ def test_03_child_double_asterisk(self): """ regex, include = GitWildMatchPattern.pattern_to_regex('spam/**') self.assertTrue(include) - self.assertEqual(regex, f'^spam{RE_DIR}.*$') + self.assertEqual(regex, "^spam/.*$") pattern = GitWildMatchPattern(re.compile(regex), include) results = set(filter(pattern.match_file, [ @@ -257,7 +257,7 @@ def test_03_only_double_asterisk(self): """ regex, include = GitWildMatchPattern.pattern_to_regex('**') self.assertTrue(include) - self.assertEqual(regex, f'^[^/]+{RE_SUB}$') + self.assertEqual(regex, f'^[^/]+(?:/.*)?$') pattern = GitWildMatchPattern(re.compile(regex), include) results = set(filter(pattern.match_file, [ @@ -314,7 +314,7 @@ def test_03_duplicate_leading_double_asterisk_edge_case(self): """ regex, include = GitWildMatchPattern.pattern_to_regex('**') self.assertTrue(include) - self.assertEqual(regex, f'^[^/]+{RE_SUB}$') + self.assertEqual(regex, "^[^/]+(?:/.*)?$") equivalent_regex, include = GitWildMatchPattern.pattern_to_regex('**/**') self.assertTrue(include) @@ -336,10 +336,14 @@ def test_03_duplicate_leading_double_asterisk_edge_case(self): self.assertTrue(include) self.assertEqual(regex, f'^(?:.+/)?api{RE_DIR}.*$') - equivalent_regex, include = GitWildMatchPattern.pattern_to_regex('**/api/**') + equivalent_regex, include = GitWildMatchPattern.pattern_to_regex(f'**/**/api/') self.assertTrue(include) self.assertEqual(equivalent_regex, regex) + regex, include = GitWildMatchPattern.pattern_to_regex('**/api/**') + self.assertTrue(include) + self.assertEqual(regex, "^(?:.+/)?api/.*$") + equivalent_regex, include = GitWildMatchPattern.pattern_to_regex('**/**/api/**/**') self.assertTrue(include) self.assertEqual(equivalent_regex, regex) @@ -817,10 +821,41 @@ def test_13_issue_77_2_regex(self): """ Test the resulting regex for regex bracket expression negation. """ - regex, include = GitWildMatchPattern.pattern_to_regex('a[^b]c') + regex, include = GitWildMatchPattern.pattern_to_regex("a[^b]c") self.assertTrue(include) - equiv_regex, include = GitWildMatchPattern.pattern_to_regex('a[!b]c') + equiv_regex, include = GitWildMatchPattern.pattern_to_regex("a[!b]c") self.assertTrue(include) self.assertEqual(regex, equiv_regex) + + def test_14_issue_81_a(self): + """ + Test ignoring files in a directory, scenario A. + """ + pattern = GitWildMatchPattern("!libfoo/**") + + self.assertEqual(pattern.regex.pattern, "^libfoo/.*$") + self.assertIs(pattern.include, False) + self.assertTrue(pattern.match_file("libfoo/__init__.py")) + + def test_14_issue_81_b(self): + """ + Test ignoring files in a directory, scenario B. + """ + pattern = GitWildMatchPattern("!libfoo/*") + + self.assertEqual(pattern.regex.pattern, f"^libfoo/[^/]+{RE_SUB}$") + self.assertIs(pattern.include, False) + self.assertTrue(pattern.match_file("libfoo/__init__.py")) + + def test_14_issue_81_c(self): + """ + Test ignoring files in a directory, scenario C. + """ + # GitWildMatchPattern will match the file, but GitIgnoreSpec should not. + pattern = GitWildMatchPattern("!libfoo/") + + self.assertEqual(pattern.regex.pattern, f"^(?:.+/)?libfoo{RE_DIR}.*$") + self.assertIs(pattern.include, False) + self.assertTrue(pattern.match_file("libfoo/__init__.py"))