Skip to content

Commit bd3fb7f

Browse files
committed
Refactor regex parsing to use Group class and update README for feature support
1 parent 5c45831 commit bd3fb7f

File tree

5 files changed

+30
-27
lines changed

5 files changed

+30
-27
lines changed

README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@ print(re.next()) # a2b
3838

3939
## What I plan to support
4040

41-
- [ ] Lookahead and lookbehind
41+
I think those features would slow down the library too much and they are not widely used. If you have suggestions on how to implement them efficiently, please let me know.
42+
43+
- [ ] Lookahead
44+
- [ ] Lookbehind
4245

4346
## What is not supported
4447

regex_enumerator/regex_enumerator.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .regex_parser import RegexParser
2-
from .regex_tree import RegexTree
2+
from .regex_tree import Group
33

44

55
class RegexEnumerator:
@@ -20,7 +20,7 @@ def __init__(self, regex: str, additional_charset: str | list[str] = None) -> No
2020

2121
charset = ''.join(sorted(set(default_charset + additional)))
2222
parser = RegexParser(regex, charset)
23-
self.regexTree: RegexTree = parser.parse()
23+
self.regexTree: Group = parser.parse()
2424
self.current: list[str] = list(self.regexTree.current)
2525
self.done: bool = self.regexTree.done and len(self.current) == 0
2626

regex_enumerator/regex_parser.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .regex_tree import Alternative, BackReference, CharClass, RegexTree
1+
from .regex_tree import Alternative, BackReference, CharClass, Group
22

33

44
class RegexError(Exception):
@@ -22,15 +22,15 @@ def __init__(self, regex: str, charset: str) -> None:
2222
self.regex = regex
2323
self.charset = charset
2424

25-
def parse(self) -> RegexTree:
25+
def parse(self) -> Group:
2626
self.index = 0
27-
return self._parseRegex(False)
27+
return self._parseGroup(False)
2828

29-
def _parseRegex(self, to_close: bool) -> RegexTree:
29+
def _parseGroup(self, to_close: bool) -> Group:
3030
alternatives: list[Alternative] = []
31-
elements: list[CharClass | RegexTree | BackReference] = []
32-
named_groups: dict[str, RegexTree] = {}
33-
ordered_groups: list[RegexTree] = []
31+
elements: list[CharClass | Group | BackReference] = []
32+
named_groups: dict[str, Group] = {}
33+
ordered_groups: list[Group] = []
3434
min_len_group, max_len_group = 1, 1
3535

3636
while self.index < len(self.regex):
@@ -53,16 +53,16 @@ def _parseRegex(self, to_close: bool) -> RegexTree:
5353
self.index += 1
5454
if name in named_groups:
5555
self._raise_error("Duplicate named group")
56-
subTree = self._parseRegex(True)
56+
subTree = self._parseGroup(True)
5757
named_groups[name] = subTree
5858
ordered_groups.append(subTree)
5959
elif self.regex[self.index] == ':':
6060
self.index += 1
61-
subTree = self._parseRegex(True)
61+
subTree = self._parseGroup(True)
6262
else:
6363
self._raise_error("Invalid group")
6464
else:
65-
subTree = self._parseRegex(True)
65+
subTree = self._parseGroup(True)
6666
ordered_groups.append(subTree)
6767
elements.append(subTree)
6868
case ')':
@@ -115,7 +115,7 @@ def _parseRegex(self, to_close: bool) -> RegexTree:
115115
self._raise_error("Unmatched opening parenthesis")
116116

117117
alternatives.append(Alternative(elements))
118-
return RegexTree(alternatives, min_len_group, max_len_group)
118+
return Group(alternatives, min_len_group, max_len_group)
119119

120120
def _parseBackReferenceLookahead(self) -> str | int | None:
121121
if len(self.regex) <= self.index:

regex_enumerator/regex_tree.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
class RegexTree:
1+
class Group:
22
pass
33

44

@@ -46,11 +46,11 @@ def next(self) -> list[str]:
4646

4747

4848
class BackReference:
49-
def __init__(self, reference: RegexTree, min_len: int, max_len: int | None):
49+
def __init__(self, reference: Group, min_len: int, max_len: int | None):
5050
self._min_len = min_len
5151
self._max_len = max_len
5252
self._index = 0
53-
self.reference: RegexTree = reference
53+
self.reference: Group = reference
5454
self.done = max_len == 0 or (
5555
reference.done and len(reference.current) == 0)
5656
self.current = self._first()
@@ -91,7 +91,7 @@ def next(self) -> dict[str, list[str]]:
9191

9292

9393
class Alternative:
94-
def __init__(self, elements: list[CharClass | RegexTree | BackReference]):
94+
def __init__(self, elements: list[CharClass | Group | BackReference]):
9595
self._index = 0
9696
self._elements = [e for e in elements if not e.done or len(e.current)]
9797
self._noBackreference = not any(isinstance(
@@ -116,9 +116,9 @@ def next(self) -> set[str]:
116116
index = 0
117117

118118
self._index = index
119-
result: list[tuple[str, dict[RegexTree, str]]] = []
119+
result: list[tuple[str, dict[Group, str]]] = []
120120

121-
if isinstance(self._elements[0], RegexTree) and len(self._elements[0].references):
121+
if isinstance(self._elements[0], Group) and len(self._elements[0].references):
122122
for string in self._elements[0].next() if index == 0 else self._elements[0].current:
123123
result.append((string, {self._elements[0]: string}))
124124
else:
@@ -138,7 +138,7 @@ def next(self) -> set[str]:
138138
for sfx in element.current[reference]:
139139
temp.append(
140140
(pfx[0] + sfx, pfx[1]))
141-
elif isinstance(element, RegexTree) and len(element.references):
141+
elif isinstance(element, Group) and len(element.references):
142142
for sfx in element.next() if i == index else element.current:
143143
for pfx in result:
144144
temp.append((pfx[0] + sfx, {**pfx[1], element: sfx}))
@@ -202,9 +202,9 @@ def _first(self) -> set[str]:
202202

203203
assert not isinstance(self._elements[0], BackReference)
204204

205-
result: list[tuple[str, dict[RegexTree, str]]] = []
205+
result: list[tuple[str, dict[Group, str]]] = []
206206

207-
if isinstance(self._elements[0], RegexTree) and len(self._elements[0].references):
207+
if isinstance(self._elements[0], Group) and len(self._elements[0].references):
208208
for char in self._elements[0].current:
209209
result.append((char, {self._elements[0]: char}))
210210
else:
@@ -214,7 +214,7 @@ def _first(self) -> set[str]:
214214
done = self._elements[0].done
215215

216216
for element in self._elements[1:]:
217-
temp: list[tuple[str, dict[RegexTree, str]]] = []
217+
temp: list[tuple[str, dict[Group, str]]] = []
218218
done = done and element.done
219219
if isinstance(element, BackReference):
220220
for pfx in result:
@@ -223,7 +223,7 @@ def _first(self) -> set[str]:
223223
for sfx in element.current[reference]:
224224
temp.append(
225225
(pfx[0] + sfx, pfx[1]))
226-
elif isinstance(element, RegexTree) and len(element.references):
226+
elif isinstance(element, Group) and len(element.references):
227227
for pfx in result:
228228
for sfx in element.current:
229229
temp.append((pfx[0] + sfx, {**pfx[1], element: sfx}))
@@ -238,7 +238,7 @@ def _first(self) -> set[str]:
238238
return {struct[0] for struct in result}
239239

240240

241-
class RegexTree:
241+
class Group:
242242
def __init__(self, alternatives: list[Alternative], min_len: int, max_len: int | None):
243243
self.references: list[BackReference] = []
244244
self._alternatives: list[Alternative] = alternatives

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name='regex_enumerator',
8-
version='0.10.1',
8+
version='1.0.0',
99
packages=find_packages(include=['regex_enumerator', 'regex_enumerator.*']),
1010
description='Enumerate all strings that match a given regex',
1111
author='Vincenzo Greco',

0 commit comments

Comments
 (0)