diff --git a/README.md b/README.md index 6ffdab8..77d03cd 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,17 @@ # Regex enumerator + [![PyPI version](https://img.shields.io/pypi/v/regex-enumerator.svg)](https://pypi.org/project/regex-enumerator/) This library is meant to generate all the strings that match a given regex pattern. It is written in python and uses no external libraries. ## Installation + ```bash pip install regex-enumerator ``` ## Usage + Here's an example of how to use the library: ```python @@ -24,25 +27,71 @@ print(re.next()) # a2b ``` ## What is supported + - [x] Character classes - [x] Quantifiers for character classes - [x] Quantifiers for groups - [x] Groups (named and unnamed) -- [x] Alternation -- [x] Escaped characters +- [x] Alternation +- [x] Escaped characters - [x] Backreferences (named and unnamed) +## What I plan to support + +- [ ] Lookahead and lookbehind +- [ ] Non-capturing groups + +## What is not supported + +- [ ] Unicode properties +- [ ] Word boundaries +- [ ] Anchors +- [ ] Non-greedy quantifiers + +## Charset + +The library supports ASCII characters by default. To handle Unicode characters, include them explicitly in your regex or define a custom character set. + +```python +from regex_enumerator import RegexEnumerator + +# Directly in regex +regex_enum = RegexEnumerator(r'£') +print(regex_enum.next()) # £ + +# Using additional_charset +unicode_charset = [chr(i) for i in range(ord('¡'), ord('£'))] +unicode_charset = ['¡', '¢', '£'] +unicode_charset = '¡¢£' +unicode_charset = ['¡¢', '£'] + +regex_enum = RegexEnumerator(r'.', additional_charset=unicode_charset) + +result = [] +while (char := regex_enum.next()) is not None: + result.append(char) + +assert '¡' in result +assert '¢' in result +assert '£' in result +``` + ## How it works + This library works by parsing the regex pattern into a tree structure. Once parsed, it performs a breadth-first search (BFS) on the tree to generate all matching strings. This ensures it does not get stuck on unbounded quantifiers for character classes or groups. ## Tests + The library includes a comprehensive test suite. To run the tests, use the following command: + ```bash pytest ``` ## License + I don't know what license to use, so I'm going to use the MIT license. If you have any suggestions, please let me know. ## Contributors + Feel free to contribute to this project. I'm open to suggestions and improvements. diff --git a/regex_enumerator/regex_enumerator.py b/regex_enumerator/regex_enumerator.py index 717e106..94098e5 100644 --- a/regex_enumerator/regex_enumerator.py +++ b/regex_enumerator/regex_enumerator.py @@ -3,8 +3,18 @@ class RegexEnumerator: - def __init__(self, regex: str): - parser = RegexParser(regex) + def __init__(self, regex: str, additional_charset: str | list[str] = None) -> None: + default_charset = [chr(c) for c in range(32, 127)] + + if additional_charset is None: + additional = [] + elif isinstance(additional_charset, list): + additional = list(''.join(additional_charset)) + else: + additional = list(additional_charset) + + charset = ''.join(sorted(set(default_charset + additional))) + parser = RegexParser(regex, charset) self.regexTree: RegexTree = parser.parse() self.current: list[str] = list(self.regexTree.current) self.done: bool = self.regexTree.done and len(self.current) == 0 diff --git a/regex_enumerator/regex_parser.py b/regex_enumerator/regex_parser.py index 8e3bdbc..7dabf76 100644 --- a/regex_enumerator/regex_parser.py +++ b/regex_enumerator/regex_parser.py @@ -1,6 +1,5 @@ from .regex_tree import Alternative, BackReference, CharClasses, RegexTree - class RegexError(Exception): def __init__(self, regex: str, index: int, message: str): self.regex = regex @@ -13,13 +12,13 @@ def __str__(self): class RegexParser: - CHARSET = [chr(c) for c in range(32, 127)] WORDS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' HEX = '0123456789abcdefABCDEF' DIGITS = '0123456789' - def __init__(self, regex: str): + def __init__(self, regex: str, charset: str): self.regex = regex + self.charset = charset def parse(self) -> RegexTree: self.index = 0 @@ -73,7 +72,7 @@ def _parseRegex(self, to_close: bool) -> RegexTree: min_len, max_len = self._parseQuantifier() elements.append(CharClasses(chars, min_len, max_len)) case '.': - chars = list(self.CHARSET) + chars = list(self.charset) min_len, max_len = self._parseQuantifier() elements.append(CharClasses(chars, min_len, max_len)) case '\\': @@ -143,11 +142,11 @@ def _parseEscapeChar(self) -> str: match char: case 'd': return self.DIGITS - case 'D': return ''.join([c for c in self.CHARSET if not c.isdigit()]) + case 'D': return ''.join([c for c in self.charset if not c.isdigit()]) case 'w': return self.WORDS - case 'W': return ''.join([c for c in self.CHARSET if c not in self.WORDS]) + case 'W': return ''.join([c for c in self.charset if c not in self.WORDS]) case 's': return ' \t\n\r\f\v' - case 'S': return ''.join([c for c in self.CHARSET if c not in ' \t\n\r\f\v']) + case 'S': return ''.join([c for c in self.charset if c not in ' \t\n\r\f\v']) case 't': return '\t' case 'r': return '\r' case 'n': return '\n' @@ -165,6 +164,17 @@ def _parseEscapeChar(self) -> str: if num < 32 or num > 126: self._raise_error(f"Invalid escape character {num}") return chr(num) + case 'u': + code = [] + for _ in range(4): + if len(self.regex) <= self.index or self.regex[self.index] not in self.HEX: + self._raise_error("Invalid escape character") + code.append(self.regex[self.index]) + self.index += 1 + num = int(''.join(code), 16) + return chr(num) + case 'p' | 'P': + self._raise_error("Unicode property not supported") case _: return char def _parseCharClass(self) -> list[str]: @@ -226,7 +236,7 @@ def _parseCharClass(self) -> list[str]: chars_list.append(first_char) if negated: - chars_list = [c for c in self.CHARSET if c not in chars_list] + chars_list = [c for c in self.charset if c not in chars_list] return chars_list diff --git a/setup.py b/setup.py index 479ca4a..29c7fae 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='regex_enumerator', - version='0.5.0', + version='0.7.0', packages=find_packages(include=['regex_enumerator', 'regex_enumerator.*']), description='Enumerate all strings that match a given regex', author='Vincenzo Greco', diff --git a/tests/test_char_classes.py b/tests/test_char_classes.py index 96b523e..413f8f6 100644 --- a/tests/test_char_classes.py +++ b/tests/test_char_classes.py @@ -107,3 +107,10 @@ def test_2_ranges(): possibilities = ['1', 'a', 'b', 'c', 'f', 'g', 'r', '3'] f_finite(regexEnumerator, possibilities) + + +def test_unicode_character_class(): + regexEnumerator = RegexEnumerator(r'[à-å]') + possibilities = ['à', 'á', 'â', 'ã', 'ä', 'å'] + + f_finite(regexEnumerator, possibilities) diff --git a/tests/test_escape_char.py b/tests/test_escape_char.py index 3c1699f..ee5f9af 100644 --- a/tests/test_escape_char.py +++ b/tests/test_escape_char.py @@ -133,3 +133,10 @@ def test_escaped_char_interrups_range_after_1st_char(): possibilities = ['[', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] f_finite(regexEnumerator, possibilities) + + +def test_escaped_unicode_literal(): + regexEnumerator = RegexEnumerator(r'\u00E0') + possibilities = ['à'] + + f_finite(regexEnumerator, possibilities) diff --git a/tests/test_mixed.py b/tests/test_mixed.py index 0920adb..6a26d91 100644 --- a/tests/test_mixed.py +++ b/tests/test_mixed.py @@ -17,6 +17,11 @@ def test_single_wildcard(): f_finite(regexEnumerator, possibilities) +def test_wildcard_with_unicode(): + regexEnumerator = RegexEnumerator(r'.', additional_charset='¡¢£') + possibilities = [chr(i) for i in range(32, 127)] + ['¡', '¢', '£'] + + def test_done(): regexEnumerator = RegexEnumerator(r'') possibilities = ['', None]