Skip to content

Commit

Permalink
Add support for unicode characters
Browse files Browse the repository at this point in the history
  • Loading branch information
Buba98 committed Dec 19, 2024
1 parent c063c68 commit 1f49c77
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 13 deletions.
53 changes: 51 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# Regex enumerator

[![PyPI version](https://img.shields.io/pypi/v/regex-enumerator.svg)](https://pypi.org/project/regex-enumerator/)

This library is meant to generate all the strings that match a given regex pattern. It is written in python and uses no external libraries.

## Installation

```bash
pip install regex-enumerator
```

## Usage

Here's an example of how to use the library:

```python
Expand All @@ -24,25 +27,71 @@ print(re.next()) # a2b
```

## What is supported

- [x] Character classes
- [x] Quantifiers for character classes
- [x] Quantifiers for groups
- [x] Groups (named and unnamed)
- [x] Alternation
- [x] Escaped characters
- [x] Alternation
- [x] Escaped characters
- [x] Backreferences (named and unnamed)

## What I plan to support

- [ ] Lookahead and lookbehind
- [ ] Non-capturing groups

## What is not supported

- [ ] Unicode properties
- [ ] Word boundaries
- [ ] Anchors
- [ ] Non-greedy quantifiers

## Charset

The library supports ASCII characters by default. To handle Unicode characters, include them explicitly in your regex or define a custom character set.

```python
from regex_enumerator import RegexEnumerator

# Directly in regex
regex_enum = RegexEnumerator(r'£')
print(regex_enum.next()) # £

# Using additional_charset
unicode_charset = [chr(i) for i in range(ord('¡'), ord('£'))]
unicode_charset = ['¡', '¢', '£']
unicode_charset = '¡¢£'
unicode_charset = ['¡¢', '£']

regex_enum = RegexEnumerator(r'.', additional_charset=unicode_charset)

result = []
while (char := regex_enum.next()) is not None:
result.append(char)

assert '¡' in result
assert '¢' in result
assert '£' in result
```

## How it works

This library works by parsing the regex pattern into a tree structure. Once parsed, it performs a breadth-first search (BFS) on the tree to generate all matching strings. This ensures it does not get stuck on unbounded quantifiers for character classes or groups.

## Tests

The library includes a comprehensive test suite. To run the tests, use the following command:

```bash
pytest
```

## License

I don't know what license to use, so I'm going to use the MIT license. If you have any suggestions, please let me know.

## Contributors

Feel free to contribute to this project. I'm open to suggestions and improvements.
14 changes: 12 additions & 2 deletions regex_enumerator/regex_enumerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,18 @@


class RegexEnumerator:
def __init__(self, regex: str):
parser = RegexParser(regex)
def __init__(self, regex: str, additional_charset: str | list[str] = None) -> None:
default_charset = [chr(c) for c in range(32, 127)]

if additional_charset is None:
additional = []
elif isinstance(additional_charset, list):
additional = list(''.join(additional_charset))
else:
additional = list(additional_charset)

charset = ''.join(sorted(set(default_charset + additional)))
parser = RegexParser(regex, charset)
self.regexTree: RegexTree = parser.parse()
self.current: list[str] = list(self.regexTree.current)
self.done: bool = self.regexTree.done and len(self.current) == 0
Expand Down
26 changes: 18 additions & 8 deletions regex_enumerator/regex_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .regex_tree import Alternative, BackReference, CharClasses, RegexTree


class RegexError(Exception):
def __init__(self, regex: str, index: int, message: str):
self.regex = regex
Expand All @@ -13,13 +12,13 @@ def __str__(self):


class RegexParser:
CHARSET = [chr(c) for c in range(32, 127)]
WORDS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
HEX = '0123456789abcdefABCDEF'
DIGITS = '0123456789'

def __init__(self, regex: str):
def __init__(self, regex: str, charset: str):
self.regex = regex
self.charset = charset

def parse(self) -> RegexTree:
self.index = 0
Expand Down Expand Up @@ -73,7 +72,7 @@ def _parseRegex(self, to_close: bool) -> RegexTree:
min_len, max_len = self._parseQuantifier()
elements.append(CharClasses(chars, min_len, max_len))
case '.':
chars = list(self.CHARSET)
chars = list(self.charset)
min_len, max_len = self._parseQuantifier()
elements.append(CharClasses(chars, min_len, max_len))
case '\\':
Expand Down Expand Up @@ -143,11 +142,11 @@ def _parseEscapeChar(self) -> str:

match char:
case 'd': return self.DIGITS
case 'D': return ''.join([c for c in self.CHARSET if not c.isdigit()])
case 'D': return ''.join([c for c in self.charset if not c.isdigit()])
case 'w': return self.WORDS
case 'W': return ''.join([c for c in self.CHARSET if c not in self.WORDS])
case 'W': return ''.join([c for c in self.charset if c not in self.WORDS])
case 's': return ' \t\n\r\f\v'
case 'S': return ''.join([c for c in self.CHARSET if c not in ' \t\n\r\f\v'])
case 'S': return ''.join([c for c in self.charset if c not in ' \t\n\r\f\v'])
case 't': return '\t'
case 'r': return '\r'
case 'n': return '\n'
Expand All @@ -165,6 +164,17 @@ def _parseEscapeChar(self) -> str:
if num < 32 or num > 126:
self._raise_error(f"Invalid escape character {num}")
return chr(num)
case 'u':
code = []
for _ in range(4):
if len(self.regex) <= self.index or self.regex[self.index] not in self.HEX:
self._raise_error("Invalid escape character")
code.append(self.regex[self.index])
self.index += 1
num = int(''.join(code), 16)
return chr(num)
case 'p' | 'P':
self._raise_error("Unicode property not supported")
case _: return char

def _parseCharClass(self) -> list[str]:
Expand Down Expand Up @@ -226,7 +236,7 @@ def _parseCharClass(self) -> list[str]:
chars_list.append(first_char)

if negated:
chars_list = [c for c in self.CHARSET if c not in chars_list]
chars_list = [c for c in self.charset if c not in chars_list]

return chars_list

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='regex_enumerator',
version='0.5.0',
version='0.7.0',
packages=find_packages(include=['regex_enumerator', 'regex_enumerator.*']),
description='Enumerate all strings that match a given regex',
author='Vincenzo Greco',
Expand Down
7 changes: 7 additions & 0 deletions tests/test_char_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,10 @@ def test_2_ranges():
possibilities = ['1', 'a', 'b', 'c', 'f', 'g', 'r', '3']

f_finite(regexEnumerator, possibilities)


def test_unicode_character_class():
regexEnumerator = RegexEnumerator(r'[à-å]')
possibilities = ['à', 'á', 'â', 'ã', 'ä', 'å']

f_finite(regexEnumerator, possibilities)
7 changes: 7 additions & 0 deletions tests/test_escape_char.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,10 @@ def test_escaped_char_interrups_range_after_1st_char():
possibilities = ['[', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

f_finite(regexEnumerator, possibilities)


def test_escaped_unicode_literal():
regexEnumerator = RegexEnumerator(r'\u00E0')
possibilities = ['à']

f_finite(regexEnumerator, possibilities)
5 changes: 5 additions & 0 deletions tests/test_mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ def test_single_wildcard():
f_finite(regexEnumerator, possibilities)


def test_wildcard_with_unicode():
regexEnumerator = RegexEnumerator(r'.', additional_charset='¡¢£')
possibilities = [chr(i) for i in range(32, 127)] + ['¡', '¢', '£']


def test_done():
regexEnumerator = RegexEnumerator(r'')
possibilities = ['', None]
Expand Down

0 comments on commit 1f49c77

Please sign in to comment.