Add support for unicode characters

Buba98 · Dec 19, 2024 · 1f49c77 · 1f49c77
1 parent c063c68
commit 1f49c77
Show file tree

Hide file tree

Showing 7 changed files with 101 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,17 @@
 # Regex enumerator
+
 [![PyPI version](https://img.shields.io/pypi/v/regex-enumerator.svg)](https://pypi.org/project/regex-enumerator/)
 
 This library is meant to generate all the strings that match a given regex pattern. It is written in python and uses no external libraries.
 
 ## Installation
+
 ```bash
 pip install regex-enumerator
 ```
 
 ## Usage
+
 Here's an example of how to use the library:
 
 ```python
@@ -24,25 +27,71 @@ print(re.next()) # a2b
 ```
 
 ## What is supported
+
 - [x] Character classes
 - [x] Quantifiers for character classes
 - [x] Quantifiers for groups
 - [x] Groups (named and unnamed)
-- [x] Alternation 
-- [x] Escaped characters 
+- [x] Alternation
+- [x] Escaped characters
 - [x] Backreferences (named and unnamed)
 
+## What I plan to support
+
+- [ ] Lookahead and lookbehind
+- [ ] Non-capturing groups
+
+## What is not supported
+
+- [ ] Unicode properties
+- [ ] Word boundaries
+- [ ] Anchors
+- [ ] Non-greedy quantifiers
+
+## Charset
+
+The library supports ASCII characters by default. To handle Unicode characters, include them explicitly in your regex or define a custom character set.
+
+```python
+from regex_enumerator import RegexEnumerator
+
+# Directly in regex
+regex_enum = RegexEnumerator(r'£')
+print(regex_enum.next())  # £
+
+# Using additional_charset
+unicode_charset = [chr(i) for i in range(ord('¡'), ord('£'))]
+unicode_charset = ['¡', '¢', '£']
+unicode_charset = '¡¢£'
+unicode_charset = ['¡¢', '£']
+
+regex_enum = RegexEnumerator(r'.', additional_charset=unicode_charset)
+
+result = []
+while (char := regex_enum.next()) is not None:
+    result.append(char)
+
+assert '¡' in result
+assert '¢' in result
+assert '£' in result
+```
+
 ## How it works
+
 This library works by parsing the regex pattern into a tree structure. Once parsed, it performs a breadth-first search (BFS) on the tree to generate all matching strings. This ensures it does not get stuck on unbounded quantifiers for character classes or groups.
 
 ## Tests
+
 The library includes a comprehensive test suite. To run the tests, use the following command:
+
 ```bash
 pytest
 ```
 
 ## License
+
 I don't know what license to use, so I'm going to use the MIT license. If you have any suggestions, please let me know.
 
 ## Contributors
+
 Feel free to contribute to this project. I'm open to suggestions and improvements.
diff --git a/regex_enumerator/regex_enumerator.py b/regex_enumerator/regex_enumerator.py
@@ -3,8 +3,18 @@
 
 
 class RegexEnumerator:
-    def __init__(self, regex: str):
-        parser = RegexParser(regex)
+    def __init__(self, regex: str, additional_charset: str | list[str] = None) -> None:
+        default_charset = [chr(c) for c in range(32, 127)]
+
+        if additional_charset is None:
+            additional = []
+        elif isinstance(additional_charset, list):
+            additional = list(''.join(additional_charset))
+        else:
+            additional = list(additional_charset)
+
+        charset = ''.join(sorted(set(default_charset + additional)))
+        parser = RegexParser(regex, charset)
         self.regexTree: RegexTree = parser.parse()
         self.current: list[str] = list(self.regexTree.current)
         self.done: bool = self.regexTree.done and len(self.current) == 0

diff --git a/regex_enumerator/regex_parser.py b/regex_enumerator/regex_parser.py
@@ -1,6 +1,5 @@
 from .regex_tree import Alternative, BackReference, CharClasses, RegexTree
 
-
 class RegexError(Exception):
     def __init__(self, regex: str, index: int, message: str):
         self.regex = regex
@@ -13,13 +12,13 @@ def __str__(self):
 
 
 class RegexParser:
-    CHARSET = [chr(c) for c in range(32, 127)]
     WORDS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
     HEX = '0123456789abcdefABCDEF'
     DIGITS = '0123456789'
 
-    def __init__(self, regex: str):
+    def __init__(self, regex: str, charset: str):
         self.regex = regex
+        self.charset = charset
 
     def parse(self) -> RegexTree:
         self.index = 0
@@ -73,7 +72,7 @@ def _parseRegex(self, to_close: bool) -> RegexTree:
                     min_len, max_len = self._parseQuantifier()
                     elements.append(CharClasses(chars, min_len, max_len))
                 case '.':
-                    chars = list(self.CHARSET)
+                    chars = list(self.charset)
                     min_len, max_len = self._parseQuantifier()
                     elements.append(CharClasses(chars, min_len, max_len))
                 case '\\':
@@ -143,11 +142,11 @@ def _parseEscapeChar(self) -> str:
 
         match char:
             case 'd': return self.DIGITS
-            case 'D': return ''.join([c for c in self.CHARSET if not c.isdigit()])
+            case 'D': return ''.join([c for c in self.charset if not c.isdigit()])
             case 'w': return self.WORDS
-            case 'W': return ''.join([c for c in self.CHARSET if c not in self.WORDS])
+            case 'W': return ''.join([c for c in self.charset if c not in self.WORDS])
             case 's': return ' \t\n\r\f\v'
-            case 'S': return ''.join([c for c in self.CHARSET if c not in ' \t\n\r\f\v'])
+            case 'S': return ''.join([c for c in self.charset if c not in ' \t\n\r\f\v'])
             case 't': return '\t'
             case 'r': return '\r'
             case 'n': return '\n'
@@ -165,6 +164,17 @@ def _parseEscapeChar(self) -> str:
                 if num < 32 or num > 126:
                     self._raise_error(f"Invalid escape character {num}")
                 return chr(num)
+            case 'u':
+                code = []
+                for _ in range(4):
+                    if len(self.regex) <= self.index or self.regex[self.index] not in self.HEX:
+                        self._raise_error("Invalid escape character")
+                    code.append(self.regex[self.index])
+                    self.index += 1
+                num = int(''.join(code), 16)
+                return chr(num)
+            case 'p' | 'P':
+                self._raise_error("Unicode property not supported")
             case _: return char
 
     def _parseCharClass(self) -> list[str]:
@@ -226,7 +236,7 @@ def _parseCharClass(self) -> list[str]:
             chars_list.append(first_char)
 
         if negated:
-            chars_list = [c for c in self.CHARSET if c not in chars_list]
+            chars_list = [c for c in self.charset if c not in chars_list]
 
         return chars_list
 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='regex_enumerator',
-    version='0.5.0',
+    version='0.7.0',
     packages=find_packages(include=['regex_enumerator', 'regex_enumerator.*']),
     description='Enumerate all strings that match a given regex',
     author='Vincenzo Greco',

diff --git a/tests/test_char_classes.py b/tests/test_char_classes.py
@@ -107,3 +107,10 @@ def test_2_ranges():
     possibilities = ['1', 'a', 'b', 'c', 'f', 'g', 'r', '3']
 
     f_finite(regexEnumerator, possibilities)
+
+
+def test_unicode_character_class():
+    regexEnumerator = RegexEnumerator(r'[à-å]')
+    possibilities = ['à', 'á', 'â', 'ã', 'ä', 'å']
+
+    f_finite(regexEnumerator, possibilities)
diff --git a/tests/test_escape_char.py b/tests/test_escape_char.py
@@ -133,3 +133,10 @@ def test_escaped_char_interrups_range_after_1st_char():
     possibilities = ['[', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
 
     f_finite(regexEnumerator, possibilities)
+
+
+def test_escaped_unicode_literal():
+    regexEnumerator = RegexEnumerator(r'\u00E0')
+    possibilities = ['à']
+
+    f_finite(regexEnumerator, possibilities)
diff --git a/tests/test_mixed.py b/tests/test_mixed.py
@@ -17,6 +17,11 @@ def test_single_wildcard():
     f_finite(regexEnumerator, possibilities)
 
 
+def test_wildcard_with_unicode():
+    regexEnumerator = RegexEnumerator(r'.', additional_charset='¡¢£')
+    possibilities = [chr(i) for i in range(32, 127)] + ['¡', '¢', '£']
+
+
 def test_done():
     regexEnumerator = RegexEnumerator(r'')
     possibilities = ['', None]