#2251

mabel-dev · Jan 16, 2025 · d322caa · d322caa
1 parent 0f2f81a
commit d322caa
Show file tree

Hide file tree

Showing 4 changed files with 160 additions and 11 deletions.
diff --git a/opteryx/compiled/list_ops/list_ops.pyx b/opteryx/compiled/list_ops/list_ops.pyx
@@ -14,9 +14,13 @@ from cython import Py_ssize_t
 from numpy cimport ndarray
 from cpython.unicode cimport PyUnicode_AsUTF8String
 from cpython.bytes cimport PyBytes_AsString
+from libc.stdint cimport int32_t
 
 cnp.import_array()
 
+cdef extern from "string.h":
+    int strncasecmp(const char *s1, const char *s2, size_t n)
+    int memcmp(const void *s1, const void *s2, size_t n)
 
 cpdef cnp.ndarray[cnp.npy_bool, ndim=1] cython_allop_eq(object literal, cnp.ndarray arr):
     cdef:
@@ -320,8 +324,102 @@ cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_contains_any(cnp.ndarray array, cnp.
     return res
 
 
-cdef extern from "string.h":
-    void *memmem(const void *haystack, size_t haystacklen, const void *needle, size_t needlelen)
+cdef int boyer_moore_horspool(const char *haystack, size_t haystacklen, const char *needle, size_t needlelen):
+    """
+    Case-sensitive Boyer-Moore-Horspool substring search.
+
+    Parameters:
+        haystack (const char *): The text to search in.
+        haystacklen (size_t): The length of the haystack.
+        needle (const char *): The pattern to search for.
+        needlelen (size_t): The length of the needle.
+
+    Returns:
+        int: 1 if the needle exists in the haystack, 0 otherwise.
+    """
+    cdef unsigned char skip[256]
+    cdef size_t i, k
+    cdef int j  # Use int to handle negative values safely
+
+    if needlelen == 0:
+        return -1  # No valid search possible
+
+    if haystacklen < needlelen:
+        return 0  # Needle is longer than haystack
+
+    # Initialize skip table
+    for i in range(256):
+        skip[i] = needlelen  # Default shift length
+
+    # Populate skip table for each character in the needle
+    for i in range(needlelen - 1):
+        skip[<unsigned char>needle[i]] = needlelen - i - 1
+
+    i = 0  # Reset i before main search loop
+
+    while i <= haystacklen - needlelen:
+        # Use memcmp for full substring comparison
+        if memcmp(&haystack[i], needle, needlelen) == 0:
+            return 1  # Match found
+
+        # Update i based on skip table, ensuring no out-of-bounds access
+        i += skip[<unsigned char>haystack[min(i + needlelen - 1, haystacklen - 1)]]
+
+    return 0  # No match found
+
+
+cdef int boyer_moore_horspool_case_insensitive(const char *haystack, size_t haystacklen, const char *needle, size_t needlelen):
+    """
+    Case-insensitive Boyer-Moore-Horspool substring search.
+
+    Parameters:
+        haystack (const char *): The text to search in.
+        haystacklen (size_t): The length of the haystack.
+        needle (const char *): The pattern to search for.
+        needlelen (size_t): The length of the needle.
+
+    Returns:
+        int: 1 if the needle exists in the haystack, 0 otherwise.
+    """
+    cdef unsigned char skip[256]
+    cdef size_t i, k
+    cdef int j  # Use int to handle negative values safely
+
+    if needlelen == 0:
+        return -1  # No valid search possible
+
+    if haystacklen < needlelen:
+        return 0  # Needle is longer than haystack
+
+    # Initialize skip table with default shift length
+    for i in range(256):
+        skip[i] = needlelen  # Default shift
+
+    # Populate skip table with actual values from needle
+    for i in range(needlelen - 1):
+        skip[<unsigned char>needle[i]] = needlelen - i - 1
+        skip[<unsigned char>(needle[i] ^ 32)] = needlelen - i - 1  # Case-insensitive mapping
+
+    i = 0  # Start searching from the beginning
+
+    while i <= haystacklen - needlelen:
+        k = i + needlelen - 1
+        j = needlelen - 1
+
+        # Case-insensitive comparison of characters
+        while j >= 0 and strncasecmp(&haystack[k], &needle[j], 1) == 0:
+            j -= 1
+            k -= 1
+
+        if j < 0:
+            return 1  # Match found
+
+        # Move i forward based on skip table
+        i += skip[<unsigned char>haystack[i + needlelen - 1]]
+
+    return 0  # No match found
+
+
 
 cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_substring(cnp.ndarray[cnp.str, ndim=1] haystack, str needle):
     """
@@ -341,16 +439,53 @@ cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_substring(cnp.ndarray[cnp.str, ndim=
     if isinstance(haystack[0], str):
         for i in range(n):
             item = PyUnicode_AsUTF8String(haystack[i])
+            data = <char*> PyBytes_AsString(item)
+            length = len(item)
+            if length >= pattern_length:
+                if boyer_moore_horspool(data, length, c_pattern, pattern_length):
+                    result[i] = 1
+    else:
+        for i in range(n):
+            item = haystack[i]
             data = <char*> item
             length = len(item)
-            if memmem(data, length, c_pattern, pattern_length) != NULL:
-                result[i] = 1
+            if length >= pattern_length:
+                if boyer_moore_horspool(data, length, c_pattern, pattern_length):
+                    result[i] = 1
+
+    return result
+
+
+cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_substring_case_insensitive(cnp.ndarray[cnp.str, ndim=1] haystack, str needle):
+    """
+    Used as the InStr operator, which was written to replace using LIKE to execute list_substring
+    matching. We tried using PyArrow's substring but the performance was almost identical to LIKE.
+    """
+    cdef Py_ssize_t n = haystack.shape[0]
+    cdef bytes needle_bytes = needle.encode('utf-8')
+    cdef char *c_pattern = PyBytes_AsString(needle_bytes)
+    cdef size_t pattern_length = len(needle_bytes)
+    cdef cnp.ndarray[cnp.uint8_t, ndim=1] result = numpy.zeros(n, dtype=numpy.uint8)
+    cdef Py_ssize_t i = 0
+    cdef Py_ssize_t length
+    cdef char *data
+
+    # Check the type of the first item to decide the processing method
+    if isinstance(haystack[0], str):
+        for i in range(n):
+            item = PyUnicode_AsUTF8String(haystack[i])
+            data = <char*> PyBytes_AsString(item)
+            length = len(item)
+            if length >= pattern_length:
+                if boyer_moore_horspool_case_insensitive(data, length, c_pattern, pattern_length):
+                    result[i] = 1
     else:
         for i in range(n):
             item = haystack[i]
             data = <char*> item
             length = len(item)
-            if memmem(data, length, c_pattern, pattern_length) != NULL:
-                result[i] = 1
+            if length >= pattern_length:
+                if boyer_moore_horspool_case_insensitive(data, length, c_pattern, pattern_length):
+                    result[i] = 1
 
     return result
diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py
@@ -65,8 +65,15 @@ def search(array, item, ignore_case: Optional[List[bool]] = None):
         return numpy.array([False], dtype=numpy.bool_)
 
     if array_type in (str, bytes):
-        # return True if the value is in the string
-        results_mask = compute.match_substring(array, pattern=item, ignore_case=ignore_case[0])
+        # Return True if the value is in the string
+        # We're essentially doing a LIKE here
+        from opteryx.compiled import list_ops
+        if ignore_case[0]:
+            results_mask = list_ops.list_ops.list_substring_case_insensitive(array, str(item)).astype(
+                numpy.bool_
+            )
+        else:
+            results_mask = list_ops.list_ops.list_substring(array, str(item)).astype(numpy.bool_)
     elif array_type == numpy.ndarray:
         # converting to a set is faster for a handful of items which is what we're
         # almost definitely working with here - note compute.index is about 50x slower

diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py
@@ -142,6 +142,13 @@ def _inner_filter_operations(arr, operator, value):
         needle = str(value[0])
         matches = list_ops.list_ops.list_substring(arr, needle)  # [#325]
         return numpy.invert(matches.astype(dtype=bool))
+    if operator == "IInStr":
+        needle = str(value[0])
+        return list_ops.list_ops.list_substring_case_insensitive(arr, needle)  # [#325]
+    if operator == "NotIInStr":
+        needle = str(value[0])
+        matches = list_ops.list_ops.list_substring_case_insensitive(arr, needle)  # [#325]
+        return numpy.invert(matches.astype(dtype=bool))
     if operator == "Like":
         # MODIFIED FOR OPTERYX
         # null input emits null output, which should be false/0

diff --git a/opteryx/planner/optimizer/strategies/predicate_rewriter.py b/opteryx/planner/optimizer/strategies/predicate_rewriter.py
@@ -38,7 +38,7 @@
 IN_REWRITES = {"InList": "Eq", "NotInList": "NotEq"}
 LIKE_REWRITES = {"Like": "Eq", "NotLike": "NotEq"}
 LITERALS_TO_THE_RIGHT = {"Plus": "Minus", "Minus": "Plus"}
-INSTR_REWRITES = {"Like": "InStr", "NotLike": "NotInStr"}
+INSTR_REWRITES = {"Like": "InStr", "NotLike": "NotInStr", "ILike": "IInStr", "NotILike": "NotIInStr"}
 
 
 def remove_adjacent_wildcards(predicate):
@@ -129,12 +129,12 @@ def _rewrite_predicate(predicate, statistics: QueryStatistics):
             statistics.optimization_predicate_rewriter_remove_adjacent_wildcards += 1
             predicate = dispatcher["remove_adjacent_wildcards"](predicate)
 
-    if predicate.value in {"Like", "NotLike"}:
+    if predicate.value in LIKE_REWRITES:
         if "%" not in predicate.right.value and "_" not in predicate.right.value:
             statistics.optimization_predicate_rewriter_remove_redundant_like += 1
             predicate.value = LIKE_REWRITES[predicate.value]
 
-    if predicate.value in {"Like", "NotLike"}:
+    if predicate.value in INSTR_REWRITES:
         if (
             "_" not in predicate.right.value
             and predicate.right.value.endswith("%")