Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Jan 16, 2025
1 parent 0f2f81a commit d322caa
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 11 deletions.
147 changes: 141 additions & 6 deletions opteryx/compiled/list_ops/list_ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@ from cython import Py_ssize_t
from numpy cimport ndarray
from cpython.unicode cimport PyUnicode_AsUTF8String
from cpython.bytes cimport PyBytes_AsString
from libc.stdint cimport int32_t

cnp.import_array()

cdef extern from "string.h":
int strncasecmp(const char *s1, const char *s2, size_t n)
int memcmp(const void *s1, const void *s2, size_t n)

cpdef cnp.ndarray[cnp.npy_bool, ndim=1] cython_allop_eq(object literal, cnp.ndarray arr):
cdef:
Expand Down Expand Up @@ -320,8 +324,102 @@ cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_contains_any(cnp.ndarray array, cnp.
return res


cdef extern from "string.h":
void *memmem(const void *haystack, size_t haystacklen, const void *needle, size_t needlelen)
cdef int boyer_moore_horspool(const char *haystack, size_t haystacklen, const char *needle, size_t needlelen):
"""
Case-sensitive Boyer-Moore-Horspool substring search.
Parameters:
haystack (const char *): The text to search in.
haystacklen (size_t): The length of the haystack.
needle (const char *): The pattern to search for.
needlelen (size_t): The length of the needle.
Returns:
int: 1 if the needle exists in the haystack, 0 otherwise.
"""
cdef unsigned char skip[256]
cdef size_t i, k
cdef int j # Use int to handle negative values safely

if needlelen == 0:
return -1 # No valid search possible

if haystacklen < needlelen:
return 0 # Needle is longer than haystack

# Initialize skip table
for i in range(256):
skip[i] = needlelen # Default shift length

# Populate skip table for each character in the needle
for i in range(needlelen - 1):
skip[<unsigned char>needle[i]] = needlelen - i - 1

i = 0 # Reset i before main search loop

while i <= haystacklen - needlelen:
# Use memcmp for full substring comparison
if memcmp(&haystack[i], needle, needlelen) == 0:
return 1 # Match found

# Update i based on skip table, ensuring no out-of-bounds access
i += skip[<unsigned char>haystack[min(i + needlelen - 1, haystacklen - 1)]]

return 0 # No match found


cdef int boyer_moore_horspool_case_insensitive(const char *haystack, size_t haystacklen, const char *needle, size_t needlelen):
"""
Case-insensitive Boyer-Moore-Horspool substring search.
Parameters:
haystack (const char *): The text to search in.
haystacklen (size_t): The length of the haystack.
needle (const char *): The pattern to search for.
needlelen (size_t): The length of the needle.
Returns:
int: 1 if the needle exists in the haystack, 0 otherwise.
"""
cdef unsigned char skip[256]
cdef size_t i, k
cdef int j # Use int to handle negative values safely

if needlelen == 0:
return -1 # No valid search possible

if haystacklen < needlelen:
return 0 # Needle is longer than haystack

# Initialize skip table with default shift length
for i in range(256):
skip[i] = needlelen # Default shift

# Populate skip table with actual values from needle
for i in range(needlelen - 1):
skip[<unsigned char>needle[i]] = needlelen - i - 1
skip[<unsigned char>(needle[i] ^ 32)] = needlelen - i - 1 # Case-insensitive mapping

i = 0 # Start searching from the beginning

while i <= haystacklen - needlelen:
k = i + needlelen - 1
j = needlelen - 1

# Case-insensitive comparison of characters
while j >= 0 and strncasecmp(&haystack[k], &needle[j], 1) == 0:
j -= 1
k -= 1

if j < 0:
return 1 # Match found

# Move i forward based on skip table
i += skip[<unsigned char>haystack[i + needlelen - 1]]

return 0 # No match found



cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_substring(cnp.ndarray[cnp.str, ndim=1] haystack, str needle):
"""
Expand All @@ -341,16 +439,53 @@ cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_substring(cnp.ndarray[cnp.str, ndim=
if isinstance(haystack[0], str):
for i in range(n):
item = PyUnicode_AsUTF8String(haystack[i])
data = <char*> PyBytes_AsString(item)
length = len(item)
if length >= pattern_length:
if boyer_moore_horspool(data, length, c_pattern, pattern_length):
result[i] = 1
else:
for i in range(n):
item = haystack[i]
data = <char*> item
length = len(item)
if memmem(data, length, c_pattern, pattern_length) != NULL:
result[i] = 1
if length >= pattern_length:
if boyer_moore_horspool(data, length, c_pattern, pattern_length):
result[i] = 1

return result


cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_substring_case_insensitive(cnp.ndarray[cnp.str, ndim=1] haystack, str needle):
"""
Used as the InStr operator, which was written to replace using LIKE to execute list_substring
matching. We tried using PyArrow's substring but the performance was almost identical to LIKE.
"""
cdef Py_ssize_t n = haystack.shape[0]
cdef bytes needle_bytes = needle.encode('utf-8')
cdef char *c_pattern = PyBytes_AsString(needle_bytes)
cdef size_t pattern_length = len(needle_bytes)
cdef cnp.ndarray[cnp.uint8_t, ndim=1] result = numpy.zeros(n, dtype=numpy.uint8)
cdef Py_ssize_t i = 0
cdef Py_ssize_t length
cdef char *data

# Check the type of the first item to decide the processing method
if isinstance(haystack[0], str):
for i in range(n):
item = PyUnicode_AsUTF8String(haystack[i])
data = <char*> PyBytes_AsString(item)
length = len(item)
if length >= pattern_length:
if boyer_moore_horspool_case_insensitive(data, length, c_pattern, pattern_length):
result[i] = 1
else:
for i in range(n):
item = haystack[i]
data = <char*> item
length = len(item)
if memmem(data, length, c_pattern, pattern_length) != NULL:
result[i] = 1
if length >= pattern_length:
if boyer_moore_horspool_case_insensitive(data, length, c_pattern, pattern_length):
result[i] = 1

return result
11 changes: 9 additions & 2 deletions opteryx/functions/other_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,15 @@ def search(array, item, ignore_case: Optional[List[bool]] = None):
return numpy.array([False], dtype=numpy.bool_)

if array_type in (str, bytes):
# return True if the value is in the string
results_mask = compute.match_substring(array, pattern=item, ignore_case=ignore_case[0])
# Return True if the value is in the string
# We're essentially doing a LIKE here
from opteryx.compiled import list_ops
if ignore_case[0]:
results_mask = list_ops.list_ops.list_substring_case_insensitive(array, str(item)).astype(
numpy.bool_
)
else:
results_mask = list_ops.list_ops.list_substring(array, str(item)).astype(numpy.bool_)
elif array_type == numpy.ndarray:
# converting to a set is faster for a handful of items which is what we're
# almost definitely working with here - note compute.index is about 50x slower
Expand Down
7 changes: 7 additions & 0 deletions opteryx/managers/expression/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,13 @@ def _inner_filter_operations(arr, operator, value):
needle = str(value[0])
matches = list_ops.list_ops.list_substring(arr, needle) # [#325]
return numpy.invert(matches.astype(dtype=bool))
if operator == "IInStr":
needle = str(value[0])
return list_ops.list_ops.list_substring_case_insensitive(arr, needle) # [#325]
if operator == "NotIInStr":
needle = str(value[0])
matches = list_ops.list_ops.list_substring_case_insensitive(arr, needle) # [#325]
return numpy.invert(matches.astype(dtype=bool))
if operator == "Like":
# MODIFIED FOR OPTERYX
# null input emits null output, which should be false/0
Expand Down
6 changes: 3 additions & 3 deletions opteryx/planner/optimizer/strategies/predicate_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
IN_REWRITES = {"InList": "Eq", "NotInList": "NotEq"}
LIKE_REWRITES = {"Like": "Eq", "NotLike": "NotEq"}
LITERALS_TO_THE_RIGHT = {"Plus": "Minus", "Minus": "Plus"}
INSTR_REWRITES = {"Like": "InStr", "NotLike": "NotInStr"}
INSTR_REWRITES = {"Like": "InStr", "NotLike": "NotInStr", "ILike": "IInStr", "NotILike": "NotIInStr"}


def remove_adjacent_wildcards(predicate):
Expand Down Expand Up @@ -129,12 +129,12 @@ def _rewrite_predicate(predicate, statistics: QueryStatistics):
statistics.optimization_predicate_rewriter_remove_adjacent_wildcards += 1
predicate = dispatcher["remove_adjacent_wildcards"](predicate)

if predicate.value in {"Like", "NotLike"}:
if predicate.value in LIKE_REWRITES:
if "%" not in predicate.right.value and "_" not in predicate.right.value:
statistics.optimization_predicate_rewriter_remove_redundant_like += 1
predicate.value = LIKE_REWRITES[predicate.value]

if predicate.value in {"Like", "NotLike"}:
if predicate.value in INSTR_REWRITES:
if (
"_" not in predicate.right.value
and predicate.right.value.endswith("%")
Expand Down

0 comments on commit d322caa

Please sign in to comment.