Skip to content

Commit

Permalink
Merge pull request #2349 from mabel-dev/joocer/issue2346
Browse files Browse the repository at this point in the history
Joocer/issue2346
  • Loading branch information
joocer authored Feb 3, 2025
2 parents fde7394 + 9136c43 commit 5464fa8
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 5 deletions.
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 1054
__build__ = 1055

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
43 changes: 43 additions & 0 deletions opteryx/compiled/list_ops/list_ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -546,3 +546,46 @@ cpdef cnp.ndarray[cnp.int64_t, ndim=1] hash_bytes_column(cnp.ndarray[cnp.bytes]
result_view[i] = PyObject_Hash(values_view[i])

return numpy.asarray(result_view, dtype=numpy.int64)


cpdef cnp.ndarray[cnp.uint8_t, ndim=1] in_list(object[::1] arr, set values):
"""
Fast membership check for "InList" using Cython.
Parameters:
arr: NumPy array of arbitrary type (should be homogeneous).
values: List of valid values (converted to a Cython set).
Returns:
NumPy boolean array indicating membership.
"""
cdef Py_ssize_t i, size = arr.shape[0]
cdef cnp.ndarray[cnp.uint8_t, ndim=1] result = numpy.empty(size, dtype=numpy.uint8)
cdef uint8_t[::1] result_view = result

for i in range(size):
result_view[i] = arr[i] in values

return result

cpdef cnp.ndarray[cnp.uint8_t, ndim=1] in_list_int64(const int64_t[::1] arr, set values, Py_ssize_t size):
"""
Fast membership check for "InList" using Cython.
Parameters:
arr: NumPy array of arbitrary type (should be homogeneous).
values: List of valid values (converted to a Cython set).
Returns:
NumPy boolean array indicating membership.
"""
cdef Py_ssize_t i
cdef cnp.ndarray[cnp.uint8_t, ndim=1] result = numpy.empty(size, dtype=numpy.uint8)
cdef uint8_t[::1] result_view = result
cdef int64_t value

for i in range(size):
value = arr[i]
result_view[i] = value in values

return result
13 changes: 9 additions & 4 deletions opteryx/managers/expression/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,18 @@ def _inner_filter_operations(arr, operator, value):
if operator == "GtEq":
return compute.greater_equal(arr, value).to_numpy(False).astype(dtype=bool)
if operator == "InList":
# MODIFIED FOR OPTERYX
values = set(value[0])
return numpy.array([a in values for a in arr], dtype=numpy.bool_) # [#325]?
if arr.dtype == numpy.int64:
return list_ops.list_ops.in_list_int64(memoryview(arr), values, len(arr))
else:
return list_ops.list_ops.in_list(arr.astype(object), values)
if operator == "NotInList":
# MODIFIED FOR OPTERYX - see comment above
values = set(value[0])
return numpy.array([a not in values for a in arr], dtype=numpy.bool_) # [#325]?
if arr.dtype == numpy.int64:
matches = list_ops.list_ops.in_list_int64(memoryview(arr), values, len(arr))
else:
matches = list_ops.list_ops.in_list(arr.astype(object), values)
return numpy.invert(matches.astype(dtype=bool))
if operator == "InStr":
needle = str(value[0])
return list_ops.list_ops.list_substring(arr, needle).astype(dtype=bool)
Expand Down

0 comments on commit 5464fa8

Please sign in to comment.