Skip to content

Commit

Permalink
Merge pull request #2261 from mabel-dev/#2185-2
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer authored Jan 17, 2025
2 parents 29e00db + 11e7e71 commit 3cb277c
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 16 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/regression_suite.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ jobs:
MAX_LOCAL_BUFFER_CAPACITY: 100
MAX_CACHE_EVICTIONS_PER_QUERY: 4
DATA_CATALOG_PROVIDER: 'ICEBERG'
DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}'
DATA_CATALOG_CONNECTION: '${{ secrets.DATA_CATALOG_CONNECTION }}'
DATA_CATALOG_STORAGE: '${{ secrets.DATA_CATALOG_STORAGE }}'
VALKEY_CONNECTION: '${{ secrets.VALKEY_CONFIG }}'

- name: Check Coverage
Expand Down
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 993
__build__ = 994

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
5 changes: 1 addition & 4 deletions opteryx/compiled/list_ops/list_ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ from cython import Py_ssize_t
from numpy cimport ndarray
from cpython.unicode cimport PyUnicode_AsUTF8String
from cpython.bytes cimport PyBytes_AsString
from libc.stdint cimport int32_t

cnp.import_array()

Expand Down Expand Up @@ -338,8 +337,7 @@ cdef int boyer_moore_horspool(const char *haystack, size_t haystacklen, const ch
int: 1 if the needle exists in the haystack, 0 otherwise.
"""
cdef unsigned char skip[256]
cdef size_t i, k
cdef int j # Use int to handle negative values safely
cdef size_t i

if needlelen == 0:
return -1 # No valid search possible
Expand Down Expand Up @@ -420,7 +418,6 @@ cdef int boyer_moore_horspool_case_insensitive(const char *haystack, size_t hays
return 0 # No match found



cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_substring(cnp.ndarray[cnp.str, ndim=1] haystack, str needle):
"""
Used as the InStr operator, which was written to replace using LIKE to execute list_substring
Expand Down
7 changes: 4 additions & 3 deletions opteryx/functions/other_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,11 @@ def search(array, item, ignore_case: Optional[List[bool]] = None):
# Return True if the value is in the string
# We're essentially doing a LIKE here
from opteryx.compiled import list_ops

if ignore_case[0]:
results_mask = list_ops.list_ops.list_substring_case_insensitive(array, str(item)).astype(
numpy.bool_
)
results_mask = list_ops.list_ops.list_substring_case_insensitive(
array, str(item)
).astype(numpy.bool_)
else:
results_mask = list_ops.list_ops.list_substring(array, str(item)).astype(numpy.bool_)
elif array_type == numpy.ndarray:
Expand Down
7 changes: 6 additions & 1 deletion opteryx/planner/optimizer/strategies/predicate_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,12 @@
IN_REWRITES = {"InList": "Eq", "NotInList": "NotEq"}
LIKE_REWRITES = {"Like": "Eq", "NotLike": "NotEq"}
LITERALS_TO_THE_RIGHT = {"Plus": "Minus", "Minus": "Plus"}
INSTR_REWRITES = {"Like": "InStr", "NotLike": "NotInStr", "ILike": "IInStr", "NotILike": "NotIInStr"}
INSTR_REWRITES = {
"Like": "InStr",
"NotLike": "NotInStr",
"ILike": "IInStr",
"NotILike": "NotIInStr",
}


def remove_adjacent_wildcards(predicate):
Expand Down
15 changes: 11 additions & 4 deletions opteryx/utils/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,16 +206,23 @@ def date_trunc(truncate_to, date_values) -> numpy.ndarray:
return date_values.astype("datetime64[Y]").astype("datetime64[s]")
elif truncate_to == "quarter":
months = date_values.astype("datetime64[M]").astype(int) // 3 * 3
return numpy.array(months,
return numpy.array(
months,
dtype="datetime64[M]",
).astype("datetime64[s]")
elif truncate_to == "month":
return date_values.astype("datetime64[M]").astype("datetime64[s]")
elif truncate_to == "week":
return (
date_values
- ((date_values.astype("datetime64[D]").astype(int) - 4) % 7).astype("timedelta64[D]")
).astype("datetime64[D]").astype("datetime64[s]")
(
date_values
- ((date_values.astype("datetime64[D]").astype(int) - 4) % 7).astype(
"timedelta64[D]"
)
)
.astype("datetime64[D]")
.astype("datetime64[s]")
)
elif truncate_to == "day":
return date_values.astype("datetime64[D]").astype("datetime64[s]")
elif truncate_to == "hour":
Expand Down
23 changes: 21 additions & 2 deletions tests/catalog/test_iceberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
# this is how we get the raw list of files for the scan
# print([task.file.file_path for task in self.table.scan().plan_files()])



def set_up_iceberg():
"""
Set up a local Iceberg catalog for testing with NVD data.
Expand Down Expand Up @@ -101,6 +99,27 @@ def test_iceberg_get_schema():
table = catalog.load_table("iceberg.tweets")
table.schema().as_arrow()

@skip_if(is_arm() or is_windows() or is_mac())
def test_iceberg_remote():

from pyiceberg.catalog import load_catalog

DATA_CATALOG_CONNECTION = os.environ.get("DATA_CATALOG_CONNECTION")
DATA_CATALOG_STORAGE = os.environ.get("DATA_CATALOG_STORAGE")

catalog = load_catalog(
"opteryx",
**{
"uri": DATA_CATALOG_CONNECTION,
"warehouse": DATA_CATALOG_STORAGE,
}
)

opteryx.register_store("iceberg", IcebergConnector, catalog=catalog)

table = opteryx.query("SELECT * FROM iceberg.tweets WHERE followers = 10")
assert table.shape[0] == 353


if __name__ == "__main__": # pragma: no cover
from tests.tools import run_tests
Expand Down

0 comments on commit 3cb277c

Please sign in to comment.