Merge pull request #2261 from mabel-dev/#2185-2

#2185
mabel-dev · Jan 17, 2025 · 3cb277c · 3cb277c
2 parents 29e00db + 11e7e71
commit 3cb277c
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 16 deletions.
diff --git a/.github/workflows/regression_suite.yaml b/.github/workflows/regression_suite.yaml
@@ -79,7 +79,8 @@ jobs:
           MAX_LOCAL_BUFFER_CAPACITY: 100
           MAX_CACHE_EVICTIONS_PER_QUERY: 4
           DATA_CATALOG_PROVIDER: 'ICEBERG'
-          DATA_CATALOG_CONFIGURATION: '${{ secrets.DATA_CATALOG_CONFIGURATION }}'
+          DATA_CATALOG_CONNECTION: '${{ secrets.DATA_CATALOG_CONNECTION }}'
+          DATA_CATALOG_STORAGE: '${{ secrets.DATA_CATALOG_STORAGE }}'
           VALKEY_CONNECTION: '${{ secrets.VALKEY_CONFIG }}'
 
       - name: Check Coverage

diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,4 +1,4 @@
-__build__ = 993
+__build__ = 994
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/opteryx/compiled/list_ops/list_ops.pyx b/opteryx/compiled/list_ops/list_ops.pyx
@@ -14,7 +14,6 @@ from cython import Py_ssize_t
 from numpy cimport ndarray
 from cpython.unicode cimport PyUnicode_AsUTF8String
 from cpython.bytes cimport PyBytes_AsString
-from libc.stdint cimport int32_t
 
 cnp.import_array()
 
@@ -338,8 +337,7 @@ cdef int boyer_moore_horspool(const char *haystack, size_t haystacklen, const ch
         int: 1 if the needle exists in the haystack, 0 otherwise.
     """
     cdef unsigned char skip[256]
-    cdef size_t i, k
-    cdef int j  # Use int to handle negative values safely
+    cdef size_t i
 
     if needlelen == 0:
         return -1  # No valid search possible
@@ -420,7 +418,6 @@ cdef int boyer_moore_horspool_case_insensitive(const char *haystack, size_t hays
     return 0  # No match found
 
 
-
 cpdef cnp.ndarray[cnp.uint8_t, ndim=1] list_substring(cnp.ndarray[cnp.str, ndim=1] haystack, str needle):
     """
     Used as the InStr operator, which was written to replace using LIKE to execute list_substring

diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py
@@ -68,10 +68,11 @@ def search(array, item, ignore_case: Optional[List[bool]] = None):
         # Return True if the value is in the string
         # We're essentially doing a LIKE here
         from opteryx.compiled import list_ops
+
         if ignore_case[0]:
-            results_mask = list_ops.list_ops.list_substring_case_insensitive(array, str(item)).astype(
-                numpy.bool_
-            )
+            results_mask = list_ops.list_ops.list_substring_case_insensitive(
+                array, str(item)
+            ).astype(numpy.bool_)
         else:
             results_mask = list_ops.list_ops.list_substring(array, str(item)).astype(numpy.bool_)
     elif array_type == numpy.ndarray:

diff --git a/opteryx/planner/optimizer/strategies/predicate_rewriter.py b/opteryx/planner/optimizer/strategies/predicate_rewriter.py
@@ -38,7 +38,12 @@
 IN_REWRITES = {"InList": "Eq", "NotInList": "NotEq"}
 LIKE_REWRITES = {"Like": "Eq", "NotLike": "NotEq"}
 LITERALS_TO_THE_RIGHT = {"Plus": "Minus", "Minus": "Plus"}
-INSTR_REWRITES = {"Like": "InStr", "NotLike": "NotInStr", "ILike": "IInStr", "NotILike": "NotIInStr"}
+INSTR_REWRITES = {
+    "Like": "InStr",
+    "NotLike": "NotInStr",
+    "ILike": "IInStr",
+    "NotILike": "NotIInStr",
+}
 
 
 def remove_adjacent_wildcards(predicate):

diff --git a/opteryx/utils/dates.py b/opteryx/utils/dates.py
@@ -206,16 +206,23 @@ def date_trunc(truncate_to, date_values) -> numpy.ndarray:
         return date_values.astype("datetime64[Y]").astype("datetime64[s]")
     elif truncate_to == "quarter":
         months = date_values.astype("datetime64[M]").astype(int) // 3 * 3
-        return numpy.array(months,
+        return numpy.array(
+            months,
             dtype="datetime64[M]",
         ).astype("datetime64[s]")
     elif truncate_to == "month":
         return date_values.astype("datetime64[M]").astype("datetime64[s]")
     elif truncate_to == "week":
         return (
-            date_values
-            - ((date_values.astype("datetime64[D]").astype(int) - 4) % 7).astype("timedelta64[D]")
-        ).astype("datetime64[D]").astype("datetime64[s]")
+            (
+                date_values
+                - ((date_values.astype("datetime64[D]").astype(int) - 4) % 7).astype(
+                    "timedelta64[D]"
+                )
+            )
+            .astype("datetime64[D]")
+            .astype("datetime64[s]")
+        )
     elif truncate_to == "day":
         return date_values.astype("datetime64[D]").astype("datetime64[s]")
     elif truncate_to == "hour":

diff --git a/tests/catalog/test_iceberg.py b/tests/catalog/test_iceberg.py
@@ -14,8 +14,6 @@
 # this is how we get the raw list of files for the scan
 # print([task.file.file_path for task in self.table.scan().plan_files()])
 
-
-
 def set_up_iceberg():
     """
     Set up a local Iceberg catalog for testing with NVD data.
@@ -101,6 +99,27 @@ def test_iceberg_get_schema():
     table = catalog.load_table("iceberg.tweets")
     table.schema().as_arrow()
 
+@skip_if(is_arm() or is_windows() or is_mac())
+def test_iceberg_remote():
+
+    from pyiceberg.catalog import load_catalog
+
+    DATA_CATALOG_CONNECTION = os.environ.get("DATA_CATALOG_CONNECTION")
+    DATA_CATALOG_STORAGE = os.environ.get("DATA_CATALOG_STORAGE")
+
+    catalog = load_catalog(
+        "opteryx",
+        **{
+            "uri": DATA_CATALOG_CONNECTION,
+            "warehouse": DATA_CATALOG_STORAGE,
+        }
+    )
+
+    opteryx.register_store("iceberg", IcebergConnector, catalog=catalog)
+
+    table = opteryx.query("SELECT * FROM iceberg.tweets WHERE followers = 10")
+    assert table.shape[0] == 353
+
 
 if __name__ == "__main__":  # pragma: no cover
     from tests.tools import run_tests