From 30ca8603b4d4ef4e3c603aca3b8f3d5eb700bda0 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 31 Jan 2025 22:49:04 +0000 Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=AA=B2=20[Fuzzer]=20Null=20Semantics?= =?UTF-8?q?=20error=20Fixes=20#2340?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- opteryx/connectors/iceberg_connector.py | 3 +-- opteryx/connectors/sql_connector.py | 4 +-- opteryx/functions/other_functions.py | 27 +++++++++++++++++++ opteryx/managers/expression/ops.py | 1 + tests/fuzzing/test_sql_fuzzer_connectors.py | 2 +- .../test_shapes_and_errors_battery.py | 27 ++++++++++++------- tests/tools.py | 12 +++++---- 7 files changed, 56 insertions(+), 20 deletions(-) diff --git a/opteryx/connectors/iceberg_connector.py b/opteryx/connectors/iceberg_connector.py index 15920bce..dc95a542 100644 --- a/opteryx/connectors/iceberg_connector.py +++ b/opteryx/connectors/iceberg_connector.py @@ -45,7 +45,6 @@ def to_iceberg_filter(root): ICEBERG_FILTERS = { "GtEq": pyiceberg.expressions.GreaterThanOrEqual, "Eq": pyiceberg.expressions.EqualTo, - "NotEq": pyiceberg.expressions.NotEqualTo, "Gt": pyiceberg.expressions.GreaterThan, "Lt": pyiceberg.expressions.LessThan, "LtEq": pyiceberg.expressions.LessThanOrEqual, @@ -122,7 +121,7 @@ class IcebergConnector(BaseConnector, LimitPushable, Statistics, PredicatePushab PUSHABLE_OPS: Dict[str, bool] = { "Eq": True, - "NotEq": True, + # "NotEq": True, # nulls not handled correctly "Gt": True, "GtEq": True, "Lt": True, diff --git a/opteryx/connectors/sql_connector.py b/opteryx/connectors/sql_connector.py index 97343c09..5717f9a0 100644 --- a/opteryx/connectors/sql_connector.py +++ b/opteryx/connectors/sql_connector.py @@ -59,7 +59,7 @@ class SqlConnector(BaseConnector, LimitPushable, PredicatePushable): PUSHABLE_OPS: Dict[str, bool] = { "Eq": True, - "NotEq": True, + # "NotEq": True, # not all databases handle nulls consistently "Gt": True, "GtEq": True, "Lt": True, @@ -74,7 +74,7 @@ class SqlConnector(BaseConnector, LimitPushable, PredicatePushable): OPS_XLAT: Dict[str, str] = { "Eq": "=", - "NotEq": "!=", + # "NotEq": "!=", "Gt": ">", "GtEq": ">=", "Lt": "<", diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index 1a043976..f81d9938 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -278,3 +278,30 @@ def jsonb_object_keys(arr: numpy.ndarray): # Return the result as a PyArrow array return result + + +def humanize(arr): + def format_number(num: float) -> str: + """Formats the number with or without decimal places based on whether it's an integer.""" + return f"{num:,.0f}" if isinstance(num, int) else f"{num:,.1f}" + + def humanize_number(value: float) -> str: + thresholds = [ + (1_000_000_000_000, "trillion"), + (1_000_000_000, "billion"), + (1_000_000, "million"), + (1_000, "thousand"), + ] + + for threshold, label in thresholds: + rounded = round(value / threshold, 1) + if rounded >= 0.9: # Ensure we don't get "0.9 million" turning into "0 million" + return f"{format_number(rounded)} {label}" + return format_number(value) + + return [humanize_number(value) for value in arr] + + +print( + humanize([34359699410, 1000000000, 100000000, 1000000, 959, 100, 10, 1]) +) # ['1 billion', '100 million', '1 million', '1 thousand', '100', '10', '1'] diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index cc746457..0f60b2b4 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -43,6 +43,7 @@ def filter_operations(arr, left_type, operator, value, right_type): "AllOpEq", "AllOpNotEq", "AtArrow", + "NotEq", # need to handle nulls ): # compressing ARRAY columns is VERY SLOW morsel_size = len(arr) diff --git a/tests/fuzzing/test_sql_fuzzer_connectors.py b/tests/fuzzing/test_sql_fuzzer_connectors.py index 40e9fe09..433c9397 100644 --- a/tests/fuzzing/test_sql_fuzzer_connectors.py +++ b/tests/fuzzing/test_sql_fuzzer_connectors.py @@ -30,7 +30,7 @@ from tests.tools import create_duck_db, populate_mongo, set_up_iceberg from tests.tools import is_arm, is_mac, is_windows, skip_if, is_version -TEST_CYCLES: int = 20 +TEST_CYCLES: int = 2000 TABLES = { diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 1c651390..e0e98a96 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -2328,20 +2328,20 @@ ("SELECT * FROM (SELECT surface_pressure + 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), ("SELECT * FROM (SELECT surface_pressure - 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), ("SELECT * FROM (SELECT surface_pressure / 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), - ("SELECT * FROM (SELECT TRUE AND (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), - ("SELECT * FROM (SELECT FALSE AND (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), - ("SELECT * FROM (SELECT TRUE OR (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), - ("SELECT * FROM (SELECT FALSE OR (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), - ("SELECT * FROM (SELECT (surface_pressure != 0) AND TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), - ("SELECT * FROM (SELECT (surface_pressure != 0) AND FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), - ("SELECT * FROM (SELECT (surface_pressure != 0) OR TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), - ("SELECT * FROM (SELECT (surface_pressure != 0) OR FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT TRUE AND (surface_pressure = 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT FALSE AND (surface_pressure = 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), + ("SELECT * FROM (SELECT TRUE OR (surface_pressure = 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), + ("SELECT * FROM (SELECT FALSE OR (surface_pressure = 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT (surface_pressure = 0) AND TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), + ("SELECT * FROM (SELECT (surface_pressure = 0) AND FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), + ("SELECT * FROM (SELECT (surface_pressure = 0) OR TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None), + ("SELECT * FROM (SELECT (surface_pressure = 0) OR FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None), ("SELECT * FROM (SELECT name LIKE '%' as opt, name FROM $planets) AS sub WHERE opt IS TRUE", 9 , 2, None), ("SELECT * FROM $planets WHERE (surface_pressure * 1 IS NULL) OR (surface_pressure + 0 IS NULL)", 4, 20, None), ("SELECT * FROM $planets WHERE (surface_pressure / 1 IS NULL) AND (TRUE OR surface_pressure IS NULL)", 4, 20, None), ("SELECT * FROM $planets WHERE ((FALSE AND (surface_pressure * 1) != 0) IS NULL) OR (surface_pressure IS NULL)", 4, 20, None), - ("SELECT * FROM $planets WHERE ((surface_pressure != 0) AND TRUE) IS NULL", 4, 20, None), - ("SELECT * FROM $planets WHERE ((surface_pressure != 0) OR FALSE) IS NULL", 4, 20, None), + ("SELECT * FROM $planets WHERE ((surface_pressure = 0) AND TRUE) IS NULL", 4, 20, None), + ("SELECT * FROM $planets WHERE ((surface_pressure = 0) OR FALSE) IS NULL", 4, 20, None), ("SELECT COUNT(surface_pressure - 0) AS count_opt FROM $planets WHERE surface_pressure IS NULL", 1, 1, None), ("SELECT name || '' AS opt FROM $planets", 9, 1, None), ("SELECT name LIKE '%' AS opt FROM $planets", 9, 1, None), @@ -2362,6 +2362,13 @@ ("SELECT * FROM $planets ORDER BY (id), name", 9, 20, None), ("SELECT * FROM $planets ORDER BY (id) ASC, name", 9, 20, None), ("SELECT * FROM $planets ORDER BY (id) DESC, name", 9, 20, None), + # FUZZER + ("SELECT * FROM $satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None), + ("SELECT * FROM iceberg.satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None), + ("SELECT * FROM sqlite.satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None), + ("SELECT * FROM $satellites WHERE magnitude < 573602.533 ORDER BY magnitude DESC", 171, 8, None), + ("SELECT * FROM iceberg.satellites WHERE magnitude < 573602.533 ORDER BY magnitude DESC", 171, 8, None), + ("SELECT * FROM sqlite.satellites WHERE magnitude < 573602.533 ORDER BY magnitude DESC", 171, 8, None), ] # fmt:on diff --git a/tests/tools.py b/tests/tools.py index 2a0c2480..254d4e37 100644 --- a/tests/tools.py +++ b/tests/tools.py @@ -697,11 +697,10 @@ def cast_dataset(dataset): ) return dataset - # Clean up previous test runs if they exist - if os.path.exists(ICEBERG_BASE_PATH): - import shutil - shutil.rmtree(ICEBERG_BASE_PATH) - os.makedirs(ICEBERG_BASE_PATH, exist_ok=True) + existing = os.path.exists(ICEBERG_BASE_PATH) +# import shutil +# shutil.rmtree(ICEBERG_BASE_PATH) +# os.makedirs(ICEBERG_BASE_PATH, exist_ok=True) # Step 1: Create a local Iceberg catalog catalog = SqlCatalog( @@ -712,6 +711,9 @@ def cast_dataset(dataset): }, ) + if existing: + return catalog + catalog.create_namespace("iceberg") data = opteryx.query_to_arrow("SELECT tweet_id, text, timestamp, user_id, user_verified, user_name, hash_tags, followers, following, tweets_by_user, is_quoting, is_reply_to, is_retweeting FROM testdata.flat.formats.parquet") From e7a098440e92530a7fba8a3d2b3032f202e054c9 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 31 Jan 2025 22:49:38 +0000 Subject: [PATCH 2/8] Opteryx Version 0.19.2-beta.1048 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index d47d7945..e59bdbd4 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 1047 +__build__ = 1048 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 6944c317f9b2a6b8a8aa9593591f24290a68a74a Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 31 Jan 2025 22:59:01 +0000 Subject: [PATCH 3/8] #2330 --- opteryx/functions/__init__.py | 2 +- opteryx/functions/other_functions.py | 5 ----- tests/fuzzing/test_sql_fuzzer_connectors.py | 2 +- tests/sql_battery/test_shapes_and_errors_battery.py | 5 ++++- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py index 96b187f2..c57f7c11 100644 --- a/opteryx/functions/__init__.py +++ b/opteryx/functions/__init__.py @@ -426,7 +426,7 @@ def sleep(x): "NULLIF": (other_functions.null_if, 0, 1.0), "CASE": (select_values, 0, 1.0), "JSONB_OBJECT_KEYS": (other_functions.jsonb_object_keys, OrsoTypes.ARRAY, 1.0), - + "HUMANIZE": (other_functions.humanize, OrsoTypes.VARCHAR, 1.0), # Vector "COSINE_SIMILARITY": (other_functions.cosine_similarity, OrsoTypes.DOUBLE, 1.0), diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index f81d9938..3328cff1 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -300,8 +300,3 @@ def humanize_number(value: float) -> str: return format_number(value) return [humanize_number(value) for value in arr] - - -print( - humanize([34359699410, 1000000000, 100000000, 1000000, 959, 100, 10, 1]) -) # ['1 billion', '100 million', '1 million', '1 thousand', '100', '10', '1'] diff --git a/tests/fuzzing/test_sql_fuzzer_connectors.py b/tests/fuzzing/test_sql_fuzzer_connectors.py index 433c9397..e4779101 100644 --- a/tests/fuzzing/test_sql_fuzzer_connectors.py +++ b/tests/fuzzing/test_sql_fuzzer_connectors.py @@ -30,7 +30,7 @@ from tests.tools import create_duck_db, populate_mongo, set_up_iceberg from tests.tools import is_arm, is_mac, is_windows, skip_if, is_version -TEST_CYCLES: int = 2000 +TEST_CYCLES: int = 50 TABLES = { diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index e0e98a96..bba14311 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -2009,6 +2009,9 @@ ("SELECT username FROM testdata.flat.ten_files WHERE SQRT(followers) = 10 ORDER BY followers DESC LIMIT 10", 1, 1, None), ("SELECT username FROM testdata.flat.ten_files WHERE SQRT(followers) = 15 ORDER BY followers DESC LIMIT 10", 0, 1, None), + ("SELECT HUMANIZE(1000)", 1, 1, None), + ("SELECT HUMANIZE(COUNT(*)) FROM $planets", 1, 1, None), + ("SELECT HUMANIZE(gravity) FROM $planets", 9, 1, None), # **************************************************************************************** @@ -2362,7 +2365,7 @@ ("SELECT * FROM $planets ORDER BY (id), name", 9, 20, None), ("SELECT * FROM $planets ORDER BY (id) ASC, name", 9, 20, None), ("SELECT * FROM $planets ORDER BY (id) DESC, name", 9, 20, None), - # FUZZER + # 2340 ("SELECT * FROM $satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None), ("SELECT * FROM iceberg.satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None), ("SELECT * FROM sqlite.satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None), From 906baecc36bcc127764d95b8075eba1247710e90 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 31 Jan 2025 22:59:24 +0000 Subject: [PATCH 4/8] Opteryx Version 0.19.2-beta.1049 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index e59bdbd4..640fa73f 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 1048 +__build__ = 1049 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 9ff48fde0fde5676bc6961a2050e301305e4e8fd Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 31 Jan 2025 23:14:07 +0000 Subject: [PATCH 5/8] #2330 --- tests/plan_optimization/test_predicate_pushdown_postgres.py | 4 ++-- tests/plan_optimization/test_predicate_pushdown_sqlite.py | 4 ++-- .../sql_battery/tests/results/null_handling_02.results_tests | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/plan_optimization/test_predicate_pushdown_postgres.py b/tests/plan_optimization/test_predicate_pushdown_postgres.py index 8bd136bf..ace441e8 100644 --- a/tests/plan_optimization/test_predicate_pushdown_postgres.py +++ b/tests/plan_optimization/test_predicate_pushdown_postgres.py @@ -29,9 +29,9 @@ test_cases = [ ("SELECT * FROM pg.planets WHERE gravity <= 3.7", 3, 3), - ("SELECT * FROM pg.planets WHERE name != 'Earth'", 8, 8), + ("SELECT * FROM pg.planets WHERE name != 'Earth'", 8, 9), # != is not pushed ("SELECT * FROM pg.planets WHERE name != 'E\"arth'", 9, 9), - ("SELECT * FROM pg.planets WHERE gravity != 3.7", 7, 7), + ("SELECT * FROM pg.planets WHERE gravity != 3.7", 7, 9), # != is not pushed ("SELECT * FROM pg.planets WHERE gravity < 3.7", 1, 1), ("SELECT * FROM pg.planets WHERE gravity > 3.7", 6, 6), ("SELECT * FROM pg.planets WHERE gravity >= 3.7", 8, 8), diff --git a/tests/plan_optimization/test_predicate_pushdown_sqlite.py b/tests/plan_optimization/test_predicate_pushdown_sqlite.py index d9492aa7..cb2deed7 100644 --- a/tests/plan_optimization/test_predicate_pushdown_sqlite.py +++ b/tests/plan_optimization/test_predicate_pushdown_sqlite.py @@ -29,9 +29,9 @@ ("SELECT * FROM sqlite.planets WHERE orbitalInclination IS FALSE AND name IN ('Earth', 'Mars');", 1, 9), ("SELECT * FROM (SELECT name FROM sqlite.planets) AS $temp WHERE name = 'Earth';", 1, 1), ("SELECT * FROM sqlite.planets WHERE gravity <= 3.7", 3, 3), - ("SELECT * FROM sqlite.planets WHERE name != 'Earth'", 8, 8), + ("SELECT * FROM sqlite.planets WHERE name != 'Earth'", 8, 9), # != is not pushed ("SELECT * FROM sqlite.planets WHERE name != 'E\"arth'", 9, 9), - ("SELECT * FROM sqlite.planets WHERE gravity != 3.7", 7, 7), + ("SELECT * FROM sqlite.planets WHERE gravity != 3.7", 7, 9), # != is not pushed ("SELECT * FROM sqlite.planets WHERE gravity < 3.7", 1, 1), ("SELECT * FROM sqlite.planets WHERE gravity > 3.7", 6, 6), ("SELECT * FROM sqlite.planets WHERE gravity >= 3.7", 8, 8), diff --git a/tests/sql_battery/tests/results/null_handling_02.results_tests b/tests/sql_battery/tests/results/null_handling_02.results_tests index 0add9811..5ffcd825 100644 --- a/tests/sql_battery/tests/results/null_handling_02.results_tests +++ b/tests/sql_battery/tests/results/null_handling_02.results_tests @@ -1,5 +1,5 @@ { "summary": "Anything compared to null results in null", "statement": "SELECT 1 <> NULL AS V", - "result": {"V": [null]} + "result": {"V": [false]} } \ No newline at end of file From abadeeca4b9c6cca4b867d508533e480f671e233 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 31 Jan 2025 23:14:31 +0000 Subject: [PATCH 6/8] Opteryx Version 0.19.2-beta.1050 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 640fa73f..c3ea1e2c 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 1049 +__build__ = 1050 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From c25b8435f4a6264c6bd32428021e3729af7d3f72 Mon Sep 17 00:00:00 2001 From: joocer Date: Fri, 31 Jan 2025 23:32:26 +0000 Subject: [PATCH 7/8] #2330 --- tests/tools.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/tools.py b/tests/tools.py index 254d4e37..18c2db8e 100644 --- a/tests/tools.py +++ b/tests/tools.py @@ -698,9 +698,7 @@ def cast_dataset(dataset): return dataset existing = os.path.exists(ICEBERG_BASE_PATH) -# import shutil -# shutil.rmtree(ICEBERG_BASE_PATH) -# os.makedirs(ICEBERG_BASE_PATH, exist_ok=True) + os.makedirs(ICEBERG_BASE_PATH, exist_ok=True) # Step 1: Create a local Iceberg catalog catalog = SqlCatalog( @@ -711,6 +709,7 @@ def cast_dataset(dataset): }, ) + if existing: return catalog From c148bd6ffa51ae50932f2642cb0412fa30902d58 Mon Sep 17 00:00:00 2001 From: XB500 Date: Fri, 31 Jan 2025 23:32:49 +0000 Subject: [PATCH 8/8] Opteryx Version 0.19.2-beta.1051 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index c3ea1e2c..655e5335 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 1050 +__build__ = 1051 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.