Skip to content

Commit

Permalink
Merge pull request #2342 from mabel-dev/#2330
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer authored Jan 31, 2025
2 parents 6894d43 + c148bd6 commit 29eb5fc
Show file tree
Hide file tree
Showing 12 changed files with 59 additions and 26 deletions.
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 1047
__build__ = 1051

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
3 changes: 1 addition & 2 deletions opteryx/connectors/iceberg_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def to_iceberg_filter(root):
ICEBERG_FILTERS = {
"GtEq": pyiceberg.expressions.GreaterThanOrEqual,
"Eq": pyiceberg.expressions.EqualTo,
"NotEq": pyiceberg.expressions.NotEqualTo,
"Gt": pyiceberg.expressions.GreaterThan,
"Lt": pyiceberg.expressions.LessThan,
"LtEq": pyiceberg.expressions.LessThanOrEqual,
Expand Down Expand Up @@ -122,7 +121,7 @@ class IcebergConnector(BaseConnector, LimitPushable, Statistics, PredicatePushab

PUSHABLE_OPS: Dict[str, bool] = {
"Eq": True,
"NotEq": True,
# "NotEq": True, # nulls not handled correctly
"Gt": True,
"GtEq": True,
"Lt": True,
Expand Down
4 changes: 2 additions & 2 deletions opteryx/connectors/sql_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class SqlConnector(BaseConnector, LimitPushable, PredicatePushable):

PUSHABLE_OPS: Dict[str, bool] = {
"Eq": True,
"NotEq": True,
# "NotEq": True, # not all databases handle nulls consistently
"Gt": True,
"GtEq": True,
"Lt": True,
Expand All @@ -74,7 +74,7 @@ class SqlConnector(BaseConnector, LimitPushable, PredicatePushable):

OPS_XLAT: Dict[str, str] = {
"Eq": "=",
"NotEq": "!=",
# "NotEq": "!=",
"Gt": ">",
"GtEq": ">=",
"Lt": "<",
Expand Down
2 changes: 1 addition & 1 deletion opteryx/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def sleep(x):
"NULLIF": (other_functions.null_if, 0, 1.0),
"CASE": (select_values, 0, 1.0),
"JSONB_OBJECT_KEYS": (other_functions.jsonb_object_keys, OrsoTypes.ARRAY, 1.0),

"HUMANIZE": (other_functions.humanize, OrsoTypes.VARCHAR, 1.0),

# Vector
"COSINE_SIMILARITY": (other_functions.cosine_similarity, OrsoTypes.DOUBLE, 1.0),
Expand Down
22 changes: 22 additions & 0 deletions opteryx/functions/other_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,3 +278,25 @@ def jsonb_object_keys(arr: numpy.ndarray):

# Return the result as a PyArrow array
return result


def humanize(arr):
def format_number(num: float) -> str:
"""Formats the number with or without decimal places based on whether it's an integer."""
return f"{num:,.0f}" if isinstance(num, int) else f"{num:,.1f}"

def humanize_number(value: float) -> str:
thresholds = [
(1_000_000_000_000, "trillion"),
(1_000_000_000, "billion"),
(1_000_000, "million"),
(1_000, "thousand"),
]

for threshold, label in thresholds:
rounded = round(value / threshold, 1)
if rounded >= 0.9: # Ensure we don't get "0.9 million" turning into "0 million"
return f"{format_number(rounded)} {label}"
return format_number(value)

return [humanize_number(value) for value in arr]
1 change: 1 addition & 0 deletions opteryx/managers/expression/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def filter_operations(arr, left_type, operator, value, right_type):
"AllOpEq",
"AllOpNotEq",
"AtArrow",
"NotEq", # need to handle nulls
):
# compressing ARRAY columns is VERY SLOW
morsel_size = len(arr)
Expand Down
2 changes: 1 addition & 1 deletion tests/fuzzing/test_sql_fuzzer_connectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from tests.tools import create_duck_db, populate_mongo, set_up_iceberg
from tests.tools import is_arm, is_mac, is_windows, skip_if, is_version

TEST_CYCLES: int = 20
TEST_CYCLES: int = 50


TABLES = {
Expand Down
4 changes: 2 additions & 2 deletions tests/plan_optimization/test_predicate_pushdown_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@

test_cases = [
("SELECT * FROM pg.planets WHERE gravity <= 3.7", 3, 3),
("SELECT * FROM pg.planets WHERE name != 'Earth'", 8, 8),
("SELECT * FROM pg.planets WHERE name != 'Earth'", 8, 9), # != is not pushed
("SELECT * FROM pg.planets WHERE name != 'E\"arth'", 9, 9),
("SELECT * FROM pg.planets WHERE gravity != 3.7", 7, 7),
("SELECT * FROM pg.planets WHERE gravity != 3.7", 7, 9), # != is not pushed
("SELECT * FROM pg.planets WHERE gravity < 3.7", 1, 1),
("SELECT * FROM pg.planets WHERE gravity > 3.7", 6, 6),
("SELECT * FROM pg.planets WHERE gravity >= 3.7", 8, 8),
Expand Down
4 changes: 2 additions & 2 deletions tests/plan_optimization/test_predicate_pushdown_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
("SELECT * FROM sqlite.planets WHERE orbitalInclination IS FALSE AND name IN ('Earth', 'Mars');", 1, 9),
("SELECT * FROM (SELECT name FROM sqlite.planets) AS $temp WHERE name = 'Earth';", 1, 1),
("SELECT * FROM sqlite.planets WHERE gravity <= 3.7", 3, 3),
("SELECT * FROM sqlite.planets WHERE name != 'Earth'", 8, 8),
("SELECT * FROM sqlite.planets WHERE name != 'Earth'", 8, 9), # != is not pushed
("SELECT * FROM sqlite.planets WHERE name != 'E\"arth'", 9, 9),
("SELECT * FROM sqlite.planets WHERE gravity != 3.7", 7, 7),
("SELECT * FROM sqlite.planets WHERE gravity != 3.7", 7, 9), # != is not pushed
("SELECT * FROM sqlite.planets WHERE gravity < 3.7", 1, 1),
("SELECT * FROM sqlite.planets WHERE gravity > 3.7", 6, 6),
("SELECT * FROM sqlite.planets WHERE gravity >= 3.7", 8, 8),
Expand Down
30 changes: 20 additions & 10 deletions tests/sql_battery/test_shapes_and_errors_battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -2009,6 +2009,9 @@
("SELECT username FROM testdata.flat.ten_files WHERE SQRT(followers) = 10 ORDER BY followers DESC LIMIT 10", 1, 1, None),
("SELECT username FROM testdata.flat.ten_files WHERE SQRT(followers) = 15 ORDER BY followers DESC LIMIT 10", 0, 1, None),

("SELECT HUMANIZE(1000)", 1, 1, None),
("SELECT HUMANIZE(COUNT(*)) FROM $planets", 1, 1, None),
("SELECT HUMANIZE(gravity) FROM $planets", 9, 1, None),

# ****************************************************************************************

Expand Down Expand Up @@ -2328,20 +2331,20 @@
("SELECT * FROM (SELECT surface_pressure + 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT surface_pressure - 0 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT surface_pressure / 1 as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT TRUE AND (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT FALSE AND (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None),
("SELECT * FROM (SELECT TRUE OR (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None),
("SELECT * FROM (SELECT FALSE OR (surface_pressure != 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT (surface_pressure != 0) AND TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT (surface_pressure != 0) AND FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None),
("SELECT * FROM (SELECT (surface_pressure != 0) OR TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None),
("SELECT * FROM (SELECT (surface_pressure != 0) OR FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT TRUE AND (surface_pressure = 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT FALSE AND (surface_pressure = 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None),
("SELECT * FROM (SELECT TRUE OR (surface_pressure = 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None),
("SELECT * FROM (SELECT FALSE OR (surface_pressure = 0) as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT (surface_pressure = 0) AND TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT (surface_pressure = 0) AND FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None),
("SELECT * FROM (SELECT (surface_pressure = 0) OR TRUE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 0, 2, None),
("SELECT * FROM (SELECT (surface_pressure = 0) OR FALSE as opt, surface_pressure FROM $planets) AS sub WHERE opt IS NULL", 4, 2, None),
("SELECT * FROM (SELECT name LIKE '%' as opt, name FROM $planets) AS sub WHERE opt IS TRUE", 9 , 2, None),
("SELECT * FROM $planets WHERE (surface_pressure * 1 IS NULL) OR (surface_pressure + 0 IS NULL)", 4, 20, None),
("SELECT * FROM $planets WHERE (surface_pressure / 1 IS NULL) AND (TRUE OR surface_pressure IS NULL)", 4, 20, None),
("SELECT * FROM $planets WHERE ((FALSE AND (surface_pressure * 1) != 0) IS NULL) OR (surface_pressure IS NULL)", 4, 20, None),
("SELECT * FROM $planets WHERE ((surface_pressure != 0) AND TRUE) IS NULL", 4, 20, None),
("SELECT * FROM $planets WHERE ((surface_pressure != 0) OR FALSE) IS NULL", 4, 20, None),
("SELECT * FROM $planets WHERE ((surface_pressure = 0) AND TRUE) IS NULL", 4, 20, None),
("SELECT * FROM $planets WHERE ((surface_pressure = 0) OR FALSE) IS NULL", 4, 20, None),
("SELECT COUNT(surface_pressure - 0) AS count_opt FROM $planets WHERE surface_pressure IS NULL", 1, 1, None),
("SELECT name || '' AS opt FROM $planets", 9, 1, None),
("SELECT name LIKE '%' AS opt FROM $planets", 9, 1, None),
Expand All @@ -2362,6 +2365,13 @@
("SELECT * FROM $planets ORDER BY (id), name", 9, 20, None),
("SELECT * FROM $planets ORDER BY (id) ASC, name", 9, 20, None),
("SELECT * FROM $planets ORDER BY (id) DESC, name", 9, 20, None),
# 2340
("SELECT * FROM $satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None),
("SELECT * FROM iceberg.satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None),
("SELECT * FROM sqlite.satellites WHERE magnitude != 573602.533 ORDER BY magnitude DESC", 177, 8, None),
("SELECT * FROM $satellites WHERE magnitude < 573602.533 ORDER BY magnitude DESC", 171, 8, None),
("SELECT * FROM iceberg.satellites WHERE magnitude < 573602.533 ORDER BY magnitude DESC", 171, 8, None),
("SELECT * FROM sqlite.satellites WHERE magnitude < 573602.533 ORDER BY magnitude DESC", 171, 8, None),
]
# fmt:on

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"summary": "Anything compared to null results in null",
"statement": "SELECT 1 <> NULL AS V",
"result": {"V": [null]}
"result": {"V": [false]}
}
9 changes: 5 additions & 4 deletions tests/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,10 +697,7 @@ def cast_dataset(dataset):
)
return dataset

# Clean up previous test runs if they exist
if os.path.exists(ICEBERG_BASE_PATH):
import shutil
shutil.rmtree(ICEBERG_BASE_PATH)
existing = os.path.exists(ICEBERG_BASE_PATH)
os.makedirs(ICEBERG_BASE_PATH, exist_ok=True)

# Step 1: Create a local Iceberg catalog
Expand All @@ -712,6 +709,10 @@ def cast_dataset(dataset):
},
)


if existing:
return catalog

catalog.create_namespace("iceberg")

data = opteryx.query_to_arrow("SELECT tweet_id, text, timestamp, user_id, user_verified, user_name, hash_tags, followers, following, tweets_by_user, is_quoting, is_reply_to, is_retweeting FROM testdata.flat.formats.parquet")
Expand Down

0 comments on commit 29eb5fc

Please sign in to comment.