-
Notifications
You must be signed in to change notification settings - Fork 609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: improve map(), struct(), array() #8666
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,10 +86,14 @@ def _make_duration(value, dtype): | |
def literal(op, **_): | ||
value = op.value | ||
dtype = op.dtype | ||
if dtype.is_interval(): | ||
return _make_duration(value, dtype) | ||
|
||
if dtype.is_array(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you avoid moving the code around here and only handle the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to look at this again, but IIRC this was actually the minimum possible change, something was erroring with NULL duration handling. Will check again There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this reorg is needed. ibis's Duration.DAY and friends are not representable in polars' datatypes, so if you call |
||
typ = PolarsType.from_ibis(dtype) | ||
if value is None: | ||
return pl.lit(None, dtype=typ) | ||
elif dtype.is_array(): | ||
value = pl.Series("", value) | ||
typ = PolarsType.from_ibis(dtype) | ||
val = pl.lit(value, dtype=typ) | ||
return val.implode() | ||
elif dtype.is_struct(): | ||
|
@@ -98,14 +102,11 @@ def literal(op, **_): | |
for k, v in value.items() | ||
] | ||
return pl.struct(values) | ||
elif dtype.is_interval(): | ||
return _make_duration(value, dtype) | ||
elif dtype.is_null(): | ||
return pl.lit(value) | ||
elif dtype.is_binary(): | ||
return pl.lit(value) | ||
else: | ||
typ = PolarsType.from_ibis(dtype) | ||
return pl.lit(op.value, dtype=typ) | ||
|
||
|
||
|
@@ -985,9 +986,12 @@ def array_concat(op, **kw): | |
|
||
|
||
@translate.register(ops.Array) | ||
def array_column(op, **kw): | ||
cols = [translate(col, **kw) for col in op.exprs] | ||
return pl.concat_list(cols) | ||
def array_literal(op, **kw): | ||
pdt = PolarsType.from_ibis(op.dtype) | ||
if op.exprs: | ||
return pl.concat_list([translate(col, **kw) for col in op.exprs]).cast(pdt) | ||
else: | ||
return pl.lit([], dtype=pdt) | ||
|
||
|
||
@translate.register(ops.ArrayCollect) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1019,8 +1019,8 @@ def visit_InSubquery(self, op, *, rel, needle): | |
query = sg.select(STAR).from_(query) | ||
return needle.isin(query=query) | ||
|
||
def visit_Array(self, op, *, exprs): | ||
return self.f.array(*exprs) | ||
def visit_Array(self, op, *, exprs, dtype): | ||
return self.cast(self.f.array(*exprs), dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems extraordinarily heavy-handed and overly broad. Do we really need to cast the entire on constructing an array, for all SQL backends? |
||
|
||
def visit_StructColumn(self, op, *, names, values): | ||
return sge.Struct.from_arg_list( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,7 @@ | |
SnowflakeProgrammingError, | ||
TrinoUserError, | ||
) | ||
from ibis.common.annotations import ValidationError | ||
from ibis.common.collections import frozendict | ||
|
||
pytestmark = [ | ||
|
@@ -73,6 +74,75 @@ | |
# list. | ||
|
||
|
||
def test_array_factory(con): | ||
a = ibis.array([1, 2, 3]) | ||
assert a.type() == dt.Array(value_type=dt.Int8) | ||
assert con.execute(a) == [1, 2, 3] | ||
|
||
a2 = ibis.array(a) | ||
assert a.type() == dt.Array(value_type=dt.Int8) | ||
assert con.execute(a2) == [1, 2, 3] | ||
|
||
|
||
def test_array_factory_typed(con): | ||
typed = ibis.array([1, 2, 3], type="array<string>") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't really like the implicit conversion here. Is it necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The conversion is exactly the part of the API I was trying to test here. I think this conversion is something that this API should explicitly support. What do you think? Maybe I add a param as @pytest.mark.param(raw, [[1,2,3], ["1", "2", "3"]])
def test_array_factory_typed(con, raw):
typed = ibis.array(raw, type="array<string>")
... |
||
assert con.execute(typed) == ["1", "2", "3"] | ||
|
||
typed2 = ibis.array(ibis.array([1, 2, 3]), type="array<string>") | ||
assert con.execute(typed2) == ["1", "2", "3"] | ||
|
||
|
||
@pytest.mark.notimpl("flink", raises=Py4JJavaError) | ||
@pytest.mark.notimpl(["pandas", "dask"], raises=ValueError) | ||
def test_array_factory_empty(con): | ||
with pytest.raises(ValidationError): | ||
ibis.array([]) | ||
|
||
empty_typed = ibis.array([], type="array<string>") | ||
assert empty_typed.type() == dt.Array(value_type=dt.string) | ||
assert con.execute(empty_typed) == [] | ||
|
||
|
||
@pytest.mark.notyet( | ||
"clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL" | ||
) | ||
@pytest.mark.notyet( | ||
"flink", raises=Py4JJavaError, reason="Parameters must be of the same type" | ||
) | ||
def test_array_factory_null(con): | ||
with pytest.raises(ValidationError): | ||
ibis.array(None) | ||
with pytest.raises(ValidationError): | ||
ibis.array(None, type="int64") | ||
none_typed = ibis.array(None, type="array<string>") | ||
assert none_typed.type() == dt.Array(value_type=dt.string) | ||
assert con.execute(none_typed) is None | ||
|
||
nones = ibis.array([None, None], type="array<string>") | ||
assert nones.type() == dt.Array(value_type=dt.string) | ||
assert con.execute(nones) == [None, None] | ||
|
||
# Execute a real value here, so the backends that don't support arrays | ||
# actually xfail as we expect them to. | ||
# Otherwise would have to @mark.xfail every test in this file besides this one. | ||
assert con.execute(ibis.array([1, 2])) == [1, 2] | ||
|
||
|
||
@pytest.mark.broken( | ||
["datafusion", "flink", "polars"], | ||
raises=AssertionError, | ||
reason="[None, 1] executes to [np.nan, 1.0]", | ||
) | ||
def test_array_factory_null_mixed(con): | ||
none_and_val = ibis.array([None, 1]) | ||
assert none_and_val.type() == dt.Array(value_type=dt.Int8) | ||
assert con.execute(none_and_val) == [None, 1] | ||
|
||
none_and_val_typed = ibis.array([None, 1], type="array<string>") | ||
assert none_and_val_typed.type() == dt.Array(value_type=dt.String) | ||
assert con.execute(none_and_val_typed) == [None, "1"] | ||
|
||
|
||
def test_array_column(backend, alltypes, df): | ||
expr = ibis.array( | ||
[alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ops.Array executes to a np.ndarray (both before this change, and after). We never ran into this scenario in tests before, so this tweak here is adjusting an existing bug.