Skip to content

Commit

Permalink
Merge pull request #2318 from mabel-dev/joocer/issue2317
Browse files Browse the repository at this point in the history
✨ prefer cython hash
  • Loading branch information
joocer authored Jan 25, 2025
2 parents a11ccf6 + e2a2d03 commit fdf1759
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 13 deletions.
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 1038
__build__ = 1040

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
14 changes: 8 additions & 6 deletions opteryx/compiled/joins/filter_join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import numpy as np

cimport numpy as cnp
from libc.stdint cimport int64_t
from cpython.object cimport PyObject_Hash

from opteryx.third_party.abseil.containers cimport FlatHashSet

cpdef FlatHashSet filter_join_set(relation, list join_columns, FlatHashSet seen_hashes):
Expand All @@ -31,14 +33,14 @@ cpdef FlatHashSet filter_join_set(relation, list join_columns, FlatHashSet seen_
if num_columns == 1:
col = values_array[0, :]
for i in range(len(col)):
hash_value = <int64_t>hash(col[i])
hash_value = PyObject_Hash(col[i])
seen_hashes.insert(hash_value)
else:
for i in range(values_array.shape[1]):
# Combine the hashes of each value in the row
hash_value = 0
for value in values_array[:, i]:
hash_value = <int64_t>(hash_value * 31 + hash(value))
hash_value = <int64_t>(hash_value * 31 + PyObject_Hash(value))
seen_hashes.insert(hash_value)

return seen_hashes
Expand All @@ -55,7 +57,7 @@ cpdef anti_join(relation, list join_columns, FlatHashSet seen_hashes):
if num_columns == 1:
col = values_array[0, :]
for i in range(len(col)):
hash_value = <int64_t>hash(col[i])
hash_value = PyObject_Hash(col[i])
if not seen_hashes.contains(hash_value):
index_buffer[idx_count] = i
idx_count += 1
Expand All @@ -64,7 +66,7 @@ cpdef anti_join(relation, list join_columns, FlatHashSet seen_hashes):
# Combine the hashes of each value in the row
hash_value = 0
for value in values_array[:, i]:
hash_value = <int64_t>(hash_value * 31 + hash(value))
hash_value = <int64_t>(hash_value * 31 + PyObject_Hash(value))
if not seen_hashes.contains(hash_value):
index_buffer[idx_count] = i
idx_count += 1
Expand All @@ -87,7 +89,7 @@ cpdef semi_join(relation, list join_columns, FlatHashSet seen_hashes):
if num_columns == 1:
col = values_array[0, :]
for i in range(len(col)):
hash_value = <int64_t>hash(col[i])
hash_value = PyObject_Hash(col[i])
if seen_hashes.contains(hash_value):
index_buffer[idx_count] = i
idx_count += 1
Expand All @@ -96,7 +98,7 @@ cpdef semi_join(relation, list join_columns, FlatHashSet seen_hashes):
# Combine the hashes of each value in the row
hash_value = 0
for value in values_array[:, i]:
hash_value = <int64_t>(hash_value * 31 + hash(value))
hash_value = <int64_t>(hash_value * 31 + PyObject_Hash(value))
if seen_hashes.contains(hash_value):
index_buffer[idx_count] = i
idx_count += 1
Expand Down
5 changes: 3 additions & 2 deletions opteryx/compiled/joins/inner_join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import numpy
from libc.stdint cimport uint8_t, int64_t

from opteryx.third_party.abseil.containers cimport FlatHashMap
from cpython.object cimport PyObject_Hash

cpdef FlatHashMap abs_hash_join_map(relation, list join_columns):
"""
Expand Down Expand Up @@ -66,14 +67,14 @@ cpdef FlatHashMap abs_hash_join_map(relation, list join_columns):
if num_columns == 1:
col = values_array[0, :]
for i in range(len(col)):
hash_value = <int64_t>hash(col[i])
hash_value = PyObject_Hash(col[i])
ht.insert(hash_value, non_null_indices[i])
else:
for i in range(values_array.shape[1]):
# Combine the hashes of each value in the row
hash_value = 0
for value in values_array[:, i]:
hash_value = <int64_t>(hash_value * 31 + hash(value))
hash_value = <int64_t>(hash_value * 31 + PyObject_Hash(value))
ht.insert(hash_value, non_null_indices[i])

return ht
7 changes: 4 additions & 3 deletions opteryx/compiled/structures/hash_table.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ from libcpp.unordered_map cimport unordered_map
from libcpp.unordered_set cimport unordered_set
from libcpp.vector cimport vector
from libc.stdint cimport int64_t, uint8_t
from cpython.object cimport PyObject_Hash

cimport numpy as cnp

Expand Down Expand Up @@ -64,7 +65,7 @@ cpdef tuple list_distinct(cnp.ndarray values, cnp.int64_t[::1] indices, HashSet

for i in range(n):
v = values[i]
hash_value = <int64_t>hash(v)
hash_value = PyObject_Hash(v)
if seen_hashes.insert(hash_value):
new_values[j] = v
new_indices[j] = indices[i]
Expand Down Expand Up @@ -127,14 +128,14 @@ cpdef HashTable hash_join_map(relation, list join_columns):
if num_columns == 1:
col = values_array[0, :]
for i in range(len(col)):
hash_value = <int64_t>hash(col[i])
hash_value = PyObject_Hash(col[i])
ht.insert(hash_value, non_null_indices[i])
else:
for i in range(values_array.shape[1]):
# Combine the hashes of each value in the row
hash_value = 0
for value in values_array[:, i]:
hash_value = <int64_t>(hash_value * 31 + hash(value))
hash_value = <int64_t>(hash_value * 31 + PyObject_Hash(value))
ht.insert(hash_value, non_null_indices[i])

return ht
1 change: 0 additions & 1 deletion opteryx/connectors/iceberg_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from typing import Union

import pyarrow
import pyiceberg.types
from orso.schema import FlatColumn
from orso.schema import RelationSchema

Expand Down

0 comments on commit fdf1759

Please sign in to comment.