-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
277 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import multiprocessing as mp | ||
from queue import Empty | ||
from typing import Any | ||
from typing import Generator | ||
from typing import Tuple | ||
|
||
import pyarrow | ||
|
||
from opteryx import EOS | ||
from opteryx.constants import ResultType | ||
from opteryx.exceptions import InvalidInternalStateError | ||
from opteryx.models import PhysicalPlan | ||
from opteryx.models import QueryStatistics | ||
|
||
WORKERS = 4 | ||
kill = object() | ||
|
||
|
||
def execute( | ||
plan: PhysicalPlan, statistics: QueryStatistics = None, num_workers: int = WORKERS | ||
) -> Tuple[Generator[pyarrow.Table, Any, Any], ResultType]: | ||
""" | ||
Execute the physical plan with morsel-level parallelism. | ||
Parameters: | ||
plan: PhysicalPlan | ||
The physical plan to execute. | ||
statistics: QueryStatistics, optional | ||
Object to collect query statistics, defaults to None. | ||
num_workers: int, optional | ||
Number of parallel workers for processing morsels, defaults to 4. | ||
Returns: | ||
Tuple[Generator[pyarrow.Table, Any, Any], ResultType] | ||
A generator producing pyarrow tables and the result type. | ||
""" | ||
try: | ||
mp.set_start_method("fork", force=True) | ||
|
||
# Ensure there's a single head node | ||
head_nodes = list(set(plan.get_exit_points())) | ||
if len(head_nodes) != 1: | ||
raise InvalidInternalStateError( | ||
f"Query plan has {len(head_nodes)} heads, expected exactly 1." | ||
) | ||
|
||
head_node = plan[head_nodes[0]] | ||
|
||
# Queue for incoming morsels and a queue for results | ||
work_queue = mp.Queue() | ||
result_queue = mp.Queue() | ||
|
||
# Create a worker pool for processing morsels | ||
pool = mp.Pool(num_workers, _worker_init, (plan, work_queue, result_queue)) | ||
|
||
def inner_execute(plan: PhysicalPlan) -> Generator: | ||
# Get the pump nodes from the plan and execute them in order | ||
pump_nodes = [ | ||
(nid, node) for nid, node in plan.depth_first_search_flat() if node.is_scan | ||
] | ||
for pump_nid, pump_instance in pump_nodes: | ||
work_queue.put((pump_nid, None, None)) | ||
work_queue.put((pump_nid, EOS, None)) | ||
while True: | ||
try: | ||
result = result_queue.get(timeout=0.1) | ||
print("got final result", type(result)) | ||
if result == EOS: | ||
continue | ||
return result | ||
except Empty: | ||
pass | ||
|
||
result_generator = inner_execute(plan) | ||
|
||
print("I'm done here") | ||
|
||
# pool.close() | ||
# pool.join() | ||
|
||
return result_generator, ResultType.TABULAR | ||
|
||
finally: | ||
# Close and join the pool after execution | ||
pass | ||
|
||
|
||
def _worker_init(plan: PhysicalPlan, work_queue: mp.Queue, completion_queue: mp.Queue): | ||
""" | ||
Initialize the worker process for morsel-level parallelism. | ||
Parameters: | ||
plan: PhysicalPlan | ||
The overall physical plan. | ||
morsel_queue: mp.Queue | ||
Queue from which morsels are fetched. | ||
result_queue: mp.Queue | ||
Queue to which processed morsels are pushed. | ||
""" | ||
while True: | ||
try: | ||
work = work_queue.get(timeout=0.1) | ||
except Empty: | ||
continue | ||
|
||
nid, morsel, join_leg = work | ||
|
||
operator = plan[nid] | ||
|
||
results = operator(morsel, join_leg) | ||
|
||
if results is None: | ||
continue | ||
|
||
print("Worker got work for", operator.name, type(morsel), "results") | ||
|
||
for result in (result for result in results if result is not None): | ||
children = plan.outgoing_edges(nid) | ||
print("results", type(result), children) | ||
if len(children) == 0: | ||
print("done") | ||
completion_queue.put(result) | ||
for _, child, leg in children: | ||
work_queue.put((child, result, leg)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
95 changes: 95 additions & 0 deletions
95
opteryx/planner/optimizer/strategies/correlated_filters.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# See the License at http://www.apache.org/licenses/LICENSE-2.0 | ||
# Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. | ||
|
||
""" | ||
Optimization Rule - Correlated Filters | ||
Type: Heuristic | ||
Goal: Reduce Rows | ||
When fields are joined on, we can infer ranges of values based on statistics | ||
or filters. This can be used to reduce the number of rows that need to be read | ||
and processed. | ||
""" | ||
|
||
from orso.tools import random_string | ||
|
||
from opteryx.managers.expression import NodeType | ||
from opteryx.models import Node | ||
from opteryx.planner import build_literal_node | ||
from opteryx.planner.logical_planner import LogicalPlan | ||
from opteryx.planner.logical_planner import LogicalPlanNode | ||
from opteryx.planner.logical_planner import LogicalPlanStepType | ||
|
||
from .optimization_strategy import OptimizationStrategy | ||
from .optimization_strategy import OptimizerContext | ||
|
||
|
||
def _write_filters(left_column, right_column): | ||
new_filters = [] | ||
if left_column.schema_column.highest_value is not None: | ||
a_side = right_column | ||
b_side = build_literal_node(left_column.schema_column.highest_value) | ||
new_filter = Node( | ||
LogicalPlanStepType.Filter, | ||
condition=Node(NodeType.COMPARISON_OPERATOR, value="LtEq", left=a_side, right=b_side), | ||
columns=[right_column], | ||
relations={right_column.source}, | ||
all_relations={right_column.source}, | ||
) | ||
new_filters.append(new_filter) | ||
|
||
a_side = right_column | ||
b_side = build_literal_node(left_column.schema_column.lowest_value) | ||
new_filter = Node( | ||
LogicalPlanStepType.Filter, | ||
condition=Node(NodeType.COMPARISON_OPERATOR, value="GtEq", left=a_side, right=b_side), | ||
columns=[right_column], | ||
relations={right_column.source}, | ||
all_relations={right_column.source}, | ||
) | ||
new_filters.append(new_filter) | ||
return new_filters | ||
|
||
|
||
class CorrelatedFiltersStrategy(OptimizationStrategy): | ||
def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerContext: | ||
if not context.optimized_plan: | ||
context.optimized_plan = context.pre_optimized_tree.copy() # type: ignore | ||
|
||
if ( | ||
node.node_type == LogicalPlanStepType.Join | ||
and node.type == "inner" | ||
and len(node.all_relations) == 2 | ||
): | ||
left_column = node.on.left | ||
right_column = node.on.right | ||
new_filters = [] | ||
|
||
# Empty connectors are FUNCTION datasets, we could push filters down and create | ||
# statistics for them, but there are other issues this creates | ||
if ( | ||
left_column.node_type == NodeType.IDENTIFIER | ||
and right_column.node_type == NodeType.IDENTIFIER | ||
and left_column.source_connector != set() | ||
): | ||
new_filters = _write_filters(left_column, right_column) | ||
if ( | ||
left_column.node_type == NodeType.IDENTIFIER | ||
and right_column.node_type == NodeType.IDENTIFIER | ||
and right_column.source_connector != set() | ||
): | ||
new_filters.extend(_write_filters(right_column, left_column)) | ||
for new_filter in new_filters: | ||
context.optimized_plan.insert_node_before( | ||
random_string(), new_filter, context.node_id | ||
) | ||
self.statistics.optimization_inner_join_correlated_filter += 1 | ||
|
||
return context | ||
|
||
def complete(self, plan: LogicalPlan, context: OptimizerContext) -> LogicalPlan: | ||
# No finalization needed for this strategy | ||
return plan |
Oops, something went wrong.