From 6b0d9cd1ed7c825d6dd2592d5067151dd45f5c72 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 1 Feb 2024 08:16:01 -0800 Subject: [PATCH] Exported the converter from chunk shape to a SimpleGrid. Also explained the various choices for the cost factor, now that we have to document the converter's arguments. --- src/delayedarray/Grid.py | 3 +- src/delayedarray/__init__.py | 2 +- src/delayedarray/chunk_grid.py | 83 +++++++++++++++++++++++++++------- 3 files changed, 70 insertions(+), 18 deletions(-) diff --git a/src/delayedarray/Grid.py b/src/delayedarray/Grid.py index 76cef62..fe1f1d7 100644 --- a/src/delayedarray/Grid.py +++ b/src/delayedarray/Grid.py @@ -74,7 +74,8 @@ def __init__(self, boundaries: Tuple[Sequence[int], ...], cost_factor: float, in Positive number representing the cost of iteration over each element of the grid's array. The actual cost is defined by the product of the cost factor by the array size. This is used to - choose between iteration schemes. + choose between iteration schemes; as a reference, extraction + from an in-memory NumPy array has a cost factor of 1. internals: Internal use only. diff --git a/src/delayedarray/__init__.py b/src/delayedarray/__init__.py index 9d6f435..66d4c4e 100644 --- a/src/delayedarray/__init__.py +++ b/src/delayedarray/__init__.py @@ -41,6 +41,6 @@ from .create_dask_array import create_dask_array from .is_sparse import is_sparse from .is_masked import is_masked -from .chunk_grid import chunk_grid +from .chunk_grid import chunk_grid, chunk_shape_to_grid from .is_pristine import is_pristine from .wrap import wrap diff --git a/src/delayedarray/chunk_grid.py b/src/delayedarray/chunk_grid.py index 758b590..1133a6c 100644 --- a/src/delayedarray/chunk_grid.py +++ b/src/delayedarray/chunk_grid.py @@ -12,7 +12,30 @@ __license__ = "MIT" -def _chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int): +def chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int) -> SimpleGrid: + """ + Convert a chunk shape to a :py:class:`~delayedarray.Grid.SimpleGrid`. + This assumes that the underlying array is split up into regular intervals + on each dimension; the first chunk should start from zero, and only the + last chunk may be of a different size (bounded by the dimension extent). + + Args: + chunks: + Chunk size for each dimension. These should be positive. + + shape: + Extent of each dimension of the array. These should be non-negative + and of the same length as ``chunks``. + + cost_factor: + Cost factor for iterating over each element of the associated + array. This is used to decide between iteration schemes and can be + increased for more expensive types, e.g., file-backed arrays. As a + reference, in-memory NumPy arrays are assigned a cost factor of 1. + + Returns: + A ``SimpleGrid`` object with the chunk shape as the boundaries. + """ out = [] for i, ch in enumerate(chunks): sh = shape[i] @@ -42,8 +65,13 @@ def chunk_grid(x: Any) -> AbstractGrid: @chunk_grid.register -def chunk_grid_ndarray(x: ndarray): - """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.""" +def chunk_grid_ndarray(x: ndarray) -> SimpleGrid: + """ + See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. + + The cost factor for iteration is set to 1, which is considered the lowest + cost for data extraction given that everything is stored in memory. + """ raw = [1] * len(x.shape) if x.flags.f_contiguous: raw[0] = x.shape[0] @@ -52,15 +80,21 @@ def chunk_grid_ndarray(x: ndarray): # to figure that out from NumPy flags. Guess we should just assume # that it's C-contiguous, given that most things are. raw[-1] = x.shape[-1] - return _chunk_shape_to_grid(raw, x.shape, cost_factor=1) + return chunk_shape_to_grid(raw, x.shape, cost_factor=1) @chunk_grid.register -def chunk_grid_SparseNdarray(x: SparseNdarray): - """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.""" +def chunk_grid_SparseNdarray(x: SparseNdarray) -> SimpleGrid: + """ + See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. + + The cost factor for iteration is set to 1.5. This is slightly higher than + that of dense NumPy arrays as the ``SparseNdarray`` is a bit more expensive + for random access on the first dimension. + """ raw = [1] * len(x.shape) raw[0] = x.shape[0] - return _chunk_shape_to_grid(raw, x.shape, cost_factor=1.5) + return chunk_shape_to_grid(raw, x.shape, cost_factor=1.5) # If scipy is installed, we add all the methods for the various scipy.sparse matrices. @@ -70,19 +104,36 @@ def chunk_grid_SparseNdarray(x: SparseNdarray): @chunk_grid.register - def chunk_grid_csc_matrix(x: sp.csc_matrix): - """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.""" - return _chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5) + def chunk_grid_csc_matrix(x: sp.csc_matrix) -> SimpleGrid: + """ + See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. + + The cost factor for iteration is set to 1.5. This is slightly higher + than that of dense NumPy arrays as CSC matrices are a bit more + expensive for random row access. + """ + return chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5) @chunk_grid.register - def chunk_grid_csr_matrix(x: sp.csr_matrix): - """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.""" - return _chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5) + def chunk_grid_csr_matrix(x: sp.csr_matrix) -> SimpleGrid: + """ + See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. + + The cost factor for iteration is set to 1.5. This is slightly higher + than that of dense NumPy arrays as CSR matrices are a bit more + expensive for random column access. + """ + return chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5) @chunk_grid.register - def chunk_grid_coo_matrix(x: sp.coo_matrix): - """See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.""" + def chunk_grid_coo_matrix(x: sp.coo_matrix) -> SimpleGrid: + """ + See :py:meth:`~delayedarray.chunk_grid.chunk_grid`. + + The cost factor for iteration is set to 5, as any extraction from a COO + matrix requires a full scan through all elements. + """ # ???? let's just do our best here, there's no nice way to access COO. - return _chunk_shape_to_grid(x.shape, x.shape, cost_factor=1.5) + return chunk_shape_to_grid(x.shape, x.shape, cost_factor=5)