Skip to content

Commit

Permalink
Exported the converter from chunk shape to a SimpleGrid.
Browse files Browse the repository at this point in the history
Also explained the various choices for the cost factor, now that
we have to document the converter's arguments.
  • Loading branch information
LTLA committed Feb 1, 2024
1 parent 87b7525 commit 6b0d9cd
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 18 deletions.
3 changes: 2 additions & 1 deletion src/delayedarray/Grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def __init__(self, boundaries: Tuple[Sequence[int], ...], cost_factor: float, in
Positive number representing the cost of iteration over each
element of the grid's array. The actual cost is defined by the
product of the cost factor by the array size. This is used to
choose between iteration schemes.
choose between iteration schemes; as a reference, extraction
from an in-memory NumPy array has a cost factor of 1.
internals:
Internal use only.
Expand Down
2 changes: 1 addition & 1 deletion src/delayedarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,6 @@
from .create_dask_array import create_dask_array
from .is_sparse import is_sparse
from .is_masked import is_masked
from .chunk_grid import chunk_grid
from .chunk_grid import chunk_grid, chunk_shape_to_grid
from .is_pristine import is_pristine
from .wrap import wrap
83 changes: 67 additions & 16 deletions src/delayedarray/chunk_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,30 @@
__license__ = "MIT"


def _chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int):
def chunk_shape_to_grid(chunks: Sequence[int], shape: Tuple[int, ...], cost_factor: int) -> SimpleGrid:
"""
Convert a chunk shape to a :py:class:`~delayedarray.Grid.SimpleGrid`.
This assumes that the underlying array is split up into regular intervals
on each dimension; the first chunk should start from zero, and only the
last chunk may be of a different size (bounded by the dimension extent).
Args:
chunks:
Chunk size for each dimension. These should be positive.
shape:
Extent of each dimension of the array. These should be non-negative
and of the same length as ``chunks``.
cost_factor:
Cost factor for iterating over each element of the associated
array. This is used to decide between iteration schemes and can be
increased for more expensive types, e.g., file-backed arrays. As a
reference, in-memory NumPy arrays are assigned a cost factor of 1.
Returns:
A ``SimpleGrid`` object with the chunk shape as the boundaries.
"""
out = []
for i, ch in enumerate(chunks):
sh = shape[i]
Expand Down Expand Up @@ -42,8 +65,13 @@ def chunk_grid(x: Any) -> AbstractGrid:


@chunk_grid.register
def chunk_grid_ndarray(x: ndarray):
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
def chunk_grid_ndarray(x: ndarray) -> SimpleGrid:
"""
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
The cost factor for iteration is set to 1, which is considered the lowest
cost for data extraction given that everything is stored in memory.
"""
raw = [1] * len(x.shape)
if x.flags.f_contiguous:
raw[0] = x.shape[0]
Expand All @@ -52,15 +80,21 @@ def chunk_grid_ndarray(x: ndarray):
# to figure that out from NumPy flags. Guess we should just assume
# that it's C-contiguous, given that most things are.
raw[-1] = x.shape[-1]
return _chunk_shape_to_grid(raw, x.shape, cost_factor=1)
return chunk_shape_to_grid(raw, x.shape, cost_factor=1)


@chunk_grid.register
def chunk_grid_SparseNdarray(x: SparseNdarray):
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
def chunk_grid_SparseNdarray(x: SparseNdarray) -> SimpleGrid:
"""
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
The cost factor for iteration is set to 1.5. This is slightly higher than
that of dense NumPy arrays as the ``SparseNdarray`` is a bit more expensive
for random access on the first dimension.
"""
raw = [1] * len(x.shape)
raw[0] = x.shape[0]
return _chunk_shape_to_grid(raw, x.shape, cost_factor=1.5)
return chunk_shape_to_grid(raw, x.shape, cost_factor=1.5)


# If scipy is installed, we add all the methods for the various scipy.sparse matrices.
Expand All @@ -70,19 +104,36 @@ def chunk_grid_SparseNdarray(x: SparseNdarray):


@chunk_grid.register
def chunk_grid_csc_matrix(x: sp.csc_matrix):
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
return _chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5)
def chunk_grid_csc_matrix(x: sp.csc_matrix) -> SimpleGrid:
"""
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
The cost factor for iteration is set to 1.5. This is slightly higher
than that of dense NumPy arrays as CSC matrices are a bit more
expensive for random row access.
"""
return chunk_shape_to_grid((x.shape[0], 1), x.shape, cost_factor=1.5)


@chunk_grid.register
def chunk_grid_csr_matrix(x: sp.csr_matrix):
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
return _chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5)
def chunk_grid_csr_matrix(x: sp.csr_matrix) -> SimpleGrid:
"""
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
The cost factor for iteration is set to 1.5. This is slightly higher
than that of dense NumPy arrays as CSR matrices are a bit more
expensive for random column access.
"""
return chunk_shape_to_grid((1, x.shape[1]), x.shape, cost_factor=1.5)


@chunk_grid.register
def chunk_grid_coo_matrix(x: sp.coo_matrix):
"""See :py:meth:`~delayedarray.chunk_grid.chunk_grid`."""
def chunk_grid_coo_matrix(x: sp.coo_matrix) -> SimpleGrid:
"""
See :py:meth:`~delayedarray.chunk_grid.chunk_grid`.
The cost factor for iteration is set to 5, as any extraction from a COO
matrix requires a full scan through all elements.
"""
# ???? let's just do our best here, there's no nice way to access COO.
return _chunk_shape_to_grid(x.shape, x.shape, cost_factor=1.5)
return chunk_shape_to_grid(x.shape, x.shape, cost_factor=5)

0 comments on commit 6b0d9cd

Please sign in to comment.