Skip to content

Commit

Permalink
Add tests and rewrote documentation
Browse files Browse the repository at this point in the history
- hypothesis tests in particular
- changed the API a bit
- Rewrote the documentation
  • Loading branch information
kai-tub committed Jul 18, 2022
1 parent a98f405 commit b4ab2c3
Show file tree
Hide file tree
Showing 23 changed files with 1,447 additions and 327 deletions.
24 changes: 17 additions & 7 deletions common_nb_preprocessors/_patterns.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import re

from pydantic import constr, validate_arguments

__all__ = ["build_prefixed_regex_pattern", "build_prefixed_regex_pattern_with_value"]


def build_prefixed_regex_pattern(prefix: str, key_term: str) -> re.Pattern:
@validate_arguments
def build_prefixed_regex_pattern(
*, prefix: constr(min_length=1), key_term: constr(min_length=1)
) -> re.Pattern:
"""
A regular expression builder that returns a compiled
regular expression that matches a string if:
Expand All @@ -29,25 +34,30 @@ def build_prefixed_regex_pattern(prefix: str, key_term: str) -> re.Pattern:
return pattern


@validate_arguments
def build_prefixed_regex_pattern_with_value(
prefix: str, key_term: str, delimiter=r"\s*"
*,
prefix: constr(min_length=1),
key_term: constr(min_length=1),
delimiter: constr(min_length=1) = "=",
) -> re.Pattern:
"""
A regular expression builder that returns a compiled
regular expression that matches a string if:
- An escaped `prefix` string (may have whitespaces before or after)
- The escape `key_term` to capture with the group name `key`
- Followed by an *unescaped* `delimiter`
regular expression that matches a string with:
- The (escaped) `prefix` string (may have whitespaces before or after)
- The (escaped) `key_term` to capture with the group name `key` is
- Followed by an (escaped) `delimiter` (may have whitespaces before or after)
- and captures the following line until the end of the line with the group name `value`
"""
prefix = re.escape(prefix)
key_term = re.escape(key_term)
delimiter = re.escape(delimiter)
pattern = re.compile(
rf"""
^ # match start of each line
\s*{prefix}\s* # allow whitespace before and after prefix
(?P<key>{key_term}) # term to capture
{delimiter}
\s*{delimiter}\s* # allow whitespace before and after delimiter
(?P<value>[^\n\r]+)
$ # match end of each line (excludes \n in MULTILINE)
[\r\n]* # Capture current and all following empty newlines
Expand Down
190 changes: 79 additions & 111 deletions common_nb_preprocessors/metadata_injector.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from pprint import pprint

import nbformat
import yaml
from nbconvert.preprocessors import Preprocessor
from traitlets import Bool, List, Unicode
from traitlets import Bool, List, TraitError, Unicode, validate

from ._patterns import (
build_prefixed_regex_pattern,
Expand All @@ -11,65 +9,36 @@

__all__ = [
"MetaDataListInjectorPreprocessor",
"GlobalMetaDataInjectorPreprocessor",
"MetaDataMapInjectorPreprocessor",
]


class MetaDataListInjectorPreprocessor(Preprocessor):
"""
Parse all *code* cells and append the matched `strings` with the
`prefix` to the `metadata_group` list, which is the `tags` list by default.
These `strings` must be on their own line and only contain the `prefix`
a `string` from `strings` and whitespace characters.
With `remove_line=True` (default) the matched line will be removed from
the output.
With the `Preprocessor` configured as:
- `metadata_group="tags"`
- `strings=["hide"]`
- `remove_line=True`
the code cell with the contents:
.. code-block:: python
# hide
import os
and the following notebook metadata json:
.. code-block:: json
{"metadata": {}}
Will be transformed to the code-cell with the contents:
.. code-block:: python
import os
and the metadata set as:
.. code-block:: json
{"metadata": {"tags": ["hide"]}}
All matched `strings` (also sometimes called *magic* comments)
will be *appended* to the `metadata_group` list.
If the entry already exists, it won't be added again.
Parse all *code* cells and append the matched magic comments with the
`prefix` to the `metadata_group` list.
These `strings` must be on their own line and only contain the `prefix`,
a `string` from `strings` (i.e., the magic comment) and whitespace characters.
"""

metadata_group = Unicode(default_value="tags").tag(config=True)
"""Metadata group into which the matched `strings` will be written."""
strings = List(Unicode(), default_value=[]).tag(config=True)
"""List of strings (magic comments) that define the text that will be matched and injected into the selected metadata group."""
"""Metadata group to which the matched magic comment will be appended to if
it doesn't already exist. Default is `tags`."""
strings = List(Unicode(), minlen=1).tag(config=True)
"""List of strings (magic comments) that define the text that will be matched and
injected into the selected metadata group."""
prefix = Unicode(default_value="#").tag(config=True)
"""The prefix that indicates the possible start of a magic comment line. Should be comment character of the language."""
"""The prefix that indicates the possible start of a magic comment line.
Should be comment character of the language. By default `#`."""
remove_line = Bool(default_value=True).tag(config=True)
"""By default remove the matching line in the code-cell."""

@validate("metadata_group")
def _valid_metadata_group(self, proposal):
if proposal["value"] == "":
raise TraitError("metadata_group must be non-empty string!")
return proposal["value"]

def _write_tag(self, tag, cell):
tags = cell.setdefault("metadata", {}).setdefault(self.metadata_group, [])
if tag not in tags:
Expand All @@ -82,7 +51,7 @@ def preprocess_cell(self, cell, resource, index):
if cell["cell_type"] == "markdown":
return cell, resource
for string in self.strings:
pattern = build_prefixed_regex_pattern(self.prefix, string)
pattern = build_prefixed_regex_pattern(prefix=self.prefix, key_term=string)
m = pattern.search(cell.source)
if m is not None:
tag = m.group("key")
Expand All @@ -93,27 +62,41 @@ def preprocess_cell(self, cell, resource, index):


class MetaDataMapInjectorPreprocessor(Preprocessor):
metadata_group = Unicode(default_value="tags").tag(config=True)
"""Metadata group into which the matched `strings` will be written."""
keys = List(Unicode(), default_value=[]).tag(config=True)
"""
Parse all *code* cells and add the matched key-value pairs with the
`prefix` to the `metadata_group` dictionary.
The key-value pairs are generated by searching for each `key` of `keys` followed
by `delimiter` and the value.
"""

metadata_group = Unicode().tag(config=True)
"""Metadata group into which the matched key-value pairs will be written."""
keys = List(Unicode()).tag(config=True)
"""List of keys that will be used as a key for the `metadata_group` dictionary entry and is followed by the `delimiter` and `value`."""
prefix = Unicode(default_value="#").tag(config=True)
"""The prefix that indicates the possible start of a magic comment line. Should be comment character of the language."""
remove_line = Bool(default_value=True).tag(config=True)
"""By default remove the matching line in the code-cell."""
delimiter = Unicode(default_value=r"\s*=\s*").tag(config=True)
delimiter = Unicode(default_value="=").tag(config=True)
"""Delimiter that separates the key from the value."""
value_to_yaml = Bool(default_value=False).tag(config=True)
"""Parse the value as yaml syntax before writing it as a dictionary. Default is `False`."""

@validate("metadata_group")
def _valid_metadata_group(self, proposal):
if proposal["value"] == "":
raise TraitError("metadata_group must be non-empty string!")
return proposal["value"]

def _write_entry(self, key, value, cell):
entries = cell.setdefault("metadata", {}).setdefault(self.metadata_group, {})
if isinstance(entries, list):
raise ValueError(
raise TraitError(
"Trying to overwrite metadata list type with metadata dictionary.",
self.metadata_group,
)
# should include an option to parse it as yaml
# or similar
value = True if value == "true" else value
value = False if value == "false" else value
if self.value_to_yaml:
value = yaml.safe_load(value)
entries[key] = value
return cell

Expand All @@ -136,53 +119,38 @@ def preprocess_cell(self, cell, resource, index):
return cell, resource


class GlobalMetaDataInjectorPreprocessor(Preprocessor):
"""
Parse all *code* cells and convert the matching `prefix` `key` `value`
lines to the global `metadata` field.
To clean up the output, the lines containing any `string` may be removed
by setting `remove_line=True` (default).
The provided list of `keys` will be used to access the *global* `metadata` field
and insert the value that is followed by the `key` in the code cell.
Note that the global metadata field will be overwritten if multiple cells define the
field's value.
.. code-block:: python
# publish true
import os
Will be transformed to:
```python
import os
```
where the _notebooks_ cell metadata `publish` field may be created and contain the additional entry `true`.
To only add a specific value to a metadata field (usually `tags`) look at `MetaDataInjectorPreprocessor`.
"""

keys = List(Unicode()).tag(config=True)
prefix = Unicode(default_value="#").tag(config=True)
delimiter = Unicode(default_value=r"=").tag(config=True)

def preprocess(self, nb, resources):
if len(self.keys) == 0:
return nb, resources

for cell in nb.cells:
if cell["cell_type"] == "markdown":
continue
for key in self.keys:
pattern = build_prefixed_regex_pattern_with_value(
self.prefix, key, delimiter=self.delimiter
)
m = pattern.search(cell.source)
if m is not None:
value = m.group("value")
nb.setdefault("metadata", {})
nb["metadata"][key] = value
return nb, resources
# class GlobalMetaDataInjectorPreprocessor(Preprocessor):
# """
# Parse all *code* cells and convert the matching `prefix` `key` `value`
# lines to the global `metadata` field.

# To clean up the output, the lines containing any `string` may be removed
# by setting `remove_line=True` (default).

# The provided list of `keys` will be used to access the *global* `metadata` field
# and insert the value that is followed by the `key` in the code cell.
# Note that the global metadata field will be overwritten if multiple cells define the
# field's value.
# """

# keys = List(Unicode()).tag(config=True)
# prefix = Unicode(default_value="#").tag(config=True)
# delimiter = Unicode(default_value=r"=").tag(config=True)

# def preprocess(self, nb, resources):
# if len(self.keys) == 0:
# return nb, resources

# for cell in nb.cells:
# if cell["cell_type"] == "markdown":
# continue
# for key in self.keys:
# pattern = build_prefixed_regex_pattern_with_value(
# self.prefix, key, delimiter=self.delimiter
# )
# m = pattern.search(cell.source)
# if m is not None:
# value = m.group("value")
# nb.setdefault("metadata", {})
# nb["metadata"][key] = value
# return nb, resources
Loading

0 comments on commit b4ab2c3

Please sign in to comment.