Skip to content

Commit

Permalink
Linting, continued
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakob-98 committed Nov 27, 2023
1 parent 411f1bd commit 15b756f
Show file tree
Hide file tree
Showing 14 changed files with 150 additions and 61 deletions.
9 changes: 7 additions & 2 deletions presidio-structured/presidio_structured/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
""" presidio-structured root module. """
"""presidio-structured root module."""
import logging

from .analysis_builder import JsonAnalysisBuilder, PandasAnalysisBuilder
from .config import StructuredAnalysis
from .data import CsvReader, JsonDataProcessor, JsonReader, PandasDataProcessor
from .data import (
CsvReader,
JsonDataProcessor,
JsonReader,
PandasDataProcessor,
)
from .structured_engine import StructuredEngine

logging.getLogger("presidio-structured").addHandler(logging.NullHandler())
Expand Down
36 changes: 24 additions & 12 deletions presidio-structured/presidio_structured/analysis_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@


class AnalysisBuilder(ABC):
"""
Abstract base class for a configuration generator.
"""
"""Abstract base class for a configuration generator."""

def __init__(self, analyzer: AnalyzerEngine = None) -> None:
"""Initialize the configuration generator."""
self.analyzer = AnalyzerEngine() if analyzer is None else analyzer
self.logger = logging.getLogger("presidio-structured")

@abstractmethod
def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis:
def generate_analysis(
self, data: Union[Dict, DataFrame]
) -> StructuredAnalysis:
"""
Abstract method to generate a configuration from the given data.
Expand All @@ -48,14 +48,17 @@ def generate_analysis(self, data: Dict) -> StructuredAnalysis:
"""
self.logger.debug("Starting JSON BatchAnalyzer analysis")
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en")
analyzer_results = batch_analyzer.analyze_dict(
input_dict=data, language="en"
)
return self._generate_analysis_from_results_json(analyzer_results)

def _generate_analysis_from_results_json(
self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = ""
) -> StructuredAnalysis:
"""
Generate a configuration from the given analyzer results. Always uses the first recognizer result if there are more than one.
Generate a configuration from the given analyzer results. \
Always uses the first recognizer result if there are more than one.
:param analyzer_results: The analyzer results.
:param prefix: The prefix for the configuration keys.
Expand All @@ -77,17 +80,21 @@ def _generate_analysis_from_results_json(
result.recognizer_results, prefix=current_key + "."
)
mappings.update(nested_mappings.entity_mapping)
first_recognizer_result = next(iter(result.recognizer_results), None)
first_recognizer_result = next(
iter(result.recognizer_results), None
)
if first_recognizer_result is not None:
self.logger.debug(
f"Found entity {first_recognizer_result.entity_type} in {current_key}"
f"Found entity {first_recognizer_result.entity_type} \
in {current_key}"
)
mappings[current_key] = first_recognizer_result.entity_type
return StructuredAnalysis(entity_mapping=mappings)


class TabularAnalysisbuilder(AnalysisBuilder):
"""Placeholder class for generalizing tabular data analysis builders (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now."""
"""Placeholder class for generalizing tabular data analysis builders \
(e.g. PySpark). Only implemented as PandasAnalysisBuilder for now."""

pass

Expand All @@ -108,13 +115,16 @@ def generate_analysis(
"""
if n > len(df):
self.logger.debug(
f"Number of samples ({n}) is larger than the number of rows ({len(df)}), using all rows"
f"Number of samples ({n}) is larger than the number of rows \
({len(df)}), using all rows"
)
n = len(df)

df = df.sample(n)

key_recognizer_result_map = self._find_most_common_entity(df, language)
key_recognizer_result_map = self._find_most_common_entity(
df, language
)

key_entity_map = {
key: result.entity_type
Expand All @@ -139,7 +149,9 @@ def _find_most_common_entity(
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)

for column in df.columns:
self.logger.debug(f"Finding most common PII entity for column {column}")
self.logger.debug(
f"Finding most common PII entity for column {column}"
)
analyzer_results = batch_analyzer.analyze_iterator(
[val for val in df[column]], language=language
)
Expand Down
1 change: 1 addition & 0 deletions presidio-structured/presidio_structured/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Config module for presidio-structured."""
from .structured_analysis import StructuredAnalysis

__all__ = [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
""" Structured Analysis module. """
"""Structured Analysis module."""

from dataclasses import dataclass
from typing import Dict


@dataclass
class StructuredAnalysis:
"""Dataclass containing entity analysis from structured data. Currently only contains entity mapping."""
"""Dataclass containing entity analysis from structured data.\
Currently only contains entity mapping."""

entity_mapping: Dict[
str, str
] # NOTE ideally Literal[...] with allowed EntityTypes, but cannot unpack in Literal.
]
2 changes: 2 additions & 0 deletions presidio-structured/presidio_structured/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Data module."""

from .data_reader import CsvReader, JsonReader
from .data_processors import JsonDataProcessor, PandasDataProcessor

Expand Down
41 changes: 28 additions & 13 deletions presidio-structured/presidio_structured/data/data_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@


class DataProcessorBase(ABC):
"""
Abstract base class to handle logic of operations over the text using the operators.
"""
"""Abstract class to handle logic of operations over text using the operators."""

def __init__(self) -> None:
"""Initializes DataProcessorBase object."""
"""Initialize DataProcessorBase object."""
self.logger = logging.getLogger("presidio-structured")

def operate(
Expand All @@ -25,7 +23,8 @@ def operate(
operators: Dict[str, OperatorConfig],
) -> Any:
"""
Performs operations over the text using the operators, as per the structured analysis.
Perform operations over the text using the operators, \
as per the structured analysis.
:param data: Data to be operated on.
:param structured_analysis: Analysis schema as per the structured data.
Expand All @@ -39,7 +38,9 @@ def operate(

@abstractmethod
def _process(
self, data: Dict | DataFrame, key_to_operator_mapping: Dict[str, Callable]
self,
data: Dict | DataFrame,
key_to_operator_mapping: Dict[str, Callable],
) -> Dict | DataFrame:
"""
Abstract method for subclasses to provide operation implementation.
Expand Down Expand Up @@ -71,11 +72,15 @@ def _generate_operator_mapping(

operators_factory = OperatorsFactory()
for key, entity in config.entity_mapping.items():
self.logger.debug(f"Creating operator for key {key} and entity {entity}")
operator_config = operators.get(entity, operators.get("DEFAULT", None))
self.logger.debug(
f"Creating operator for key {key} and entity {entity}"
)
operator_config = operators.get(
entity, operators.get("DEFAULT", None)
)
if operator_config is None:
raise ValueError(f"Operator for entity {entity} not found")
# NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported for now.
# NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported.
operator = operators_factory.create_operator_class(
operator_config.operator_name, OperatorType.Anonymize
)
Expand All @@ -102,6 +107,8 @@ def _operate_on_text(


class PandasDataProcessor(DataProcessorBase):
"""Pandas Data Processor."""

def _process(
self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable]
) -> DataFrame:
Expand Down Expand Up @@ -155,9 +162,12 @@ def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any:
return data

@staticmethod
def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> None:
def _set_nested_value(
data: Union[Dict, List], path: List[str], value: Any
) -> None:
"""
Recursively sets a value in nested data using a given path.
:param data: Nested data (JSON-like).
:param path: List of keys/indexes representing the path.
:param value: Value to be set.
Expand All @@ -172,7 +182,9 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N
continue
else:
for item in data:
JsonDataProcessor._set_nested_value(item, path[i:], value)
JsonDataProcessor._set_nested_value(
item, path[i:], value
)
return
elif isinstance(data, dict):
if i == len(path) - 1:
Expand All @@ -181,10 +193,13 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N
data = data.setdefault(key, {})

def _process(
self, data: Union[Dict, List], key_to_operator_mapping: Dict[str, Callable]
self,
data: Union[Dict, List],
key_to_operator_mapping: Dict[str, Callable],
) -> Union[Dict, List]:
"""
Operates on the given JSON-like data (nested dictionary/list) based on the provided configuration.
Operates on the given JSON-like data based on the provided configuration.
:param data: JSON-like data to be operated on.
:param config: Configuration object containing operator information.
:return: JSON-like data after the operation.
Expand Down
4 changes: 2 additions & 2 deletions presidio-structured/presidio_structured/data/data_reader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Helper data classes, mostly simple wrappers to ensure consistent user interface. """
"""Helper data classes, mostly simple wrappers to ensure consistent user interface."""

import json
from abc import ABC, abstractmethod
Expand All @@ -12,7 +12,7 @@ class ReaderBase(ABC):
"""
Base class for data readers.
This class should not be instantiated directly. Instead use or define a reader subclass.
This class should not be instantiated directly, instead init a subclass.
"""

@abstractmethod
Expand Down
19 changes: 11 additions & 8 deletions presidio-structured/presidio_structured/structured_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@


class StructuredEngine:
"""
Class to implement methods for anonymizing tabular data.
"""
"""Class to implement methods for anonymizing tabular data."""

def __init__(
self, data_processor: DataProcessorBase = PandasDataProcessor()
Expand Down Expand Up @@ -46,23 +44,28 @@ def anonymize(
self.loggger.debug("Starting anonymization")
operators = self.__check_or_add_default_operator(operators)

return self.data_processor.operate(data, structured_analysis, operators)
return self.data_processor.operate(
data, structured_analysis, operators
)

def __check_or_add_default_operator(
self, operators: Dict[str, OperatorConfig]
) -> Dict[str, OperatorConfig]:
"""
Check if the provided operators dictionary has a default operator.
If not, add a default operator.
Check if the provided operators dictionary has a default operator. \
If not, add a default operator.
:param operators: dictionary of operator configurations.
:return: operators dictionary with the default operator added if it was not initially present.
:return: operators dictionary with the default operator added \
if it was not initially present.
"""
default_operator = OperatorConfig(DEFAULT)
if not operators:
self.logger.debug("No operators provided, using default operator")
return {"DEFAULT": default_operator}
if not operators.get("DEFAULT"):
self.logger.debug("No default operator provided, using default operator")
self.logger.debug(
"No default operator provided, using default operator"
)
operators["DEFAULT"] = default_operator
return operators
10 changes: 10 additions & 0 deletions presidio-structured/setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[flake8]
max-line-length = 88
exclude =
.git,
__pycache__,
build,
dist,
tests
docstring-convention = numpy
extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 TC
7 changes: 5 additions & 2 deletions presidio-structured/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
name="presidio_structured",
python_requires=">=3.5",
version=__version__,
packages=find_packages(include=["presidio_structured", "presidio_structured.*"]),
packages=find_packages(
include=["presidio_structured", "presidio_structured.*"]
),
classifiers=[
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
Expand All @@ -36,7 +38,8 @@
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
],
description="Presidio structured package - analyses and anonymizes structured and semistructured data.",
description="Presidio structured package - analyses and anonymizes \
structured and semistructured data.",
license="MIT license",
include_package_data=True,
keywords="presidio_structured",
Expand Down
27 changes: 22 additions & 5 deletions presidio-structured/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
""" Pytest fixtures for presidio-structured tests. """

import pandas as pd
import pytest
from presidio_anonymizer.entities import OperatorConfig
Expand All @@ -9,7 +11,11 @@
def sample_df():
data = {
"name": ["John Doe", "Jane Doe", "John Smith"],
"email": ["john@example.com", "jane@example.com", "johnsmith@example.com"],
"email": [
"john@example.com",
"jane@example.com",
"johnsmith@example.com",
],
"phone": ["1234567890", "0987654321", "1122334455"],
}
return pd.DataFrame(data)
Expand All @@ -33,7 +39,12 @@ def sample_json():

@pytest.fixture
def sample_json_with_array():
data = {"users": [{"id": 1, "name": "John Doe"}, {"id": 2, "name": "Jane Doe"}]}
data = {
"users": [
{"id": 1, "name": "John Doe"},
{"id": 2, "name": "Jane Doe"},
]
}
return data


Expand All @@ -50,15 +61,21 @@ def tabular_analysis_builder():
@pytest.fixture
def operators():
return {
"PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}),
"DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}),
"PERSON": OperatorConfig(
"replace", {"new_value": "PERSON_REPLACEMENT"}
),
"DEFAULT": OperatorConfig(
"replace", {"new_value": "DEFAULT_REPLACEMENT"}
),
}


@pytest.fixture
def operators_no_default():
return {
"PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}),
"PERSON": OperatorConfig(
"replace", {"new_value": "PERSON_REPLACEMENT"}
),
}


Expand Down
Loading

0 comments on commit 15b756f

Please sign in to comment.