diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py index 83242132e..7ad40b67a 100644 --- a/presidio-structured/presidio_structured/__init__.py +++ b/presidio-structured/presidio_structured/__init__.py @@ -1,9 +1,14 @@ -""" presidio-structured root module. """ +"""presidio-structured root module.""" import logging from .analysis_builder import JsonAnalysisBuilder, PandasAnalysisBuilder from .config import StructuredAnalysis -from .data import CsvReader, JsonDataProcessor, JsonReader, PandasDataProcessor +from .data import ( + CsvReader, + JsonDataProcessor, + JsonReader, + PandasDataProcessor, +) from .structured_engine import StructuredEngine logging.getLogger("presidio-structured").addHandler(logging.NullHandler()) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 473a06927..b679977b9 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -16,9 +16,7 @@ class AnalysisBuilder(ABC): - """ - Abstract base class for a configuration generator. - """ + """Abstract base class for a configuration generator.""" def __init__(self, analyzer: AnalyzerEngine = None) -> None: """Initialize the configuration generator.""" @@ -26,7 +24,9 @@ def __init__(self, analyzer: AnalyzerEngine = None) -> None: self.logger = logging.getLogger("presidio-structured") @abstractmethod - def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis: + def generate_analysis( + self, data: Union[Dict, DataFrame] + ) -> StructuredAnalysis: """ Abstract method to generate a configuration from the given data. @@ -48,14 +48,17 @@ def generate_analysis(self, data: Dict) -> StructuredAnalysis: """ self.logger.debug("Starting JSON BatchAnalyzer analysis") batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) - analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en") + analyzer_results = batch_analyzer.analyze_dict( + input_dict=data, language="en" + ) return self._generate_analysis_from_results_json(analyzer_results) def _generate_analysis_from_results_json( self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" ) -> StructuredAnalysis: """ - Generate a configuration from the given analyzer results. Always uses the first recognizer result if there are more than one. + Generate a configuration from the given analyzer results. \ + Always uses the first recognizer result if there are more than one. :param analyzer_results: The analyzer results. :param prefix: The prefix for the configuration keys. @@ -77,17 +80,21 @@ def _generate_analysis_from_results_json( result.recognizer_results, prefix=current_key + "." ) mappings.update(nested_mappings.entity_mapping) - first_recognizer_result = next(iter(result.recognizer_results), None) + first_recognizer_result = next( + iter(result.recognizer_results), None + ) if first_recognizer_result is not None: self.logger.debug( - f"Found entity {first_recognizer_result.entity_type} in {current_key}" + f"Found entity {first_recognizer_result.entity_type} \ + in {current_key}" ) mappings[current_key] = first_recognizer_result.entity_type return StructuredAnalysis(entity_mapping=mappings) class TabularAnalysisbuilder(AnalysisBuilder): - """Placeholder class for generalizing tabular data analysis builders (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" + """Placeholder class for generalizing tabular data analysis builders \ + (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" pass @@ -108,13 +115,16 @@ def generate_analysis( """ if n > len(df): self.logger.debug( - f"Number of samples ({n}) is larger than the number of rows ({len(df)}), using all rows" + f"Number of samples ({n}) is larger than the number of rows \ + ({len(df)}), using all rows" ) n = len(df) df = df.sample(n) - key_recognizer_result_map = self._find_most_common_entity(df, language) + key_recognizer_result_map = self._find_most_common_entity( + df, language + ) key_entity_map = { key: result.entity_type @@ -139,7 +149,9 @@ def _find_most_common_entity( batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) for column in df.columns: - self.logger.debug(f"Finding most common PII entity for column {column}") + self.logger.debug( + f"Finding most common PII entity for column {column}" + ) analyzer_results = batch_analyzer.analyze_iterator( [val for val in df[column]], language=language ) diff --git a/presidio-structured/presidio_structured/config/__init__.py b/presidio-structured/presidio_structured/config/__init__.py index 85341c3e5..f7724c726 100644 --- a/presidio-structured/presidio_structured/config/__init__.py +++ b/presidio-structured/presidio_structured/config/__init__.py @@ -1,3 +1,4 @@ +"""Config module for presidio-structured.""" from .structured_analysis import StructuredAnalysis __all__ = [ diff --git a/presidio-structured/presidio_structured/config/structured_analysis.py b/presidio-structured/presidio_structured/config/structured_analysis.py index f9a00c519..ca1b75d2c 100644 --- a/presidio-structured/presidio_structured/config/structured_analysis.py +++ b/presidio-structured/presidio_structured/config/structured_analysis.py @@ -1,4 +1,4 @@ -""" Structured Analysis module. """ +"""Structured Analysis module.""" from dataclasses import dataclass from typing import Dict @@ -6,8 +6,9 @@ @dataclass class StructuredAnalysis: - """Dataclass containing entity analysis from structured data. Currently only contains entity mapping.""" + """Dataclass containing entity analysis from structured data.\ + Currently only contains entity mapping.""" entity_mapping: Dict[ str, str - ] # NOTE ideally Literal[...] with allowed EntityTypes, but cannot unpack in Literal. + ] diff --git a/presidio-structured/presidio_structured/data/__init__.py b/presidio-structured/presidio_structured/data/__init__.py index b888f9829..a65a622dd 100644 --- a/presidio-structured/presidio_structured/data/__init__.py +++ b/presidio-structured/presidio_structured/data/__init__.py @@ -1,3 +1,5 @@ +"""Data module.""" + from .data_reader import CsvReader, JsonReader from .data_processors import JsonDataProcessor, PandasDataProcessor diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index d2eaf9fb1..05b8b82df 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -10,12 +10,10 @@ class DataProcessorBase(ABC): - """ - Abstract base class to handle logic of operations over the text using the operators. - """ + """Abstract class to handle logic of operations over text using the operators.""" def __init__(self) -> None: - """Initializes DataProcessorBase object.""" + """Initialize DataProcessorBase object.""" self.logger = logging.getLogger("presidio-structured") def operate( @@ -25,7 +23,8 @@ def operate( operators: Dict[str, OperatorConfig], ) -> Any: """ - Performs operations over the text using the operators, as per the structured analysis. + Perform operations over the text using the operators, \ + as per the structured analysis. :param data: Data to be operated on. :param structured_analysis: Analysis schema as per the structured data. @@ -39,7 +38,9 @@ def operate( @abstractmethod def _process( - self, data: Dict | DataFrame, key_to_operator_mapping: Dict[str, Callable] + self, + data: Dict | DataFrame, + key_to_operator_mapping: Dict[str, Callable], ) -> Dict | DataFrame: """ Abstract method for subclasses to provide operation implementation. @@ -71,11 +72,15 @@ def _generate_operator_mapping( operators_factory = OperatorsFactory() for key, entity in config.entity_mapping.items(): - self.logger.debug(f"Creating operator for key {key} and entity {entity}") - operator_config = operators.get(entity, operators.get("DEFAULT", None)) + self.logger.debug( + f"Creating operator for key {key} and entity {entity}" + ) + operator_config = operators.get( + entity, operators.get("DEFAULT", None) + ) if operator_config is None: raise ValueError(f"Operator for entity {entity} not found") - # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported for now. + # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported. operator = operators_factory.create_operator_class( operator_config.operator_name, OperatorType.Anonymize ) @@ -102,6 +107,8 @@ def _operate_on_text( class PandasDataProcessor(DataProcessorBase): + """Pandas Data Processor.""" + def _process( self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable] ) -> DataFrame: @@ -155,9 +162,12 @@ def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any: return data @staticmethod - def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> None: + def _set_nested_value( + data: Union[Dict, List], path: List[str], value: Any + ) -> None: """ Recursively sets a value in nested data using a given path. + :param data: Nested data (JSON-like). :param path: List of keys/indexes representing the path. :param value: Value to be set. @@ -172,7 +182,9 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N continue else: for item in data: - JsonDataProcessor._set_nested_value(item, path[i:], value) + JsonDataProcessor._set_nested_value( + item, path[i:], value + ) return elif isinstance(data, dict): if i == len(path) - 1: @@ -181,10 +193,13 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N data = data.setdefault(key, {}) def _process( - self, data: Union[Dict, List], key_to_operator_mapping: Dict[str, Callable] + self, + data: Union[Dict, List], + key_to_operator_mapping: Dict[str, Callable], ) -> Union[Dict, List]: """ - Operates on the given JSON-like data (nested dictionary/list) based on the provided configuration. + Operates on the given JSON-like data based on the provided configuration. + :param data: JSON-like data to be operated on. :param config: Configuration object containing operator information. :return: JSON-like data after the operation. diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py index 0149f6527..ab1d675a1 100644 --- a/presidio-structured/presidio_structured/data/data_reader.py +++ b/presidio-structured/presidio_structured/data/data_reader.py @@ -1,4 +1,4 @@ -""" Helper data classes, mostly simple wrappers to ensure consistent user interface. """ +"""Helper data classes, mostly simple wrappers to ensure consistent user interface.""" import json from abc import ABC, abstractmethod @@ -12,7 +12,7 @@ class ReaderBase(ABC): """ Base class for data readers. - This class should not be instantiated directly. Instead use or define a reader subclass. + This class should not be instantiated directly, instead init a subclass. """ @abstractmethod diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index 47d367073..aa046693c 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -14,9 +14,7 @@ class StructuredEngine: - """ - Class to implement methods for anonymizing tabular data. - """ + """Class to implement methods for anonymizing tabular data.""" def __init__( self, data_processor: DataProcessorBase = PandasDataProcessor() @@ -46,23 +44,28 @@ def anonymize( self.loggger.debug("Starting anonymization") operators = self.__check_or_add_default_operator(operators) - return self.data_processor.operate(data, structured_analysis, operators) + return self.data_processor.operate( + data, structured_analysis, operators + ) def __check_or_add_default_operator( self, operators: Dict[str, OperatorConfig] ) -> Dict[str, OperatorConfig]: """ - Check if the provided operators dictionary has a default operator. - If not, add a default operator. + Check if the provided operators dictionary has a default operator. \ + If not, add a default operator. :param operators: dictionary of operator configurations. - :return: operators dictionary with the default operator added if it was not initially present. + :return: operators dictionary with the default operator added \ + if it was not initially present. """ default_operator = OperatorConfig(DEFAULT) if not operators: self.logger.debug("No operators provided, using default operator") return {"DEFAULT": default_operator} if not operators.get("DEFAULT"): - self.logger.debug("No default operator provided, using default operator") + self.logger.debug( + "No default operator provided, using default operator" + ) operators["DEFAULT"] = default_operator return operators diff --git a/presidio-structured/setup.cfg b/presidio-structured/setup.cfg new file mode 100644 index 000000000..732559f8e --- /dev/null +++ b/presidio-structured/setup.cfg @@ -0,0 +1,10 @@ +[flake8] +max-line-length = 88 +exclude = + .git, + __pycache__, + build, + dist, + tests +docstring-convention = numpy +extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 TC \ No newline at end of file diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py index 2b3a7e04d..ab9aeeb80 100644 --- a/presidio-structured/setup.py +++ b/presidio-structured/setup.py @@ -24,7 +24,9 @@ name="presidio_structured", python_requires=">=3.5", version=__version__, - packages=find_packages(include=["presidio_structured", "presidio_structured.*"]), + packages=find_packages( + include=["presidio_structured", "presidio_structured.*"] + ), classifiers=[ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", @@ -36,7 +38,8 @@ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", ], - description="Presidio structured package - analyses and anonymizes structured and semistructured data.", + description="Presidio structured package - analyses and anonymizes \ + structured and semistructured data.", license="MIT license", include_package_data=True, keywords="presidio_structured", diff --git a/presidio-structured/tests/conftest.py b/presidio-structured/tests/conftest.py index c7560b477..11f78a805 100644 --- a/presidio-structured/tests/conftest.py +++ b/presidio-structured/tests/conftest.py @@ -1,3 +1,5 @@ +""" Pytest fixtures for presidio-structured tests. """ + import pandas as pd import pytest from presidio_anonymizer.entities import OperatorConfig @@ -9,7 +11,11 @@ def sample_df(): data = { "name": ["John Doe", "Jane Doe", "John Smith"], - "email": ["john@example.com", "jane@example.com", "johnsmith@example.com"], + "email": [ + "john@example.com", + "jane@example.com", + "johnsmith@example.com", + ], "phone": ["1234567890", "0987654321", "1122334455"], } return pd.DataFrame(data) @@ -33,7 +39,12 @@ def sample_json(): @pytest.fixture def sample_json_with_array(): - data = {"users": [{"id": 1, "name": "John Doe"}, {"id": 2, "name": "Jane Doe"}]} + data = { + "users": [ + {"id": 1, "name": "John Doe"}, + {"id": 2, "name": "Jane Doe"}, + ] + } return data @@ -50,15 +61,21 @@ def tabular_analysis_builder(): @pytest.fixture def operators(): return { - "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), - "DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}), + "PERSON": OperatorConfig( + "replace", {"new_value": "PERSON_REPLACEMENT"} + ), + "DEFAULT": OperatorConfig( + "replace", {"new_value": "DEFAULT_REPLACEMENT"} + ), } @pytest.fixture def operators_no_default(): return { - "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), + "PERSON": OperatorConfig( + "replace", {"new_value": "PERSON_REPLACEMENT"} + ), } diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py index c9bd365f2..514569ce9 100644 --- a/presidio-structured/tests/data/test_data_transformers.py +++ b/presidio-structured/tests/data/test_data_transformers.py @@ -8,7 +8,9 @@ class TestDataProcessorBase: - def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators): + def test_abstract_init_raises( + self, sample_df, tabular_analysis_builder, operators + ): with pytest.raises(TypeError): DataProcessorBase() @@ -29,9 +31,13 @@ def test_process_no_default_should_raise( ): processor = PandasDataProcessor() with pytest.raises(ValueError): - processor.operate(sample_df, tabular_analysis, operators_no_default) + processor.operate( + sample_df, tabular_analysis, operators_no_default + ) - def test_process_invalid_data(self, sample_json, tabular_analysis, operators): + def test_process_invalid_data( + self, sample_json, tabular_analysis, operators + ): processor = PandasDataProcessor() with pytest.raises(ValueError): processor.operate(sample_json, tabular_analysis, operators) @@ -57,7 +63,9 @@ def test_process_no_default_should_raise( ): processor = JsonDataProcessor() with pytest.raises(ValueError): - processor.operate(sample_json, json_analysis, operators_no_default) + processor.operate( + sample_json, json_analysis, operators_no_default + ) def test_process_invalid_data(self, sample_df, json_analysis, operators): processor = JsonDataProcessor() diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index 4d1a6a834..92ad91720 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -1,3 +1,5 @@ +""" Test the analysis builder """ + import pandas as pd import pytest @@ -5,15 +7,21 @@ def test_generate_analysis_tabular(tabular_analysis_builder, sample_df): - structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) + structured_analysis = tabular_analysis_builder.generate_analysis( + sample_df + ) assert structured_analysis.entity_mapping["name"] == "PERSON" assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" -def test_generate_analysis_tabular_with_sampling(tabular_analysis_builder, sample_df): - structured_analysis = tabular_analysis_builder.generate_analysis(sample_df, n=2) +def test_generate_analysis_tabular_with_sampling( + tabular_analysis_builder, sample_df +): + structured_analysis = tabular_analysis_builder.generate_analysis( + sample_df, n=2 + ) assert len(structured_analysis.entity_mapping) == 3 assert structured_analysis.entity_mapping["name"] == "PERSON" @@ -29,8 +37,8 @@ def test_generate_analysis_tabular_with_invalid_sampling( def test_find_most_common_entity(tabular_analysis_builder, sample_df): - key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity( - sample_df, "en" + key_recognizer_result_map = ( + tabular_analysis_builder._find_most_common_entity(sample_df, "en") ) assert len(key_recognizer_result_map) == 3 @@ -41,8 +49,8 @@ def test_find_most_common_entity(tabular_analysis_builder, sample_df): def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): df = pd.DataFrame() - key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity( - df, "en" + key_recognizer_result_map = ( + tabular_analysis_builder._find_most_common_entity(df, "en") ) assert len(key_recognizer_result_map) == 0 diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py index 45aca7804..3fe02f272 100644 --- a/presidio-structured/tests/test_tabular_engine.py +++ b/presidio-structured/tests/test_tabular_engine.py @@ -21,7 +21,9 @@ def test_structured_engine_anonymize_calls_data_processor_operate(): structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_processor.operate.assert_called_once_with(data, structured_analysis, operators) + data_processor.operate.assert_called_once_with( + data, structured_analysis, operators + ) def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): @@ -40,7 +42,7 @@ def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): assert "DEFAULT" in args[2] -def test_structured_engine_anonymize_does_not_override_existing_default_operator(): +def test_structured_engine_anonymize_doesnt_override_existing_default_operator(): # Arrange data_processor = Mock() structured_engine = StructuredEngine(data_processor) @@ -52,7 +54,9 @@ def test_structured_engine_anonymize_does_not_override_existing_default_operator structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_processor.operate.assert_called_once_with(data, structured_analysis, operators) + data_processor.operate.assert_called_once_with( + data, structured_analysis, operators + ) def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis):