Linting, continued

microsoft · Nov 27, 2023 · 15b756f · 15b756f
1 parent 411f1bd
commit 15b756f
Show file tree

Hide file tree

Showing 14 changed files with 150 additions and 61 deletions.
diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py
@@ -1,9 +1,14 @@
-""" presidio-structured root module. """
+"""presidio-structured root module."""
 import logging
 
 from .analysis_builder import JsonAnalysisBuilder, PandasAnalysisBuilder
 from .config import StructuredAnalysis
-from .data import CsvReader, JsonDataProcessor, JsonReader, PandasDataProcessor
+from .data import (
+    CsvReader,
+    JsonDataProcessor,
+    JsonReader,
+    PandasDataProcessor,
+)
 from .structured_engine import StructuredEngine
 
 logging.getLogger("presidio-structured").addHandler(logging.NullHandler())

diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py
@@ -16,17 +16,17 @@
 
 
 class AnalysisBuilder(ABC):
-    """
-    Abstract base class for a configuration generator.
-    """
+    """Abstract base class for a configuration generator."""
 
     def __init__(self, analyzer: AnalyzerEngine = None) -> None:
         """Initialize the configuration generator."""
         self.analyzer = AnalyzerEngine() if analyzer is None else analyzer
         self.logger = logging.getLogger("presidio-structured")
 
     @abstractmethod
-    def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis:
+    def generate_analysis(
+        self, data: Union[Dict, DataFrame]
+    ) -> StructuredAnalysis:
         """
         Abstract method to generate a configuration from the given data.
 
@@ -48,14 +48,17 @@ def generate_analysis(self, data: Dict) -> StructuredAnalysis:
         """
         self.logger.debug("Starting JSON BatchAnalyzer analysis")
         batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
-        analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en")
+        analyzer_results = batch_analyzer.analyze_dict(
+            input_dict=data, language="en"
+        )
         return self._generate_analysis_from_results_json(analyzer_results)
 
     def _generate_analysis_from_results_json(
         self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = ""
     ) -> StructuredAnalysis:
         """
-        Generate a configuration from the given analyzer results. Always uses the first recognizer result if there are more than one.
+        Generate a configuration from the given analyzer results. \
+             Always uses the first recognizer result if there are more than one.
 
         :param analyzer_results: The analyzer results.
         :param prefix: The prefix for the configuration keys.
@@ -77,17 +80,21 @@ def _generate_analysis_from_results_json(
                     result.recognizer_results, prefix=current_key + "."
                 )
                 mappings.update(nested_mappings.entity_mapping)
-            first_recognizer_result = next(iter(result.recognizer_results), None)
+            first_recognizer_result = next(
+                iter(result.recognizer_results), None
+            )
             if first_recognizer_result is not None:
                 self.logger.debug(
-                    f"Found entity {first_recognizer_result.entity_type} in {current_key}"
+                    f"Found entity {first_recognizer_result.entity_type} \
+                        in {current_key}"
                 )
                 mappings[current_key] = first_recognizer_result.entity_type
         return StructuredAnalysis(entity_mapping=mappings)
 
 
 class TabularAnalysisbuilder(AnalysisBuilder):
-    """Placeholder class for generalizing tabular data analysis builders (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now."""
+    """Placeholder class for generalizing tabular data analysis builders \
+          (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now."""
 
     pass
 
@@ -108,13 +115,16 @@ def generate_analysis(
         """
         if n > len(df):
             self.logger.debug(
-                f"Number of samples ({n}) is larger than the number of rows ({len(df)}), using all rows"
+                f"Number of samples ({n}) is larger than the number of rows \
+                    ({len(df)}), using all rows"
             )
             n = len(df)
 
         df = df.sample(n)
 
-        key_recognizer_result_map = self._find_most_common_entity(df, language)
+        key_recognizer_result_map = self._find_most_common_entity(
+            df, language
+        )
 
         key_entity_map = {
             key: result.entity_type
@@ -139,7 +149,9 @@ def _find_most_common_entity(
         batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
 
         for column in df.columns:
-            self.logger.debug(f"Finding most common PII entity for column {column}")
+            self.logger.debug(
+                f"Finding most common PII entity for column {column}"
+            )
             analyzer_results = batch_analyzer.analyze_iterator(
                 [val for val in df[column]], language=language
             )

diff --git a/presidio-structured/presidio_structured/config/__init__.py b/presidio-structured/presidio_structured/config/__init__.py
@@ -1,3 +1,4 @@
+"""Config module for presidio-structured."""
 from .structured_analysis import StructuredAnalysis
 
 __all__ = [

diff --git a/presidio-structured/presidio_structured/config/structured_analysis.py b/presidio-structured/presidio_structured/config/structured_analysis.py
@@ -1,13 +1,14 @@
-""" Structured Analysis module. """
+"""Structured Analysis module."""
 
 from dataclasses import dataclass
 from typing import Dict
 
 
 @dataclass
 class StructuredAnalysis:
-    """Dataclass containing entity analysis from structured data. Currently only contains entity mapping."""
+    """Dataclass containing entity analysis from structured data.\
+            Currently only contains entity mapping."""
 
     entity_mapping: Dict[
         str, str
-    ]  # NOTE ideally Literal[...] with allowed EntityTypes, but cannot unpack in Literal.
+    ]
diff --git a/presidio-structured/presidio_structured/data/__init__.py b/presidio-structured/presidio_structured/data/__init__.py
@@ -1,3 +1,5 @@
+"""Data module."""
+
 from .data_reader import CsvReader, JsonReader
 from .data_processors import JsonDataProcessor, PandasDataProcessor
 

diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py
@@ -10,12 +10,10 @@
 
 
 class DataProcessorBase(ABC):
-    """
-    Abstract base class to handle logic of operations over the text using the operators.
-    """
+    """Abstract class to handle logic of operations over text using the operators."""
 
     def __init__(self) -> None:
-        """Initializes DataProcessorBase object."""
+        """Initialize DataProcessorBase object."""
         self.logger = logging.getLogger("presidio-structured")
 
     def operate(
@@ -25,7 +23,8 @@ def operate(
         operators: Dict[str, OperatorConfig],
     ) -> Any:
         """
-        Performs operations over the text using the operators, as per the structured analysis.
+        Perform operations over the text using the operators, \
+            as per the structured analysis.
 
         :param data: Data to be operated on.
         :param structured_analysis: Analysis schema as per the structured data.
@@ -39,7 +38,9 @@ def operate(
 
     @abstractmethod
     def _process(
-        self, data: Dict | DataFrame, key_to_operator_mapping: Dict[str, Callable]
+        self,
+        data: Dict | DataFrame,
+        key_to_operator_mapping: Dict[str, Callable],
     ) -> Dict | DataFrame:
         """
         Abstract method for subclasses to provide operation implementation.
@@ -71,11 +72,15 @@ def _generate_operator_mapping(
 
         operators_factory = OperatorsFactory()
         for key, entity in config.entity_mapping.items():
-            self.logger.debug(f"Creating operator for key {key} and entity {entity}")
-            operator_config = operators.get(entity, operators.get("DEFAULT", None))
+            self.logger.debug(
+                f"Creating operator for key {key} and entity {entity}"
+            )
+            operator_config = operators.get(
+                entity, operators.get("DEFAULT", None)
+            )
             if operator_config is None:
                 raise ValueError(f"Operator for entity {entity} not found")
-            # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported for now.
+            # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported.
             operator = operators_factory.create_operator_class(
                 operator_config.operator_name, OperatorType.Anonymize
             )
@@ -102,6 +107,8 @@ def _operate_on_text(
 
 
 class PandasDataProcessor(DataProcessorBase):
+    """Pandas Data Processor."""
+
     def _process(
         self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable]
     ) -> DataFrame:
@@ -155,9 +162,12 @@ def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any:
         return data
 
     @staticmethod
-    def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> None:
+    def _set_nested_value(
+        data: Union[Dict, List], path: List[str], value: Any
+    ) -> None:
         """
         Recursively sets a value in nested data using a given path.
+
         :param data: Nested data (JSON-like).
         :param path: List of keys/indexes representing the path.
         :param value: Value to be set.
@@ -172,7 +182,9 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N
                     continue
                 else:
                     for item in data:
-                        JsonDataProcessor._set_nested_value(item, path[i:], value)
+                        JsonDataProcessor._set_nested_value(
+                            item, path[i:], value
+                        )
                     return
             elif isinstance(data, dict):
                 if i == len(path) - 1:
@@ -181,10 +193,13 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N
                     data = data.setdefault(key, {})
 
     def _process(
-        self, data: Union[Dict, List], key_to_operator_mapping: Dict[str, Callable]
+        self,
+        data: Union[Dict, List],
+        key_to_operator_mapping: Dict[str, Callable],
     ) -> Union[Dict, List]:
         """
-        Operates on the given JSON-like data (nested dictionary/list) based on the provided configuration.
+        Operates on the given JSON-like data based on the provided configuration.
+
         :param data: JSON-like data to be operated on.
         :param config: Configuration object containing operator information.
         :return: JSON-like data after the operation.

diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py
@@ -1,4 +1,4 @@
-""" Helper data classes, mostly simple wrappers  to ensure consistent user interface. """
+"""Helper data classes, mostly simple wrappers  to ensure consistent user interface."""
 
 import json
 from abc import ABC, abstractmethod
@@ -12,7 +12,7 @@ class ReaderBase(ABC):
     """
     Base class for data readers.
 
-    This class should not be instantiated directly. Instead use or define a reader subclass.
+    This class should not be instantiated directly, instead init a subclass.
     """
 
     @abstractmethod

diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py
@@ -14,9 +14,7 @@
 
 
 class StructuredEngine:
-    """
-    Class to implement methods for anonymizing tabular data.
-    """
+    """Class to implement methods for anonymizing tabular data."""
 
     def __init__(
         self, data_processor: DataProcessorBase = PandasDataProcessor()
@@ -46,23 +44,28 @@ def anonymize(
         self.loggger.debug("Starting anonymization")
         operators = self.__check_or_add_default_operator(operators)
 
-        return self.data_processor.operate(data, structured_analysis, operators)
+        return self.data_processor.operate(
+            data, structured_analysis, operators
+        )
 
     def __check_or_add_default_operator(
         self, operators: Dict[str, OperatorConfig]
     ) -> Dict[str, OperatorConfig]:
         """
-        Check if the provided operators dictionary has a default operator.
-        If not, add a default operator.
+        Check if the provided operators dictionary has a default operator. \
+            If not, add a default operator.
 
         :param operators: dictionary of operator configurations.
-        :return: operators dictionary with the default operator added if it was not initially present.
+        :return: operators dictionary with the default operator added \
+            if it was not initially present.
         """
         default_operator = OperatorConfig(DEFAULT)
         if not operators:
             self.logger.debug("No operators provided, using default operator")
             return {"DEFAULT": default_operator}
         if not operators.get("DEFAULT"):
-            self.logger.debug("No default operator provided, using default operator")
+            self.logger.debug(
+                "No default operator provided, using default operator"
+            )
             operators["DEFAULT"] = default_operator
         return operators
diff --git a/presidio-structured/setup.cfg b/presidio-structured/setup.cfg
@@ -0,0 +1,10 @@
+[flake8]
+max-line-length = 88
+exclude =
+    .git,
+    __pycache__,
+    build,
+    dist,
+    tests
+docstring-convention = numpy
+extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 TC
diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py
@@ -24,7 +24,9 @@
     name="presidio_structured",
     python_requires=">=3.5",
     version=__version__,
-    packages=find_packages(include=["presidio_structured", "presidio_structured.*"]),
+    packages=find_packages(
+        include=["presidio_structured", "presidio_structured.*"]
+    ),
     classifiers=[
         "Intended Audience :: Developers",
         "License :: OSI Approved :: MIT License",
@@ -36,7 +38,8 @@
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
     ],
-    description="Presidio structured package - analyses and anonymizes structured and semistructured data.",
+    description="Presidio structured package - analyses and anonymizes \
+          structured and semistructured data.",
     license="MIT license",
     include_package_data=True,
     keywords="presidio_structured",

diff --git a/presidio-structured/tests/conftest.py b/presidio-structured/tests/conftest.py
@@ -1,3 +1,5 @@
+""" Pytest fixtures for presidio-structured tests. """
+
 import pandas as pd
 import pytest
 from presidio_anonymizer.entities import OperatorConfig
@@ -9,7 +11,11 @@
 def sample_df():
     data = {
         "name": ["John Doe", "Jane Doe", "John Smith"],
-        "email": ["john@example.com", "jane@example.com", "johnsmith@example.com"],
+        "email": [
+            "john@example.com",
+            "jane@example.com",
+            "johnsmith@example.com",
+        ],
         "phone": ["1234567890", "0987654321", "1122334455"],
     }
     return pd.DataFrame(data)
@@ -33,7 +39,12 @@ def sample_json():
 
 @pytest.fixture
 def sample_json_with_array():
-    data = {"users": [{"id": 1, "name": "John Doe"}, {"id": 2, "name": "Jane Doe"}]}
+    data = {
+        "users": [
+            {"id": 1, "name": "John Doe"},
+            {"id": 2, "name": "Jane Doe"},
+        ]
+    }
     return data
 
 
@@ -50,15 +61,21 @@ def tabular_analysis_builder():
 @pytest.fixture
 def operators():
     return {
-        "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}),
-        "DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}),
+        "PERSON": OperatorConfig(
+            "replace", {"new_value": "PERSON_REPLACEMENT"}
+        ),
+        "DEFAULT": OperatorConfig(
+            "replace", {"new_value": "DEFAULT_REPLACEMENT"}
+        ),
     }
 
 
 @pytest.fixture
 def operators_no_default():
     return {
-        "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}),
+        "PERSON": OperatorConfig(
+            "replace", {"new_value": "PERSON_REPLACEMENT"}
+        ),
     }