Merge pull request #1 from ebotiab/feature/presidio-tabular

Addressing Code Review Feedback for Feature/presidio-structured/1192
microsoft · Nov 24, 2023 · f637f34 · f637f34
2 parents 463beba + 354e223
commit f637f34
Show file tree

Hide file tree

Showing 17 changed files with 63 additions and 62 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,7 @@ All notable changes to this project will be documented in this file.
 ## [Unreleased]
 ### Added
 #### Structured
-* Added V1 of presidio-structured, a library (presidio-structured) which re-uses existing logic from existing presidio components to allow anonymization of (semi-)structured data.
+* Added alpha of presidio-structured, a library (presidio-structured) which re-uses existing logic from existing presidio components to allow anonymization of (semi-)structured data.
 
 ## [2.2.351] - Nov. 6th 2024
 ### Changed

diff --git a/docs/samples/index.md b/docs/samples/index.md
@@ -14,6 +14,7 @@
 | Usage | Images     | Python Notebook                       | [Plot custom bounding boxes](https://github.com/microsoft/presidio/blob/main/docs/samples/python/plot_custom_bboxes.ipynb)
 | Usage | Text     | Python Notebook                       | [Integrating with external services](https://github.com/microsoft/presidio/blob/main/docs/samples/python/integrating_with_external_services.ipynb) |
 | Usage | Text       | Python file                               | [Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py) |
+| Usage | Structured     | Python Notebook                       | [Presidio Structured Basic Usage Notebook](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_structured.ipynb) |
 | Usage | Text      | Python file                               | [Azure AI Language as a Remote Recognizer](python/text_analytics/index.md)  |
 | Usage | CSV       | Python file                               | [Analyze and Anonymize CSV file](https://github.com/microsoft/presidio/blob/main/docs/samples/python/process_csv_file.py) |
 | Usage | Text      | Python                                | [Using Flair as an external PII model](https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py)|

diff --git a/...tructured/sample/sample_data/test_csv.csv → ...ython/csv_sample_data/test_structured.csv b/...tructured/sample/sample_data/test_csv.csv → ...ython/csv_sample_data/test_structured.csv
diff --git a/presidio-structured/sample/example.ipynb → docs/samples/python/example_structured.ipynb b/presidio-structured/sample/example.ipynb → docs/samples/python/example_structured.ipynb
@@ -7,9 +7,7 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import sys\n",
-    "sys.path.append(os.path.abspath(\"..\"))\n",
-    "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataTransformer, PandasDataTransformer"
+    "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor"
    ]
   },
   {
@@ -107,10 +105,17 @@
     }
    ],
    "source": [
-    "sample_df = CsvReader().read(\"./sample_data/test_csv.csv\")\n",
+    "sample_df = CsvReader().read(\"./csv_sample_data/test_structured.csv\")\n",
     "sample_df"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This sample is a structured dataset containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -134,7 +139,7 @@
     }
    ],
    "source": [
-    "sample_json = JsonReader().read(\"./sample_data/test_json.json\")\n",
+    "sample_json = JsonReader().read(\"./sample_data/test_structured.json\")\n",
     "sample_json"
    ]
   },
@@ -176,7 +181,7 @@
    ],
    "source": [
     "# contains nested objects in lists\n",
-    "sample_complex_json = JsonReader().read(\"./sample_data/test_complex_json.json\")\n",
+    "sample_complex_json = JsonReader().read(\"./sample_data/test_structured_complex.json\")\n",
     "sample_complex_json"
    ]
   },
@@ -294,7 +299,7 @@
    "source": [
     "# anonymized data defaults to be replaced with None, unless operators is specified\n",
     "\n",
-    "pandas_engine = StructuredEngine(data_transformer=PandasDataTransformer())\n",
+    "pandas_engine = StructuredEngine(data_processor=PandasDataProcessor())\n",
     "df_to_be_anonymized = sample_df.copy() # in-place anonymization\n",
     "anonymized_df = pandas_engine.anonymize(df_to_be_anonymized, tabular_analysis, operators=None) # explicit None for clarity\n",
     "anonymized_df"
@@ -490,7 +495,7 @@
    ],
    "source": [
     "# anonymizing simple data\n",
-    "json_engine = StructuredEngine(data_transformer=JsonDataTransformer())\n",
+    "json_engine = StructuredEngine(data_processor=JsonDataProcessor())\n",
     "anonymized_json = json_engine.anonymize(sample_json, json_analysis, operators=operators)\n",
     "anonymized_json"
    ]

diff --git a/...uctured/sample/sample_data/test_json.json → ...s/python/sample_data/test_structured.json b/...uctured/sample/sample_data/test_json.json → ...s/python/sample_data/test_structured.json
diff --git a/...sample/sample_data/test_complex_json.json → .../sample_data/test_structured_complex.json b/...sample/sample_data/test_complex_json.json → .../sample_data/test_structured_complex.json
diff --git a/presidio-structured/README.md b/presidio-structured/README.md
@@ -2,17 +2,20 @@
 
 ## Status
 
-### TODO
-
-For TODOs, see draft PR.
+**Alpha**: This package is currently in alpha, meaning it is in its early stages of development. Features and functionality may change as the project evolves.
 
 ## Description
 
-The Presidio stuctured is..
+The Presidio structured package is a flexible and customizable framework designed to identify and protect structured sensitive data. This tool extends the capabilities of Presidio, focusing on structured data formats.
 
 ## Deploy Presidio analyzer to Azure
 
+TODO: [Instructions on deploying the Presidio analyzer to Azure will be here]
+
 ## Simple usage example
 
+TODO: [A basic example of how to use the Presidio structured package will be here]
+
 ## Documentation
 
+TODO: [Link to the comprehensive documentation, guides, and API references]
diff --git a/presidio-structured/__init__.py b/presidio-structured/__init__.py
@@ -4,6 +4,6 @@
 # Set up default logging (with NullHandler)
 
 
-# logging.getLogger("presidio-str").addHandler(logging.NullHandler())
+logging.getLogger("presidio-str").addHandler(logging.NullHandler())
 
 # __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"]
diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py
@@ -1,6 +1,6 @@
 from .analysis_builder import JsonAnalysisBuilder, TabularAnalysisBuilder
 from .config import StructuredAnalysis
-from .data import CsvReader, JsonDataTransformer, JsonReader, PandasDataTransformer
+from .data import CsvReader, JsonDataProcessor, JsonReader, PandasDataProcessor
 from .structured_engine import StructuredEngine
 
 __all__ = [
@@ -10,6 +10,6 @@
     "StructuredAnalysis",
     "CsvReader",
     "JsonReader",
-    "PandasDataTransformer",
-    "JsonDataTransformer",
+    "PandasDataProcessor",
+    "JsonDataProcessor",
 ]
diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py
@@ -19,19 +19,17 @@ class AnalysisBuilder(ABC):
     Abstract base class for a configuration generator.
     """
 
-    def __init__(self):
+    def __init__(self, analyzer: AnalyzerEngine = None) -> None:
         """Initialize the configuration generator."""
-        self.analyzer = AnalyzerEngine()
+        self.analyzer = AnalyzerEngine() if analyzer is None else analyzer
 
     @abstractmethod
     def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis:
         """
         Abstract method to generate a configuration from the given data.
 
         :param data: The input data. Can be a dictionary or DataFrame instance.
-        :type data: Union[Dict, DataFrame]
         :return: The generated configuration.
-        :rtype StructuredAnalysis:
         """
         pass
 
@@ -44,9 +42,7 @@ def generate_analysis(self, data: Dict) -> StructuredAnalysis:
         Generate a configuration from the given JSON data.
 
         :param data: The input JSON data.
-        :type data: Dict
         :return: The generated configuration.
-        :rtype StructuredAnalysis:
         """
         batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
         analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en")
@@ -59,11 +55,8 @@ def _generate_analysis_from_results_json(
         Generate a configuration from the given analyzer results.
 
         :param analyzer_results: The analyzer results.
-        :type analyzer_results: Iterator[DictAnalyzerResult]
         :param prefix: The prefix for the configuration keys.
-        :type prefix: str
         :return: The generated configuration.
-        :rtype StructuredAnalysis:
         """
         mappings = {}
 
@@ -78,10 +71,9 @@ def _generate_analysis_from_results_json(
                     result.recognizer_results, prefix=current_key + "."
                 )
                 mappings.update(nested_mappings.entity_mapping)
-
-            if sum(1 for _ in result.recognizer_results) > 0:
-                for recognizer_result in result.recognizer_results:
-                    mappings[current_key] = recognizer_result.entity_type
+            first_recognizer_result = next(iter(result.recognizer_results), None)
+            if first_recognizer_result is not None:
+                mappings[current_key] = first_recognizer_result.entity_type
         return StructuredAnalysis(entity_mapping=mappings)
 
 
@@ -95,13 +87,9 @@ def generate_analysis(
         Generate a configuration from the given tabular data.
 
         :param df: The input tabular data (dataframe).
-        :type df: DataFrame
         :param n: The number of samples to be taken from the dataframe.
-        :type n: int
         :param language: The language to be used for analysis.
-        :type language: str
         :return: The generated configuration.
-        :rtype StructuredAnalysis:
         """
         if n > len(df):
             n = len(df)
@@ -125,16 +113,14 @@ def _find_most_common_entity(
         Find the most common entity in a dataframe column.
 
         :param df: The dataframe where entities will be searched.
-        :type df: DataFrame
         :param language: Language to be used in the analysis engine.
-        :type language: str
         :return: A dictionary mapping column names to the most common RecognizerResult.
-        :rtype: Dict[str, RecognizerResult]
         """
         key_recognizer_result_map = {}
 
+        batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
+
         for column in df.columns:
-            batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
             analyzer_results = batch_analyzer.analyze_iterator(
                 [val for val in df[column]], language=language
             )

diff --git a/presidio-structured/presidio_structured/data/__init__.py b/presidio-structured/presidio_structured/data/__init__.py
@@ -1,9 +1,9 @@
 from .data_reader import CsvReader, JsonReader
-from .data_transformers import JsonDataTransformer, PandasDataTransformer
+from .data_processors import JsonDataProcessor, PandasDataProcessor
 
 __all__ = [
     "CsvReader",
     "JsonReader",
-    "PandasDataTransformer",
-    "JsonDataTransformer",
+    "PandasDataProcessor",
+    "JsonDataProcessor",
 ]
diff --git a/...idio_structured/data/data_transformers.py → ...esidio_structured/data/data_processors.py b/...idio_structured/data/data_transformers.py → ...esidio_structured/data/data_processors.py
@@ -8,13 +8,13 @@
 from presidio_structured.config import StructuredAnalysis
 
 
-class DataTransformerBase(ABC):
+class DataProcessorBase(ABC):
     """
     Abstract base class to handle logic of operations over the text using the operators.
     """
 
     def __init__(self) -> None:
-        """Initializes DataTransformerBase object."""
+        """Initializes DataProcessorBase object."""
         pass
 
     def operate(
@@ -99,7 +99,7 @@ def _operate_on_text(
         return operator_callable(text_to_operate_on)
 
 
-class PandasDataTransformer(DataTransformerBase):
+class PandasDataProcessor(DataProcessorBase):
     def _process(
         self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable]
     ) -> DataFrame:
@@ -124,8 +124,8 @@ def _process(
         return data
 
 
-class JsonDataTransformer(DataTransformerBase):
-    """JSON Data Transformer, Supports arbitrary nesting of dictionaries and lists."""
+class JsonDataProcessor(DataProcessorBase):
+    """JSON Data Processor, Supports arbitrary nesting of dictionaries and lists."""
 
     @staticmethod
     def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any:
@@ -142,7 +142,7 @@ def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any:
                     data = data[int(key)]
                 else:
                     return [
-                        JsonDataTransformer._get_nested_value(item, path[i:])
+                        JsonDataProcessor._get_nested_value(item, path[i:])
                         for item in data
                     ]
             elif isinstance(data, dict):
@@ -169,7 +169,7 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N
                     continue
                 else:
                     for item in data:
-                        JsonDataTransformer._set_nested_value(item, path[i:], value)
+                        JsonDataProcessor._set_nested_value(item, path[i:], value)
                     return
             elif isinstance(data, dict):
                 if i == len(path) - 1:

diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py
@@ -2,7 +2,8 @@
 
 import json
 from abc import ABC, abstractmethod
-from typing import Any, Dict
+from pathlib import Path
+from typing import Any, Dict, Union
 
 import pandas as pd
 
@@ -15,7 +16,7 @@ class ReaderBase(ABC):
     """
 
     @abstractmethod
-    def read(self, path: str) -> Any:
+    def read(self, path: Union[str, Path], **kwargs) -> Any:
         """
         Extract data from file located at path.
 
@@ -36,14 +37,14 @@ class CsvReader(ReaderBase):
 
     """
 
-    def read(self, path: str) -> pd.DataFrame:
+    def read(self, path: Union[str, Path], **kwargs) -> pd.DataFrame:
         """
         Read csv file to pandas dataframe.
 
         :param path: String defining the location of the csv file to read.
         :return: Pandas DataFrame with the data read from the csv file.
         """
-        return pd.read_csv(path)
+        return pd.read_csv(path, **kwargs)
 
 
 class JsonReader(ReaderBase):
@@ -57,13 +58,13 @@ class JsonReader(ReaderBase):
 
     """
 
-    def read(self, path: str) -> Dict[str, Any]:
+    def read(self, path: Union[str, Path], **kwargs) -> Dict[str, Any]:
         """
         Read json file to dict.
 
         :param path: String defining the location of the json file to read.
         :return: dictionary with the data read from the json file.
         """
         with open(path) as f:
-            data = json.load(f)
+            data = json.load(f, **kwargs)
         return data
diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py
@@ -4,7 +4,10 @@
 from presidio_anonymizer.entities import OperatorConfig
 
 from presidio_structured.config import StructuredAnalysis
-from presidio_structured.data.data_transformers import DataTransformerBase
+from presidio_structured.data.data_processors import (
+    DataProcessorBase,
+    PandasDataProcessor,
+)
 
 DEFAULT = "replace"
 
@@ -14,13 +17,15 @@ class StructuredEngine:
     Class to implement methods for anonymizing tabular data.
     """
 
-    def __init__(self, data_transformer: DataTransformerBase):
+    def __init__(self, data_processor: DataProcessorBase = None) -> None:
         """
-        Initialize the class with a data transformer.
+        Initialize the class with a data processor.
 
-        :param data_transformer: Instance of DataTransformerBase.
+        :param data_processor: Instance of DataProcessorBase.
         """
-        self.data_transformer = data_transformer
+        if data_processor is None:
+            data_processor = PandasDataProcessor()
+        self.data_processor = data_processor
 
     def anonymize(
         self,
@@ -38,7 +43,7 @@ def anonymize(
         """
         operators = self.__check_or_add_default_operator(operators)
 
-        return self.data_transformer.operate(data, structured_analysis, operators)
+        return self.data_processor.operate(data, structured_analysis, operators)
 
     @staticmethod
     def __check_or_add_default_operator(