pr comments, nits, minor tests

microsoft · Nov 24, 2023 · db1f3d8 · db1f3d8
1 parent f637f34
commit db1f3d8
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 58 deletions.
diff --git a/docs/samples/python/example_structured.ipynb b/docs/samples/python/example_structured.ipynb
@@ -6,10 +6,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
     "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This sample showcases presidio-structured on structured and semi-structured data containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -109,13 +115,6 @@
     "sample_df"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This sample is a structured dataset containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -216,7 +215,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -291,7 +290,7 @@
        "2   3  <None>  <None>  789 Pine St  <None>  <None>        11223"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -352,7 +351,7 @@
        "      <th>0</th>\n",
        "      <td>1</td>\n",
        "      <td>person...</td>\n",
-       "      <td>mccoyryan@example.com</td>\n",
+       "      <td>jamestaylor@example.net</td>\n",
        "      <td>123 Main St</td>\n",
        "      <td>&lt;None&gt;</td>\n",
        "      <td>&lt;None&gt;</td>\n",
@@ -362,7 +361,7 @@
        "      <th>1</th>\n",
        "      <td>2</td>\n",
        "      <td>person...</td>\n",
-       "      <td>harristricia@example.net</td>\n",
+       "      <td>brian49@example.com</td>\n",
        "      <td>456 Elm St</td>\n",
        "      <td>&lt;None&gt;</td>\n",
        "      <td>&lt;None&gt;</td>\n",
@@ -372,7 +371,7 @@
        "      <th>2</th>\n",
        "      <td>3</td>\n",
        "      <td>person...</td>\n",
-       "      <td>thomasmikayla@example.org</td>\n",
+       "      <td>clarkcody@example.org</td>\n",
        "      <td>789 Pine St</td>\n",
        "      <td>&lt;None&gt;</td>\n",
        "      <td>&lt;None&gt;</td>\n",
@@ -383,10 +382,10 @@
        "</div>"
       ],
       "text/plain": [
-       "   id       name                      email       street    city   state  \\\n",
-       "0   1  person...      mccoyryan@example.com  123 Main St  <None>  <None>   \n",
-       "1   2  person...   harristricia@example.net   456 Elm St  <None>  <None>   \n",
-       "2   3  person...  thomasmikayla@example.org  789 Pine St  <None>  <None>   \n",
+       "   id       name                    email       street    city   state  \\\n",
+       "0   1  person...  jamestaylor@example.net  123 Main St  <None>  <None>   \n",
+       "1   2  person...      brian49@example.com   456 Elm St  <None>  <None>   \n",
+       "2   3  person...    clarkcody@example.org  789 Pine St  <None>  <None>   \n",
        "\n",
        "   postal_code  \n",
        "0        12345  \n",
@@ -428,7 +427,7 @@
     {
      "data": {
       "text/plain": [
-       "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'URL', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})"
+       "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'EMAIL_ADDRESS', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})"
       ]
      },
      "execution_count": 8,
@@ -481,7 +480,7 @@
       "text/plain": [
        "{'id': 1,\n",
        " 'name': 'person...',\n",
-       " 'email': '<None>',\n",
+       " 'email': 'virginia29@example.org',\n",
        " 'address': {'street': '123 Main St',\n",
        "  'city': '<None>',\n",
        "  'state': '<None>',\n",
@@ -510,21 +509,21 @@
       "text/plain": [
        "{'users': [{'id': 1,\n",
        "   'name': 'person...',\n",
-       "   'email': 'tricia10@example.com',\n",
+       "   'email': 'david90@example.org',\n",
        "   'address': {'street': '<None>',\n",
        "    'city': '<None>',\n",
        "    'state': '<None>',\n",
        "    'postal_code': '12345'}},\n",
        "  {'id': 2,\n",
        "   'name': 'person...',\n",
-       "   'email': 'tricia10@example.com',\n",
+       "   'email': 'david90@example.org',\n",
        "   'address': {'street': '<None>',\n",
        "    'city': '<None>',\n",
        "    'state': '<None>',\n",
        "    'postal_code': '67890'}},\n",
        "  {'id': 3,\n",
        "   'name': 'person...',\n",
-       "   'email': 'tricia10@example.com',\n",
+       "   'email': 'david90@example.org',\n",
        "   'address': {'street': '<None>',\n",
        "    'city': '<None>',\n",
        "    'state': '<None>',\n",

diff --git a/presidio-structured/__init__.py b/presidio-structured/__init__.py
@@ -3,7 +3,4 @@
 
 # Set up default logging (with NullHandler)
 
-
-logging.getLogger("presidio-str").addHandler(logging.NullHandler())
-
-# __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"]
+logging.getLogger("presidio-structured").addHandler(logging.NullHandler())
diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py
@@ -52,7 +52,7 @@ def _generate_analysis_from_results_json(
         self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = ""
     ) -> StructuredAnalysis:
         """
-        Generate a configuration from the given analyzer results.
+        Generate a configuration from the given analyzer results. Always uses the first recognizer result if there are more than one.
 
         :param analyzer_results: The analyzer results.
         :param prefix: The prefix for the configuration keys.

diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py
@@ -17,14 +17,12 @@ class StructuredEngine:
     Class to implement methods for anonymizing tabular data.
     """
 
-    def __init__(self, data_processor: DataProcessorBase = None) -> None:
+    def __init__(self, data_processor: DataProcessorBase = PandasDataProcessor()) -> None:
         """
         Initialize the class with a data processor.
 
         :param data_processor: Instance of DataProcessorBase.
         """
-        if data_processor is None:
-            data_processor = PandasDataProcessor()
         self.data_processor = data_processor
 
     def anonymize(

diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py
@@ -1,18 +1,18 @@
 import pytest
 from pandas import DataFrame
 from presidio_anonymizer.entities import OperatorConfig
-from presidio_structured.data.data_transformers import DataTransformerBase, PandasDataTransformer, JsonDataTransformer
+from presidio_structured.data.data_processors import DataProcessorBase, PandasDataProcessor, JsonDataProcessor
 from presidio_structured.config import StructuredAnalysis
 
-class TestDataTransformerBase:
+class TestDataProcessorBase:
     def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators):
         with pytest.raises(TypeError):
-            DataTransformerBase()
+            DataProcessorBase()
 
-class TestPandasDataTransformer:
+class TestPandasDataProcessor:
     def test_process(self, sample_df, operators, tabular_analysis):
-        transformer = PandasDataTransformer()
-        result = transformer.operate(sample_df, tabular_analysis, operators)
+        processor = PandasDataProcessor()
+        result = processor.operate(sample_df, tabular_analysis, operators)
         assert isinstance(result, DataFrame)
         for key in tabular_analysis.entity_mapping:
             if key == 'name':
@@ -21,19 +21,19 @@ def test_process(self, sample_df, operators, tabular_analysis):
                 assert all(result[key] == "DEFAULT_REPLACEMENT")
 
     def test_process_no_default_should_raise(self, sample_df, operators_no_default, tabular_analysis):
-        transformer = PandasDataTransformer()
+        processor = PandasDataProcessor()
         with pytest.raises(ValueError):
-            transformer.operate(sample_df, tabular_analysis, operators_no_default)
+            processor.operate(sample_df, tabular_analysis, operators_no_default)
 
     def test_process_invalid_data(self, sample_json, tabular_analysis, operators):
-        transformer = PandasDataTransformer()
+        processor = PandasDataProcessor()
         with pytest.raises(ValueError):
-            transformer.operate(sample_json, tabular_analysis, operators)
+            processor.operate(sample_json, tabular_analysis, operators)
 
-class TestJsonDataTransformer:
+class TestJsonDataProcessor:
     def test_process(self, sample_json, operators, json_analysis):
-        transformer = JsonDataTransformer()
-        result = transformer.operate(sample_json, json_analysis, operators)
+        processor = JsonDataProcessor()
+        result = processor.operate(sample_json, json_analysis, operators)
         assert isinstance(result, dict)
         for key, value in json_analysis.entity_mapping.items():
             keys = key.split(".")
@@ -46,11 +46,11 @@ def test_process(self, sample_json, operators, json_analysis):
                 assert nested_value == "DEFAULT_REPLACEMENT"
 
     def test_process_no_default_should_raise(self, sample_json, operators_no_default, json_analysis):
-        transformer = JsonDataTransformer()
+        processor = JsonDataProcessor()
         with pytest.raises(ValueError):
-            transformer.operate(sample_json, json_analysis, operators_no_default)
+            processor.operate(sample_json, json_analysis, operators_no_default)
 
     def test_process_invalid_data(self, sample_df, json_analysis, operators):
-        transformer = JsonDataTransformer()
+        processor = JsonDataProcessor()
         with pytest.raises(ValueError):
-          transformer.operate(sample_df, json_analysis, operators)
+          processor.operate(sample_df, json_analysis, operators)
diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py
@@ -1,15 +1,18 @@
 from unittest.mock import Mock
+import pandas as pd
 
 import pytest
+
 from presidio_anonymizer.entities import OperatorConfig
 
 from presidio_structured import StructuredEngine
+from presidio_structured.data.data_processors import JsonDataProcessor
 
 
-def test_structured_engine_anonymize_calls_data_transformer_operate():
+def test_structured_engine_anonymize_calls_data_processor_operate():
     # Arrange
-    data_transformer = Mock()
-    structured_engine = StructuredEngine(data_transformer)
+    data_processor = Mock()
+    structured_engine = StructuredEngine(data_processor)
     data = Mock()
     structured_analysis = Mock()
     operators = {"DEFAULT": OperatorConfig("replace")}
@@ -18,31 +21,31 @@ def test_structured_engine_anonymize_calls_data_transformer_operate():
     structured_engine.anonymize(data, structured_analysis, operators)
 
     # Assert
-    data_transformer.operate.assert_called_once_with(
+    data_processor.operate.assert_called_once_with(
         data, structured_analysis, operators
     )
 
 
 def test_structured_engine_anonymize_adds_default_operator_if_none_provided():
     # Arrange
-    data_transformer = Mock()
-    structured_engine = StructuredEngine(data_transformer)
+    data_processor = Mock()
+    structured_engine = StructuredEngine(data_processor)
     data = Mock()
     structured_analysis = Mock()
 
     # Act
     structured_engine.anonymize(data, structured_analysis)
 
     # Assert
-    data_transformer.operate.assert_called_once()
-    args, _ = data_transformer.operate.call_args
+    data_processor.operate.assert_called_once()
+    args, _ = data_processor.operate.call_args
     assert "DEFAULT" in args[2]
 
 
 def test_structured_engine_anonymize_does_not_override_existing_default_operator():
     # Arrange
-    data_transformer = Mock()
-    structured_engine = StructuredEngine(data_transformer)
+    data_processor = Mock()
+    structured_engine = StructuredEngine(data_processor)
     data = Mock()
     structured_analysis = Mock()
     operators = {"DEFAULT": OperatorConfig("custom")}
@@ -51,6 +54,19 @@ def test_structured_engine_anonymize_does_not_override_existing_default_operator
     structured_engine.anonymize(data, structured_analysis, operators)
 
     # Assert
-    data_transformer.operate.assert_called_once_with(
+    data_processor.operate.assert_called_once_with(
         data, structured_analysis, operators
     )
+
+def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis):
+    data_processor = JsonDataProcessor()
+    structured_engine = StructuredEngine(data_processor)
+    data = pd.DataFrame({"name": ["John", "Jane"]})
+    with pytest.raises(ValueError):
+        structured_engine.anonymize(data, tabular_analysis)
+
+def test_pandas_processor_with_json_will_raise(json_analysis):
+    structured_engine = StructuredEngine() # default PandasDataProcessor
+    data = {"name": ["John", "Jane"]}
+    with pytest.raises(ValueError):
+        structured_engine.anonymize(data, json_analysis)