diff --git a/docs/samples/python/example_structured.ipynb b/docs/samples/python/example_structured.ipynb index d7f6caf93..f0630a44f 100644 --- a/docs/samples/python/example_structured.ipynb +++ b/docs/samples/python/example_structured.ipynb @@ -6,10 +6,16 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This sample showcases presidio-structured on structured and semi-structured data containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -109,13 +115,6 @@ "sample_df" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This sample is a structured dataset containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns." - ] - }, { "cell_type": "code", "execution_count": 3, @@ -216,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -291,7 +290,7 @@ "2 3 789 Pine St 11223" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -352,7 +351,7 @@ " 0\n", " 1\n", " person...\n", - " mccoyryan@example.com\n", + " jamestaylor@example.net\n", " 123 Main St\n", " <None>\n", " <None>\n", @@ -362,7 +361,7 @@ " 1\n", " 2\n", " person...\n", - " harristricia@example.net\n", + " brian49@example.com\n", " 456 Elm St\n", " <None>\n", " <None>\n", @@ -372,7 +371,7 @@ " 2\n", " 3\n", " person...\n", - " thomasmikayla@example.org\n", + " clarkcody@example.org\n", " 789 Pine St\n", " <None>\n", " <None>\n", @@ -383,10 +382,10 @@ "" ], "text/plain": [ - " id name email street city state \\\n", - "0 1 person... mccoyryan@example.com 123 Main St \n", - "1 2 person... harristricia@example.net 456 Elm St \n", - "2 3 person... thomasmikayla@example.org 789 Pine St \n", + " id name email street city state \\\n", + "0 1 person... jamestaylor@example.net 123 Main St \n", + "1 2 person... brian49@example.com 456 Elm St \n", + "2 3 person... clarkcody@example.org 789 Pine St \n", "\n", " postal_code \n", "0 12345 \n", @@ -428,7 +427,7 @@ { "data": { "text/plain": [ - "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'URL', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})" + "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'EMAIL_ADDRESS', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})" ] }, "execution_count": 8, @@ -481,7 +480,7 @@ "text/plain": [ "{'id': 1,\n", " 'name': 'person...',\n", - " 'email': '',\n", + " 'email': 'virginia29@example.org',\n", " 'address': {'street': '123 Main St',\n", " 'city': '',\n", " 'state': '',\n", @@ -510,21 +509,21 @@ "text/plain": [ "{'users': [{'id': 1,\n", " 'name': 'person...',\n", - " 'email': 'tricia10@example.com',\n", + " 'email': 'david90@example.org',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", " 'postal_code': '12345'}},\n", " {'id': 2,\n", " 'name': 'person...',\n", - " 'email': 'tricia10@example.com',\n", + " 'email': 'david90@example.org',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", " 'postal_code': '67890'}},\n", " {'id': 3,\n", " 'name': 'person...',\n", - " 'email': 'tricia10@example.com',\n", + " 'email': 'david90@example.org',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", diff --git a/presidio-structured/__init__.py b/presidio-structured/__init__.py index a3ec0a887..ed6a669d8 100644 --- a/presidio-structured/__init__.py +++ b/presidio-structured/__init__.py @@ -3,7 +3,4 @@ # Set up default logging (with NullHandler) - -logging.getLogger("presidio-str").addHandler(logging.NullHandler()) - -# __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"] +logging.getLogger("presidio-structured").addHandler(logging.NullHandler()) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 424eab96e..1955c505e 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -52,7 +52,7 @@ def _generate_analysis_from_results_json( self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" ) -> StructuredAnalysis: """ - Generate a configuration from the given analyzer results. + Generate a configuration from the given analyzer results. Always uses the first recognizer result if there are more than one. :param analyzer_results: The analyzer results. :param prefix: The prefix for the configuration keys. diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index 3c76f3a55..b36a8f59b 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -17,14 +17,12 @@ class StructuredEngine: Class to implement methods for anonymizing tabular data. """ - def __init__(self, data_processor: DataProcessorBase = None) -> None: + def __init__(self, data_processor: DataProcessorBase = PandasDataProcessor()) -> None: """ Initialize the class with a data processor. :param data_processor: Instance of DataProcessorBase. """ - if data_processor is None: - data_processor = PandasDataProcessor() self.data_processor = data_processor def anonymize( diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py index 0156d77ad..638ed03c9 100644 --- a/presidio-structured/tests/data/test_data_transformers.py +++ b/presidio-structured/tests/data/test_data_transformers.py @@ -1,18 +1,18 @@ import pytest from pandas import DataFrame from presidio_anonymizer.entities import OperatorConfig -from presidio_structured.data.data_transformers import DataTransformerBase, PandasDataTransformer, JsonDataTransformer +from presidio_structured.data.data_processors import DataProcessorBase, PandasDataProcessor, JsonDataProcessor from presidio_structured.config import StructuredAnalysis -class TestDataTransformerBase: +class TestDataProcessorBase: def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators): with pytest.raises(TypeError): - DataTransformerBase() + DataProcessorBase() -class TestPandasDataTransformer: +class TestPandasDataProcessor: def test_process(self, sample_df, operators, tabular_analysis): - transformer = PandasDataTransformer() - result = transformer.operate(sample_df, tabular_analysis, operators) + processor = PandasDataProcessor() + result = processor.operate(sample_df, tabular_analysis, operators) assert isinstance(result, DataFrame) for key in tabular_analysis.entity_mapping: if key == 'name': @@ -21,19 +21,19 @@ def test_process(self, sample_df, operators, tabular_analysis): assert all(result[key] == "DEFAULT_REPLACEMENT") def test_process_no_default_should_raise(self, sample_df, operators_no_default, tabular_analysis): - transformer = PandasDataTransformer() + processor = PandasDataProcessor() with pytest.raises(ValueError): - transformer.operate(sample_df, tabular_analysis, operators_no_default) + processor.operate(sample_df, tabular_analysis, operators_no_default) def test_process_invalid_data(self, sample_json, tabular_analysis, operators): - transformer = PandasDataTransformer() + processor = PandasDataProcessor() with pytest.raises(ValueError): - transformer.operate(sample_json, tabular_analysis, operators) + processor.operate(sample_json, tabular_analysis, operators) -class TestJsonDataTransformer: +class TestJsonDataProcessor: def test_process(self, sample_json, operators, json_analysis): - transformer = JsonDataTransformer() - result = transformer.operate(sample_json, json_analysis, operators) + processor = JsonDataProcessor() + result = processor.operate(sample_json, json_analysis, operators) assert isinstance(result, dict) for key, value in json_analysis.entity_mapping.items(): keys = key.split(".") @@ -46,11 +46,11 @@ def test_process(self, sample_json, operators, json_analysis): assert nested_value == "DEFAULT_REPLACEMENT" def test_process_no_default_should_raise(self, sample_json, operators_no_default, json_analysis): - transformer = JsonDataTransformer() + processor = JsonDataProcessor() with pytest.raises(ValueError): - transformer.operate(sample_json, json_analysis, operators_no_default) + processor.operate(sample_json, json_analysis, operators_no_default) def test_process_invalid_data(self, sample_df, json_analysis, operators): - transformer = JsonDataTransformer() + processor = JsonDataProcessor() with pytest.raises(ValueError): - transformer.operate(sample_df, json_analysis, operators) \ No newline at end of file + processor.operate(sample_df, json_analysis, operators) \ No newline at end of file diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py index 96d0f9188..4d758a94f 100644 --- a/presidio-structured/tests/test_tabular_engine.py +++ b/presidio-structured/tests/test_tabular_engine.py @@ -1,15 +1,18 @@ from unittest.mock import Mock +import pandas as pd import pytest + from presidio_anonymizer.entities import OperatorConfig from presidio_structured import StructuredEngine +from presidio_structured.data.data_processors import JsonDataProcessor -def test_structured_engine_anonymize_calls_data_transformer_operate(): +def test_structured_engine_anonymize_calls_data_processor_operate(): # Arrange - data_transformer = Mock() - structured_engine = StructuredEngine(data_transformer) + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) data = Mock() structured_analysis = Mock() operators = {"DEFAULT": OperatorConfig("replace")} @@ -18,15 +21,15 @@ def test_structured_engine_anonymize_calls_data_transformer_operate(): structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_transformer.operate.assert_called_once_with( + data_processor.operate.assert_called_once_with( data, structured_analysis, operators ) def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): # Arrange - data_transformer = Mock() - structured_engine = StructuredEngine(data_transformer) + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) data = Mock() structured_analysis = Mock() @@ -34,15 +37,15 @@ def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): structured_engine.anonymize(data, structured_analysis) # Assert - data_transformer.operate.assert_called_once() - args, _ = data_transformer.operate.call_args + data_processor.operate.assert_called_once() + args, _ = data_processor.operate.call_args assert "DEFAULT" in args[2] def test_structured_engine_anonymize_does_not_override_existing_default_operator(): # Arrange - data_transformer = Mock() - structured_engine = StructuredEngine(data_transformer) + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) data = Mock() structured_analysis = Mock() operators = {"DEFAULT": OperatorConfig("custom")} @@ -51,6 +54,19 @@ def test_structured_engine_anonymize_does_not_override_existing_default_operator structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_transformer.operate.assert_called_once_with( + data_processor.operate.assert_called_once_with( data, structured_analysis, operators ) + +def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis): + data_processor = JsonDataProcessor() + structured_engine = StructuredEngine(data_processor) + data = pd.DataFrame({"name": ["John", "Jane"]}) + with pytest.raises(ValueError): + structured_engine.anonymize(data, tabular_analysis) + +def test_pandas_processor_with_json_will_raise(json_analysis): + structured_engine = StructuredEngine() # default PandasDataProcessor + data = {"name": ["John", "Jane"]} + with pytest.raises(ValueError): + structured_engine.anonymize(data, json_analysis) \ No newline at end of file