Skip to content

Commit

Permalink
pr comments, nits, minor tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakob-98 committed Nov 24, 2023
1 parent f637f34 commit db1f3d8
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 58 deletions.
43 changes: 21 additions & 22 deletions docs/samples/python/example_structured.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,16 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This sample showcases presidio-structured on structured and semi-structured data containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -109,13 +115,6 @@
"sample_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This sample is a structured dataset containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns."
]
},
{
"cell_type": "code",
"execution_count": 3,
Expand Down Expand Up @@ -216,7 +215,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -291,7 +290,7 @@
"2 3 <None> <None> 789 Pine St <None> <None> 11223"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -352,7 +351,7 @@
" <th>0</th>\n",
" <td>1</td>\n",
" <td>person...</td>\n",
" <td>mccoyryan@example.com</td>\n",
" <td>jamestaylor@example.net</td>\n",
" <td>123 Main St</td>\n",
" <td>&lt;None&gt;</td>\n",
" <td>&lt;None&gt;</td>\n",
Expand All @@ -362,7 +361,7 @@
" <th>1</th>\n",
" <td>2</td>\n",
" <td>person...</td>\n",
" <td>harristricia@example.net</td>\n",
" <td>brian49@example.com</td>\n",
" <td>456 Elm St</td>\n",
" <td>&lt;None&gt;</td>\n",
" <td>&lt;None&gt;</td>\n",
Expand All @@ -372,7 +371,7 @@
" <th>2</th>\n",
" <td>3</td>\n",
" <td>person...</td>\n",
" <td>thomasmikayla@example.org</td>\n",
" <td>clarkcody@example.org</td>\n",
" <td>789 Pine St</td>\n",
" <td>&lt;None&gt;</td>\n",
" <td>&lt;None&gt;</td>\n",
Expand All @@ -383,10 +382,10 @@
"</div>"
],
"text/plain": [
" id name email street city state \\\n",
"0 1 person... mccoyryan@example.com 123 Main St <None> <None> \n",
"1 2 person... harristricia@example.net 456 Elm St <None> <None> \n",
"2 3 person... thomasmikayla@example.org 789 Pine St <None> <None> \n",
" id name email street city state \\\n",
"0 1 person... jamestaylor@example.net 123 Main St <None> <None> \n",
"1 2 person... brian49@example.com 456 Elm St <None> <None> \n",
"2 3 person... clarkcody@example.org 789 Pine St <None> <None> \n",
"\n",
" postal_code \n",
"0 12345 \n",
Expand Down Expand Up @@ -428,7 +427,7 @@
{
"data": {
"text/plain": [
"StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'URL', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})"
"StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'EMAIL_ADDRESS', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})"
]
},
"execution_count": 8,
Expand Down Expand Up @@ -481,7 +480,7 @@
"text/plain": [
"{'id': 1,\n",
" 'name': 'person...',\n",
" 'email': '<None>',\n",
" 'email': 'virginia29@example.org',\n",
" 'address': {'street': '123 Main St',\n",
" 'city': '<None>',\n",
" 'state': '<None>',\n",
Expand Down Expand Up @@ -510,21 +509,21 @@
"text/plain": [
"{'users': [{'id': 1,\n",
" 'name': 'person...',\n",
" 'email': 'tricia10@example.com',\n",
" 'email': 'david90@example.org',\n",
" 'address': {'street': '<None>',\n",
" 'city': '<None>',\n",
" 'state': '<None>',\n",
" 'postal_code': '12345'}},\n",
" {'id': 2,\n",
" 'name': 'person...',\n",
" 'email': 'tricia10@example.com',\n",
" 'email': 'david90@example.org',\n",
" 'address': {'street': '<None>',\n",
" 'city': '<None>',\n",
" 'state': '<None>',\n",
" 'postal_code': '67890'}},\n",
" {'id': 3,\n",
" 'name': 'person...',\n",
" 'email': 'tricia10@example.com',\n",
" 'email': 'david90@example.org',\n",
" 'address': {'street': '<None>',\n",
" 'city': '<None>',\n",
" 'state': '<None>',\n",
Expand Down
5 changes: 1 addition & 4 deletions presidio-structured/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,4 @@

# Set up default logging (with NullHandler)


logging.getLogger("presidio-str").addHandler(logging.NullHandler())

# __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"]
logging.getLogger("presidio-structured").addHandler(logging.NullHandler())
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _generate_analysis_from_results_json(
self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = ""
) -> StructuredAnalysis:
"""
Generate a configuration from the given analyzer results.
Generate a configuration from the given analyzer results. Always uses the first recognizer result if there are more than one.
:param analyzer_results: The analyzer results.
:param prefix: The prefix for the configuration keys.
Expand Down
4 changes: 1 addition & 3 deletions presidio-structured/presidio_structured/structured_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,12 @@ class StructuredEngine:
Class to implement methods for anonymizing tabular data.
"""

def __init__(self, data_processor: DataProcessorBase = None) -> None:
def __init__(self, data_processor: DataProcessorBase = PandasDataProcessor()) -> None:
"""
Initialize the class with a data processor.
:param data_processor: Instance of DataProcessorBase.
"""
if data_processor is None:
data_processor = PandasDataProcessor()
self.data_processor = data_processor

def anonymize(
Expand Down
34 changes: 17 additions & 17 deletions presidio-structured/tests/data/test_data_transformers.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import pytest
from pandas import DataFrame
from presidio_anonymizer.entities import OperatorConfig
from presidio_structured.data.data_transformers import DataTransformerBase, PandasDataTransformer, JsonDataTransformer
from presidio_structured.data.data_processors import DataProcessorBase, PandasDataProcessor, JsonDataProcessor
from presidio_structured.config import StructuredAnalysis

class TestDataTransformerBase:
class TestDataProcessorBase:
def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators):
with pytest.raises(TypeError):
DataTransformerBase()
DataProcessorBase()

class TestPandasDataTransformer:
class TestPandasDataProcessor:
def test_process(self, sample_df, operators, tabular_analysis):
transformer = PandasDataTransformer()
result = transformer.operate(sample_df, tabular_analysis, operators)
processor = PandasDataProcessor()
result = processor.operate(sample_df, tabular_analysis, operators)
assert isinstance(result, DataFrame)
for key in tabular_analysis.entity_mapping:
if key == 'name':
Expand All @@ -21,19 +21,19 @@ def test_process(self, sample_df, operators, tabular_analysis):
assert all(result[key] == "DEFAULT_REPLACEMENT")

def test_process_no_default_should_raise(self, sample_df, operators_no_default, tabular_analysis):
transformer = PandasDataTransformer()
processor = PandasDataProcessor()
with pytest.raises(ValueError):
transformer.operate(sample_df, tabular_analysis, operators_no_default)
processor.operate(sample_df, tabular_analysis, operators_no_default)

def test_process_invalid_data(self, sample_json, tabular_analysis, operators):
transformer = PandasDataTransformer()
processor = PandasDataProcessor()
with pytest.raises(ValueError):
transformer.operate(sample_json, tabular_analysis, operators)
processor.operate(sample_json, tabular_analysis, operators)

class TestJsonDataTransformer:
class TestJsonDataProcessor:
def test_process(self, sample_json, operators, json_analysis):
transformer = JsonDataTransformer()
result = transformer.operate(sample_json, json_analysis, operators)
processor = JsonDataProcessor()
result = processor.operate(sample_json, json_analysis, operators)
assert isinstance(result, dict)
for key, value in json_analysis.entity_mapping.items():
keys = key.split(".")
Expand All @@ -46,11 +46,11 @@ def test_process(self, sample_json, operators, json_analysis):
assert nested_value == "DEFAULT_REPLACEMENT"

def test_process_no_default_should_raise(self, sample_json, operators_no_default, json_analysis):
transformer = JsonDataTransformer()
processor = JsonDataProcessor()
with pytest.raises(ValueError):
transformer.operate(sample_json, json_analysis, operators_no_default)
processor.operate(sample_json, json_analysis, operators_no_default)

def test_process_invalid_data(self, sample_df, json_analysis, operators):
transformer = JsonDataTransformer()
processor = JsonDataProcessor()
with pytest.raises(ValueError):
transformer.operate(sample_df, json_analysis, operators)
processor.operate(sample_df, json_analysis, operators)
38 changes: 27 additions & 11 deletions presidio-structured/tests/test_tabular_engine.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from unittest.mock import Mock
import pandas as pd

import pytest

from presidio_anonymizer.entities import OperatorConfig

from presidio_structured import StructuredEngine
from presidio_structured.data.data_processors import JsonDataProcessor


def test_structured_engine_anonymize_calls_data_transformer_operate():
def test_structured_engine_anonymize_calls_data_processor_operate():
# Arrange
data_transformer = Mock()
structured_engine = StructuredEngine(data_transformer)
data_processor = Mock()
structured_engine = StructuredEngine(data_processor)
data = Mock()
structured_analysis = Mock()
operators = {"DEFAULT": OperatorConfig("replace")}
Expand All @@ -18,31 +21,31 @@ def test_structured_engine_anonymize_calls_data_transformer_operate():
structured_engine.anonymize(data, structured_analysis, operators)

# Assert
data_transformer.operate.assert_called_once_with(
data_processor.operate.assert_called_once_with(
data, structured_analysis, operators
)


def test_structured_engine_anonymize_adds_default_operator_if_none_provided():
# Arrange
data_transformer = Mock()
structured_engine = StructuredEngine(data_transformer)
data_processor = Mock()
structured_engine = StructuredEngine(data_processor)
data = Mock()
structured_analysis = Mock()

# Act
structured_engine.anonymize(data, structured_analysis)

# Assert
data_transformer.operate.assert_called_once()
args, _ = data_transformer.operate.call_args
data_processor.operate.assert_called_once()
args, _ = data_processor.operate.call_args
assert "DEFAULT" in args[2]


def test_structured_engine_anonymize_does_not_override_existing_default_operator():
# Arrange
data_transformer = Mock()
structured_engine = StructuredEngine(data_transformer)
data_processor = Mock()
structured_engine = StructuredEngine(data_processor)
data = Mock()
structured_analysis = Mock()
operators = {"DEFAULT": OperatorConfig("custom")}
Expand All @@ -51,6 +54,19 @@ def test_structured_engine_anonymize_does_not_override_existing_default_operator
structured_engine.anonymize(data, structured_analysis, operators)

# Assert
data_transformer.operate.assert_called_once_with(
data_processor.operate.assert_called_once_with(
data, structured_analysis, operators
)

def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis):
data_processor = JsonDataProcessor()
structured_engine = StructuredEngine(data_processor)
data = pd.DataFrame({"name": ["John", "Jane"]})
with pytest.raises(ValueError):
structured_engine.anonymize(data, tabular_analysis)

def test_pandas_processor_with_json_will_raise(json_analysis):
structured_engine = StructuredEngine() # default PandasDataProcessor
data = {"name": ["John", "Jane"]}
with pytest.raises(ValueError):
structured_engine.anonymize(data, json_analysis)

0 comments on commit db1f3d8

Please sign in to comment.