diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 61c6386d6..cf07a16f4 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -33,6 +33,7 @@ from .au_tfn_recognizer import AuTfnRecognizer from .au_medicare_recognizer import AuMedicareRecognizer from .in_pan_recognizer import InPanRecognizer +from .pl_pesel_recognizer import PlPeselRecognizer NLP_RECOGNIZERS = { @@ -74,4 +75,5 @@ "ItIdentityCardRecognizer", "ItPassportRecognizer", "InPanRecognizer", + "PlPeselRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/pl_pesel_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/pl_pesel_recognizer.py new file mode 100644 index 000000000..065b281b8 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/pl_pesel_recognizer.py @@ -0,0 +1,51 @@ +from typing import List, Optional + +from presidio_analyzer import Pattern, PatternRecognizer + + +class PlPeselRecognizer(PatternRecognizer): + """ + Recognize PESEL number using regex and checksum. + + For more information about PESEL: https://en.wikipedia.org/wiki/PESEL + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + PATTERNS = [ + Pattern( + "PESEL", + r"[0-9]{2}([02468][1-9]|[13579][012])(0[1-9]|1[0-9]|2[0-9]|3[01])[0-9]{5}", + 0.4, + ), + ] + + CONTEXT = ["PESEL"] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "pl", + supported_entity: str = "PL_PESEL", + ): + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: # noqa D102 + digits = [int(digit) for digit in pattern_text] + weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3] + + checksum = sum(digit * weight for digit, weight in zip(digits[:10], weights)) + checksum %= 10 + + return checksum == digits[10] diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry.py index df2967b75..2f1f09833 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry.py @@ -44,6 +44,7 @@ ItPassportRecognizer, ItIdentityCardRecognizer, InPanRecognizer, + PlPeselRecognizer, ) logger = logging.getLogger("presidio-analyzer") @@ -109,6 +110,7 @@ def load_predefined_recognizers( ItIdentityCardRecognizer, ItPassportRecognizer, ], + "pl": [PlPeselRecognizer], "ALL": [ CreditCardRecognizer, CryptoRecognizer, diff --git a/presidio-analyzer/tests/test_pl_pesel_recognizer.py b/presidio-analyzer/tests/test_pl_pesel_recognizer.py new file mode 100644 index 000000000..ee6cc01c0 --- /dev/null +++ b/presidio-analyzer/tests/test_pl_pesel_recognizer.py @@ -0,0 +1,37 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import PlPeselRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return PlPeselRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["PL_PESEL"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions", + [ + # fmt: off + # valid PESEL scores + ("11111111114", 1, ((0, 11),),), + ("My pesel is 11111111114.", 1, ((12, 23), )), + # invalid PESEL scores + ("1111321111", 0, ()), + ("11110021111", 0, ()), + ("11-11-11-11114", 0, ()), + # fmt: on + ], +) +def test_when_all_pl_pesels_then_succeed( + text, expected_len, expected_positions, recognizer, entities, max_score +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos) in zip(results, expected_positions): + assert_result(res, entities[0], st_pos, fn_pos, max_score)