Skip to content

Commit

Permalink
Add PL PESEL recognizer (#1209)
Browse files Browse the repository at this point in the history
  • Loading branch information
bckamil authored Nov 14, 2023
1 parent d68c44b commit 2f30b37
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from .au_tfn_recognizer import AuTfnRecognizer
from .au_medicare_recognizer import AuMedicareRecognizer
from .in_pan_recognizer import InPanRecognizer
from .pl_pesel_recognizer import PlPeselRecognizer


NLP_RECOGNIZERS = {
Expand Down Expand Up @@ -74,4 +75,5 @@
"ItIdentityCardRecognizer",
"ItPassportRecognizer",
"InPanRecognizer",
"PlPeselRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from typing import List, Optional

from presidio_analyzer import Pattern, PatternRecognizer


class PlPeselRecognizer(PatternRecognizer):
"""
Recognize PESEL number using regex and checksum.
For more information about PESEL: https://en.wikipedia.org/wiki/PESEL
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
"""

PATTERNS = [
Pattern(
"PESEL",
r"[0-9]{2}([02468][1-9]|[13579][012])(0[1-9]|1[0-9]|2[0-9]|3[01])[0-9]{5}",
0.4,
),
]

CONTEXT = ["PESEL"]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "pl",
supported_entity: str = "PL_PESEL",
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
digits = [int(digit) for digit in pattern_text]
weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3]

checksum = sum(digit * weight for digit, weight in zip(digits[:10], weights))
checksum %= 10

return checksum == digits[10]
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
ItPassportRecognizer,
ItIdentityCardRecognizer,
InPanRecognizer,
PlPeselRecognizer,
)

logger = logging.getLogger("presidio-analyzer")
Expand Down Expand Up @@ -109,6 +110,7 @@ def load_predefined_recognizers(
ItIdentityCardRecognizer,
ItPassportRecognizer,
],
"pl": [PlPeselRecognizer],
"ALL": [
CreditCardRecognizer,
CryptoRecognizer,
Expand Down
37 changes: 37 additions & 0 deletions presidio-analyzer/tests/test_pl_pesel_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import PlPeselRecognizer


@pytest.fixture(scope="module")
def recognizer():
return PlPeselRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["PL_PESEL"]


@pytest.mark.parametrize(
"text, expected_len, expected_positions",
[
# fmt: off
# valid PESEL scores
("11111111114", 1, ((0, 11),),),
("My pesel is 11111111114.", 1, ((12, 23), )),
# invalid PESEL scores
("1111321111", 0, ()),
("11110021111", 0, ()),
("11-11-11-11114", 0, ()),
# fmt: on
],
)
def test_when_all_pl_pesels_then_succeed(
text, expected_len, expected_positions, recognizer, entities, max_score
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos) in zip(results, expected_positions):
assert_result(res, entities[0], st_pos, fn_pos, max_score)

0 comments on commit 2f30b37

Please sign in to comment.