diff --git a/CHANGELOG.md b/CHANGELOG.md index 70048e6..405001e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## [1.0.8] - Unreleased - Import from pydantic v2 lib first - Fixed prefix extraction during normalization +- Improved canonical ISCC string validation ## [1.0.7] - 2024-01-07 - Support pydantic v1 & v2 diff --git a/docs/changelog.md b/docs/changelog.md index 70048e6..405001e 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -3,6 +3,7 @@ ## [1.0.8] - Unreleased - Import from pydantic v2 lib first - Fixed prefix extraction during normalization +- Improved canonical ISCC string validation ## [1.0.7] - 2024-01-07 - Support pydantic v1 & v2 diff --git a/iscc_core/codec.py b/iscc_core/codec.py index b1ab987..51d6ed3 100644 --- a/iscc_core/codec.py +++ b/iscc_core/codec.py @@ -19,7 +19,7 @@ def encode_component(mtype, stype, version, bit_length, digest): # type: (MainType, SubType, Version, Length, bytes) -> str """ - Encode an ISCC unit inlcuding header and body with standard base32 encoding. + Encode an ISCC-UNIT inlcuding header and body with standard base32 encoding. !!! note The `length` value must be the **length in number of bits** for the component. @@ -67,11 +67,10 @@ def encode_header(mtype, stype, version=0, length=1): :rtype: bytes """ - # TODO verify that all header params and there combination is valid header = bitarray() for n in (mtype, stype, version, length): header += encode_varnibble(n) - # Append zero-padding if required (right side, least significant bits). + # Append zero-padding if required (right side, least-significant bits). header.fill() return header.tobytes() @@ -238,7 +237,7 @@ def encode_length(mtype, length): if 0 <= length <= 7: return length raise ValueError(error) - # counter byte lenght encoding + # counter byte length encoding elif mtype == MT.ID: if 64 <= length <= 96: return (length - 64) // 8 @@ -503,7 +502,7 @@ def iscc_validate(iscc, strict=True): - an ISCC-CODE or ISCC-UNIT - encoded with base32 upper without padding - has a valid combination of header values - - is represented in its canonical URI form + - is represented in its canonical form :param str iscc: ISCC string :param bool strict: Raise an exeption if validation fails (default True) @@ -519,7 +518,18 @@ def iscc_validate(iscc, strict=True): else: return False + # Base32 encoding test + try: + decode_base32(iscc.split(":")[1]) + except Exception as e: + if strict: + raise ValueError(e) + else: + return False + cleaned = iscc_clean(iscc) + + # Prefix test prefix = cleaned[:2] if prefix not in PREFIXES: if strict: @@ -527,6 +537,7 @@ def iscc_validate(iscc, strict=True): else: return False + # Version test m, s, v, l, t = decode_header(decode_base32(cleaned)) if v != 0: if strict: @@ -534,6 +545,15 @@ def iscc_validate(iscc, strict=True): else: return False + # Length test + expected_nbyptes = decode_length(m, l).value // 8 + actual_nbyptes = len(t) + if expected_nbyptes != actual_nbyptes: + if strict: + raise ValueError(f"Header expects {expected_nbyptes} but got {actual_nbyptes} bytes") + else: + return False + return True diff --git a/iscc_core/models.py b/iscc_core/models.py index f6a5db2..dcf226f 100644 --- a/iscc_core/models.py +++ b/iscc_core/models.py @@ -246,7 +246,6 @@ def rnd(cls, mt=None, st=None, bits=64, data=None): # Length ln_bits = bits or cls.rgen.choice(list(LN)).value if mt == MT.ISCC: - # TODO fix ramdom ISCC with custom SubType generation ln_code = encode_units(units) else: ln_code = encode_length(mt, bits) @@ -290,15 +289,6 @@ def mf_base64url(self) -> str: """Multiformats base64url encoded.""" return "u" + encode_base64(self.mc_bytes) - # TODO: bech32m support - # @property - # def bech32m(self): - # """Encode as bech32m with hrp `iscc`""" - # data = [bech32.CHARSET.find(c) for c in self.code.lower()] - # return bech32.bech32_encode( - # "iscc", data, bech32.Encoding.BECH32M - # ) - def __xor__(self, other) -> int: """Use XOR operator for hamming distance calculation.""" return count_xor(self._body, other._body) diff --git a/tests/test_code_meta.py b/tests/test_code_meta.py index 88a9576..a953366 100644 --- a/tests/test_code_meta.py +++ b/tests/test_code_meta.py @@ -23,7 +23,6 @@ def test_gen_meta_code_name_only(): def test_gen_meta_code_name_and_desc(): - # TODO handle cases with consecutive newlines and spaces in between result = ic.gen_meta_code_v0("Hello World", "# Some\n\n\n description") assert result == { "iscc": "ISCC:AAAWN77F72MBZZK3", diff --git a/tests/test_codec.py b/tests/test_codec.py index 0c77093..fd35b93 100644 --- a/tests/test_codec.py +++ b/tests/test_codec.py @@ -617,3 +617,23 @@ def test_iscc_code_no_content_code(): def test_models_Code_rnd_custom_subtype_raises(): with pytest.raises(ValueError): ic.Code.rnd(ic.MT.ISCC, st=ic.ST_ISCC.TEXT) + + +def test_iscc_validate_base_encoding_error(): + sample = "ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ4ORJGIVHDI" + assert ic.iscc_validate(sample, strict=False) is True + + assert ic.iscc_validate(sample[:-1], strict=False) is False + + with pytest.raises(ValueError) as excinfo: + ic.iscc_validate(sample[:-1], strict=True) + assert str(excinfo.value) == "Incorrect padding" + + +def test_iscc_validate_bad_length(): + sample = "ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ" + assert ic.iscc_validate(sample, strict=False) is False + + with pytest.raises(ValueError) as excinfo: + ic.iscc_validate(sample, strict=True) + assert str(excinfo.value) == "Header expects 32 but got 26 bytes" diff --git a/tests/test_iscc_code.py b/tests/test_iscc_code.py index 6f40006..84acd44 100644 --- a/tests/test_iscc_code.py +++ b/tests/test_iscc_code.py @@ -41,7 +41,6 @@ def test_gen_iscc_code_v0_no_meta(): def test_gen_iscc_code_v0_no_meta_content(): icode = ic.gen_iscc_code_v0([DID_128, IID_256]) assert icode == {"iscc": "ISCC:KUAFVC5DMJJGYKZ4MQV6B6THIBBUG"} - # TODO mabye show length for SubType SUM as we now the unit composition. # we may also get a ISCC-SUM-V0-256 version assert ic.iscc_explain(icode["iscc"]) == "ISCC-SUM-V0-DI-5a8ba362526c2b3c642be0fa67404343"