From b16072b66e387ceb9d326bbb233c35c8e344efab Mon Sep 17 00:00:00 2001 From: Titusz Pan Date: Mon, 18 Nov 2024 11:57:47 +0100 Subject: [PATCH] Add multiformats support for codec convenience functions --- CHANGELOG.md | 1 + docs/changelog.md | 1 + iscc_core/codec.py | 57 ++++++++++++++++++++++++-------------- tests/test_multiformats.py | 23 +++++++++++++++ 4 files changed, 61 insertions(+), 21 deletions(-) create mode 100644 tests/test_multiformats.py diff --git a/CHANGELOG.md b/CHANGELOG.md index be815a4..a28bfc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - Set CDC index to size if size is smaller than minimum chunk size - Added Python 3.13 support - Added .env to .gitignore +- Added multiformats support for codec convenience functions - Updeted locked dependencies - Removed EOL Python 3.8/3.9 support diff --git a/docs/changelog.md b/docs/changelog.md index be815a4..a28bfc1 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,6 +4,7 @@ - Set CDC index to size if size is smaller than minimum chunk size - Added Python 3.13 support - Added .env to .gitignore +- Added multiformats support for codec convenience functions - Updeted locked dependencies - Removed EOL Python 3.8/3.9 support diff --git a/iscc_core/codec.py b/iscc_core/codec.py index 2b373c7..a78214d 100644 --- a/iscc_core/codec.py +++ b/iscc_core/codec.py @@ -325,6 +325,33 @@ def decode_base32hex(code): return decode_base32(b32) +def normalize_multiformat(iscc_code): + """ + Transcode a multiformat encoded ISCC to standard base32 encoding. + Returns the input unchanged (but cleaned) if it's not multiformat encoded. + """ + decoders = { + MULTIBASE.base16.value: bytes.fromhex, # f + MULTIBASE.base32.value: decode_base32, # b + MULTIBASE.base32hex.value: decode_base32hex, # v + MULTIBASE.base58btc.value: base58.b58decode, # z + MULTIBASE.base64url.value: decode_base64, # u + } + + # Clean the ISCC code first + iscc_code = iscc_clean(iscc_code) + + # Check for multibase prefix + multibase_prefix = iscc_code[0] + if multibase_prefix in decoders.keys(): + decoder = decoders[multibase_prefix] + decoded = decoder(iscc_code[1:]) + if not decoded.startswith(MC_PREFIX): + raise ValueError(f"Malformed multiformat codec: {decoded[:2]}") + return encode_base32(decoded[2:]) + return iscc_code + + def iscc_decompose(iscc_code): # type: (str) -> List[str] """ @@ -333,9 +360,10 @@ def iscc_decompose(iscc_code): A valid ISCC sequence is a string concatenation of ISCC-UNITS optionally seperated by a hyphen. """ - iscc_code = iscc_clean(iscc_code) - components = [] + # Handle multiformat encoding first + iscc_code = normalize_multiformat(iscc_code) + components = [] raw_code = decode_base32(iscc_code) while raw_code: mt, st, vs, ln, body = decode_header(raw_code) @@ -400,26 +428,13 @@ def iscc_normalize(iscc_code): """ from iscc_core.iscc_code import gen_iscc_code_v0 - decoders = { - MULTIBASE.base16.value: bytes.fromhex, # f - MULTIBASE.base32.value: decode_base32, # b - MULTIBASE.base32hex.value: decode_base32hex, # v - MULTIBASE.base58btc.value: base58.b58decode, # z - MULTIBASE.base64url.value: decode_base64, # u - } + # Handle multiformat encoding first + iscc_code = normalize_multiformat(iscc_code) - # Transcode to base32 if encoded - multibase_prefix = iscc_code[0] - if multibase_prefix in decoders.keys(): - decoder = decoders[multibase_prefix] - decoded = decoder(iscc_code[1:]) - if not decoded.startswith(MC_PREFIX): - raise ValueError(f"Malformed multiformat codec: {decoded[:2]}") - iscc_code = encode_base32(decoded[2:]) - else: - prefix = iscc_code.upper().replace("ISCC:", "")[:2] - if prefix not in PREFIXES: - raise ValueError(f"ISCC starts with invalid prefix {prefix}") + # Validate prefix + prefix = iscc_code.upper()[:2] + if prefix not in PREFIXES: + raise ValueError(f"ISCC starts with invalid prefix {prefix}") decomposed = iscc_decompose(iscc_code) recomposed = gen_iscc_code_v0(decomposed)["iscc"] if len(decomposed) >= 2 else decomposed[0] diff --git a/tests/test_multiformats.py b/tests/test_multiformats.py new file mode 100644 index 0000000..1e53197 --- /dev/null +++ b/tests/test_multiformats.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +import iscc_core as ic + +CANONICAL = "ISCC:EAAWFH3PX3MCYB6N" +MF_B32H = "vpg0i00b2jtnrtm1c0v6g" +MF_B32H_P = "iscc:vpg0i00b2jtnrtm1c0v6g" +ISCC_OBJ = ic.Code(CANONICAL) + + +def test_iscc_clean(): + assert ic.iscc_clean(MF_B32H) == MF_B32H + assert ic.iscc_clean(MF_B32H_P) == MF_B32H + + +def test_iscc_decode(): + assert ic.iscc_decode(MF_B32H) == ISCC_OBJ._head + (ISCC_OBJ._body.tobytes(),) + assert ic.iscc_decode(MF_B32H_P) == ISCC_OBJ._head + (ISCC_OBJ._body.tobytes(),) + + +def test_iscc_decompose(): + decomposed = ic.iscc_decompose(ISCC_OBJ.uri) + assert ic.iscc_decompose(MF_B32H) == decomposed + assert ic.iscc_decompose(MF_B32H_P) == decomposed