Skip to content

Commit

Permalink
Improve canonical ISCC string validation
Browse files Browse the repository at this point in the history
  • Loading branch information
titusz committed Jan 29, 2024
1 parent 2b10240 commit c5bc58b
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 17 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## [1.0.8] - Unreleased
- Import from pydantic v2 lib first
- Fixed prefix extraction during normalization
- Improved canonical ISCC string validation

## [1.0.7] - 2024-01-07
- Support pydantic v1 & v2
Expand Down
1 change: 1 addition & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## [1.0.8] - Unreleased
- Import from pydantic v2 lib first
- Fixed prefix extraction during normalization
- Improved canonical ISCC string validation

## [1.0.7] - 2024-01-07
- Support pydantic v1 & v2
Expand Down
30 changes: 25 additions & 5 deletions iscc_core/codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
def encode_component(mtype, stype, version, bit_length, digest):
# type: (MainType, SubType, Version, Length, bytes) -> str
"""
Encode an ISCC unit inlcuding header and body with standard base32 encoding.
Encode an ISCC-UNIT inlcuding header and body with standard base32 encoding.
!!! note
The `length` value must be the **length in number of bits** for the component.
Expand Down Expand Up @@ -67,11 +67,10 @@ def encode_header(mtype, stype, version=0, length=1):
:rtype: bytes
"""
# TODO verify that all header params and there combination is valid
header = bitarray()
for n in (mtype, stype, version, length):
header += encode_varnibble(n)
# Append zero-padding if required (right side, least significant bits).
# Append zero-padding if required (right side, least-significant bits).
header.fill()
return header.tobytes()

Expand Down Expand Up @@ -238,7 +237,7 @@ def encode_length(mtype, length):
if 0 <= length <= 7:
return length
raise ValueError(error)
# counter byte lenght encoding
# counter byte length encoding
elif mtype == MT.ID:
if 64 <= length <= 96:
return (length - 64) // 8
Expand Down Expand Up @@ -503,7 +502,7 @@ def iscc_validate(iscc, strict=True):
- an ISCC-CODE or ISCC-UNIT
- encoded with base32 upper without padding
- has a valid combination of header values
- is represented in its canonical URI form
- is represented in its canonical form
:param str iscc: ISCC string
:param bool strict: Raise an exeption if validation fails (default True)
Expand All @@ -519,21 +518,42 @@ def iscc_validate(iscc, strict=True):
else:
return False

# Base32 encoding test
try:
decode_base32(iscc.split(":")[1])
except Exception as e:
if strict:
raise ValueError(e)
else:
return False

cleaned = iscc_clean(iscc)

# Prefix test
prefix = cleaned[:2]
if prefix not in PREFIXES:
if strict:
raise ValueError(f"Header starts with invalid sequence {prefix}")
else:
return False

# Version test
m, s, v, l, t = decode_header(decode_base32(cleaned))
if v != 0:
if strict:
raise ValueError(f"Unknown version {v} in version header")
else:
return False

# Length test
expected_nbyptes = decode_length(m, l).value // 8
actual_nbyptes = len(t)
if expected_nbyptes != actual_nbyptes:
if strict:
raise ValueError(f"Header expects {expected_nbyptes} but got {actual_nbyptes} bytes")
else:
return False

return True


Expand Down
10 changes: 0 additions & 10 deletions iscc_core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,6 @@ def rnd(cls, mt=None, st=None, bits=64, data=None):
# Length
ln_bits = bits or cls.rgen.choice(list(LN)).value
if mt == MT.ISCC:
# TODO fix ramdom ISCC with custom SubType generation
ln_code = encode_units(units)
else:
ln_code = encode_length(mt, bits)
Expand Down Expand Up @@ -290,15 +289,6 @@ def mf_base64url(self) -> str:
"""Multiformats base64url encoded."""
return "u" + encode_base64(self.mc_bytes)

# TODO: bech32m support
# @property
# def bech32m(self):
# """Encode as bech32m with hrp `iscc`"""
# data = [bech32.CHARSET.find(c) for c in self.code.lower()]
# return bech32.bech32_encode(
# "iscc", data, bech32.Encoding.BECH32M
# )

def __xor__(self, other) -> int:
"""Use XOR operator for hamming distance calculation."""
return count_xor(self._body, other._body)
Expand Down
1 change: 0 additions & 1 deletion tests/test_code_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def test_gen_meta_code_name_only():


def test_gen_meta_code_name_and_desc():
# TODO handle cases with consecutive newlines and spaces in between
result = ic.gen_meta_code_v0("Hello World", "# Some\n\n\n description")
assert result == {
"iscc": "ISCC:AAAWN77F72MBZZK3",
Expand Down
20 changes: 20 additions & 0 deletions tests/test_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,3 +617,23 @@ def test_iscc_code_no_content_code():
def test_models_Code_rnd_custom_subtype_raises():
with pytest.raises(ValueError):
ic.Code.rnd(ic.MT.ISCC, st=ic.ST_ISCC.TEXT)


def test_iscc_validate_base_encoding_error():
sample = "ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ4ORJGIVHDI"
assert ic.iscc_validate(sample, strict=False) is True

assert ic.iscc_validate(sample[:-1], strict=False) is False

with pytest.raises(ValueError) as excinfo:
ic.iscc_validate(sample[:-1], strict=True)
assert str(excinfo.value) == "Incorrect padding"


def test_iscc_validate_bad_length():
sample = "ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ"
assert ic.iscc_validate(sample, strict=False) is False

with pytest.raises(ValueError) as excinfo:
ic.iscc_validate(sample, strict=True)
assert str(excinfo.value) == "Header expects 32 but got 26 bytes"
1 change: 0 additions & 1 deletion tests/test_iscc_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def test_gen_iscc_code_v0_no_meta():
def test_gen_iscc_code_v0_no_meta_content():
icode = ic.gen_iscc_code_v0([DID_128, IID_256])
assert icode == {"iscc": "ISCC:KUAFVC5DMJJGYKZ4MQV6B6THIBBUG"}
# TODO mabye show length for SubType SUM as we now the unit composition.
# we may also get a ISCC-SUM-V0-256 version
assert ic.iscc_explain(icode["iscc"]) == "ISCC-SUM-V0-DI-5a8ba362526c2b3c642be0fa67404343"

Expand Down

0 comments on commit c5bc58b

Please sign in to comment.