diff --git a/iscc_core/cdc.pxd b/iscc_core/cdc.pxd index f884a3a..145bc6d 100644 --- a/iscc_core/cdc.pxd +++ b/iscc_core/cdc.pxd @@ -6,4 +6,4 @@ cdef uint32_t CDC_READ_SIZE = 262144 cdef uint32_t[256] GEAR @cython.locals(pattern=uint32_t, i=uint32_t, size=uint32_t, barrier=uint32_t) -cdef uint32_t cdc_offset(const uint8_t[:], uint32_t mi, uint32_t ma, uint32_t cs, uint32_t mask_s, uint32_t mask_l) +cdef uint32_t alg_cdc_offset(const uint8_t[:], uint32_t mi, uint32_t ma, uint32_t cs, uint32_t mask_s, uint32_t mask_l) diff --git a/iscc_core/cdc.py b/iscc_core/cdc.py index 52d4618..54d6261 100644 --- a/iscc_core/cdc.py +++ b/iscc_core/cdc.py @@ -6,10 +6,10 @@ import iscc_core as ic -__all__ = ["cdc_data_chunks"] +__all__ = ["alg_cdc_chunks"] -def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size): +def alg_cdc_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size): # type: (Data, bool, int) -> Generator[bytes, None, None] """ A generator that yields data-dependent chunks for `data`. @@ -33,13 +33,13 @@ def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size if not buffer: yield b"" - mi, ma, cs, mask_s, mask_l = cdc_params(avg_chunk_size) + mi, ma, cs, mask_s, mask_l = alg_cdc_params(avg_chunk_size) buffer = memoryview(buffer) while buffer: if len(buffer) <= ma: buffer = memoryview(bytes(buffer) + stream.read(ic.core_opts.io_read_size)) - cut_point = cdc_offset(buffer, mi, ma, cs, mask_s, mask_l) + cut_point = alg_cdc_offset(buffer, mi, ma, cs, mask_s, mask_l) # Make sure cut points are at 4-byte aligned for utf32 encoded text if utf32: @@ -49,7 +49,7 @@ def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size buffer = buffer[cut_point:] -def cdc_offset(buffer, mi, ma, cs, mask_s, mask_l): +def alg_cdc_offset(buffer, mi, ma, cs, mask_s, mask_l): # type: (ic.Data, int, int, int, int, int) -> int """ Find breakpoint offset for a given buffer. @@ -82,7 +82,7 @@ def cdc_offset(buffer, mi, ma, cs, mask_s, mask_l): return i -def cdc_params(avg_size: int) -> tuple: +def alg_cdc_params(avg_size: int) -> tuple: """ Calculate CDC parameters diff --git a/iscc_core/code_content_audio.py b/iscc_core/code_content_audio.py index 9d144c7..113193a 100644 --- a/iscc_core/code_content_audio.py +++ b/iscc_core/code_content_audio.py @@ -75,13 +75,13 @@ def soft_hash_audio_v0(cv): return b"\x00" * 32 # Calculate simhash of digests as first 32-bit chunk of the hash - parts = [ic.similarity_hash(digests)] + parts = [ic.alg_simhash(digests)] # Calculate separate 32-bit simhashes for each quarter of features (original order) for bucket in divide(4, digests): features = list(bucket) if features: - parts.append(ic.similarity_hash(features)) + parts.append(ic.alg_simhash(features)) else: parts.append(b"\x00\x00\x00\x00") @@ -91,7 +91,7 @@ def soft_hash_audio_v0(cv): for bucket in divide(3, digests): features = list(bucket) if features: - parts.append(ic.similarity_hash(features)) + parts.append(ic.alg_simhash(features)) else: parts.append(b"\x00\x00\x00\x00") return b"".join(parts) diff --git a/iscc_core/code_content_image.py b/iscc_core/code_content_image.py index bd91a80..cddb7b5 100644 --- a/iscc_core/code_content_image.py +++ b/iscc_core/code_content_image.py @@ -78,13 +78,13 @@ def soft_hash_image_v0(pixels, bits=ic.core_opts.image_bits): # DCT per row dct_row_lists = [] for pixel_list in chunked(pixels, 32): - dct_row_lists.append(ic.dct(pixel_list)) + dct_row_lists.append(ic.alg_dct(pixel_list)) # DCT per col dct_row_lists_t = list(map(list, zip(*dct_row_lists))) dct_col_lists_t = [] for dct_list in dct_row_lists_t: - dct_col_lists_t.append(ic.dct(dct_list)) + dct_col_lists_t.append(ic.alg_dct(dct_list)) dct_matrix = list(map(list, zip(*dct_col_lists_t))) diff --git a/iscc_core/code_content_mixed.py b/iscc_core/code_content_mixed.py index c6495d7..c63be30 100644 --- a/iscc_core/code_content_mixed.py +++ b/iscc_core/code_content_mixed.py @@ -94,4 +94,4 @@ def soft_hash_codes_v0(cc_digests, bits=ic.core_opts.mixed_bits): # Retain the first byte of the header and strip body to mixed_bits length for full, code_tuple in zip(cc_digests, code_tuples): hash_bytes.append(full[:1] + code_tuple[-1][: nbytes - 1]) - return ic.similarity_hash(hash_bytes) + return ic.alg_simhash(hash_bytes) diff --git a/iscc_core/code_content_text.py b/iscc_core/code_content_text.py index f714268..81903c4 100644 --- a/iscc_core/code_content_text.py +++ b/iscc_core/code_content_text.py @@ -84,7 +84,7 @@ def soft_hash_text_v0(text): - Slide over text with a [`text_ngram_size`][iscc_core.options.CoreOptions.text_ngram_size] wide window and create [`xxh32`](https://cyan4973.github.io/xxHash/) hashes - - Create a [`minhash_256`][iscc_core.minhash.minhash_256] from the hashes generated + - Create a [`minhash_256`][iscc_core.minhash.alg_minhash_256] from the hashes generated in the previous step. !!! note @@ -102,7 +102,7 @@ def soft_hash_text_v0(text): """ ngrams = ic.sliding_window(text, ic.core_opts.text_ngram_size) features = [xxhash.xxh32_intdigest(s.encode("utf-8")) for s in ngrams] - hash_digest = ic.minhash_256(features) + hash_digest = ic.alg_minhash_256(features) return hash_digest diff --git a/iscc_core/code_content_video.py b/iscc_core/code_content_video.py index 799956b..9ad6eee 100644 --- a/iscc_core/code_content_video.py +++ b/iscc_core/code_content_video.py @@ -67,5 +67,5 @@ def soft_hash_video_v0(frame_sigs, bits=ic.core_opts.video_bits): frame_sigs = [tuple(sig) for sig in frame_sigs] sigs = set(frame_sigs) vecsum = [sum(col) for col in zip(*sigs)] - video_hash_digest = ic.wtahash(vecsum, bits) + video_hash_digest = ic.alg_wtahash(vecsum, bits) return video_hash_digest diff --git a/iscc_core/code_data.py b/iscc_core/code_data.py index 7a66b2f..29498ce 100644 --- a/iscc_core/code_data.py +++ b/iscc_core/code_data.py @@ -89,7 +89,7 @@ def push(self, data): if self.tail: data = self.tail + data - for chunk in ic.cdc_data_chunks( + for chunk in ic.alg_cdc_chunks( data, utf32=False, avg_chunk_size=ic.core_opts.data_avg_chunk_size ): self.chunk_sizes.append(len(chunk)) @@ -103,7 +103,7 @@ def digest(self): # type: () -> bytes """Calculate 256-bit minhash digest from feature hashes.""" self._finalize() - return ic.minhash_256(self.chunk_features) + return ic.alg_minhash_256(self.chunk_features) def code(self, bits=ic.core_opts.data_bits): # type: (int) -> str diff --git a/iscc_core/code_meta.py b/iscc_core/code_meta.py index ff12a46..417415a 100644 --- a/iscc_core/code_meta.py +++ b/iscc_core/code_meta.py @@ -185,7 +185,7 @@ def soft_hash_meta_v0(name, extra=None): name = ic.text_collapse(name) name_n_grams = ic.sliding_window(name, width=ic.core_opts.meta_ngram_size_text) name_hash_digests = [blake3(s.encode("utf-8")).digest() for s in name_n_grams] - simhash_digest = ic.similarity_hash(name_hash_digests) + simhash_digest = ic.alg_simhash(name_hash_digests) if extra in {None, "", b""}: return simhash_digest @@ -203,7 +203,7 @@ def soft_hash_meta_v0(name, extra=None): else: raise ValueError("parameter `extra` must be of type str or bytes!") - extra_simhash_digest = ic.similarity_hash(extra_hash_digests) + extra_simhash_digest = ic.alg_simhash(extra_hash_digests) # Interleave first half of name and extra simhashes in 32-bit chunks chunks_simhash_digest = sliced(simhash_digest[:16], 4) diff --git a/iscc_core/dct.py b/iscc_core/dct.py index 2a641b2..49d241a 100644 --- a/iscc_core/dct.py +++ b/iscc_core/dct.py @@ -3,7 +3,7 @@ from typing import List, Sequence -def dct(v): +def alg_dct(v): # type: (Sequence[float]) -> List """ Discrete cosine transform. @@ -26,8 +26,8 @@ def dct(v): beta = [ (v[i] - v[-(i + 1)]) / (math.cos((i + 0.5) * math.pi / n) * 2.0) for i in range(half) ] - alpha = dct(alpha) - beta = dct(beta) + alpha = alg_dct(alpha) + beta = alg_dct(beta) result = [] for i in range(half - 1): result.append(alpha[i]) diff --git a/iscc_core/iscc_id.py b/iscc_core/iscc_id.py index e520824..5c057ca 100644 --- a/iscc_core/iscc_id.py +++ b/iscc_core/iscc_id.py @@ -82,7 +82,7 @@ def soft_hash_iscc_id_v0(iscc_code, uc=0): # first byte of header + first 7 bytes of body digests.append(dec[:1] + unp[-1][:7]) - iscc_id_digest = ic.similarity_hash(digests) + iscc_id_digest = ic.alg_simhash(digests) if uc: iscc_id_digest += uvarint.encode(uc) diff --git a/iscc_core/minhash.pxd b/iscc_core/minhash.pxd index c4ddbb5..bb997ba 100644 --- a/iscc_core/minhash.pxd +++ b/iscc_core/minhash.pxd @@ -9,11 +9,11 @@ cdef uint64_t[64] MPA cdef uint64_t[64] MPB @cython.locals(a=uint64_t, b=uint64_t, f=uint64_t) -cpdef list minhash(list features) +cpdef list alg_minhash(list features) -cpdef bytes minhash_64(list features) +cpdef bytes alg_minhash_64(list features) -cpdef bytes minhash_256(list features) +cpdef bytes alg_minhash_256(list features) @cython.locals(bits=str, bitpos=uint8_t, h=uint64_t) -cpdef bytes compress(list mhash, int lsb=*) +cpdef bytes alg_minhash_compress(list mhash, int lsb=*) diff --git a/iscc_core/minhash.py b/iscc_core/minhash.py index 70bc6ca..523121b 100644 --- a/iscc_core/minhash.py +++ b/iscc_core/minhash.py @@ -2,7 +2,7 @@ from typing import List -def minhash(features): +def alg_minhash(features): # type: (List[int]) -> List[int] """ Calculate a 64 dimensional minhash integer vector. @@ -16,7 +16,7 @@ def minhash(features): ] -def minhash_64(features): +def alg_minhash_64(features): # type: (List[int]) -> bytes """ Create 64-bit minimum hash digest. @@ -25,10 +25,10 @@ def minhash_64(features): :return: 64-bit binary from the least significant bits of the minhash values :rtype: bytes """ - return minhash_compress(minhash(features), 1) + return alg_minhash_compress(alg_minhash(features), 1) -def minhash_256(features): +def alg_minhash_256(features): # type: (List[int]) -> bytes """ Create 256-bit minimum hash digest. @@ -37,15 +37,15 @@ def minhash_256(features): :return: 256-bit binary from the least significant bits of the minhash values :rtype: bytes """ - return minhash_compress(minhash(features), 4) + return alg_minhash_compress(alg_minhash(features), 4) -def minhash_compress(mhash, lsb=4): +def alg_minhash_compress(mhash, lsb=4): # type: (List[int], int) -> bytes """ Compress minhash vector to byte hash-digest. - Concatenates `lsb` number of least significant bits from each integer in `mhash`. + Concatenates `lsb` number of least-significant bits from each integer in `mhash`. For example an `mhash` with 64 integers and `lsb=4` will produce a 256-bit summary of the minhash vector. diff --git a/iscc_core/simhash.pxd b/iscc_core/simhash.pxd index 78f6096..caa2921 100644 --- a/iscc_core/simhash.pxd +++ b/iscc_core/simhash.pxd @@ -1 +1 @@ -cpdef bytes similarity_hash(list hash_digests) +cpdef bytes alg_simhash(list hash_digests) diff --git a/iscc_core/simhash.py b/iscc_core/simhash.py index c94b636..89d20cf 100644 --- a/iscc_core/simhash.py +++ b/iscc_core/simhash.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -def similarity_hash(hash_digests): +def alg_simhash(hash_digests): # type: (list[bytes]) -> bytes """ Creates a similarity preserving hash from a sequence of equal sized hash digests. diff --git a/iscc_core/wtahash.py b/iscc_core/wtahash.py index 699a2c6..a4c84e5 100644 --- a/iscc_core/wtahash.py +++ b/iscc_core/wtahash.py @@ -3,7 +3,7 @@ from bitarray import bitarray -def wtahash(vec: Sequence[float], bits) -> bytes: +def alg_wtahash(vec: Sequence[float], bits) -> bytes: """Calculate WTA Hash for vector with 380 values (MP7 frame signature).""" h = [] for perm in WTA_VIDEO_ID_PERMUTATIONS: diff --git a/tests/test_cdc.py b/tests/test_cdc.py index d684870..112fbfc 100644 --- a/tests/test_cdc.py +++ b/tests/test_cdc.py @@ -5,46 +5,46 @@ def test_get_params(): - assert iscc_core.cdc.cdc_params(1024) == (256, 8192, 640, 2047, 511) - assert iscc_core.cdc.cdc_params(8192) == (2048, 65536, 5120, 16383, 4095) + assert iscc_core.cdc.alg_cdc_params(1024) == (256, 8192, 640, 2047, 511) + assert iscc_core.cdc.alg_cdc_params(8192) == (2048, 65536, 5120, 16383, 4095) def test_data_chunks_empty(): - assert list(iscc_core.cdc.cdc_data_chunks(b"", False)) == [b""] + assert list(iscc_core.cdc.alg_cdc_chunks(b"", False)) == [b""] def test_data_chunks_1byte(): - assert list(iscc_core.cdc.cdc_data_chunks(b"\x00", False)) == [b"\x00"] + assert list(iscc_core.cdc.alg_cdc_chunks(b"\x00", False)) == [b"\x00"] def test_data_chunks_below_min(): data = static_bytes(256 - 1) - assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data] + assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data] def test_data_chunks_min(): data = static_bytes(256) - assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data] + assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data] def test_data_chunks_above_min(): data = static_bytes(256 + 1) - assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data] + assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data] def test_data_chunks_avg(): data = static_bytes(1024) - assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data] + assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data] def test_data_chunks_avg_above(): data = static_bytes(1024 + 1) - assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data] + assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data] def test_data_chunks_two_chunks(): data = static_bytes(1024 + 309) - assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data[:-1], data[-1:]] + assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data[:-1], data[-1:]] def test_data_chunks_max_odd(): @@ -58,7 +58,7 @@ def test_data_chunks_max_odd(): "5fae55e1aee84705fc3dc6e831d4f7981677e03338343bd6a783c45e333a55fe", ] data = static_bytes(8192) - hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, False)] + hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, False)] assert len(hashes) == 7 assert hashes == expected @@ -75,7 +75,7 @@ def test_data_chunks_max_even(): "2032f28cfcdad86090b60fa5cfd8cc44b972df47d5f7e3637001d8e03b8fbc07", ] data = static_bytes(8192 + 1000) - hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, False)] + hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, False)] assert len(hashes) == 8 assert hashes == expected @@ -93,6 +93,6 @@ def test_data_chunks_utf32(): "f249cbe070bba6b689251074ddb75aa3ddfc02caa357f2f8f714cfeb39523d96", ] data = static_bytes(8192 + 1000) - hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, True)] + hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, True)] assert len(hashes) == 9 assert hashes == expected diff --git a/tests/test_code_content_image.py b/tests/test_code_content_image.py index 85bf467..cd4e253 100644 --- a/tests/test_code_content_image.py +++ b/tests/test_code_content_image.py @@ -71,19 +71,19 @@ def test_get_code_image(): def test_dct_empty(): with pytest.raises(ValueError): - ic.dct([]) + ic.alg_dct([]) def test_dct_zeros(): - assert ic.dct([0] * 64) == [0] * 64 + assert ic.alg_dct([0] * 64) == [0] * 64 def test_dct_ones(): - assert ic.dct([1] * 64) == [64] + [0] * 63 + assert ic.alg_dct([1] * 64) == [64] + [0] * 63 def test_dct_range(): - assert ic.dct(range(64))[0] == 2016 + assert ic.alg_dct(range(64))[0] == 2016 def test_gen_image_code_schema_conformance(): diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 0772ac1..9173842 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -5,11 +5,11 @@ def test_minhash_empty(): with pytest.raises(ValueError): - ic.minhash([]) + ic.alg_minhash([]) def test_minhash_single_feature(): - mh = ic.minhash([2**16]) + mh = ic.alg_minhash([2**16]) assert isinstance(mh, list) assert len(mh) == 64 assert mh[0] == 1968499307 @@ -18,22 +18,22 @@ def test_minhash_single_feature(): def test_minhash_32bit_features(): i32 = 2**32 - 1 - mh = ic.minhash([2**64 - 1]) + mh = ic.alg_minhash([2**64 - 1]) for n in mh: assert n <= i32 def test_minhash_compress(): - mh = ic.minhash([2**16]) - digest = ic.minhash_compress(mh) + mh = ic.alg_minhash([2**16]) + digest = ic.alg_minhash_compress(mh) assert len(digest) == 32 assert digest.hex() == "a18e2fb2bd663d21db9c7dcc9ae78380253cae5bf089766d87a6b51fcb3f8f8e" mh = [0b10100001, 0b11000010, 0b10110100, 0b10011000] - compressed = ic.minhash_compress(mh) + compressed = ic.alg_minhash_compress(mh) as_int = int.from_bytes(compressed, "big", signed=False) assert as_int == 0b1000_0100_0010_0001 def test_minhash_64(): - mh = ic.minhash_64([2**16]) + mh = ic.alg_minhash_64([2**16]) assert mh.hex() == "a18e2fb2bd663d21" diff --git a/tests/test_simhash.py b/tests/test_simhash.py index 7b59999..11d123f 100644 --- a/tests/test_simhash.py +++ b/tests/test_simhash.py @@ -4,40 +4,40 @@ def test_similarity_hash(): all_zero = 0b0.to_bytes(8, "big") - assert iscc_core.simhash.similarity_hash([all_zero, all_zero]) == all_zero + assert iscc_core.simhash.alg_simhash([all_zero, all_zero]) == all_zero all_ones = 0b11111111.to_bytes(1, "big") - assert iscc_core.simhash.similarity_hash([all_ones, all_ones]) == all_ones + assert iscc_core.simhash.alg_simhash([all_ones, all_ones]) == all_ones a = 0b0110.to_bytes(1, "big") b = 0b1100.to_bytes(1, "big") r = 0b1110.to_bytes(1, "big") - assert iscc_core.simhash.similarity_hash([a, b]) == r + assert iscc_core.simhash.alg_simhash([a, b]) == r a = 0b01101001.to_bytes(1, "big") b = 0b00111000.to_bytes(1, "big") c = 0b11100100.to_bytes(1, "big") r = 0b01101000.to_bytes(1, "big") - assert iscc_core.simhash.similarity_hash([a, b, c]) == r + assert iscc_core.simhash.alg_simhash([a, b, c]) == r a = 0b01100101.to_bytes(1, "big") b = 0b01011001.to_bytes(1, "big") c = 0b10010101.to_bytes(1, "big") d = 0b10101001.to_bytes(1, "big") r = 0b11111101.to_bytes(1, "big") - assert iscc_core.simhash.similarity_hash([a, b, c, d]) == r + assert iscc_core.simhash.alg_simhash([a, b, c, d]) == r a = 0b0110100101101001.to_bytes(2, "big") b = 0b0011100000111000.to_bytes(2, "big") c = 0b1110010011100100.to_bytes(2, "big") r = 0b0110100001101000.to_bytes(2, "big") - assert iscc_core.simhash.similarity_hash([a, b, c]) == r + assert iscc_core.simhash.alg_simhash([a, b, c]) == r def test_similarity_hash_256_bit(): a = bytes.fromhex("84f6a7413bb26202c515fccedfaabfa2f2fc46a69a28a08f7c2a1a12a390ca50") b = bytes.fromhex("dc0edccb8e7bff663699f89139af1ff99c276275c4dabd775311bde43c13b34d") assert ( - iscc_core.simhash.similarity_hash([a, b]).hex() + iscc_core.simhash.alg_simhash([a, b]).hex() == "dcfeffcbbffbff66f79dfcdfffafbffbfeff66f7defabdff7f3bbff6bf93fb5d" ) diff --git a/tests/test_wtahash.py b/tests/test_wtahash.py index 670ef8a..3691203 100644 --- a/tests/test_wtahash.py +++ b/tests/test_wtahash.py @@ -5,26 +5,26 @@ def test_wtahash(): vec = tuple([0] * 379) + (1,) assert ( - ic.wtahash(vec, 256).hex() + ic.alg_wtahash(vec, 256).hex() == "0000000000000000000000000000000000000200000000000000000000000000" ) vec = (1,) + tuple([0] * 379) assert ( - ic.wtahash(vec, 256).hex() + ic.alg_wtahash(vec, 256).hex() == "0000000000000000000000000000000000000000000000000000000000000000" ) vec = (1,) + tuple([0] * 378) + (1,) assert ( - ic.wtahash(vec, 256).hex() + ic.alg_wtahash(vec, 256).hex() == "0000000000000000000000000000000000000200000000000000000000000000" ) vec = (0,) + tuple([2] * 378) + (0,) assert ( - ic.wtahash(vec, 256).hex() + ic.alg_wtahash(vec, 256).hex() == "0000000000000000000000000000000000000000000000000000000000000000" ) vec = tuple(range(380)) assert ( - ic.wtahash(vec, 256).hex() + ic.alg_wtahash(vec, 256).hex() == "528f91431f7c4ad26932fc073a28cac93f21a3071a152fc2925bdaed1d190061" )