Common prefix for basic algorithms

iscc · Feb 11, 2022 · fdbe66a · fdbe66a
1 parent d30d091
commit fdbe66a
Show file tree

Hide file tree

Showing 21 changed files with 74 additions and 74 deletions.
diff --git a/iscc_core/cdc.pxd b/iscc_core/cdc.pxd
@@ -6,4 +6,4 @@ cdef uint32_t CDC_READ_SIZE = 262144
 cdef uint32_t[256] GEAR
 
 @cython.locals(pattern=uint32_t, i=uint32_t, size=uint32_t, barrier=uint32_t)
-cdef uint32_t cdc_offset(const uint8_t[:], uint32_t mi, uint32_t ma, uint32_t cs, uint32_t mask_s, uint32_t mask_l)
+cdef uint32_t alg_cdc_offset(const uint8_t[:], uint32_t mi, uint32_t ma, uint32_t cs, uint32_t mask_s, uint32_t mask_l)
diff --git a/iscc_core/cdc.py b/iscc_core/cdc.py
@@ -6,10 +6,10 @@
 import iscc_core as ic
 
 
-__all__ = ["cdc_data_chunks"]
+__all__ = ["alg_cdc_chunks"]
 
 
-def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size):
+def alg_cdc_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size):
     # type: (Data, bool, int) -> Generator[bytes, None, None]
     """
     A generator that yields data-dependent chunks for `data`.
@@ -33,13 +33,13 @@ def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size
     if not buffer:
         yield b""
 
-    mi, ma, cs, mask_s, mask_l = cdc_params(avg_chunk_size)
+    mi, ma, cs, mask_s, mask_l = alg_cdc_params(avg_chunk_size)
 
     buffer = memoryview(buffer)
     while buffer:
         if len(buffer) <= ma:
             buffer = memoryview(bytes(buffer) + stream.read(ic.core_opts.io_read_size))
-        cut_point = cdc_offset(buffer, mi, ma, cs, mask_s, mask_l)
+        cut_point = alg_cdc_offset(buffer, mi, ma, cs, mask_s, mask_l)
 
         # Make sure cut points are at 4-byte aligned for utf32 encoded text
         if utf32:
@@ -49,7 +49,7 @@ def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size
         buffer = buffer[cut_point:]
 
 
-def cdc_offset(buffer, mi, ma, cs, mask_s, mask_l):
+def alg_cdc_offset(buffer, mi, ma, cs, mask_s, mask_l):
     # type: (ic.Data, int, int, int, int, int) -> int
     """
     Find breakpoint offset for a given buffer.
@@ -82,7 +82,7 @@ def cdc_offset(buffer, mi, ma, cs, mask_s, mask_l):
     return i
 
 
-def cdc_params(avg_size: int) -> tuple:
+def alg_cdc_params(avg_size: int) -> tuple:
     """
     Calculate CDC parameters
 

diff --git a/iscc_core/code_content_audio.py b/iscc_core/code_content_audio.py
@@ -75,13 +75,13 @@ def soft_hash_audio_v0(cv):
         return b"\x00" * 32
 
     # Calculate simhash of digests as first 32-bit chunk of the hash
-    parts = [ic.similarity_hash(digests)]
+    parts = [ic.alg_simhash(digests)]
 
     # Calculate separate 32-bit simhashes for each quarter of features (original order)
     for bucket in divide(4, digests):
         features = list(bucket)
         if features:
-            parts.append(ic.similarity_hash(features))
+            parts.append(ic.alg_simhash(features))
         else:
             parts.append(b"\x00\x00\x00\x00")
 
@@ -91,7 +91,7 @@ def soft_hash_audio_v0(cv):
     for bucket in divide(3, digests):
         features = list(bucket)
         if features:
-            parts.append(ic.similarity_hash(features))
+            parts.append(ic.alg_simhash(features))
         else:
             parts.append(b"\x00\x00\x00\x00")
     return b"".join(parts)
diff --git a/iscc_core/code_content_image.py b/iscc_core/code_content_image.py
@@ -78,13 +78,13 @@ def soft_hash_image_v0(pixels, bits=ic.core_opts.image_bits):
     # DCT per row
     dct_row_lists = []
     for pixel_list in chunked(pixels, 32):
-        dct_row_lists.append(ic.dct(pixel_list))
+        dct_row_lists.append(ic.alg_dct(pixel_list))
 
     # DCT per col
     dct_row_lists_t = list(map(list, zip(*dct_row_lists)))
     dct_col_lists_t = []
     for dct_list in dct_row_lists_t:
-        dct_col_lists_t.append(ic.dct(dct_list))
+        dct_col_lists_t.append(ic.alg_dct(dct_list))
 
     dct_matrix = list(map(list, zip(*dct_col_lists_t)))
 

diff --git a/iscc_core/code_content_mixed.py b/iscc_core/code_content_mixed.py
@@ -94,4 +94,4 @@ def soft_hash_codes_v0(cc_digests, bits=ic.core_opts.mixed_bits):
     # Retain the first byte of the header and strip body to mixed_bits length
     for full, code_tuple in zip(cc_digests, code_tuples):
         hash_bytes.append(full[:1] + code_tuple[-1][: nbytes - 1])
-    return ic.similarity_hash(hash_bytes)
+    return ic.alg_simhash(hash_bytes)
diff --git a/iscc_core/code_content_text.py b/iscc_core/code_content_text.py
@@ -84,7 +84,7 @@ def soft_hash_text_v0(text):
     - Slide over text with a
       [`text_ngram_size`][iscc_core.options.CoreOptions.text_ngram_size] wide window
       and create [`xxh32`](https://cyan4973.github.io/xxHash/) hashes
-    - Create a [`minhash_256`][iscc_core.minhash.minhash_256] from the hashes generated
+    - Create a [`minhash_256`][iscc_core.minhash.alg_minhash_256] from the hashes generated
       in the previous step.
 
     !!! note
@@ -102,7 +102,7 @@ def soft_hash_text_v0(text):
     """
     ngrams = ic.sliding_window(text, ic.core_opts.text_ngram_size)
     features = [xxhash.xxh32_intdigest(s.encode("utf-8")) for s in ngrams]
-    hash_digest = ic.minhash_256(features)
+    hash_digest = ic.alg_minhash_256(features)
     return hash_digest
 
 

diff --git a/iscc_core/code_content_video.py b/iscc_core/code_content_video.py
@@ -67,5 +67,5 @@ def soft_hash_video_v0(frame_sigs, bits=ic.core_opts.video_bits):
         frame_sigs = [tuple(sig) for sig in frame_sigs]
     sigs = set(frame_sigs)
     vecsum = [sum(col) for col in zip(*sigs)]
-    video_hash_digest = ic.wtahash(vecsum, bits)
+    video_hash_digest = ic.alg_wtahash(vecsum, bits)
     return video_hash_digest
diff --git a/iscc_core/code_data.py b/iscc_core/code_data.py
@@ -89,7 +89,7 @@ def push(self, data):
         if self.tail:
             data = self.tail + data
 
-        for chunk in ic.cdc_data_chunks(
+        for chunk in ic.alg_cdc_chunks(
             data, utf32=False, avg_chunk_size=ic.core_opts.data_avg_chunk_size
         ):
             self.chunk_sizes.append(len(chunk))
@@ -103,7 +103,7 @@ def digest(self):
         # type: () -> bytes
         """Calculate 256-bit minhash digest from feature hashes."""
         self._finalize()
-        return ic.minhash_256(self.chunk_features)
+        return ic.alg_minhash_256(self.chunk_features)
 
     def code(self, bits=ic.core_opts.data_bits):
         # type: (int) -> str

diff --git a/iscc_core/code_meta.py b/iscc_core/code_meta.py
@@ -185,7 +185,7 @@ def soft_hash_meta_v0(name, extra=None):
     name = ic.text_collapse(name)
     name_n_grams = ic.sliding_window(name, width=ic.core_opts.meta_ngram_size_text)
     name_hash_digests = [blake3(s.encode("utf-8")).digest() for s in name_n_grams]
-    simhash_digest = ic.similarity_hash(name_hash_digests)
+    simhash_digest = ic.alg_simhash(name_hash_digests)
 
     if extra in {None, "", b""}:
         return simhash_digest
@@ -203,7 +203,7 @@ def soft_hash_meta_v0(name, extra=None):
         else:
             raise ValueError("parameter `extra` must be of type str or bytes!")
 
-        extra_simhash_digest = ic.similarity_hash(extra_hash_digests)
+        extra_simhash_digest = ic.alg_simhash(extra_hash_digests)
 
         # Interleave first half of name and extra simhashes in 32-bit chunks
         chunks_simhash_digest = sliced(simhash_digest[:16], 4)

diff --git a/iscc_core/dct.py b/iscc_core/dct.py
@@ -3,7 +3,7 @@
 from typing import List, Sequence
 
 
-def dct(v):
+def alg_dct(v):
     # type: (Sequence[float]) -> List
     """
     Discrete cosine transform.
@@ -26,8 +26,8 @@ def dct(v):
         beta = [
             (v[i] - v[-(i + 1)]) / (math.cos((i + 0.5) * math.pi / n) * 2.0) for i in range(half)
         ]
-        alpha = dct(alpha)
-        beta = dct(beta)
+        alpha = alg_dct(alpha)
+        beta = alg_dct(beta)
         result = []
         for i in range(half - 1):
             result.append(alpha[i])

diff --git a/iscc_core/iscc_id.py b/iscc_core/iscc_id.py
@@ -82,7 +82,7 @@ def soft_hash_iscc_id_v0(iscc_code, uc=0):
             # first byte of header + first 7 bytes of body
             digests.append(dec[:1] + unp[-1][:7])
 
-    iscc_id_digest = ic.similarity_hash(digests)
+    iscc_id_digest = ic.alg_simhash(digests)
 
     if uc:
         iscc_id_digest += uvarint.encode(uc)

diff --git a/iscc_core/minhash.pxd b/iscc_core/minhash.pxd
@@ -9,11 +9,11 @@ cdef uint64_t[64] MPA
 cdef uint64_t[64] MPB
 
 @cython.locals(a=uint64_t, b=uint64_t, f=uint64_t)
-cpdef list minhash(list features)
+cpdef list alg_minhash(list features)
 
-cpdef bytes minhash_64(list features)
+cpdef bytes alg_minhash_64(list features)
 
-cpdef bytes minhash_256(list features)
+cpdef bytes alg_minhash_256(list features)
 
 @cython.locals(bits=str, bitpos=uint8_t, h=uint64_t)
-cpdef bytes compress(list mhash, int lsb=*)
+cpdef bytes alg_minhash_compress(list mhash, int lsb=*)
diff --git a/iscc_core/minhash.py b/iscc_core/minhash.py
@@ -2,7 +2,7 @@
 from typing import List
 
 
-def minhash(features):
+def alg_minhash(features):
     # type: (List[int]) -> List[int]
     """
     Calculate a 64 dimensional minhash integer vector.
@@ -16,7 +16,7 @@ def minhash(features):
     ]
 
 
-def minhash_64(features):
+def alg_minhash_64(features):
     # type: (List[int]) -> bytes
     """
     Create 64-bit minimum hash digest.
@@ -25,10 +25,10 @@ def minhash_64(features):
     :return: 64-bit binary from the least significant bits of the minhash values
     :rtype: bytes
     """
-    return minhash_compress(minhash(features), 1)
+    return alg_minhash_compress(alg_minhash(features), 1)
 
 
-def minhash_256(features):
+def alg_minhash_256(features):
     # type: (List[int]) -> bytes
     """
     Create 256-bit minimum hash digest.
@@ -37,15 +37,15 @@ def minhash_256(features):
     :return: 256-bit binary from the least significant bits of the minhash values
     :rtype: bytes
     """
-    return minhash_compress(minhash(features), 4)
+    return alg_minhash_compress(alg_minhash(features), 4)
 
 
-def minhash_compress(mhash, lsb=4):
+def alg_minhash_compress(mhash, lsb=4):
     # type: (List[int], int) -> bytes
     """
     Compress minhash vector to byte hash-digest.
 
-    Concatenates `lsb` number of  least significant bits from each integer in `mhash`.
+    Concatenates `lsb` number of least-significant bits from each integer in `mhash`.
     For example an `mhash` with 64 integers and `lsb=4` will produce a 256-bit summary
     of the minhash vector.
 

diff --git a/iscc_core/simhash.pxd b/iscc_core/simhash.pxd
@@ -1 +1 @@
-cpdef bytes similarity_hash(list hash_digests)
+cpdef bytes alg_simhash(list hash_digests)
diff --git a/iscc_core/simhash.py b/iscc_core/simhash.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 
-def similarity_hash(hash_digests):
+def alg_simhash(hash_digests):
     # type: (list[bytes]) -> bytes
     """
     Creates a similarity preserving hash from a sequence of equal sized hash digests.

diff --git a/iscc_core/wtahash.py b/iscc_core/wtahash.py
@@ -3,7 +3,7 @@
 from bitarray import bitarray
 
 
-def wtahash(vec: Sequence[float], bits) -> bytes:
+def alg_wtahash(vec: Sequence[float], bits) -> bytes:
     """Calculate WTA Hash for vector with 380 values (MP7 frame signature)."""
     h = []
     for perm in WTA_VIDEO_ID_PERMUTATIONS:

diff --git a/tests/test_cdc.py b/tests/test_cdc.py
@@ -5,46 +5,46 @@
 
 
 def test_get_params():
-    assert iscc_core.cdc.cdc_params(1024) == (256, 8192, 640, 2047, 511)
-    assert iscc_core.cdc.cdc_params(8192) == (2048, 65536, 5120, 16383, 4095)
+    assert iscc_core.cdc.alg_cdc_params(1024) == (256, 8192, 640, 2047, 511)
+    assert iscc_core.cdc.alg_cdc_params(8192) == (2048, 65536, 5120, 16383, 4095)
 
 
 def test_data_chunks_empty():
-    assert list(iscc_core.cdc.cdc_data_chunks(b"", False)) == [b""]
+    assert list(iscc_core.cdc.alg_cdc_chunks(b"", False)) == [b""]
 
 
 def test_data_chunks_1byte():
-    assert list(iscc_core.cdc.cdc_data_chunks(b"\x00", False)) == [b"\x00"]
+    assert list(iscc_core.cdc.alg_cdc_chunks(b"\x00", False)) == [b"\x00"]
 
 
 def test_data_chunks_below_min():
     data = static_bytes(256 - 1)
-    assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
+    assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]
 
 
 def test_data_chunks_min():
     data = static_bytes(256)
-    assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
+    assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]
 
 
 def test_data_chunks_above_min():
     data = static_bytes(256 + 1)
-    assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
+    assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]
 
 
 def test_data_chunks_avg():
     data = static_bytes(1024)
-    assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
+    assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]
 
 
 def test_data_chunks_avg_above():
     data = static_bytes(1024 + 1)
-    assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
+    assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]
 
 
 def test_data_chunks_two_chunks():
     data = static_bytes(1024 + 309)
-    assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data[:-1], data[-1:]]
+    assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data[:-1], data[-1:]]
 
 
 def test_data_chunks_max_odd():
@@ -58,7 +58,7 @@ def test_data_chunks_max_odd():
         "5fae55e1aee84705fc3dc6e831d4f7981677e03338343bd6a783c45e333a55fe",
     ]
     data = static_bytes(8192)
-    hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, False)]
+    hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, False)]
     assert len(hashes) == 7
     assert hashes == expected
 
@@ -75,7 +75,7 @@ def test_data_chunks_max_even():
         "2032f28cfcdad86090b60fa5cfd8cc44b972df47d5f7e3637001d8e03b8fbc07",
     ]
     data = static_bytes(8192 + 1000)
-    hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, False)]
+    hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, False)]
     assert len(hashes) == 8
     assert hashes == expected
 
@@ -93,6 +93,6 @@ def test_data_chunks_utf32():
         "f249cbe070bba6b689251074ddb75aa3ddfc02caa357f2f8f714cfeb39523d96",
     ]
     data = static_bytes(8192 + 1000)
-    hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, True)]
+    hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, True)]
     assert len(hashes) == 9
     assert hashes == expected
diff --git a/tests/test_code_content_image.py b/tests/test_code_content_image.py
@@ -71,19 +71,19 @@ def test_get_code_image():
 
 def test_dct_empty():
     with pytest.raises(ValueError):
-        ic.dct([])
+        ic.alg_dct([])
 
 
 def test_dct_zeros():
-    assert ic.dct([0] * 64) == [0] * 64
+    assert ic.alg_dct([0] * 64) == [0] * 64
 
 
 def test_dct_ones():
-    assert ic.dct([1] * 64) == [64] + [0] * 63
+    assert ic.alg_dct([1] * 64) == [64] + [0] * 63
 
 
 def test_dct_range():
-    assert ic.dct(range(64))[0] == 2016
+    assert ic.alg_dct(range(64))[0] == 2016
 
 
 def test_gen_image_code_schema_conformance():
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		cpdef bytes similarity_hash(list hash_digests)
		cpdef bytes alg_simhash(list hash_digests)