Skip to content

Commit

Permalink
Common prefix for basic algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
titusz committed Feb 11, 2022
1 parent d30d091 commit fdbe66a
Show file tree
Hide file tree
Showing 21 changed files with 74 additions and 74 deletions.
2 changes: 1 addition & 1 deletion iscc_core/cdc.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ cdef uint32_t CDC_READ_SIZE = 262144
cdef uint32_t[256] GEAR

@cython.locals(pattern=uint32_t, i=uint32_t, size=uint32_t, barrier=uint32_t)
cdef uint32_t cdc_offset(const uint8_t[:], uint32_t mi, uint32_t ma, uint32_t cs, uint32_t mask_s, uint32_t mask_l)
cdef uint32_t alg_cdc_offset(const uint8_t[:], uint32_t mi, uint32_t ma, uint32_t cs, uint32_t mask_s, uint32_t mask_l)
12 changes: 6 additions & 6 deletions iscc_core/cdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import iscc_core as ic


__all__ = ["cdc_data_chunks"]
__all__ = ["alg_cdc_chunks"]


def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size):
def alg_cdc_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size):
# type: (Data, bool, int) -> Generator[bytes, None, None]
"""
A generator that yields data-dependent chunks for `data`.
Expand All @@ -33,13 +33,13 @@ def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size
if not buffer:
yield b""

mi, ma, cs, mask_s, mask_l = cdc_params(avg_chunk_size)
mi, ma, cs, mask_s, mask_l = alg_cdc_params(avg_chunk_size)

buffer = memoryview(buffer)
while buffer:
if len(buffer) <= ma:
buffer = memoryview(bytes(buffer) + stream.read(ic.core_opts.io_read_size))
cut_point = cdc_offset(buffer, mi, ma, cs, mask_s, mask_l)
cut_point = alg_cdc_offset(buffer, mi, ma, cs, mask_s, mask_l)

# Make sure cut points are at 4-byte aligned for utf32 encoded text
if utf32:
Expand All @@ -49,7 +49,7 @@ def cdc_data_chunks(data, utf32, avg_chunk_size=ic.core_opts.data_avg_chunk_size
buffer = buffer[cut_point:]


def cdc_offset(buffer, mi, ma, cs, mask_s, mask_l):
def alg_cdc_offset(buffer, mi, ma, cs, mask_s, mask_l):
# type: (ic.Data, int, int, int, int, int) -> int
"""
Find breakpoint offset for a given buffer.
Expand Down Expand Up @@ -82,7 +82,7 @@ def cdc_offset(buffer, mi, ma, cs, mask_s, mask_l):
return i


def cdc_params(avg_size: int) -> tuple:
def alg_cdc_params(avg_size: int) -> tuple:
"""
Calculate CDC parameters
Expand Down
6 changes: 3 additions & 3 deletions iscc_core/code_content_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ def soft_hash_audio_v0(cv):
return b"\x00" * 32

# Calculate simhash of digests as first 32-bit chunk of the hash
parts = [ic.similarity_hash(digests)]
parts = [ic.alg_simhash(digests)]

# Calculate separate 32-bit simhashes for each quarter of features (original order)
for bucket in divide(4, digests):
features = list(bucket)
if features:
parts.append(ic.similarity_hash(features))
parts.append(ic.alg_simhash(features))
else:
parts.append(b"\x00\x00\x00\x00")

Expand All @@ -91,7 +91,7 @@ def soft_hash_audio_v0(cv):
for bucket in divide(3, digests):
features = list(bucket)
if features:
parts.append(ic.similarity_hash(features))
parts.append(ic.alg_simhash(features))
else:
parts.append(b"\x00\x00\x00\x00")
return b"".join(parts)
4 changes: 2 additions & 2 deletions iscc_core/code_content_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ def soft_hash_image_v0(pixels, bits=ic.core_opts.image_bits):
# DCT per row
dct_row_lists = []
for pixel_list in chunked(pixels, 32):
dct_row_lists.append(ic.dct(pixel_list))
dct_row_lists.append(ic.alg_dct(pixel_list))

# DCT per col
dct_row_lists_t = list(map(list, zip(*dct_row_lists)))
dct_col_lists_t = []
for dct_list in dct_row_lists_t:
dct_col_lists_t.append(ic.dct(dct_list))
dct_col_lists_t.append(ic.alg_dct(dct_list))

dct_matrix = list(map(list, zip(*dct_col_lists_t)))

Expand Down
2 changes: 1 addition & 1 deletion iscc_core/code_content_mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,4 @@ def soft_hash_codes_v0(cc_digests, bits=ic.core_opts.mixed_bits):
# Retain the first byte of the header and strip body to mixed_bits length
for full, code_tuple in zip(cc_digests, code_tuples):
hash_bytes.append(full[:1] + code_tuple[-1][: nbytes - 1])
return ic.similarity_hash(hash_bytes)
return ic.alg_simhash(hash_bytes)
4 changes: 2 additions & 2 deletions iscc_core/code_content_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def soft_hash_text_v0(text):
- Slide over text with a
[`text_ngram_size`][iscc_core.options.CoreOptions.text_ngram_size] wide window
and create [`xxh32`](https://cyan4973.github.io/xxHash/) hashes
- Create a [`minhash_256`][iscc_core.minhash.minhash_256] from the hashes generated
- Create a [`minhash_256`][iscc_core.minhash.alg_minhash_256] from the hashes generated
in the previous step.
!!! note
Expand All @@ -102,7 +102,7 @@ def soft_hash_text_v0(text):
"""
ngrams = ic.sliding_window(text, ic.core_opts.text_ngram_size)
features = [xxhash.xxh32_intdigest(s.encode("utf-8")) for s in ngrams]
hash_digest = ic.minhash_256(features)
hash_digest = ic.alg_minhash_256(features)
return hash_digest


Expand Down
2 changes: 1 addition & 1 deletion iscc_core/code_content_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,5 @@ def soft_hash_video_v0(frame_sigs, bits=ic.core_opts.video_bits):
frame_sigs = [tuple(sig) for sig in frame_sigs]
sigs = set(frame_sigs)
vecsum = [sum(col) for col in zip(*sigs)]
video_hash_digest = ic.wtahash(vecsum, bits)
video_hash_digest = ic.alg_wtahash(vecsum, bits)
return video_hash_digest
4 changes: 2 additions & 2 deletions iscc_core/code_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def push(self, data):
if self.tail:
data = self.tail + data

for chunk in ic.cdc_data_chunks(
for chunk in ic.alg_cdc_chunks(
data, utf32=False, avg_chunk_size=ic.core_opts.data_avg_chunk_size
):
self.chunk_sizes.append(len(chunk))
Expand All @@ -103,7 +103,7 @@ def digest(self):
# type: () -> bytes
"""Calculate 256-bit minhash digest from feature hashes."""
self._finalize()
return ic.minhash_256(self.chunk_features)
return ic.alg_minhash_256(self.chunk_features)

def code(self, bits=ic.core_opts.data_bits):
# type: (int) -> str
Expand Down
4 changes: 2 additions & 2 deletions iscc_core/code_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def soft_hash_meta_v0(name, extra=None):
name = ic.text_collapse(name)
name_n_grams = ic.sliding_window(name, width=ic.core_opts.meta_ngram_size_text)
name_hash_digests = [blake3(s.encode("utf-8")).digest() for s in name_n_grams]
simhash_digest = ic.similarity_hash(name_hash_digests)
simhash_digest = ic.alg_simhash(name_hash_digests)

if extra in {None, "", b""}:
return simhash_digest
Expand All @@ -203,7 +203,7 @@ def soft_hash_meta_v0(name, extra=None):
else:
raise ValueError("parameter `extra` must be of type str or bytes!")

extra_simhash_digest = ic.similarity_hash(extra_hash_digests)
extra_simhash_digest = ic.alg_simhash(extra_hash_digests)

# Interleave first half of name and extra simhashes in 32-bit chunks
chunks_simhash_digest = sliced(simhash_digest[:16], 4)
Expand Down
6 changes: 3 additions & 3 deletions iscc_core/dct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import List, Sequence


def dct(v):
def alg_dct(v):
# type: (Sequence[float]) -> List
"""
Discrete cosine transform.
Expand All @@ -26,8 +26,8 @@ def dct(v):
beta = [
(v[i] - v[-(i + 1)]) / (math.cos((i + 0.5) * math.pi / n) * 2.0) for i in range(half)
]
alpha = dct(alpha)
beta = dct(beta)
alpha = alg_dct(alpha)
beta = alg_dct(beta)
result = []
for i in range(half - 1):
result.append(alpha[i])
Expand Down
2 changes: 1 addition & 1 deletion iscc_core/iscc_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def soft_hash_iscc_id_v0(iscc_code, uc=0):
# first byte of header + first 7 bytes of body
digests.append(dec[:1] + unp[-1][:7])

iscc_id_digest = ic.similarity_hash(digests)
iscc_id_digest = ic.alg_simhash(digests)

if uc:
iscc_id_digest += uvarint.encode(uc)
Expand Down
8 changes: 4 additions & 4 deletions iscc_core/minhash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ cdef uint64_t[64] MPA
cdef uint64_t[64] MPB

@cython.locals(a=uint64_t, b=uint64_t, f=uint64_t)
cpdef list minhash(list features)
cpdef list alg_minhash(list features)

cpdef bytes minhash_64(list features)
cpdef bytes alg_minhash_64(list features)

cpdef bytes minhash_256(list features)
cpdef bytes alg_minhash_256(list features)

@cython.locals(bits=str, bitpos=uint8_t, h=uint64_t)
cpdef bytes compress(list mhash, int lsb=*)
cpdef bytes alg_minhash_compress(list mhash, int lsb=*)
14 changes: 7 additions & 7 deletions iscc_core/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List


def minhash(features):
def alg_minhash(features):
# type: (List[int]) -> List[int]
"""
Calculate a 64 dimensional minhash integer vector.
Expand All @@ -16,7 +16,7 @@ def minhash(features):
]


def minhash_64(features):
def alg_minhash_64(features):
# type: (List[int]) -> bytes
"""
Create 64-bit minimum hash digest.
Expand All @@ -25,10 +25,10 @@ def minhash_64(features):
:return: 64-bit binary from the least significant bits of the minhash values
:rtype: bytes
"""
return minhash_compress(minhash(features), 1)
return alg_minhash_compress(alg_minhash(features), 1)


def minhash_256(features):
def alg_minhash_256(features):
# type: (List[int]) -> bytes
"""
Create 256-bit minimum hash digest.
Expand All @@ -37,15 +37,15 @@ def minhash_256(features):
:return: 256-bit binary from the least significant bits of the minhash values
:rtype: bytes
"""
return minhash_compress(minhash(features), 4)
return alg_minhash_compress(alg_minhash(features), 4)


def minhash_compress(mhash, lsb=4):
def alg_minhash_compress(mhash, lsb=4):
# type: (List[int], int) -> bytes
"""
Compress minhash vector to byte hash-digest.
Concatenates `lsb` number of least significant bits from each integer in `mhash`.
Concatenates `lsb` number of least-significant bits from each integer in `mhash`.
For example an `mhash` with 64 integers and `lsb=4` will produce a 256-bit summary
of the minhash vector.
Expand Down
2 changes: 1 addition & 1 deletion iscc_core/simhash.pxd
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cpdef bytes similarity_hash(list hash_digests)
cpdef bytes alg_simhash(list hash_digests)
2 changes: 1 addition & 1 deletion iscc_core/simhash.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-


def similarity_hash(hash_digests):
def alg_simhash(hash_digests):
# type: (list[bytes]) -> bytes
"""
Creates a similarity preserving hash from a sequence of equal sized hash digests.
Expand Down
2 changes: 1 addition & 1 deletion iscc_core/wtahash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from bitarray import bitarray


def wtahash(vec: Sequence[float], bits) -> bytes:
def alg_wtahash(vec: Sequence[float], bits) -> bytes:
"""Calculate WTA Hash for vector with 380 values (MP7 frame signature)."""
h = []
for perm in WTA_VIDEO_ID_PERMUTATIONS:
Expand Down
26 changes: 13 additions & 13 deletions tests/test_cdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,46 +5,46 @@


def test_get_params():
assert iscc_core.cdc.cdc_params(1024) == (256, 8192, 640, 2047, 511)
assert iscc_core.cdc.cdc_params(8192) == (2048, 65536, 5120, 16383, 4095)
assert iscc_core.cdc.alg_cdc_params(1024) == (256, 8192, 640, 2047, 511)
assert iscc_core.cdc.alg_cdc_params(8192) == (2048, 65536, 5120, 16383, 4095)


def test_data_chunks_empty():
assert list(iscc_core.cdc.cdc_data_chunks(b"", False)) == [b""]
assert list(iscc_core.cdc.alg_cdc_chunks(b"", False)) == [b""]


def test_data_chunks_1byte():
assert list(iscc_core.cdc.cdc_data_chunks(b"\x00", False)) == [b"\x00"]
assert list(iscc_core.cdc.alg_cdc_chunks(b"\x00", False)) == [b"\x00"]


def test_data_chunks_below_min():
data = static_bytes(256 - 1)
assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]


def test_data_chunks_min():
data = static_bytes(256)
assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]


def test_data_chunks_above_min():
data = static_bytes(256 + 1)
assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]


def test_data_chunks_avg():
data = static_bytes(1024)
assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]


def test_data_chunks_avg_above():
data = static_bytes(1024 + 1)
assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data]
assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data]


def test_data_chunks_two_chunks():
data = static_bytes(1024 + 309)
assert list(iscc_core.cdc.cdc_data_chunks(data, False)) == [data[:-1], data[-1:]]
assert list(iscc_core.cdc.alg_cdc_chunks(data, False)) == [data[:-1], data[-1:]]


def test_data_chunks_max_odd():
Expand All @@ -58,7 +58,7 @@ def test_data_chunks_max_odd():
"5fae55e1aee84705fc3dc6e831d4f7981677e03338343bd6a783c45e333a55fe",
]
data = static_bytes(8192)
hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, False)]
hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, False)]
assert len(hashes) == 7
assert hashes == expected

Expand All @@ -75,7 +75,7 @@ def test_data_chunks_max_even():
"2032f28cfcdad86090b60fa5cfd8cc44b972df47d5f7e3637001d8e03b8fbc07",
]
data = static_bytes(8192 + 1000)
hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, False)]
hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, False)]
assert len(hashes) == 8
assert hashes == expected

Expand All @@ -93,6 +93,6 @@ def test_data_chunks_utf32():
"f249cbe070bba6b689251074ddb75aa3ddfc02caa357f2f8f714cfeb39523d96",
]
data = static_bytes(8192 + 1000)
hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.cdc_data_chunks(data, True)]
hashes = [blake3(c).hexdigest() for c in iscc_core.cdc.alg_cdc_chunks(data, True)]
assert len(hashes) == 9
assert hashes == expected
8 changes: 4 additions & 4 deletions tests/test_code_content_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,19 +71,19 @@ def test_get_code_image():

def test_dct_empty():
with pytest.raises(ValueError):
ic.dct([])
ic.alg_dct([])


def test_dct_zeros():
assert ic.dct([0] * 64) == [0] * 64
assert ic.alg_dct([0] * 64) == [0] * 64


def test_dct_ones():
assert ic.dct([1] * 64) == [64] + [0] * 63
assert ic.alg_dct([1] * 64) == [64] + [0] * 63


def test_dct_range():
assert ic.dct(range(64))[0] == 2016
assert ic.alg_dct(range(64))[0] == 2016


def test_gen_image_code_schema_conformance():
Expand Down
Loading

0 comments on commit fdbe66a

Please sign in to comment.