From dd4f13e7f507ec7fd248e997cda9a85e835a8245 Mon Sep 17 00:00:00 2001 From: Titusz Pan Date: Thu, 7 Dec 2023 12:31:20 +0100 Subject: [PATCH] Improve simhash performance --- build.py | 6 ++---- iscc_core/simhash.pxd | 1 + iscc_core/simhash.py | 15 ++++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) create mode 100644 iscc_core/simhash.pxd diff --git a/build.py b/build.py index 6051570..63c6a33 100644 --- a/build.py +++ b/build.py @@ -5,6 +5,7 @@ The shared library can also be built manually using the command: $ cythonize -X language_level=3 -a -i ./iscc_core/cdc.py $ cythonize -X language_level=3 -a -i ./iscc_core/minhash.py +$ cythonize -X language_level=3 -a -i ./iscc_core/simhash.py """ try: from Cython.Build import cythonize, build_ext @@ -43,10 +44,7 @@ def build(setup_kwargs): setup_kwargs.update( dict( ext_modules=cythonize( - [ - "iscc_core/cdc.py", - "iscc_core/minhash.py", - ] + ["iscc_core/cdc.py", "iscc_core/minhash.py", "iscc_core/simhash.py"] ), cmdclass=dict(build_ext=build_ext_gracefull), ) diff --git a/iscc_core/simhash.pxd b/iscc_core/simhash.pxd new file mode 100644 index 0000000..0a12c37 --- /dev/null +++ b/iscc_core/simhash.pxd @@ -0,0 +1 @@ +cpdef bytes alg_simhash(list[bytes] hash_digests) diff --git a/iscc_core/simhash.py b/iscc_core/simhash.py index 89d20cf..32f032d 100644 --- a/iscc_core/simhash.py +++ b/iscc_core/simhash.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from bitarray import bitarray def alg_simhash(hash_digests): @@ -16,16 +17,16 @@ def alg_simhash(hash_digests): vector = [0] * n_bits for digest in hash_digests: - h = int.from_bytes(digest, "big", signed=False) - + h = bitarray() + h.frombytes(digest) for i in range(n_bits): - vector[i] += h & 1 - h >>= 1 + vector[i] += h[i] - minfeatures = len(hash_digests) * 1.0 / 2 + minfeatures = len(hash_digests) / 2 shash = 0 for i in range(n_bits): - shash |= int(vector[i] >= minfeatures) << i + if vector[i] >= minfeatures: + shash |= 1 << (n_bits - 1 - i) - return shash.to_bytes(n_bytes, "big", signed=False) + return shash.to_bytes(n_bytes, "big")