Skip to content

Commit

Permalink
Improve simhash performance
Browse files Browse the repository at this point in the history
  • Loading branch information
titusz committed Dec 7, 2023
1 parent 1521026 commit dd4f13e
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 11 deletions.
6 changes: 2 additions & 4 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
The shared library can also be built manually using the command:
$ cythonize -X language_level=3 -a -i ./iscc_core/cdc.py
$ cythonize -X language_level=3 -a -i ./iscc_core/minhash.py
$ cythonize -X language_level=3 -a -i ./iscc_core/simhash.py
"""
try:
from Cython.Build import cythonize, build_ext
Expand Down Expand Up @@ -43,10 +44,7 @@ def build(setup_kwargs):
setup_kwargs.update(
dict(
ext_modules=cythonize(
[
"iscc_core/cdc.py",
"iscc_core/minhash.py",
]
["iscc_core/cdc.py", "iscc_core/minhash.py", "iscc_core/simhash.py"]
),
cmdclass=dict(build_ext=build_ext_gracefull),
)
Expand Down
1 change: 1 addition & 0 deletions iscc_core/simhash.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cpdef bytes alg_simhash(list[bytes] hash_digests)
15 changes: 8 additions & 7 deletions iscc_core/simhash.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
from bitarray import bitarray


def alg_simhash(hash_digests):
Expand All @@ -16,16 +17,16 @@ def alg_simhash(hash_digests):
vector = [0] * n_bits

for digest in hash_digests:
h = int.from_bytes(digest, "big", signed=False)

h = bitarray()
h.frombytes(digest)
for i in range(n_bits):
vector[i] += h & 1
h >>= 1
vector[i] += h[i]

minfeatures = len(hash_digests) * 1.0 / 2
minfeatures = len(hash_digests) / 2
shash = 0

for i in range(n_bits):
shash |= int(vector[i] >= minfeatures) << i
if vector[i] >= minfeatures:
shash |= 1 << (n_bits - 1 - i)

return shash.to_bytes(n_bytes, "big", signed=False)
return shash.to_bytes(n_bytes, "big")

0 comments on commit dd4f13e

Please sign in to comment.