Use shelve to reduce RAM usage

cau777 · Mar 28, 2023 · 4601ec7 · 4601ec7
1 parent 03d1a32
commit 4601ec7
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 11 deletions.
diff --git a/key_terms_extractor/Dockerfile b/key_terms_extractor/Dockerfile
@@ -14,7 +14,7 @@ RUN pip install -r requirements.txt
 COPY nltk_setup.py ./
 RUN python nltk_setup.py
 
-COPY app.py dataset_loader.py lazy_data.py text_processing.py documents.json ./
+COPY app.py dataset_loader.py lazy_data.py text_processing.py documents.json documents.shelf.bak documents.shelf.dat documents.shelf.dir ./
 
 EXPOSE 5000
 #ENTRYPOINT ["tail", "-f", "/dev/null"]

diff --git a/key_terms_extractor/app.py b/key_terms_extractor/app.py
@@ -1,8 +1,8 @@
+import shelve
 from collections import defaultdict
 from math import log
 from flask import Flask, jsonify, request
 from werkzeug.exceptions import BadRequest
-from lazy_data import load_document
 from text_processing import extract_pos_tokens, lemmatize_tokens, normalize_text, ENGLISH_STOPWORDS
 
 app = Flask(__name__)
@@ -32,15 +32,16 @@ def hello_world():
         word_count += 1
 
     words_scores = dict()
-    document_count, document_frequencies = load_document()
-    for word, freq in frequencies.items():
-        word_document_count = 0
-        if word in document_frequencies:
-            word_document_count = document_frequencies[word]
-
-        tf = freq / word_count
-        idf = log((document_count + 1) / (1 + word_document_count))
-        words_scores[word] = tf * idf
+    with shelve.open("./documents.shelf") as document_frequencies:
+        document_count = document_frequencies["__count__"]
+        for word, freq in frequencies.items():
+            word_document_count = 0
+            if word in document_frequencies:
+                word_document_count = document_frequencies[word]
+
+            tf = freq / word_count
+            idf = log((document_count + 1) / (1 + word_document_count))
+            words_scores[word] = tf * idf
 
     # Only get the count words with the highest score
     s = sorted(words_scores.items(), key=lambda item: -item[1])[:min(count, len(words_scores))]

diff --git a/key_terms_extractor/documents.shelf.bak b/key_terms_extractor/documents.shelf.bak
@@ -0,0 +1,3 @@
+'__count__', (0, 5)
+'count', (512, 17)
+'frequencies', (1024, 5602906)
diff --git a/key_terms_extractor/documents.shelf.dat b/key_terms_extractor/documents.shelf.dat
diff --git a/key_terms_extractor/documents.shelf.dir b/key_terms_extractor/documents.shelf.dir
@@ -0,0 +1,3 @@
+'__count__', (0, 5)
+'count', (512, 17)
+'frequencies', (1024, 5602906)
diff --git a/key_terms_extractor/shelf_writer.py b/key_terms_extractor/shelf_writer.py
@@ -0,0 +1,14 @@
+import json
+import shelve
+
+
+def main():
+    with open(r"./documents.json", "r", encoding="utf-8") as f, \
+            shelve.open(r"./documents.shelf", "c") as dest:
+        source: dict = json.load(f)
+        for key, value in source.items():
+            dest[key] = value
+
+
+if __name__ == '__main__':
+    main()