Skip to content

Commit

Permalink
Use shelve to reduce RAM usage
Browse files Browse the repository at this point in the history
  • Loading branch information
cau777 committed Mar 28, 2023
1 parent 03d1a32 commit 4601ec7
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 11 deletions.
2 changes: 1 addition & 1 deletion key_terms_extractor/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ RUN pip install -r requirements.txt
COPY nltk_setup.py ./
RUN python nltk_setup.py

COPY app.py dataset_loader.py lazy_data.py text_processing.py documents.json ./
COPY app.py dataset_loader.py lazy_data.py text_processing.py documents.json documents.shelf.bak documents.shelf.dat documents.shelf.dir ./

EXPOSE 5000
#ENTRYPOINT ["tail", "-f", "/dev/null"]
Expand Down
21 changes: 11 additions & 10 deletions key_terms_extractor/app.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import shelve
from collections import defaultdict
from math import log
from flask import Flask, jsonify, request
from werkzeug.exceptions import BadRequest
from lazy_data import load_document
from text_processing import extract_pos_tokens, lemmatize_tokens, normalize_text, ENGLISH_STOPWORDS

app = Flask(__name__)
Expand Down Expand Up @@ -32,15 +32,16 @@ def hello_world():
word_count += 1

words_scores = dict()
document_count, document_frequencies = load_document()
for word, freq in frequencies.items():
word_document_count = 0
if word in document_frequencies:
word_document_count = document_frequencies[word]

tf = freq / word_count
idf = log((document_count + 1) / (1 + word_document_count))
words_scores[word] = tf * idf
with shelve.open("./documents.shelf") as document_frequencies:
document_count = document_frequencies["__count__"]
for word, freq in frequencies.items():
word_document_count = 0
if word in document_frequencies:
word_document_count = document_frequencies[word]

tf = freq / word_count
idf = log((document_count + 1) / (1 + word_document_count))
words_scores[word] = tf * idf

# Only get the count words with the highest score
s = sorted(words_scores.items(), key=lambda item: -item[1])[:min(count, len(words_scores))]
Expand Down
3 changes: 3 additions & 0 deletions key_terms_extractor/documents.shelf.bak
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
'__count__', (0, 5)
'count', (512, 17)
'frequencies', (1024, 5602906)
Binary file added key_terms_extractor/documents.shelf.dat
Binary file not shown.
3 changes: 3 additions & 0 deletions key_terms_extractor/documents.shelf.dir
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
'__count__', (0, 5)
'count', (512, 17)
'frequencies', (1024, 5602906)
14 changes: 14 additions & 0 deletions key_terms_extractor/shelf_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import json
import shelve


def main():
with open(r"./documents.json", "r", encoding="utf-8") as f, \
shelve.open(r"./documents.shelf", "c") as dest:
source: dict = json.load(f)
for key, value in source.items():
dest[key] = value


if __name__ == '__main__':
main()

0 comments on commit 4601ec7

Please sign in to comment.