-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcsim.py
137 lines (109 loc) · 4.25 KB
/
csim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import sys
import json
import nltk
import string
import indexer
import requests
import makeuplink
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Points to the directory that contains lyrics.
TRAIN_DATA_DIR = 'youtubeScraper/ok/'
# Points to the user input file.
USERINPUT = 'input1.txt'
# Path to main database index.
INDEXDB = 'INDEX.db'
def stem_tokens(tokens):
''' Return items that can stand alone or be combined. '''
return [stemmer.stem(item) for item in tokens]
def normalize(text):
''' Remove untwanted char from the data stream. '''
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
def cosine_sim(text1, text2):
''' Return the cosine similarity, between two non zero vectors of an inner product space '''
tfidf = vectorizer.fit_transform([text1, text2])
return ((tfidf * tfidf.T).A)[0, 1]
def get_related_words(word):
''' Deprecated method. '''
return word
def read_file(filename):
''' Return the source of a file. '''
with open(filename, 'r') as source:
return source.read()
def get_score(userKeys, databaseKeys):
''' Calculate the score using the user defined terms and all the terms in the training database. '''
return cosine_sim(userKeys, databaseKeys)
if __name__ == '__main__':
# Make sure ntlk package is present.
nltk.download('punkt')
# Generate punctuation map.
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
# Normalize the stream.
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
# Read the user input file and prepare the variables.
userinput = read_file(USERINPUT)
words = []
_GLOBALDICTIONARY = []
tokenizer = RegexpTokenizer(r"[\w']+")
# Start matching the words.
for word in [w.lower() for w in tokenizer.tokenize(userinput) if w not in stopwords.words('english')]:
words.append(get_related_words(word.lower()))
userinput = ' '.join(words)
scores = []
# Check the training database files.
for filename in os.listdir(TRAIN_DATA_DIR):
train_data_content = read_file(TRAIN_DATA_DIR + '/' + filename)
words = []
for word in [w.lower() for w in tokenizer.tokenize(train_data_content) if w not in stopwords.words('english')]:
words.append(get_related_words(word.lower()))
words = ' '.join(words)
try:
# Calculate the score between two files.
score = get_score(words, userinput)
scores.append(score)
# Update the dictionary with the filename and the score.
_GLOBALDICTIONARY.append(dict({'rating': score, 'filename': filename}))
except:
pass
# print filename + ': ' + str(scores)
# print sorted(scores)[-1]
THISSENTIMENT = float(makeuplink.get_sentiment(USERINPUT))
if THISSENTIMENT > 0.5:
THISSENTIMENT = 'hpy'
else:
THISSENTIMENT = 'sad'
# Print the global dictionary as a list.
# for k, d in _GLOBALDICTIONARY.items():
# print k, d
sentimentdictlist = []
with open(INDEXDB, 'r') as mainindex:
for line in mainindex.readlines():
jsondict = json.loads(line)
if jsondict['sentiment'] == THISSENTIMENT:
sentimentdictlist.append(jsondict)
finalscores = []
for sdict in sentimentdictlist:
for gdict in _GLOBALDICTIONARY:
if gdict['filename'] == sdict['filename']:
finalscores.append(gdict['rating'])
highscore = sorted(finalscores, reverse=True)[0]
for gdict in _GLOBALDICTIONARY:
if gdict['rating'] == highscore:
for sdict in sentimentdictlist:
if gdict['filename'] == sdict['filename']:
print
print gdict
print sdict
# finalscores = []
# for mydict in sentimentdictlist:
# finalscores.append(mydict['score'])
#
# finalscores = sorted(finalscores)
# highscore = finalscores[-1]
# for mydict in sentimentdictlist:
# if mydict['score'] == highscore:
# print mydict