-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis2.py
executable file
·59 lines (50 loc) · 1.78 KB
/
analysis2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python
## ex: analysis2.py <train.csv> <test.csv>
from time import time
from datetime import datetime as dt
from MyVecs import MyTfidfVectorizer
import sys
import pandas as pd
import numpy as np
def main():
train = sys.argv[1]
test = sys.argv[2]
savefile = train.split(sep='.')[0]
#### import file ####
print("---- reading data files ----", dt.now())
df_train = pd.read_csv(train)
df_test = pd.read_csv(test)
#### make training vectorizer ####
print("---- constructing training vector ----", dt.now())
vec_train = MyTfidfVectorizer(ngram_range=(1,2),
stop_words='english',
vocabulary=bow(df_train.content))
tfidf_train = vec_train.fit_transform(df_train.content)
#### make test vectorizer ####
print("---- make test vectorizer ----", dt.now())
vec_test = MyTfidfVectorizer(ngram_range=(1,2),
stop_words='english',
vocabulary=vec_train.get_feature_names())
tfidf_test = vec_test.fit_transform(df_test.content)
#### calculate dot products ####
print("---- calculate dot products ----", dt.now())
dots = tfidf_test.dot(tfidf_train.T)
one = np.ones((len(df_train), 1))
print("dots", np.shape(dots))
print("one", np.shape(one))
sums = dots.dot(one)
print(np.shape(sums))
np.save(savefile+".npy", sums)
def bow(series):
bag = {}
for tweet in series:
for word in tweet.split():
if word not in bag.keys():
bag[word] = 1
else:
bag[word] += 1
return set([key for key, val in bag.items() if val > 1])
if __name__=="__main__":
t1 = time()
main()
print(time()-t1)