-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUser-user Cosine.py
136 lines (115 loc) · 2.86 KB
/
User-user Cosine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from math import sqrt
import codecs
def loadMovieLens(path=''):
data = {}
i = 0
f = codecs.open(path + 'u5.base', 'r', 'ascii')
for line in f:
fields = line.split('\t')
user = fields[0]
movie = fields[1]
rating = int(fields[2].strip().strip('"'))
#print(fields[0] + '\t' + fields[1] + '\t' + fields[2])
if user in data:
currentRatings = data[user]
else:
currentRatings = {}
currentRatings[movie] = rating
data[user] = currentRatings
f.close()
#print('loaded')
return data
def calcSim (user, userOther, data):
sum_xy = 0
sum_x2 = 0
sum_y2 = 0
n = 0
rating1 = data[user]
rating2 = data[userOther]
for key in rating1:
if key in rating2:
n += 1
sum_xy += rating1[key]*rating2[key]
if n == 0:
return 0
else:
for value in rating1.itervalues():
sum_x2 += value**2
for value in rating2.itervalues():
sum_y2 += value**2
denominator = sqrt(sum_x2) * sqrt(sum_y2)
if denominator == 0:
return 0
else:
return sum_xy/denominator
def createSimMatrix(data):
normalizedMatrix = data
simMatrix = {}
sumNum = 0
sumUser = 0
sumOtherUser = 0
for user in normalizedMatrix:
#print ("for user " + user)
sim = {}
for userOther in normalizedMatrix:
k = calcSim (user, userOther, normalizedMatrix)
sim[userOther] = k
#print ("sim("+user+","+userOther+") ="+str(k))
simMatrix[user] = sim
return simMatrix
def computeNearestNeighbor(username, movie, data, simMatrix, i):
simUser = simMatrix.get(username)
sorted_simUser = sorted(simUser.items(), key=lambda x: -x[1])
#print sorted_simUser
simArray = []
cnt1 = 0
cnt2 = 0
for u in sorted_simUser:
cnt1 = cnt1 + 1
if cnt1 is not 1:
if data.get(u[0]).get(movie) is not None:
temp = (u[0], u[1])
simArray.append(temp)
cnt2 = cnt2 + 1
#print u[0]
#print data.get(u[0]).get(movie)
if cnt2 is i:
break
return simArray
def calculateRating(username, movie, data, simMatrix, i):
currentUser = data.get(username)
simArray = computeNearestNeighbor(username, movie, data, simMatrix, i)
denominator_sum = 0
total = 0
length = len(simArray)
for el in simArray:
total = total + el[1]*data[el[0]][movie]
denominator_sum = denominator_sum + el[1]
div = 0
if denominator_sum > 0.00000000000001:
div = total/denominator_sum
#print simArray
return div
def mainFunction(path=''):
data = loadMovieLens('ml-100k/')
simMatrix = createSimMatrix(data)
for i in range (1,51):
f = codecs.open(path + 'u5.test', 'r', 'ascii')
cnt = 0
dif = 0
sum = 0
for line in f:
fields = line.split('\t')
user = fields[0]
movie = fields[1]
rating = int(fields[2].strip().strip('"'))
calculated = calculateRating(user, movie, data, simMatrix, i)
if calculated is not 0:
dif = (calculated - rating) ** 2
sum = sum + dif
cnt = cnt + 1
#print cnt
f.close()
rmse = sqrt(sum/cnt)
print i,rmse
mainFunction('ml-100k/')