-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans_plus.py
42 lines (34 loc) · 1.99 KB
/
kmeans_plus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# 208539270 yardenbakish
# 208983700 tamircohen1
"""kmeans_plus - This module chooses initial k clusters out of n observations
and returns an array where the value in the i'th cell is the observation choosen
for the i'th centroid. the function is called from the NSC and main modules"""
import EuclideanDistance as EC
import numpy as np
def kmeans_pp(observations, K, D, N):
np.random.seed(0) # Seed randomness
selected_indices = [] #indices of observations selected as centroids
indices = np.arange(0, N) #indices of all centroids
#selecting first centroid at random
index = np.random.choice(indices)
selected_indices.append(int(index))
D_list = np.zeros(N) # list of minimal squared euclidean distance for each observation
# computing the squared euclidean distance from the selected first centroid to all observations
for i in range(N):
# calculating squared euclidean distance (using a different module)
D_list[i]=EC.sqEucDist(observations[i*D:(i+1)*D],observations[index*D:(index+1)*D])
# selecting the k-1 remaining centroids
for j in range(2, K+1):
# for each observation calculate D_i
for i in range (N):
t=selected_indices[-1]
"""calculating squared euclidean distance (using a different module) and keeping the minimum between the
value from the previous iteration and the value calculated in the current one."""
#calculating squared euclidean distance (using another module)
D_list[i]=np.minimum(EC.sqEucDist(observations[i*D:(i+1)*D],observations[t*D:(t+1)*D]),D_list[i])
prob=D_list/np.sum(D_list) # computing the probability of selecting each observation for the j'th cluster
"""selecting the j'th cluster inital observations' index from the remaining observations using the probability
density function defined in the previous line"""
index=np.random.choice(indices,p=prob)
selected_indices.append(int(index))
return selected_indices