-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapriori.cpp
397 lines (362 loc) · 14.3 KB
/
apriori.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/**
* Created by harry7 on 30/8/17.
*
* This file contains the code for main functionality of the apriori algorithm
* The Apriori implemented here assumes two things
* 1. All data given will fit into the memory
* 2. Number of unique items is not so small
* This code uses trie (more like a hash tree) data structure to store the counts for frequent item sets
* configuration is read from config.csv which is present in the same folder as the executable is present
*
*/
#include <bits/stdc++.h>
#include <tr1/unordered_map>
#include "trie.h"
#include "utilities.h"
using namespace std;
using namespace std::tr1;
/* ---------------- Global Variables ---------------- */
vector<int> frequencies; /* counts for all the items in the dataset */
unordered_map<string, int> map1; /* map from items to an integer */
vector<vector<int> > table; /* table which contains transactions of the data base in compressed format */
unordered_map<int, int> map2; /* map to be used for items which are frequent */
vector<int> single_item_sets; /* Vector to hold all the frequent 1 item sets */
vector<vector<int> > tri_array; /* Triangular array to hold the counts for 2 item sets */
Trie tree; /* Hash tree to hold the counts for all item sets */
vector<string> item_map; /* Reverse map from integer to the item */
vector<vector<vector<int> > > freq_item_sets; /* List of all frequent item sets of the data */
int min_count; /* minimum count of an item set for it to be frequent*/
Configuration config; /* struct to hold the configuration given in `config.csv` */
/* ---------------- Function Declarations ---------------- */
/**
* Removes the in frequent items from each record of the table
*/
void purgeTable();
/**
* Makes a pass over all the records in the table and counts the frequency
* of each pair in the record and updates those frequencies
*/
void updatePairFrequencies();
/**
* Generates the next length (k+1) item sets from given length (k) item sets
*
* @param k current length of item sets
* @return candidate item sets of length k+1 which can be frequent
*/
vector<vector<int> > aprioriGen(int k);
/**
* Generates association rules from the frequent item sets present
*
* @param lhs left hand parts of association rules
* @param rhs right hand parts of association rules
* @return number of association rules generated
*/
int getAssociationRules(vector<vector<int> > &lhs, vector<vector<int> > &rhs);
/**
* writes the frequent item sets to the output file as specified in config file
* uses @link{getAssociationRules} to generate the association rules and writes
* them to the output file if specified
*
* @param freq_item_count number of frequent item sets in the data
*/
void writeOutput(int freq_item_count);
/* ---------------- Driver program code ---------------- */
int main() {
config = readConfig(); // Read the config from the config file
/*
* Read the input from the input file and prepare a map from items
* and item map from integer to the item
*/
int num_records = readInput(map1, item_map, config.input_file);
table.resize((unsigned long) num_records);
frequencies.resize(item_map.size());
/*
* Make the table from the input by the mapped values and count the frequencies
* of all the items
*/
makeTable(table, map1, frequencies, config.input_file);
int num_items = int(item_map.size());
tree.root = tree.newNode(0, num_items + 1);
vector<vector<int> > empty_set;
vector<int> temp;
freq_item_sets.push_back(empty_set); // 0 length item-sets
freq_item_sets.push_back(empty_set); // 1 length item-sets
min_count = int(ceil(num_records * config.support));
// Generate all the frequent 1-item sets by checking frequencies and update the tree
int counter = 0;
for (int i = 1; i < num_items; i++) {
if (frequencies[i] >= min_count) {
//cout << item_map[i] <<" " << frequencies[i] << " " << min_count << "\n";
single_item_sets.push_back(i);
temp.push_back(i);
freq_item_sets[1].push_back(temp);
tree.insert(temp, num_items + 1, frequencies[i]);
temp.pop_back();
map2[i] = counter + 1;
counter++;
}
}
/*
* Iterate over all the record in the table and remove the infrequent items
* from all the records and update the records in table
*/
purgeTable();
// Generating triangular array for use when generating two item sets
tri_array.resize((unsigned long) (counter + 1));
for (int i = 2; i <= counter; i++) {
for (int j = 0; j < i; j++) {
tri_array[i].push_back(0);
}
}
// Go through the updated records and update the frequencies of each pair and update the tree
updatePairFrequencies();
// Generate the frequent 2 item sets by going through the pair frequencies
freq_item_sets.push_back(empty_set); // for 2 item sets
vector<int> pair_temp;
for (int j = 1; j < counter; j++) {
for (int i = j + 1; i <= counter; i++) {
if (tri_array[i][j] >= min_count) {
pair_temp.push_back(single_item_sets[j - 1]);
pair_temp.push_back(single_item_sets[i - 1]);
freq_item_sets[2].push_back(pair_temp);
tree.insert(pair_temp, num_items + 1, tri_array[i][j]);
pair_temp.pop_back();
pair_temp.pop_back();
}
}
}
// Run apriori for remaining item set sizes using tree
int freq_item_count = int(freq_item_sets[1].size() + freq_item_sets[2].size());
// cerr << "2-item sets done with total count " << freq_item_count << "\n";
int current = 3;
vector<int> k_set_frequencies;
while (freq_item_sets[current - 1].size() > 1) {
vector<vector<int> > gen = aprioriGen(current - 1);
// count frequencies for all the item sets returned by the aprioriGen
k_set_frequencies.clear();
for (int i = 0; i < gen.size(); i++) {
k_set_frequencies.push_back(0);
}
// Iterate through all records and check if that record contains the item set
int gen_size = int(gen.size());
for (int i = 0; i < num_records; i++) {
for (int j = 0; j < gen_size; j++) {
int ind1 = 0;
int ind2 = 0;
int ind1_max = int(table[i].size());
while (ind2 < current) {
while (ind1 < ind1_max && table[i][ind1] != gen[j][ind2]) {
ind1++;
}
if (ind1 == ind1_max) {
break;
}
ind2++;
}
if (ind2 == current) {
// All the items are present so increase the count
k_set_frequencies[j]++;
}
}
}
// Push the frequent item sets into the answer and insert into tree
freq_item_sets.push_back(empty_set);
int k_freq_item_count = 0;
for (int j = 0; j < gen_size; j++) {
if (k_set_frequencies[j] >= min_count) {
freq_item_sets[current].push_back(gen[j]);
tree.insert(gen[j], num_items + 1, k_set_frequencies[j]);
k_freq_item_count++;
}
}
freq_item_count += k_freq_item_count;
current++;
}
writeOutput(freq_item_count);
return 0;
}
/* ---------------- Implementation of above declared methods ---------------- */
void purgeTable() {
vector<int> temp;
for (int i = 0; i < table.size(); i++) {
//Go through each record of the table and keep track of items which are frequent
for (vector<int>::iterator it = table[i].begin(); it != table[i].end(); it++) {
if (frequencies[*it] >= min_count) {
temp.push_back(*it);
}
}
// Modify the record so that it contains only frequent items
table[i].clear();
for (vector<int>::iterator it = temp.begin(); it != temp.end(); it++) {
table[i].push_back(*it);
}
temp.clear();
}
}
void updatePairFrequencies() {
/*
* Go through each record in table and update the frequency for all
* the pairs present in the record
*/
for (int i = 0; i < table.size(); i++) {
for (int j = 0; j < table[i].size(); j++) {
int ind2 = map2[table[i][j]];
for (int k = j + 1; k < table[i].size(); k++) {
int ind1 = map2[table[i][k]];
tri_array[ind1][ind2]++;
}
}
}
}
vector<vector<int> > aprioriGen(int k) {
int len = int(freq_item_sets[k].size());
int i = 0, j = 1;
vector<vector<int> > gen;
/*
* Iterate through the item set. For each index, get the max index until which
* all items except the last one are same with the current item set. Then prepare
* the union of these sets and check if any immediate subset is not frequent
* If that is the case prune that itemset. Finally return all the candidate
* frequent item sets of size k+1
*
*/
for (; i < len && j < len;) {
for (; j < len; j++) {
bool match = true;
// check if the sets are same except the last item
for (int l = 0; l < k - 1; l++) {
if (freq_item_sets[k][i][l] != freq_item_sets[k][j][l]) {
// reached the non equal point before the last item
match = false;
break;
}
}
if (!match) {
break;
}
}
vector<int> temp;
for (int l = 0; l < k - 1; l++) {
temp.push_back(freq_item_sets[k][i][l]);
}
/*
* Iterate over possible pairs of item sets with all except the last item same
* and generate the candidate item sets of size k + 1.
*/
for (int ind1 = i; ind1 < j; ind1++) {
for (int ind2 = ind1 + 1; ind2 < j; ind2++) {
temp.push_back(freq_item_sets[k][ind1][k - 1]);
temp.push_back(freq_item_sets[k][ind2][k - 1]);
vector<int> tmp;
bool frequent = true;
/* Prune out the item set if any immediate subset is not frequent */
for (int l = 0; l < k - 1; l++) {
for (int m = 0; m < k - 1; m++) {
if (l != m) {
tmp.push_back(temp[m]);
}
}
tmp.push_back(temp[k - 1]);
tmp.push_back(temp[k]);
if (tree.getCount(tmp) < min_count) {
frequent = false;
break;
}
tmp.clear();
}
if (frequent) {
// No immediate subset is not frequent so add this to list
gen.push_back(temp);
}
temp.pop_back();
temp.pop_back();
}
}
i = j;
j = i + 1;
}
return gen;
}
void writeOutput(int freq_item_count) {
ofstream output(config.output_file.c_str());
output << freq_item_count << endl;
for (int i = 0; i < freq_item_sets.size(); i++) {
for (int j = 0; j < freq_item_sets[i].size(); j++) {
int item_sz = int(freq_item_sets[i][j].size());
for (int k = 0; k < item_sz - 1; k++) {
output << item_map[freq_item_sets[i][j][k]] << ",";
}
output << item_map[freq_item_sets[i][j][item_sz - 1]] << "\n";
}
}
if (config.flag == 0) {
output.close();
return;
}
// get the association rules from frequent item_sets
vector<vector<int> > assoc_rules_lhs;
vector<vector<int> > assoc_rules_rhs;
int num_assoc_rules = getAssociationRules(assoc_rules_lhs, assoc_rules_rhs);
output << num_assoc_rules << "\n";
for (int i = 0; i < num_assoc_rules; i++) {
int lhs_len = int(assoc_rules_lhs[i].size());
for (int j = 0; j < lhs_len; j++) {
output << item_map[assoc_rules_lhs[i][j]] << ",";
}
output << "=>,";
int rhs_len = int(assoc_rules_rhs[i].size());
for (int j = 0; j < rhs_len - 1; j++) {
output << item_map[assoc_rules_rhs[i][j]] << ",";
}
output << item_map[assoc_rules_rhs[i][rhs_len - 1]] << "\n";
}
output.close();
}
int getAssociationRules(vector<vector<int> > &lhs, vector<vector<int> > &rhs) {
/*
* Go through the each item set and pick a subset as lhs and remaining items
* and pick the remaining elements as rhs and calculate the confidence of
* the rule. If it has minimum confidence and then add those to lhs and rhs
* respectively.
*/
int num_assoc_rules = 0;
vector<int> lhs_items;
vector<int> rhs_items;
for (int item_sz = 2; item_sz < freq_item_sets.size(); item_sz++) {
int num_sets = int(freq_item_sets[item_sz].size());
int lim = (1 << item_sz) - 1; // number of subsets of a set of size n = 2^n
for (int i = 0; i < num_sets; i++) {
int sup = tree.getCount(freq_item_sets[item_sz][i]);
for (int j = 1; j < lim; j++) {
for (int k = 0; k < item_sz; k++) {
// check if the bit is set if yes add it to lhs or add it to rhs
if (j >> k & 1) {
lhs_items.push_back(freq_item_sets[item_sz][i][k]);
} else {
rhs_items.push_back(freq_item_sets[item_sz][i][k]);
}
}
// check for the support
int lhs_sup = tree.getCount(lhs_items);
int min_sup = int(ceil(config.confidence * lhs_sup));
if (sup >= min_sup) {
num_assoc_rules++;
lhs.push_back(lhs_items);
rhs.push_back(rhs_items);
}
lhs_items.clear();
rhs_items.clear();
}
}
}
return num_assoc_rules;
}
void printTable() {
// For debugging purposes
for (int i = 0; i < table.size(); i++) {
for (int j = 0; j < table[i].size(); j++) {
cout << item_map[table[i][j]] << ", ";
}
cout << "\n";
}
}