-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaggregate.awk
210 lines (195 loc) · 6.09 KB
/
aggregate.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/bin/awk -f
##
# Aggregate a CSV file and expose the COUNT, SUM and DISTINCT methods
# @param string header New header line
# @param int omitHeader If 1, exclude the first line
# @param int distinctLine If, aggregate by the entire line
# @param string avgColumns List of columns by index to use to calculate average, separated by space
# @param string distinctColumns List of columns by index to use in single mode, separated by space
# @param string minColumns List of columns by index where find the min value, separated by space
# @param string maxColumns List of columns by index where find the max value, separated by space
# @param string countColumns List of columns by index to count, separated by space
# @param string sumColumns List of columns by index to sum, separated by space
# @param string groupByColumns List of columns by index to use to group, separated by space
#
# @copyright 2016 Hervé Gouchet
# @license http://www.apache.org/licenses/LICENSE-2.0
# @source https://github.com/rvflash/termtables
BEGIN {
# Separators (input)
FS=",";
# Skip header line
start=1;
if (1 == omitHeader) {
start++;
}
# Declare default values
groupSize=1;
split("", count);
split("", order);
# Convert string args to array
splitFlip(avgColumns, averages, " ");
splitFlip(distinctColumns, distincts, " ");
splitFlip(countColumns, counts, " ");
splitFlip(minColumns, minimums, " ");
splitFlip(maxColumns, maximums, " ");
splitFlip(sumColumns, sums, " ");
splitFlip(groupByColumns, groups, " ");
}
(NR >= start) {
# Split line with comma separated values and deal with comma inside quotes
numberFields=splitLine($0, columns);
# Group by (single or multiple columns)
if (1 == distinctLine) {
group=$0;
} else {
group="";
for (column in groups) {
if ("" == group) {
group=columns[column];
} else {
group=group FS columns[column];
}
}
}
# If no group has been defined but first column requested as distinct value, use it for grouping
if ("" == group && 1 in distincts) {
group=columns[1];
}
# Keep original order by
if (group in count) {
# awk seems to not have pattern for "not in array"
} else {
order[groupSize++]=group;
}
aggregating[group]=$0;
count[group]++;
for (column=1; column <= numberFields; column++) {
# Count distinct
distinct[column FS group FS columns[column]]++;
# Sum
sum[column FS group]+=columns[column];
# Max
if ("" == max[column FS group]) {
max[column FS group]=columns[column]
}
if (columns[column] > max[column FS group]) {
max[column FS group]=columns[column];
}
# Min
if ("" == min[column FS group]) {
min[column FS group]=columns[column]
}
if (columns[column] < min[column FS group]) {
min[column FS group]=columns[column];
}
}
}
END {
# Empty file
if (NR < start) {
exit;
}
# Output the new header line
if ("" != header) {
print header;
}
for (pos=1; pos < groupSize; pos++) {
group=order[pos];
numberFields=splitLine(aggregating[group], columns);
for (column=1; column <= numberFields; column++) {
if (column in counts) {
if (column in distincts) {
printf("%d", countKeyWith(column FS group FS, distinct))
} else {
printf("%d", count[group])
}
} else if (column in sums) {
printNumber(sum[column FS group], decimal);
} else if (column in averages) {
printNumber((sum[column FS group] / count[group]), decimal);
} else if (column in minimums) {
printNumber(min[column FS group], decimal);
} else if (column in maximums) {
printNumber(max[column FS group], decimal);
} else {
# Protect column containing comma with quotes
printf("%s", (columns[column] ~ FS ? "\"" columns[column] "\"" : columns[column]))
}
if (column < numberFields) {
printf(FS)
} else {
printf(RS)
}
}
}
}
##
# Count all elements in an array with key beginning by needle
# @param mixed needle
# @param array haystack
# @return int
function countKeyWith (needle, haystack) {
countKey=0
for (key in haystack) {
if (match(key, "^" needle)) {
countKey++;
}
}
return countKey
}
##
# Prints number with float format with 4 decimals only if necessary
# @param int|float number
# @param int number, by default 6
# @param string
function printNumber (number, decimal)
{
if ("" == decimal) {
decimal=6
}
if (number == int(number)) {
printf("%d", number);
} else {
number=sprintf("%.*f", decimal, number);
printf("%s", (number ~ FS ? "\"" number "\"" : number));
}
}
##
# Split string and exchanges all keys with their associated values in an array, return length of array
# @param string source
# @param array destination
# @param string separator
# @return int
function splitFlip (string, array, separator)
{
# Declare array to manage empty string
split("", array);
split(string, arr, separator);
for (key in arr) {
array[arr[key]]=key;
}
return length(array);
}
##
# Split line and deal with escaping separator within double quotes
# Cheating with CSV file that contains comma inside a quoted field
# @param string line
# @param array columns
# @return int
function splitLine (line, columns)
{
numberFields=0;
line=line FS;
while(line) {
match(line, / *"[^"]*" *,|[^,]*,/);
field=substr(line, RSTART, RLENGTH);
# Remove extra data
gsub(/^ *"?|"? *,$/, "", field);
numberFields++;
columns[numberFields]=field;
# So, next ?
line=substr(line, RLENGTH+1);
}
return numberFields
}