-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVector Search - Fast KNN.sas
427 lines (333 loc) · 16.5 KB
/
Vector Search - Fast KNN.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/* -----------------------------------------------------------------------------------------*
Vector Search - Fast KNN
Version: 1.1 (09FEB2024)
Created: Sundaresh Sankaran(sundaresh.sankaran@sas.com)
*------------------------------------------------------------------------------------------ */
cas ss;
caslib _ALL_ assign;
/*-----------------------------------------------------------------------------------------*
Values provided are for illustrative purposes only.
Provide your own values in the section below.
*------------------------------------------------------------------------------------------*/
%let baseTable_lib =PUBLIC;
%let baseTable_name_base =BASETABLE;
%let queryTable_lib =PUBLIC;
%let queryTable_name_base =QUERYTABLE;
%let outputTable_lib =PUBLIC;
%let outputTable_name_base =OUTPUTTABLE;
%let outputDistTable_lib =PUBLIC;
%let outputDistTable_name_base =OUTPUTDIST;
%let idCol =Doc_ID;
%let parallelMethod =QUERY;
%let numMatches =10;
%let thresholdDistance =100;
%let searchMethod =APPROXIMATE;
%let mTrees =10;
%let maxPoints =100;
/*-----------------------------------------------------------------------------------------*
START MACRO DEFINITIONS.
*------------------------------------------------------------------------------------------*/
/* -----------------------------------------------------------------------------------------*
Error flag for capture during code execution.
*------------------------------------------------------------------------------------------ */
%global _fnn_error_flag;
%global _fnn_error_desc;
%let _fnn_error_flag=0;
/* -----------------------------------------------------------------------------------------*
Global macro variable for the trigger to run this custom step. A value of 1
(the default) enables this custom step to run. A value of 0 (provided by upstream code)
sets this to disabled.
*------------------------------------------------------------------------------------------ */
%global _fnn_run_trigger;
%if %sysevalf(%superq(_fnn_run_trigger)=, boolean) %then %do;
%put NOTE: Trigger macro variable _fnn_run_trigger does not exist. Creating it now.;
%let _fnn_run_trigger=1;
%end;
/*-----------------------------------------------------------------------------------------*
Macro variable to capture indicator of a currently active CAS session
*------------------------------------------------------------------------------------------*/
%global casSessionExists;
%global _current_uuid_;
/*-----------------------------------------------------------------------------------------*
Macro to capture indicator and UUID of any currently active CAS session.
UUID is not expensive and can be used in future to consider graceful reconnect.
*------------------------------------------------------------------------------------------*/
%macro _fnn_checkSession;
%if %sysfunc(symexist(_SESSREF_)) %then %do;
%let casSessionExists= %sysfunc(sessfound(&_SESSREF_.));
%if &casSessionExists.=1 %then %do;
proc cas;
session.sessionId result = sessresults;
call symputx("_current_uuid_", sessresults[1]);
%put NOTE: A CAS session &_SESSREF_. is currently active with UUID &_current_uuid_. ;
quit;
%end;
%end;
%mend _fnn_checkSession;
/*-----------------------------------------------------------------------------------------*
Macro to capture indicator and UUIDof any currently active CAS session.
UUID is not expensive and can be used in future to consider graceful reconnect.
Input:
1. errorFlagName: name of an error flag that gets populated in case the connection is
not active. Provide this value in quotes when executing the macro.
Define this as a global macro variable in order to use downstream.
2. errorFlagDesc: Name of a macro variable which can hold a descriptive message output
from the check.
Output:
1. Informational note as required. We explicitly don't provide an error note since
there is an easy recourse(of being able to connect to CAS)
2. UUID of the session: macro variable which gets created if a session exists.
3. errorFlagName: populated
4. errorFlagDesc: populated
*------------------------------------------------------------------------------------------*/
%macro _env_cas_checkSession(errorFlagName, errorFlagDesc);
%if %sysfunc(symexist(_current_uuid_)) %then %do;
%symdel _current_uuid_;
%end;
%if %sysfunc(symexist(_SESSREF_)) %then %do;
%let casSessionExists= %sysfunc(sessfound(&_SESSREF_.));
%if &casSessionExists.=1 %then %do;
%global _current_uuid_;
%let _current_uuid_=;
proc cas;
session.sessionId result = sessresults;
call symputx("_current_uuid_", sessresults[1]);
quit;
%put NOTE: A CAS session &_SESSREF_. is currently active with UUID &_current_uuid_. ;
data _null_;
call symputx(&errorFlagName., 0);
call symput(&errorFlagDesc., "CAS session is active.");
run;
%end;
%else %do;
%put NOTE: Unable to find a currently active CAS session. Reconnect or connect to a CAS session upstream. ;
data _null_;
call symputx(&errorFlagName., 1);
call symput(&errorFlagDesc., "Unable to find a currently active CAS session. Reconnect or connect to a CAS session upstream.");
run;
%end;
%end;
%else %do;
%put NOTE: No active CAS session ;
data _null_;
call symputx(&errorFlagName., 1);
call symput(&errorFlagDesc., "No active CAS session. Connect to a CAS session upstream.");
run;
%end;
%mend _env_cas_checkSession;
/*-----------------------------------------------------------------------------------------*
This macro creates a global macro variable called _usr_nameCaslib
that contains the caslib name (aka. caslib-reference-name) associated with the libname
and assumes that the libname is using the CAS engine.
As sysvalue has a length of 1024 chars, we use the trimmed option in proc sql
to remove leading and trailing blanks in the caslib name.
*------------------------------------------------------------------------------------------*/
%macro _usr_getNameCaslib(_usr_LibrefUsingCasEngine);
%global _usr_nameCaslib;
%let _usr_nameCaslib=;
proc sql noprint;
select sysvalue into :_usr_nameCaslib trimmed from dictionary.libnames
where libname = upcase("&_usr_LibrefUsingCasEngine.") and upcase(sysname)="CASLIB";
quit;
%mend _usr_getNameCaslib;
/*-----------------------------------------------------------------------------------------*
This macro generates additional codepieces based on a condition provided.
*------------------------------------------------------------------------------------------*/
%macro _gac_generate_additional_code(conditionVar, conditionOperator, conditionVal, desiredVar, desiredVal);
%global _gac_generated_string;
%put &conditionVar. &conditionOperator. &conditionVal.;
%if &conditionVar. &conditionOperator. &conditionVal. %then %do;
%put NOTE: Hey mama no shoes;
%let _gac_generated_string = &desiredVar.=&desiredVal.,;
%end;
%else %do;
%let _gac_generated_string = ;
%end;
%mend;
/*--------------------------------------------------------------------------------------*
Macro variable to hold the selected input columns to use as matching criteria.
*---------------------------------------------------------------------------------------*/
%let blankSeparatedCols = %_flw_get_column_list(_flw_prefix=inputColumns);
/*-----------------------------------------------------------------------------------------*
EXECUTION CODE MACRO
*------------------------------------------------------------------------------------------*/
%macro _fnn_main_execution_code;
/*-----------------------------------------------------------------------------------------*
Check for an active CAS session
*------------------------------------------------------------------------------------------*/
%_env_cas_checkSession("_fnn_error_flag","_fnn_error_desc");
%if &_fnn_error_flag. = 1 %then %do;
%put ERROR: &_fnn_error_desc.;
%end;
%else %do;
/*-----------------------------------------------------------------------------------------*
Check Input (base) table libref to ensure it points to a valid caslib.
*------------------------------------------------------------------------------------------*/
%global baseCaslib;
%_usr_getNameCaslib(&baseTable_lib.);
%let baseCaslib=&_usr_nameCaslib.;
%put NOTE: &baseCaslib. is the caslib for the base table.;
%let _usr_nameCaslib=;
%if "&baseCaslib." = "" %then %do;
%put ERROR: Base table caslib is blank. Check if Base table is a valid CAS table. ;
%let _fnn_error_flag=1;
%end;
%end;
/*-----------------------------------------------------------------------------------------*
Check Input (query) table libref to ensure it points to a valid caslib.
*------------------------------------------------------------------------------------------*/
%if &_fnn_error_flag. = 0 %then %do;
%global queryCaslib;
%_usr_getNameCaslib(&queryTable_lib.);
%let queryCaslib=&_usr_nameCaslib.;
%put NOTE: &queryCaslib. is the caslib for the query table.;
%let _usr_nameCaslib=;
%if "&queryCaslib." = "" %then %do;
%put ERROR: Query table caslib is blank. Check if Query table is a valid CAS table. ;
%let _fnn_error_flag=1;
%end;
%end;
/*-----------------------------------------------------------------------------------------*
Check Output table libref to ensure it points to a valid caslib.
*------------------------------------------------------------------------------------------*/
%if &_fnn_error_flag. = 0 %then %do;
%global outputCaslib;
%_usr_getNameCaslib(&outputTable_lib.);
%let outputCaslib=&_usr_nameCaslib.;
%put NOTE: &outputCaslib. is the output caslib.;
%let _usr_nameCaslib=;
%if "&outputCaslib." = "" %then %do;
%put ERROR: Output table caslib is blank. Check if Output table is a valid CAS table. ;
%let _fnn_error_flag=1;
%end;
%end;
/*-----------------------------------------------------------------------------------------*
Check Output (distance) table libref to ensure it points to a valid caslib.
*------------------------------------------------------------------------------------------*/
%if &_fnn_error_flag. = 0 %then %do;
%global outputDistCaslib;
%_usr_getNameCaslib(&outputDistTable_lib.);
%let outputDistCaslib=&_usr_nameCaslib.;
%put NOTE: &outputDistCaslib. is the output distance table caslib.;
%let _usr_nameCaslib=;
%if "&outputDistCaslib." = "" %then %do;
%put ERROR: Output distance table caslib is blank. Check if Output distance table is a valid CAS table. ;
%let _fnn_error_flag=1;
%end;
%end;
/*-----------------------------------------------------------------------------------------*
Run CAS statements
*------------------------------------------------------------------------------------------*/
%if &_fnn_error_flag. = 0 %then %do;
%local mTreesString;
%local maxPointsString;
%let desiredVar=mTrees;
%_gac_generate_additional_code(&searchMethod.,=,"APPROXIMATE",&desiredVar., &mTrees.);
%let mTreesString=&_gac_generated_string.;
%let _gac_generated_string=;
%let desiredVar=maxPoints;
%_gac_generate_additional_code(&searchMethod.,=,"APPROXIMATE",&desiredVar., &maxPoints.);
%let maxPointsString=&_gac_generated_string.;
%let _gac_generated_string=;
proc cas;
/*-----------------------------------------------------------------------------------------*
Obtain inputs from UI.
*------------------------------------------------------------------------------------------*/
baseTableName = symget("baseTable_name_base");
baseTableLib = symget("baseCaslib");
queryTableName = symget("queryTable_name_base");
queryTableLib = symget("queryCaslib");
outputTableName = symget("outputTable_name_base");
outputTableLib = symget("outputCaslib");
outputDistTableName = symget("outputDistTable_name_base");
outputDistTableLib = symget("outputDistCaslib");
idCol = symget("idCol");
numMatches = symget("numMatches");
parallelTable = symget("parallelTable");
thresholdDistance = symget("thresholdDistance");
searchMethod = symget("searchMethod");
mTreesString = symget("mTreesString");
maxPointsString = symget("maxPointsString");
/*-----------------------------------------------------------------------------------------*
Run Fast KNN action
Note: We are currently keeping the default parallelization setting for the QUERY
table currently, due to the chances of some session hangups when running with
PARALLELIZATION=INPUT. This is temporary and will be revisited.
*------------------------------------------------------------------------------------------*/
fastknn.fastknn result=r /
table = {name=baseTableName, caslib=baseTableLib},
query = {name=queryTableName, caslib=queryTableLib},
inputs = ${&blankSeparatedCols.},
id = idCol,
k = numMatches,
method = searchMethod,
parallelization = parallelTable,
&mTreesString.
&maxPointsString.
output = { casout= {name=outputTableName, caslib=outputTableLib, replace=True}},
outDist = { name=outputDistTableName, caslib=outputDistTableLib, replace=True},
threshDist = thresholdDistance
;
/*-----------------------------------------------------------------------------------------*
Print summary results to output window;
*------------------------------------------------------------------------------------------*/
print r;
quit;
%end;
%mend _fnn_main_execution_code;
/*-----------------------------------------------------------------------------------------*
END MACRO DEFINITIONS.
*------------------------------------------------------------------------------------------*/
/*-----------------------------------------------------------------------------------------*
EXECUTION CODE
The execution code is controlled by the trigger variable defined in this custom step. This
trigger variable is in an "enabled" (value of 1) state by default, but in some cases, as
dictated by logic, could be set to a "disabled" (value of 0) state.
*------------------------------------------------------------------------------------------*/
%if &_fnn_run_trigger. = 1 %then %do;
%_fnn_main_execution_code;
%end;
%if &_fnn_run_trigger. = 0 %then %do;
%put NOTE: This step has been disabled. Nothing to do.;
%end;
/*-----------------------------------------------------------------------------------------*
Clean up existing macro variables and macro definitions.
*------------------------------------------------------------------------------------------*/
%if %symexist(_fnn_error_flag) %then %do;
%symdel _fnn_error_flag;
%end;
%if %symexist(outputDistCaslib) %then %do;
%symdel outputDistCaslib;
%end;
%if %symexist(queryCaslib) %then %do;
%symdel queryCaslib;
%end;
%if %symexist(baseCaslib) %then %do;
%symdel baseCaslib;
%end;
%if %symexist(_fnn_run_trigger) %then %do;
%symdel _fnn_run_trigger;
%end;
%if %symexist(_current_uuid_) %then %do;
%symdel _current_uuid_;
%end;
%if %symexist(_usr_nameCaslib) %then %do;
%symdel _usr_nameCaslib;
%end;
%if %symexist(outputCaslib) %then %do;
%symdel outputCaslib;
%end;
%if %symexist(_gac_generated_string) %then %do;
%symdel _gac_generated_string;
%end;
%if %symexist(blankSeparatedCols) %then %do;
%symdel blankSeparatedCols;
%end;
%if %symexist(_fnn_error_desc) %then %do;
%symdel _fnn_error_desc;
%end;
%sysmacdelete _env_cas_checkSession;
%sysmacdelete _usr_getNameCaslib;
%sysmacdelete _fnn_main_execution_code;
%sysmacdelete _gac_generate_additional_code;
cas ss terminate;