Merge pull request #1251 from BlazingDB/branch-0.17

Branch 0.17
BlazingDB · Dec 10, 2020 · e139eca · e139eca
2 parents 31c31d7 + 98b4893
commit e139eca
Show file tree

Hide file tree

Showing 253 changed files with 15,553 additions and 8,403 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,6 +24,7 @@ __pycache__/
 # Log file
 *.log
 *.xlsx
+rmm_log.txt
 
 # Tmp files
 *.swp

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,84 @@
+# BlazingSQL 0.17.0 (December 10, 2020)
+
+
+## New Features
+- #1105 Implement to_date/to_timestamp functions
+- #1077 Allow to create tables from compressed files
+- #1126 Add DAYOFWEEK function
+- #981 Added powerPC building script and instructions
+- #912 Added UCX support to how the engine runs
+- #1125 Implement new TCP and UCX comms layer, exposed graph to python
+- #1122 Add ConfigOptionsTest, a test with different config_options values
+- #1110 Adding local logging directory to BlazingContext
+- #1148 Add e2e test for DAYOFWEEK
+- #1130 Infer hive folder partition
+- #1188 Implement upper/lower operators
+- #1193 Implement string REPLACE
+- #1218 Added smiles test set
+- #1201 Implement string TRIM
+- #1216 Add unit test for DAYOFWEEK
+- #1205 Implement string REVERSE
+- #1220 Implement string LEFT and RIGHT 
+- #1250 updated README.md and CHANGELOG and others preparing for 0.17 release
+
+
+## Improvements
+- #878 Adding calcite rule for window functions. (Window functions not supported yet)
+- #1081 Add validation for the kwargs when bc API is called
+- #1082 Validate s3 bucket
+- #1093 Logs configurable to have max size and be rotated
+- #1091 Improves the error message problem when validating any GCP bucket
+- #1102 Add option to read csv files in chunks
+- #1090 Add tests for Uri Data provider for local uri
+- #1119 Add tests for transform json tree and get json plan
+- #1117 Add error logging in DataSourceSequence
+- #1111 output compile json for cppcheck
+- #1132 Refactoring new comms
+- #1078 Bump junit from 4.12 to 4.13.1 in /algebra
+- #1144 update with changes from main
+- #1156 Added scheduler file support for e2e testing framework
+- #1158 Deprecated bc.partition
+- #1154 Recompute the avg_bytes_per_row value
+- #1155 Removing comms subproject and cleaning some related code
+- #1170 Improve gpuCI scripts
+- #1194 Powerpc building scripts
+- #1186 Removing cuda labels to install due cudatoolkit version
+- #1187 Enable MySQL-specific SQL operators in addition to Standard and Oracle
+- #1206 Improved contribution documentation 
+- #1224 Added cudaSetDevice to thread initialization so that the cuda context is available to UCX
+- #1229 Change hardcoded version from setup.py
+- #1231 Adding docker support for gpuCI scripts
+- #1248 Jenkins and Docker scripts were improved for building
+
+
+## Bug Fixes
+- #1064 Fixed issue when loading parquet files with local_files=True
+- #1086 Showing an appropriate error to indicate that we don't support opening directories with wildcards
+- #1088 Fixed issue caused by cudf changing from one .so file to multiple
+- #1094 Fixed logging directory setup
+- #1100 Showing an appropriate error for invalid or unsupported expressions on the logical plan
+- #1115 Fixed changes to RMM api using cuda_stream_view instead of cudaStream_t now
+- #1120 Fix missing valid kwargs in create_table
+- #1118 Fixed issue with config_options and adding local_files to valid params
+- #1133 Fixed adressing issue in float columns when parsing parquet metadata
+- #1163 added empty line to trigger build
+- #1108 Remove temp files when an error occurs
+- #1165 E2e tests, distributed mode, again tcp
+- #1171 Don't log timeout in output/input caches
+- #1168 Fix SSL errors for conda
+- #1164 MergeAggr when single node has multiple batches
+- #1191 Fix graph thread pool hang when exception is thrown
+- #1181 Remove unnecesary prints (cluster and logging info)
+- #1185 Create table in distributed mode crash with a InferFolderPartitionMetadata Error
+- #1179 Fix ignore headers when multiple CSV files was provided
+- #1199 Fix non thread-safe access to map containing tag to message_metadata for ucx
+- #1196 Fix column_names (table) always as list of string
+- #1203 Changed code back so that parquet is not read a single rowgroup at a time 
+- #1207 Calcite uses literal as int32 if not explicit CAST was provided
+- #1212 Fixed issue when building the thirdpart, cmake version set to 3.18.4
+- #1225 Fixed issue due to change in gather API 
+
+
 # BlazingSQL 0.16.0 (October 22, 2020)
 
 ## Improvements
@@ -10,6 +91,7 @@
 - #1065 Remove thrift from build prodcess as its no longer used
 - #1067 Upload conda packages to both rapidsai and blazingsql conda channels
 
+
 ## Bug Fixes
 - #918 Activate validation for GPU_CI tests results.
 - #975 Fixed issue due to cudf orc api change
@@ -32,11 +114,11 @@
 - #1007 Fix arrow and spdlog compilation issues
 - #1068 Just adds a docs important links and avoid the message about filesystem authority not found
 - #1073 Fixed parseSchemaPython can throw exceptions
-- #1074: Remove lock inside grow() method from PinnedBufferProvider
+- #1074 Remove lock inside grow() method from PinnedBufferProvider
 - #1071 Fix crash when loading an empty folder
-- #1085 Fixed intra-query memory leak in joins. Fixed by clearing array caches after PartwiseJoin is done 
+- #1085 Fixed intra-query memory leak in joins. Fixed by clearing array caches after PartwiseJoin is done
 - #1096 Backport from branch-0.17 with these PRs: #1094, #1086, #1093 and #1091
-- #1099 Fixed issue with config_options 
+- #1099 Fixed issue with config_options
 
 
 # BlazingSQL 0.15.0 (August 31, 2020)
@@ -167,4 +249,4 @@
 - #785 Add script for Manual Testing Artifacts.
 - #931 Add script for error messages validation.
 - #932 Import pydrill and pyspark only when its generator or full mode.
-
+- #1031 adding notebooks into BlazingSQL Tests
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -20,15 +20,42 @@ Contributions to blazingSQL fall into the following three categories.
 ## Code contributions
 
 1. Follow the guide in our documentation for [Building From Source](https://github.com/BlazingDB/blazingsql#buildinstall-from-source-conda-environment).
-2. Comment on the issue stating that you are going to work on it.
-3. Code! Make sure to update unit tests!
-4. When done, [create your pull request](https://github.com/blazingdb/blazingsql/compare).
-5. Wait for other developers to review your code and update code as needed.
-6. Once reviewed and approved, a BlazingSQL developer will merge your pull request.
+2. Find an issue to work on (that already has not been asigned to someone). The best way is to look for the good first issue or help wanted labels.
+3. Comment on the issue stating that you are going to work on it and assign it to yourself.
+4. When you start working on it, please place the issue on the WIP column in the project board.
+5. All work should be done on your own fork and on a new branch on your fork.
+6. Code! Make sure to update unit tests!
+7. If applicable (i.e.when adding a new SQL function), add new [End-To-End tests](#adding-end-to-end-tests).
+8. When done, [create your pull request](https://github.com/blazingdb/blazingsql/compare).
+9. Verify that CI passes all [status checks](https://help.github.com/articles/about-status-checks/). Fix if needed.
+10. When all the work is done, please place the issue in the _Needs Review_ column of the project board. Wait for other developers to review your code and update code as needed.
+11. Once reviewed and approved, a BlazingSQL developer will merge your pull request.
 
 Remember, if you are unsure about anything, don't hesitate to comment on issues
 and ask for clarifications!
 
+## Adding End to End Tests
+
+Dependencies and instructions for how to run the End to End testing framework can be found [here](tests/README.md).
+
+To add a new End to End test queries, please do the follwing:
+- If the new query is related to an existing script, you can add it **at the end** of the existing script:
+*$CONDA_PREFIX/blazingsql/test/BlazingSQLTest/EndToEndTests/existingScript.py*
+- In case that the new query is part of a non-existent test suite, create a new script and add the new tests in the new script:
+*$CONDA_PREFIX/blazingsql/test/BlazingSQLTest/EndToEndTests/newScript.py*
+- If you added a new script, you have to add this also in `allE2ETest.py`
+- After you add the new queries, you will want to run the testing framework in generator mode, so you will need to following environment variable:
+`export BLAZINGSQL_E2E_EXEC_MODE="generator"`
+- You will also need to have an instance of Apache Drill running. You need to install apache-drill and run it like so:
+`apache-drill-1.17.0/bin/drill-embedded`
+- Enter to $CONDA_PREFIX/blazingsql and execute: 
+`./test.sh e2e tests=<new_suiteTest>`
+- New query validation files will be generated in the repo located at: 
+`$CONDA_PREFIX/blazingsql-testing-files/`
+- Please create a PR in the `blazingsql-testing-files` repo and reference the PR in the BlazingSQL repo that the new end to end tests correspond to.
+- Once the `blazingsql-testing-files` PR is merged, then you can run the GPU_CI tests in the `blazingsql` PR. 
+- Make sure that all the GPU_CI tests are passing.
+
 
 ## Attribution
 Portions adopted from https://github.com/rapidsai/cudf/CONTRIBUTING.md
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 > A lightweight, GPU accelerated, SQL engine built on the [RAPIDS.ai](https://rapids.ai) ecosystem.
 
-<a href='https://app.blazingsql.com/jupyter/user-redirect/lab/workspaces/auto-b/tree/Welcome_to_BlazingSQL_Notebooks/welcome.ipynb'><img src="https://github.com/BlazingDB/blazingsql/blob/roaramburu-readme-update/img/bsql_rapids.PNG"/></a>
+<a href='https://app.blazingsql.com/jupyter/user-redirect/lab/workspaces/auto-b/tree/Welcome_to_BlazingSQL_Notebooks/welcome.ipynb'>Get Started on app.blazingsql.com</a>
 
 [Getting Started](#getting-started) | [Documentation](https://docs.blazingdb.com) | [Examples](#examples) | [Contributing](#contributing) | [License](LICENSE) | [Blog](https://blog.blazingdb.com) | [Try Now](https://app.blazingsql.com/jupyter/user-redirect/lab/workspaces/auto-b/tree/Welcome_to_BlazingSQL_Notebooks/welcome.ipynb)
 
@@ -90,22 +90,22 @@ BlazingSQL can be installed with conda ([miniconda](https://conda.io/miniconda.h
 
 ## Stable Version
 ```bash
-conda install -c blazingsql/label/cuda$CUDA_VERSION -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
+conda install -c blazingsql/label/cuda$CUDA_VERSION -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION cudatoolkit=$CUDA_VERSION
 ```
 Where $CUDA_VERSION is 10.1, 10.2 or 11.0  and $PYTHON_VERSION is 3.7 or 3.8
 *For example for CUDA 10.1 and Python 3.7:*
 ```bash
-conda install -c blazingsql/label/cuda10.1 -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=3.7
+conda install -c blazingsql -c rapidsai -c nvidia -c conda-forge -c defaults blazingsql python=3.7 cudatoolkit=10.1
 ``` 
 
 ## Nightly Version
 ```bash
-conda install -c blazingsql-nightly/label/cuda$CUDA_VERSION -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION
+conda install -c blazingsql-nightly -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=$PYTHON_VERSION  cudatoolkit=$CUDA_VERSION
 ```
 Where $CUDA_VERSION is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
 *For example for CUDA 10.1 and Python 3.7:*
 ```bash
-conda install -c blazingsql-nightly/label/cuda10.1 -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=3.7
+conda install -c blazingsql-nightly -c rapidsai-nightly -c nvidia -c conda-forge -c defaults blazingsql python=3.7  cudatoolkit=10.1
 ```
 
 # Build/Install from Source (Conda Environment)
@@ -117,16 +117,18 @@ This is the recommended way of building all of the BlazingSQL components and dep
 ```bash
 conda create -n bsql python=$PYTHON_VERSION
 conda activate bsql
-conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
-conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=$CUDA_VERSION
+conda install --yes -c conda-forge spdlog=1.7.0 google-cloud-cpp=1.16 ninja
+conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults dask-cuda=0.17 dask-cudf=0.17 cudf=0.17 ucx-py=0.17 ucx-proc=*=gpu python=3.7 cudatoolkit=$CUDA_VERSION
+conda install --yes -c conda-forge cmake=3.18 gtest gmock cppzmq cython=0.29 openjdk=8.0 maven jpype1 netifaces pyhive
 ```
 Where $CUDA_VERSION is is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
 *For example for CUDA 10.1 and Python 3.7:*
 ```bash
 conda create -n bsql python=3.7
 conda activate bsql
-conda install --yes -c conda-forge openjdk=8.0 maven cmake gtest gmock rapidjson cppzmq cython=0.29 jpype1 netifaces pyhive
-conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults cudf=0.15 dask-cudf=0.15 dask-cuda=0.15 cudatoolkit=10.1
+conda install --yes -c conda-forge spdlog=1.7.0 google-cloud-cpp=1.16 ninja
+conda install --yes -c rapidsai -c nvidia -c conda-forge -c defaults dask-cuda=0.17 dask-cudf=0.17 cudf=0.17 ucx-py=0.17 ucx-proc=*=gpu python=3.7 cudatoolkit=10.1
+conda install --yes -c conda-forge cmake=3.18 gtest gmock cppzmq cython=0.29 openjdk=8.0 maven jpype1 netifaces pyhive
 ```
 
 ### Build
@@ -150,19 +152,18 @@ $CONDA_PREFIX now has a folder for the blazingsql repository.
 ```bash
 conda create -n bsql python=$PYTHON_VERSION
 conda activate bsql
-
 conda install --yes -c conda-forge spdlog=1.7.0 google-cloud-cpp=1.16 ninja
-conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults dask-cuda=0.16 dask-cudf=0.16 cudf=0.16 python=3.7 cudatoolkit=$CUDA_VERSION
-conda install --yes -c conda-forge cmake gtest gmock cppzmq cython=0.29 openjdk=8.0 maven jpype1 netifaces pyhive
+conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults dask-cuda=0.18 dask-cudf=0.18 cudf=0.18 ucx-py=0.18 ucx-proc=*=gpu python=3.7 cudatoolkit=$CUDA_VERSION
+conda install --yes -c conda-forge cmake=3.18 gtest gmock cppzmq cython=0.29 openjdk=8.0 maven jpype1 netifaces pyhive
 ```
 Where $CUDA_VERSION is is 10.1, 10.2 or 11.0 and $PYTHON_VERSION is 3.7 or 3.8
 *For example for CUDA 10.1 and Python 3.7:*
 ```bash
 conda create -n bsql python=3.7
 conda activate bsql
 conda install --yes -c conda-forge spdlog=1.7.0 google-cloud-cpp=1.16 ninja
-conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults dask-cuda=0.16 dask-cudf=0.16 cudf=0.16 python=3.7 cudatoolkit=10.1
-conda install --yes -c conda-forge cmake gtest gmock cppzmq cython=0.29 openjdk=8.0 maven jpype1 netifaces pyhive
+conda install --yes -c rapidsai-nightly -c nvidia -c conda-forge -c defaults dask-cuda=0.18 dask-cudf=0.18 cudf=0.18 ucx-py=0.18 ucx-proc=*=gpu python=3.7 cudatoolkit=10.1
+conda install --yes -c conda-forge cmake=3.18 gtest gmock cppzmq cython=0.29 openjdk=8.0 maven jpype1 netifaces pyhive
 ```
 
 ### Build
@@ -177,6 +178,8 @@ export CUDACXX=/usr/local/cuda/bin/nvcc
 ```
 NOTE: You can do `./build.sh -h` to see more build options.
 
+NOTE: You can perform static analysis with cppcheck with the command `cppcheck  --project=compile_commands.json` in any of the cpp project build directories.
+
 $CONDA_PREFIX now has a folder for the blazingsql repository.
 
 #### Storage plugins

diff --git a/...plication/src/main/java/com/blazingdb/calcite/application/RelationalAlgebraGenerator.java b/...plication/src/main/java/com/blazingdb/calcite/application/RelationalAlgebraGenerator.java
@@ -29,13 +29,14 @@
 import org.apache.calcite.rel.rules.ProjectRemoveRule;
 import org.apache.calcite.rel.rules.AggregateReduceFunctionsRule;
 import org.apache.calcite.rel.rules.ReduceExpressionsRule;
+import org.apache.calcite.rel.rules.ProjectToWindowRule;
 import org.apache.calcite.rex.RexExecutorImpl;
 import org.apache.calcite.rel.type.RelDataTypeSystem;
 import org.apache.calcite.schema.SchemaPlus;
 import org.apache.calcite.sql.SqlNode;
 import org.apache.calcite.sql.SqlOperatorTable;
-import org.apache.calcite.sql.fun.OracleSqlOperatorTable;
-import org.apache.calcite.sql.fun.SqlStdOperatorTable;
+import org.apache.calcite.sql.fun.SqlLibrary;
+import org.apache.calcite.sql.fun.SqlLibraryOperatorTableFactory;
 import org.apache.calcite.sql.parser.SqlParseException;
 import org.apache.calcite.sql.parser.SqlParser;
 import org.apache.calcite.sql.util.ChainedSqlOperatorTable;
@@ -54,6 +55,7 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Properties;
+import java.util.EnumSet;
 
 /**
  * <h1>Generate Relational Algebra</h1>
@@ -114,8 +116,8 @@ public RelationalAlgebraGenerator(BlazingSchema newSchema) {
 			Properties props = new Properties();
 			props.setProperty("defaultSchema", newSchema.getName());
 			List<SqlOperatorTable> sqlOperatorTables = new ArrayList<>();
-			sqlOperatorTables.add(SqlStdOperatorTable.instance());
-			sqlOperatorTables.add(OracleSqlOperatorTable.instance());
+			sqlOperatorTables.add(SqlLibraryOperatorTableFactory.INSTANCE.getOperatorTable(
+				EnumSet.of(SqlLibrary.STANDARD, SqlLibrary.ORACLE, SqlLibrary.MYSQL)));
 			sqlOperatorTables.add(new CalciteCatalogReader(CalciteSchema.from(schema.getSubSchema(newSchema.getName())),
 				defaultSchema,
 				new JavaTypeFactoryImpl(RelDataTypeSystem.DEFAULT),
@@ -180,6 +182,7 @@ public RelationalAlgebraGenerator(FrameworkConfig frameworkConfig, HepProgram he
 	getOptimizedRelationalAlgebra(RelNode nonOptimizedPlan) throws RelConversionException {
 		if(rules == null) {
 			program = new HepProgramBuilder()
+						  .addRuleInstance(ProjectToWindowRule.PROJECT)
 						  .addRuleInstance(AggregateExpandDistinctAggregatesRule.JOIN)
 						  .addRuleInstance(FilterAggregateTransposeRule.INSTANCE)
 						  .addRuleInstance(FilterJoinRule.JoinConditionPushRule.FILTER_ON_JOIN)

diff --git a/algebra/pom.xml b/algebra/pom.xml
@@ -24,7 +24,7 @@
 		<project.reporting.outputEncoding>${sourceEncoding}</project.reporting.outputEncoding>
 		<java.version>1.8</java.version>
 		<org.slf4j-version>1.7.29</org.slf4j-version>
-		<junit.version>4.12</junit.version>
+		<junit.version>4.13.1</junit.version>
 		<org.testng.version>6.14.3</org.testng.version>
 	</properties>
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,6 +24,7 @@ __pycache__/ @@
     # Log file
     *.log
     *.xlsx
+    rmm_log.txt
     # Tmp files
     *.swp
@@ Expand Down @@