-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSparkR-0.01m.R
43 lines (32 loc) · 1.57 KB
/
SparkR-0.01m.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
library(sparklyr)
sc <- spark_connect("yarn-client")
#connecting to the azure storage on the created cluster.
azure_storage_path <- file.path("wasb://bdaclusterstore@bdaclusterhdistorage1.blob.core.windows.net",
"user/sshuser/csvfiles")
# Reading the test and the train data from the files in azure storage.(0.01m Dataset)
test_data <- spark_read_csv(sc, path = azure_storage_path,
name = 'test', delimiter = ",")
train_data <- spark_read_csv(sc, path = azure_storage_path,
name = 'train001m', delimiter = ",")
# Model Training and Calculating the time taken for model training.
# Parameters defined are:
## number_of_trees = 100
## max_bins = 20
## max_depth = 50
## impurity = gini
system.time({
model <- train_data %>%
ml_random_forest(dep_delayed_15min ~ ., type = "classification", impurity = "gini",
max_bins = 20, max_depth = 50,
num_trees = 100)
})
# Prediction with the test dataset.
predict <- sdf_predict(model, test_data)
# Calcualting accuarcy of the model.
ml_classification_eval(predict, "dep_delayed_15min", "prediction", metric = "accuracy")
# Calcualting AUC of the model.
ml_binary_classification_eval(predict, "dep_delayed_15min", "prediction", metric = "areaUnderROC")
# Calculating the MSE and RMSE of the model.
sq_resid <- transform(predict, sq_residuals = (predict$dep_delayed_15min - predict$prediction)^2)
MSE <- collect(summarize(sq_resid, mean = mean(sq_resid$sq_residuals)))$mean
RMSE <- sqrt(MSE)