-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw2_modeling.Rmd
350 lines (266 loc) · 10.5 KB
/
hw2_modeling.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
## Set working directory
```{r}
setwd('.../Into to Data Science/hw1/')
```
## All packages installetions:
##############
```{r}install.packages("dplyr")
install.packages("corrplot")
install.packages("class")
install.packages("rpart")
install.packages("pROC")
install.packages("DMwR")
install.packages("e1071")
install.packages("randomForest")
install.packages("fpc")
install.packages("cluster")
library(dplyr)
library(corrplot)
library(class)
library(rpart)
library(pROC)
library(DMwR)
library(e1071)
library(randomForest)
library(cluster)
library(fpc)
library(caTools)
```
###########
# CARVANA #
###########
# PREPARATION
##############
## Load the data
```{r}
data.c = read.csv('Carvana/CARVANA.csv', na.strings=c("","NA"))
data.c <- data.c[,-1] # remove x column
# Remove the NA rows, not significant amount of rows
data.c <- data.c[!(data.c$Trim %>% is.na() | data.c$Transmission %>% is.na()),]
```
## Take a sample of 30,000 rows.
```{r}
set.seed(0)
data.c<- data.c[sample.int(dim(data.c)[1], size=30000, replace = F),]
```
## Split the data into test (30%) and train (70%) sets with respect to the target variable
```{r}
train.index = sample.split(data.c$IsBadBuy, SplitRatio = 0.7)
train.c <-(subset(data.c, train.index == TRUE))
test.c <- (subset(data.c, train.index == FALSE))
```
# FEATURE SELECTION AND CORRELATION
## Convert all factorial features to numeric
```{r}
factor.columns <- train.c[,sapply(train.c, is.factor)] %>% names
train.c[, factor.columns] <- sapply(train.c[,factor.columns], as.numeric)
test.c[, factor.columns] <- sapply(test.c[,factor.columns], as.numeric)
data.c[,factor.columns] <- sapply(data.c[,factor.columns], as.numeric)
train.c$IsBadBuy <- sapply(train.c$IsBadBuy, as.factor)
test.c$IsBadBuy <- sapply(test.c$IsBadBuy, as.factor)
```
## Display the correlation plot of the features
```{r}
# Normalize it first
data.c[,-1] <- scale(data.c[,-1], center = T, scale = T)
train.c[,-1] <- scale(train.c[,-1], center = T, scale = T)
test.c[,-1] <- scale(test.c[,-1], center = T, scale = T)
par(mfrow = c(1,1))
corr_matrix = cor(data.c[,-1])
corrplot(corr_matrix, method="color", tl.pos = "td", type="upper", tl.cex = 0.6)
```
## Features that have a correlation of over 0.65
```{r}
High.corr.columns <- which(corr_matrix >0.65 & corr_matrix <1, arr.ind = TRUE) %>% row.names()
cat("the correlated features are ", High.corr.columns)
```
# Save new data frame without the highly correlated features and name them with the suffix .noHighCor
```{r}
data.c.noHighCor <- data.c[,!colnames(data.c) %in% High.corr.columns]
train.c.noHighCor <- train.c[,!colnames(train.c) %in% High.corr.columns]
test.c.noHighCor <- test.c[,!colnames(test.c) %in% High.corr.columns]
```
# KNN
## With the new train and test data frames - predict the test.c outcomes using knn, with k=1.
```{r}
train.c.noHighCor.label <- train.c.noHighCor$IsBadBuy
train.c.noHighCor.features <- train.c.noHighCor[,-1]
test.c.noHighCor.features <- test.c.noHighCor[,-1]
test.c.noHighCor.label <- test.c.noHighCor$IsBadBuy
# run KNN with k=1
knn.model <- knn(train.c.noHighCor.features, test.c.noHighCor.features, train.c.noHighCor.label, k=1)
get_accuracy <- function(preds, labels){
tbl <- table(preds, labels)
acc <- sum(diag(tbl)) / sum(tbl)
return(acc)
}
```
## Display the confusion matrix:
```{r}
table(knn.model, test.c.noHighCor.label)
get_accuracy(knn.model, test.c.noHighCor.label)
```
## Using cross-validation train a knn model
```{r}
knn3.model <- knn.cv(train.c.noHighCor.features, train.c.noHighCor.label, k=3)
acc.k3 <- get_accuracy(knn3.model, train.c.noHighCor.label)
knn5.model <- knn.cv(train.c.noHighCor.features, train.c.noHighCor.label, k=5)
acc.k5 <- get_accuracy(knn5.model, train.c.noHighCor.label)
knn10.model <- knn.cv(train.c.noHighCor.features, train.c.noHighCor.label, k=10)
acc.k10 <- get_accuracy(knn10.model, train.c.noHighCor.label)
which.max(list(acc.k3, acc.k5, acc.k10))
# we can see the best model is with K=10
```
## Predict the test data's labels
```{r}
knn10.model <- knn(train.c.noHighCor.features, test.c.noHighCor.features, train.c.noHighCor.label, k=10, prob = TRUE)
table(knn10.model, test.c.noHighCor.label)
get_accuracy(knn10.model, test.c.noHighCor.label)
```
# ROC
## Display the ROC of the model you trained
```{r}
ROC <- roc(test.c.noHighCor.label, 1-attr(knn10.model,"prob"))
plot(ROC, col = "blue", main ="ROC curve for KNN model with k=10")
```
# PCA
## Use train.c to find its principal components
```{r}
# our data is already centered and scaled
pc.model <- prcomp(train.c[,-1])
```
## Plot the drop in the variance explained by the PC's
```{r}
pcs.var <- pc.model$sdev^2
names(pcs.var) <- 1:length(pcs.var)
barplot(pcs.var, main="Variance explained as a function of PCs", xlab="PCs")
```
## Using the PC's you created above, create two new data frames named train.c.pca and test.c.pca in which the features are replaced by PCs
```{r}
train.c.pca <- pc.model$x
pcs <- pc.model$rotation
test.c.pca <- t(t(pcs) %*% t(test.c[,-1]) )
```
## Using only the first 3 PC's - fit a simple knn model (like the first one we did) with k=7
```{r}
train.c.label <- train.c$IsBadBuy
test.c.label <- test.c$IsBadBuy
knn.pc.model <- knn(train = train.c.pca[,1:3], test = test.c.pca[,1:3], cl = train.c.label, k=7)
```
## Show the confusion matrix
```{r}
table( knn.pc.model, test.c.label)
get_accuracy(preds = knn.pc.model, test.c.label)
```
############
# DIABETES #
############
# PREPARATIONS
##############
## Load the data
```{r}
data.d <- read.csv('Diabetes/diabetes.csv', header = TRUE)
# Transform all 0 values in this feature to NA.
data.d[data.d$SkinThickness==0, 'SkinThickness'] <- NA
# Impute the missing values for the feature SkinThickness using the mean
data.d[is.na(data.d$SkinThickness), 'SkinThickness'] <- mean(data.d$SkinThickness, na.rm = TRUE)
```
# LOF
## Plot the density of the LOF scores using all features
```{r}
data.d.label <- data.d$Outcome
# Lets scale the data first
data.d.scaled.features <- as.data.frame(scale(data.d[,!colnames(data.d) == 'Outcome'], center = T, scale = T))
lof.model <- lofactor(data.d.scaled.features, k=5)
plot(density(lof.model), main = "LOF density plot", xlab= "lof values")
```
## Based on the plot above - remove outliers above a certain LOF score threshold
```{r}
sum(lof.model < 1.5) / length(lof.model)
# We remain with 95.9% of the data, after considering removing LOF > 1.3 that we observed in the previous plot
data.d.scaled.features <- data.d.scaled.features[lof.model < 1.5,]
data.d.label <- data.d.label[lof.model < 1.5]
```
# SVM
## Split the data into test (30%) and train (70%) sets with respect to the target variable
```{r}
set.seed(0)
train.index = sample.split(data.d$Outcome, SplitRatio = 0.7)
train.d <-(subset(data.d, train.index == TRUE))
test.d <- (subset(data.d, train.index == FALSE))
train.d.features <- scale(train.d[,!colnames(train.d) == 'Outcome'], center = T, scale = T)
test.d.features <- scale(test.d[,!colnames(test.d)== 'Outcome'], center = T, scale = T)
train.d$Outcome <- as.factor(train.d$Outcome)
test.d$Outcome <- as.factor(test.d$Outcome)
```
## Create an SVM model with as many features as possible
```{r}
model.features <- c("Pregnancies", "Glucose" , "BloodPressure" ,"Insulin" , "BMI" ,"DiabetesPedigreeFunction")
svm.model <- svm(x = train.d.features[,model.features],y = train.d$Outcome, cost=1, gamma=1, type = 'C-classification', kernel ="radial")
res <- predict(svm.model, test.d.features[,model.features])
```
## Compute the error rate
```{r}
1-get_accuracy(res, test.d$Outcome)
# Error is 29.1% on test set
```
## Tune the SVM model using no more than 5 different costs, 5 different gammas and 5 CV
```{r}
svm_tune <- tune(svm, train.x=train.d.features[,model.features], train.y=train.d$Outcome,
kernel="radial", ranges=list(cost=10^(-2:2), gamma=c(.01,.1,.5,2, 10)))
print(svm_tune)
# Best parameters are: cost = 1, gamma = 0.01
svm_tune$best.performance
# We achieved 21.7% CV error rate which is less than 23% :)
```
## Display the best model found (its parameters) and use it to predict the test values
```{r}
svm_tune$best.parameters
# The kernel we chose is radial basis
svm.model2 <- svm(x = train.d.features[,model.features],y = train.d$Outcome, cost=svm_tune$best.parameters["cost"][[1]], gamma = svm_tune$best.parameters["gamma"][[1]])
res2 <- predict(svm.model2, test.d.features[,model.features])
## Show if it improved by computing the new error rate
1-get_accuracy(res2, test.d$Outcome)
```
# RANDOM FOREST
## Create a random forest model with as many features as possible (but choose with logic and looking at the data)
```{r}
# Lets take a look on the features coefficiens in simple logistic regression
lr.model <- glm(Outcome ~ ., data=train.d, family="binomial")
lr.model$coefficients
# we will choose all features with abs(betas) > threshold
features.model <- names(which(abs(lr.model$coefficients) >0.01, TRUE))[-1]
features.model
rf.model <- randomForest(x= train.d.features[,features.model], y=train.d$Outcome, importance = TRUE, ntree = 2000)
# Lets use the model to predict the test outcome
resForest <- predict(rf.model, test.d.features)
# Display the error rate
1-get_accuracy(preds = resForest, labels = test.d$Outcome)
# Random forest outperformed the tuned SVM model :)
```
## Feature importance
```{r}
feature.importance <- rf.model$importance
fi <- as.data.frame(cbind(feature.importance[,-c(1,2,3)]))
colnames(fi) <- "value"
fi <- fi[with(fi, order(-value)), ]
barplot(fi, beside = T, names.arg = feature.importance %>% row.names(), cex.names = 0.8)
```
###########
# MOVIES #
###########
# KMEANS
## Loading the data
```{r}
data.m <- read.csv('Movies/movies.csv', header = TRUE)
```
## Run kmeans using 6 centers.
```{r}
data.m.scaled <- scale(data.m[,c("rating", "year", "votes", "length")], center = TRUE, scale = TRUE)
data.m.scaled <- data.m.scaled %>% as.data.frame()
kmean.model <- kmeans(data.m.scaled, 6, nstart = 20)
# plot the clusters:
plot(data.m.scaled[c('rating','year')],col=kmean.model$cluster,main='Kmeans using 6 centroids')
points(kmean.model$centers[,c('rating','year')],col=1:6,pch=20,cex=3)
#It looks like the data can be represented by fewer centroids since there are overlapping areas when using 6 centroids
```