hw1_preprocessing_EDA.Rmd



## Set working directory
```{r}
setwd('.../Into to Data Science/hw1/')
```


## Loading the data
```{r}
data = read.csv('movies.csv', na.strings=c("","NA"))
```

# Preprocessing

## Filter X,r1..r10 cols (unrelevant)
```{r}
data = data[,!colnames(data) %in% c('X',colnames(data)[8:17])]
sapply(data,class)
head(data)

```


## Filter votes<100
```{r}
data = data[data$votes>=100,]
```

## Convert genere columns to factors
```{r}
data[,8:14] = lapply(data[, 8:14], as.factor)
```

## Describe the data
```{r}
df_stat <- data[,c("length", "budget", "rating", "votes","year")]
tmp <- do.call(data.frame, 
               list(mean = apply(df_stat, 2, mean),
                    sd = apply(df_stat, 2, sd),
                    variance = apply(df_stat, 2, var),
                    min = apply(df_stat, 2, min),
                    max = apply(df_stat, 2, max),
                    median = apply(df_stat, 2, median),
                    range = apply(df_stat, 2, max) - apply(df_stat, 2, min),
                    quantile.20 = apply(df_stat, 2, quantile, prob=c(0.2), na.rm=TRUE),
                    quantile.40 = apply(df_stat, 2, quantile, prob=0.4, na.rm=T),
                    quantile.60 = apply(df_stat, 2, quantile, prob=0.6, na.rm=T),
                    quantile.80 = apply(df_stat, 2, quantile, prob=0.8, na.rm=T)))
data.frame(t(tmp))
```

## Missing values counter
```{r}
apply(is.na(data),2, sum)
round(apply(is.na(data),2, sum)/dim(data)[1],3)

# Filter out "mpaa" and "budget" due to abundance in missing values
mpaa_budget_indices <- which(colnames(data)==c("mpaa","budget"))
data <- data[,-mpaa_budget_indices]
```


## Make a qq plot for each of the following features: year,rating,votes.
```{r}
par(mfrow=c(1,3))
qqnorm(y=data[,"year"], main='year')
qqline(y=data[,"year"])
qqnorm(y=data[,"rating"], main='rating')
qqline(y=data[,"rating"])
qqnorm(y=data[,"votes"], main='votes')
qqline(y=data[,"votes"])
# easy to see 'year' and 'votes' are not normally distributed. It also makes sense,
# there is no logical explanation why movie year should behave "normaly". 
# votes looks like there are many movies with few votes, and few movies with high votes
# Rating is the only feature here that looks like normal distributed.

```


## Normalize relevant features 
```{r}
par(mfrow=c(1,2))
summary(data$votes)
data$votes.norm = data$votes - min(data$votes)+1 # offset the distribution to begin with 1..
data$votes.norm = log(data$votes.norm) # log transformation on the skewed data
data$votes.norm = (data$votes.norm - mean(data$votes.norm))/sd(data$votes.norm) # normalize it to range around 0
hist(data$votes.norm)
qqnorm(y=data$votes.norm, main='votes.norm')
qqline(y=data$votes.norm)
# now votes is rescaled and behaves normally

# we will normalize rating:
data$rating.norm = (data$rating - mean(data$rating))/sd(data$rating)

hist(data$rating.norm)
qqnorm(y=data$rating.norm, main='rating.norm')
qqline(y=data$rating.norm)
```




## Creating one plot containing all box plots for each of the numeric features of the data
```{r}
par(mfrow=c(1,1))
df_outl = data[,c("year","length", "rating.norm", "votes.norm")]
boxplot(df_outl)
'''



## Remove suspected 'length' outliers from the data using the box plot.
```{r}
bx.length <- boxplot(df_outl$length)
data = data[!(data$length %in% bx.length$out),]
```



## Using the LOF measure to remove outliers using the following features: "votes","length","rating".
```{r}
# install.packages("DMwR")
library(DMwR)

# rescall length to range around zero, like normal distribution
lof.data <- data[,c("votes","length","rating")]
lof.data$length <- scale(data$length,center=TRUE,scale=TRUE)
l<- lofactor(data[,c("votes","length","rating")], k=20)
outliers <- l > 1.5

data <- data[!outliers,]
```

## Display a bar chart plotting the number of movies per genre. 
```{r}
df_generes <- data[,6:12]
df_generes <- apply(df_generes,2, function(x) sum(as.numeric(x)))

par(mfrow = c(1,1))
barplot(df_generes, col = c("red","yellow","blue","orange", "black","gray","brown"), legend=TRUE)
```



## Make 2 density plots
```{r}
#density plot of ratings by length

plot(density(data[data$length.desc == 'Long','rating']),xlab = "ranks",main="density of ranks by length",col='blue')
lines(density(data[data$length.desc == 'Medium','rating']),xlab = "ranks",main="density of ranks by length",col='red')
lines(density(data[data$length.desc == 'Short','rating']),xlab = "ranks",main="density of ranks by length",col='brown')
legend(8,0.4, c("Long","Medium","Short"),lty=c(1,1), lwd=c(2.5,2.5),col=c("blue","red","brown"))

#density plot of ratings by votes
plot(density(data[data$votes.desc == 'Many','rating']),xlab = "ranks",main="density of ranks by votes",col='blue')
lines(density(data[data$votes.desc == 'Few','rating']),xlab = "ranks",main="density of ranks by votes",col='red')
legend(8.5,0.3, c("Many","Few"),lty=c(1,1), lwd=c(2.5,2.5),col=c("blue","red"))
```

## Display the correlation plot of the features. 
```{r}
scaled.data <- scale(data[,c('year','length','rating','votes')],center=TRUE,scale=TRUE)

install.packages("corrplot")
library(corrplot)

corr_matrix = cor(scaled.data)

corrplot(corr_matrix, method="color", type="upper")
```