DAI_ASSIGNMENT_1_Q1.py

# -*- coding: utf-8 -*-
"""DAI_ASSIGNMENT_1_Q1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1JfW0yXYGNERN6Ny-_aggrbctoTUNqkdW

Q1. Analysis of Machine Learning model for bias. ​[50 Marks]
● Use the SVM model. (You can use sklearn library)
● Train the model on Bollywood celebrity dataset [​Download Here​: https://www.kaggle.com/havingfun/100-bollywood-celebrity-faces/download]
● Choose any 10 classes from the dataset of your choice
● Report/ Show
○ Class-wise accuracy [10 Marks]
○ Overall accuracy [10 Marks]
○ Training loss vs Testing loss curve wrt epochs.[10 Marks]
○ Check if your model is biased or not by using at-least ​2 metrics​ ex. Confusion Matrix [15
Marks]
○ What type of bias you see(if any), explain. [5 Marks]
"""

# ref: https://medium.com/analytics-vidhya/how-to-fetch-kaggle-datasets-into-google-colab-ea682569851a
from google.colab import drive
drive.mount('/content/gdrive')

import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/gdrive/My Drive/Kaggle

!kaggle datasets download -d havingfun/100-bollywood-celebrity-faces

!ls

!unzip \*.zip

!ls

!pwd

!ls

# Selecting any 10 classes: Shraddha_Kapoor, Shahid_Kapoor, Richa_Chadda, Randeep_Hooda, Tapsee_Pannu, Suniel_Shetty, Shruti_Haasan, Sidharth_Malhotra, Disha_Patani, Arjun_Rampal

!pwd

#Folders
Shraddha_Kapoor  = '/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Shraddha_Kapoor'
Shahid_Kapoor = '/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Shahid_Kapoor'
Richa_Chadda= '/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Richa_Chadda'
Randeep_Hooda= '/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Randeep_Hood'
Tapsee_Pannu='/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Tapsee_Pannu'
Suniel_Shetty= '/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Suniel_Shetty'
Shruti_Haasan='/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Shruti_Haasan'
Sidharth_Malhotra='/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Sidharth_Malhotra'
Disha_Patani='/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces_0/Disha_Patani'
Arjun_Rampal='/content/gdrive/My Drive/Kaggle/bollywood_celeb_faces_0/Arjun_Rampal'

!ls

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/gdrive/My Drive/Kaggle/bollywood_celeb_faces_0

!ls

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/gdrive/My Drive/Kaggle/bollywood_celeb_faces_1

!ls

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2

!ls

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/gdrive/My Drive/Kaggle/bollywood_celeb_faces2/Shraddha_Kapoor

!ls | wc -l # Number of images in Shraddha_Kapoor Dataset

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/gdrive/My Drive/Kaggle/question_one_dataset

!pwd

"""# Run the program from here

# Overall accuracy: 0.35833333333333334
"""

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
dir = "/content/gdrive/My Drive/Kaggle/question_one_dataset"

classes = [   'Shraddha_Kapoor',#0
              'Shahid_Kapoor',#1
              'Richa_Chadda',#2
              'Randeep_Hooda',#3
              'Taapsee_Pannu',#4
              'Suniel_Shetty',#5
              'Shruti_Haasan',#6
              'Sidharth_Malhotra',#7
              'Disha_Patani',#8
              'Arjun_Rampal'#9
              ]

from google.colab.patches import cv2_imshow

# cv2_imshow(cv2.imread("/content/gdrive/My Drive/Kaggle/question_one_dataset/Shraddha_Kapoor/1.jpg")) # Worked!!!!

# This implies the data is fetched from the google drive. Now we just have to do the learning.

data = []
for clas in classes:
  path = os.path.join(dir, clas)
  label = classes.index(clas)
  print(label)
  for img in os.listdir(path): # Gets the list of all files in the directory
    imagepath = os.path.join(path,img)
    # print(str(imagepath))
    star_image = cv2.imread(imagepath, 0)
    try:
      star_image = cv2.resize(star_image,(250,250))
      image_array = np.array(star_image).flatten()
      data.append([image_array, label])
    except Exception as e:
      pass
    
print(len(data))

import random
random.shuffle(data)
X_features = []
Y_labels = []

for x,y in data:
  X_features.append(x)
  Y_labels.append(y)

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X_features, Y_labels, test_size=0.1)

print("Sizes of X_train, Y_train, X_test, Y_test")
print(len(X_train))
print(len(X_test))
print(len(Y_train))
print(len(Y_test))

from sklearn.svm import SVC

model = SVC(decision_function_shape='ovo', kernel='rbf')
model.fit(X_train, Y_train)
prediction_test = model.predict(X_test)
accu = model.score(X_test, Y_test)

print("Testing accuracy:",accu)

model = SVC(decision_function_shape='ovo', kernel='rbf')
model.fit(X_train, Y_train)
prediction_train = model.predict(X_train)
train_accu = model.score(X_train, Y_train)

print("Training accuracy:",train_accu)

# Testing Metrics
from sklearn.metrics import classification_report
print(classification_report(Y_test, prediction_test,
                            target_names=classes))

# Training Metrics
from sklearn.metrics import classification_report
print(classification_report(Y_train, prediction_train,
                            target_names=classes))

import seaborn as sns
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(Y_test, prediction_test)
sns.heatmap(matrix.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=classes,
            yticklabels=classes)
plt.xlabel('true label')
plt.ylabel('predicted label');

test_error = []
training_error = []
loss_train=0
loss_test=0
for item in range(len(Y_train)):
  if(model.predict(X_train[item].reshape(1,-1))[0] == Y_train[item]):
    l=0
  else:
    loss_train = loss_train+1
    training_error.append(loss_train)
  

for item in range(len(Y_test)):
  if(model.predict(X_test[item].reshape(1,-1))[0] == Y_test[item]):
    l=0
  else:
    loss_test = loss_test + 1
    test_error.append(loss_test)


plt.imshow(X_test[9].reshape(250,250))

Y_test[9]

a = model.predict(X_test[9].reshape(1,-1))[0]

print(classes[a])

print (a)

plt.imshow(X_test[70].reshape(250,250))

b = model.predict(X_test[9].reshape(1,-1))[0]

print(classes[b])

plt.imshow(X_test[25].reshape(250,250))

c = model.predict(X_test[9].reshape(1,-1))[0]

print(classes[c])

plt.imshow(X_train[400].reshape(250,250))

d = model.predict(X_train[400].reshape(1,-1))[0]

print(classes[d])

plt.imshow(X_train[363].reshape(250,250))

e = model.predict(X_train[363].reshape(1,-1))[0]
print(classes[e])

# so on the training dataset it is performing very well

plt.imshow(X_test[51].reshape(250,250))

f = model.predict(X_test[51].reshape(1,-1))[0]
print(classes[f])

plt.imshow(X_test[67].reshape(250,250))
g = model.predict(X_test[67].reshape(1,-1))[0]
print(classes[g])


"""# Training Loss vs Testing Loss Curve"""


"""# Class Wise
Lets perform class wise classification and check accuracy. We need to take 2 classes for each training. Here below, classwise accuracy is checked between Shraddha_Kapoor and Taapsee_Pannu, accuracy: 0.6666666666666666
"""

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
dir = "/content/gdrive/My Drive/Kaggle/question_one_dataset"

classes = [   'Shraddha_Kapoor',#0
              'Taapsee_Pannu',#1

              ]
data_0 = []
for clas in classes:
  path = os.path.join(dir, clas)
  label = classes.index(clas)
  print(label)
  for img in os.listdir(path): # Gets the list of all files in the directory
    imagepath = os.path.join(path,img)
    # print(str(imagepath))
    star_image = cv2.imread(imagepath, 0)
    try:
      star_image = cv2.resize(star_image,(250,250))
      image_array = np.array(star_image).flatten()
      data_0.append([image_array, label])
    except Exception as e:
      pass
    
print(len(data_0))

import random
random.shuffle(data_0)
X_features = []
Y_labels = []

for x,y in data_0:
  X_features.append(x)
  Y_labels.append(y)

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X_features, Y_labels, test_size=0.1)

print("Sizes of X_train, Y_train, X_test, Y_test")
print(len(X_train))
print(len(X_test))
print(len(Y_train))
print(len(Y_test))

from sklearn.svm import SVC
model = SVC(C=0.1, kernel='linear')
model.fit(X_train, Y_train)
prediction = model.predict(X_test)
accu = model.score(X_test, Y_test)

print("accuracy:",accu)


"""But what we envision is the accuracies of each of the actor's image should be compared with other. making total of 10 X 10 runs of the training and accuracies."""

array1 = [   'Shraddha_Kapoor',#0
              'Shahid_Kapoor',#1
              'Richa_Chadda',#2
              'Randeep_Hooda',#3
              'Taapsee_Pannu',#4
              'Suniel_Shetty',#5
              'Shruti_Haasan',#6
              'Sidharth_Malhotra',#7
              'Disha_Patani',#8
              'Arjun_Rampal'#9
              ]

array2 = [   'Shraddha_Kapoor',#0
              'Shahid_Kapoor',#1
              'Richa_Chadda',#2
              'Randeep_Hooda',#3
              'Taapsee_Pannu',#4
              'Suniel_Shetty',#5
              'Shruti_Haasan',#6
              'Sidharth_Malhotra',#7
              'Disha_Patani',#8
              'Arjun_Rampal'#9
              ]


def important_function(actor1, actor2):
  print("------------------------------------------------")
  print("-----------Doing for ---------")
  print(str(actor1)+" and "+str(actor2))
  print("------------------------------------------------")
    
  classes = [   str(actor1),#0
                str(actor2),#1

                ]
  data_0 = []
  for clas in classes:
    path = os.path.join(dir, clas)
    label = classes.index(clas)
    print(label)
    for img in os.listdir(path): # Gets the list of all files in the directory
      imagepath = os.path.join(path,img)
      # print(str(imagepath))
      star_image = cv2.imread(imagepath, 0)
      try:
        star_image = cv2.resize(star_image,(250,250))
        image_array = np.array(star_image).flatten()
        data_0.append([image_array, label])
      except Exception as e:
        pass
      
  print(len(data_0))
  random.shuffle(data_0)
  X_features = []
  Y_labels = []

  for x,y in data_0:
    X_features.append(x)
    Y_labels.append(y)
  from sklearn.model_selection import train_test_split
  X_train,X_test, Y_train, Y_test = train_test_split(X_features, Y_labels, test_size=0.1)
  print("Sizes of X_train, Y_train, X_test, Y_test")
  print(len(X_train))
  print(len(X_test))
  print(len(Y_train))
  print(len(Y_test))
  from sklearn.svm import SVC
  model = SVC(C=0.1, kernel='linear')
  model.fit(X_train, Y_train)
  prediction = model.predict(X_test)
  accu = model.score(X_test, Y_test)

  print("accuracy:",accu)


for element1 in array1:
  for element2 in array2:
    if(str(element1)==str(element2)):
      pass
    else:
      important_function(element1, element2)