-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathchallenge.py
227 lines (178 loc) · 7.92 KB
/
challenge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#****************************************************************
# import libraries
from flask import Flask
from flask import request
from flask import make_response
from flask import jsonify
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# NLP libraries
import nltk
import re
from textblob import TextBlob, Word
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Deep learning libraries
import keras
from keras.models import Model
from keras.layers import Dense, Embedding, Input, LSTM, GlobalMaxPool1D, Dropout
#****************************************************************
# Functions definition
## 1. DATA PREPARATION
### 1.1 function for text cleaning
def preprocess_text(text):
print('preprocessing text...')
stop_words = set(stopwords.words('english'))
text = text.lower() # lowercase
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "can not ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"\'scuse", " excuse ", text)
text = re.sub(r"\'\n", " ", text) #line breaks
#text = re.sub(r"\'\xa0", " ", text) # xa0 Unicode representing spaces
#text = re.sub('\s+', ' ', text) # one or more whitespace characters
text = text.strip(' ') # spaces
# remove backslash-apostrophe
text = re.sub("\'", "", text)
# remove everything except alphabets
text = re.sub("[^a-zA-Z]"," ",text)
#lemmatize and remove stopwords
no_stopword_text = [w for w in text.split() if not w in stop_words]
text = ' '.join(no_stopword_text)
return text
### 1.2 function for lemmatization
def lemma(text): # Lemmatization of cleaned body
print('lemmmatizing...')
sent = TextBlob(text)
tag_dict = {"J": 'a',
"N": 'n',
"V": 'v',
"R": 'r'}
words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
seperator=' '
lemma = seperator.join(lemmatized_list)
return lemma
## 2. MODEL
def build_model(max_features, maxlen, inp, embed_size):
x = Embedding(max_features, embed_size)(inp)
x = LSTM(64, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(19, activation="softmax")(x)
### 2.3 build the model
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print("LSTM neural network compiled")
return model
## 3. Extraction of 5 m,ost probable genres tags from the predictions matrix
def top_5_predictions(df):
N = 5
cols = df.columns[:-1].tolist()
a = df[cols].to_numpy().argsort()[:, :-N-1:-1]
c = np.array(cols)[a]
#d = df[cols].to_numpy()[np.arange(a.shape[0])[:, None], a]
df1 = pd.DataFrame(c).rename(columns=lambda x : f'max_{x+1}_col')
predicted_genres = df1["max_1_col"] + ' ' + df1["max_2_col"]+ ' ' +df1["max_3_col"]+ ' ' + df1["max_4_col"]+ ' '+df1["max_5_col"]
return predicted_genres
#****************************************************************
# run the API
## 1. Define train and predict folders
app = Flask(__name__)
UPLOAD_FOLDER = '/home/marco/Documents/CV/home assignments/Radix/challenge/challenge/genres/train'
UPLOAD_FOLDER1 = '/home/marco/Documents/CV/home assignments/Radix/challenge/challenge/genres/predict'
# 2. Upload endpoints
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
@app.route('/genres/train', methods=['POST','PUT'])
def upload_train():
file = request.files['csv']
file.save(os.path.join(app.config['UPLOAD_FOLDER'], "train.csv"))
resp = jsonify({'message' : 'File successfully uploaded'})
resp.status_code = 201
return resp
app.config['UPLOAD_FOLDER1'] = UPLOAD_FOLDER1
@app.route('/genres/predict', methods=['POST','PUT'])
def upload_test():
file = request.files['csv']
file.save(os.path.join(app.config['UPLOAD_FOLDER1'], "test.csv"))
resp = jsonify({'message' : 'File successfully uploaded'})
resp.status_code = 201
return resp
# Process and output endpoint
@app.route('/', methods=['POST','PUT'])
def endpoint_process():
# Create train dataframe from the uploaded csv file
print('train.csv uploaded to /genres/train')
train= pd.read_csv('genres/train/train.csv')
print("train.csv transformed into Pandas dataframe")
## 1. Preprocess text for training matrix
train['clean_plot'] = train['synopsis'].apply(lambda x: preprocess_text(x))
train['lemma'] = train['clean_plot'].apply(lambda x: lemma(x))
X = train['lemma']
## 2. define train matrix parameters and tokenize the text
max_features = 5000
maxlen = 150
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X))
list_tokenized_train = tokenizer.texts_to_sequences(X)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen) # this is the final training matrix
## 3. Apply the onehot transformation for the target vector
y = train['genres']
one_hot = MultiLabelBinarizer() # encoder for the tags
y_onehot = one_hot.fit_transform(y.str.split(' ')) # this is the target vector for training
y_bin = pd.DataFrame(y_onehot, columns=one_hot.classes_ ) # transform it to Pandas object
## 1. Define the model parameters
inp = Input(shape=(maxlen, )) #maxlen defined earlier
embed_size = 128
batch_size = 16
epochs = 3
## 2. Compile the model
model = build_model(max_features, maxlen, inp, embed_size)
## 3. Train the model
print('training the LSTM model...')
model.fit(X_t,y_onehot, batch_size=batch_size, epochs=epochs, validation_split=0.1)
print('LSTM neural network weights updated, model trained!')
# Create test dataframe from the uploaded test.csv file
print('test.csv uploaded to /genres/predict')
test= pd.read_csv('genres/predict/test.csv')
print("test.csv transformed into Pandas dataframe")
## 1. preprocess text
test['clean_plot'] = test['synopsis'].apply(lambda x: preprocess_text(x))
test['lemma'] = test['clean_plot'].apply(lambda x: lemma(x))
print('preprocessing and lemmatization done!')
X_test = test['lemma']
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)
# Predict genres tags appplying the model to test set
print('prediction...')
y_pred = model.predict(X_te, batch_size= 16, verbose=0)
print(y_pred.shape)
print('obtained probability matrix')
# Obtain a dataframe for the predictions
df_probs_all = pd.DataFrame(y_pred,columns=y_bin.columns)
pred_gen = top_5_predictions(df_probs_all) # function defined earlier that returns the 5 most probable genres for each movie
submission = pd.DataFrame(data= {'movie_id':test.movie_id,'predicted_genres':pred_gen})
# Return a csv file with predictions as response to the user
csv = submission.to_string()
response = make_response(csv)
response.headers["Content-Disposition"] = "attachment; filename=submission.csv"
response.headers["Content-type"] = "text/csv"
return response
#run command
if __name__ == "__main__":
app.debug=True
app.run()