-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathdata_helpers.py
57 lines (49 loc) · 1.96 KB
/
data_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re, html
import numpy as np
re1 = re.compile(r' +')
# functions for pre-processing text
def fixup(x):
x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
'<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
' @-@ ','-').replace('\\', ' \\ ')
return re1.sub(' ', html.unescape(x))
def get_texts(df, n_lbls=1):
labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
texts = list(texts.apply(fixup).values)
tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
return tok, list(labels)
def get_all(df, n_lbls):
tok, labels = [], []
for i, r in enumerate(df):
print(i)
tok_, labels_ = get_texts(r, n_lbls)
tok += tok_;
labels += labels_
return tok, labels
'''
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
(This function is copied from the scikit docs.)
"""
plt.figure()
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
'''