-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdm.py
204 lines (162 loc) · 6.94 KB
/
dm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#-*- coding: utf-8 -*-
# pylint: disable=E1101
from random import shuffle
import matplotlib.pyplot as plt
import pandas as pd
# from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.models import Sequential
from scipy.interpolate import lagrange
# from sklearn.externals import joblib
import joblib
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.tree import DecisionTreeClassifier
"""
cm_plot-->自定义混淆矩阵可视化
programmer_1-->使用拉格朗日插值法进行插值
programmer_2-->构建CART决策树模型,进行预测给出训练结果,并且绘制ROC曲线
programmer_3-->使用神经网络模型,进行预测给出训练结果,并且绘制ROC曲线
"""
def cm_plot(y, yp):
cm = confusion_matrix(y, yp)
plt.matshow(cm, cmap=plt.cm.Greens)
plt.colorbar()
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(
cm[x, y],
xy=(x, y),
horizontalalignment='center',
verticalalignment='center')
plt.ylabel('True label')
plt.xlabel('Predicted label')
return plt
def programmer_1():
inputfile = 'data/missing_data.xls'
outputfile = 'tmp/missing_data_processed.xls'
data = pd.read_excel(inputfile, header=None) # 读入数据
# 自定义列向量插值函数
# s为列向量,n为被插值的位置,k为取前后的数据个数,默认为5
def ployinterp_column(s, n, k=5):
y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))] # 取数
y = y[y.notnull()] # 剔除空值
return lagrange(y.index, list(y))(n) # 插值并返回插值结果
# 逐个元素判断是否需要插值
for i in data.columns:
for j in range(len(data)):
if (data[i].isnull())[j]:
data[i][j] = ployinterp_column(data[i], j)
data.to_excel(outputfile, header=None, index=False)
def programmer_2():
datafile = 'data/model.xls'
data = pd.read_excel(datafile)
data = data.as_matrix()
shuffle(data) # 随机打乱数据
# 设置训练数据比8:2
p = 0.8
train = data[:int(len(data) * p), :]
test = data[int(len(data) * p):, :]
# 构建CART决策树模型
treefile = 'tmp/tree.pkl'
tree = DecisionTreeClassifier()
tree.fit(train[:, :3], train[:, 3])
joblib.dump(tree, treefile)
cm_plot(train[:, 3], tree.predict(train[:, :3])).show() # 显示混淆矩阵可视化结果
# 注意到Scikit-Learn使用predict方法直接给出预测结果。
fpr, tpr, thresholds = roc_curve(
test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_label=1)
plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# 设定边界范围
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.legend(loc=4)
plt.show()
print(thresholds)
def programmer_3():
datafile = 'data/model.xls'
data = pd.read_excel(datafile)
data = data.as_matrix()
shuffle(data)
p = 0.8
train = data[:int(len(data) * p), :]
test = data[int(len(data) * p):, :]
# 构建LM神经网络模型
netfile = 'tmp/net.model'
net = Sequential() # 建立神经网络
# net.add(Dense(input_dim = 3, units = 10))
# 添加输入层(3节点)到隐藏层(10节点)的连接
net.add(Dense(10, input_shape=(3, )))
net.add(Activation('relu')) # 隐藏层使用relu激活函数
# net.add(Dense(input_dim = 10, units = 1))
#添加隐藏层(10节点)到输出层(1节点)的连接
net.add(Dense(1, input_shape=(10, )))
net.add(Activation('sigmoid')) # 输出层使用sigmoid激活函数
net.compile(
loss='binary_crossentropy',
optimizer='adam',
sample_weight_mode="binary") # 编译模型,使用adam方法求解
net.fit(train[:, :3], train[:, 3], epochs=100, batch_size=1)
net.save_weights(netfile)
predict_result = net.predict_classes(train[:, :3]).reshape(
len(train)) # 预测结果变形
'''这里要提醒的是,keras用predict给出预测概率,predict_classes才是给出预测类别,而且两者的预测结果都是n x 1维数组,而不是通常的 1 x n'''
cm_plot(train[:, 3], predict_result).show()
predict_result = net.predict(test[:, :3]).reshape(len(test))
fpr, tpr, thresholds = roc_curve(test[:, 3], predict_result, pos_label=1)
plt.plot(fpr, tpr, linewidth=2, label='ROC of LM')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.legend(loc=4)
plt.show()
print(thresholds)
def sklearn_():
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.datasets import make_gaussian_quantiles
from sklearn.datasets import make_hastie_10_2
plt.figure(figsize=(8, 8))
plt.subplots_adjust(bottom=.05, top=.9, left=.05, right=.95)
plt.subplot(421)
plt.title("One informative feature, one cluster per class", fontsize='small')
X1, Y1 = make_classification(n_samples=1000,n_features=2, n_redundant=0, n_informative=1,
n_clusters_per_class=1)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
plt.subplot(422)
plt.title("Two informative features, one cluster per class", fontsize='small')
X1, Y1 = make_classification(n_samples=1000,n_features=2, n_redundant=0, n_informative=2,
n_clusters_per_class=1)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
plt.subplot(423)
plt.title("Two informative features, two clusters per class", fontsize='small')
X2, Y2 = make_classification(n_samples=1000,n_features=2, n_redundant=0, n_informative=2)
plt.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2)
plt.subplot(424)
plt.title("Multi-class, two informative features, one cluster",
fontsize='small')
X1, Y1 = make_classification(n_samples=1000,n_features=2, n_redundant=0, n_informative=2,
n_clusters_per_class=1, n_classes=3)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
plt.subplot(425)
plt.title("Three blobs", fontsize='small')
X1, Y1 = make_blobs(n_samples=1000,n_features=2, centers=3)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
plt.subplot(426)
plt.title("Gaussian divided into four quantiles", fontsize='small')
X1, Y1 = make_gaussian_quantiles(n_samples=1000,n_features=2, n_classes=4)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
plt.subplot(427)
plt.title("hastie data ", fontsize='small')
X1, Y1 = make_hastie_10_2(n_samples=1000)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
plt.show()
if __name__ == "__main__":
# programmer_1()
# programmer_2()
# programmer_3()
sklearn_()
pass