chapter 12: 建立算法的管道模型
# 常规步骤:数据预处理,交叉验证模型评估模型,使用网格搜索找到最优参数。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
# 生成数据,200个样本,分类2, 标准差为5
X,y = make_blobs(n_samples=200, centers=2, cluster_std=5)
# 拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=38)
# 预处理
scaler = StandardScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
print("shape of the datasets: ", X_train_scaled.shape, X_test_scaled.shape) #(150, 2) (50, 2)
shape of the datasets: (150, 2) (50, 2)
# 神经网络是典型的需要数据预处理的算法模型。
# 原始的训练集
plt.scatter(X_train[:,0], X_train[:,1])
# 经过预处理的数据集
plt.scatter(X_train_scaled[:,0], X_train_scaled[:,1], marker='^', edgecolors='k')
plt.title("training set & scaled training set")
plt.show()
# 处理后的数据更加“聚拢”。
# 使用神经网络进行拟合,使用网格搜索确定最优参数
from sklearn.model_selection import GridSearchCV
# 设定参数组合 5*5=25个组合
params={"hidden_layer_sizes":[(50,), (100,), (100,100), (50,100), (100,50)],
"alpha":[0.0001, 0.001,0.01, 0.1, 1]}
grid=GridSearchCV(MLPClassifier(max_iter=3000, random_state=38),
param_grid=params, cv=3)
# 拟合
grid.fit(X_train_scaled, y_train) #耗时 60s
# 打分
print("best score:{:0.3f}".format(grid.best_score_))
print("best params_:{}".format(grid.best_params_))
print("\ntest score:{}".format(grid.score(X_test_scaled, y_test)) )
# 这个过程其实是错误的。
# 因为我们对 X_train做的标准化fit,
# 而GridSearchCV时传入的是X_train_scaled,对该数据有切分为 train 和 validation 2部分,内部按照 validation 最高分输出的参数组合。
# 而外部,我们使用该参数组合 MLP fit 是相对于X_train_scaled,对 X_test_scaled 做 prediction 打分。
# 内部 scaler fit 应该对 内部的train,而不能是对数据总体。
best score:0.747 best params_:{'alpha': 1, 'hidden_layer_sizes': (100,)} test score:0.76
# 每次划分都要手动预处理,需要做参数组合数次,太麻烦。
# Pipeline 能起到形式简化的作用
from sklearn.pipeline import Pipeline
# 在流水线上安装2个设备,一个数据预处理的 StandardScaler, 一个最大迭代次数1600的MLP多层感知神经网络。
pipeline = Pipeline([('scaler', StandardScaler()),
('mlp', MLPClassifier(max_iter=1600, random_state=38))])
# 使用管道模型对训练集进行拟合
pipeline.fit(X_train, y_train)
# 对测试集打分
print("test score:{:0.3f}".format(pipeline.score(X_test, y_test))) #0.880
test score:0.760
# 注意:参数加上管道中工具的前缀,中间使用双下划线__连接。
params={"mlp__hidden_layer_sizes":[(50,), (100,), (100,100), (50,100), (100,50)], #
"mlp__alpha":[0.0001, 0.001,0.01, 0.1]}
grid=GridSearchCV(pipeline, param_grid=params, cv=3)
# 拟合
grid.fit(X_train, y_train) #耗时 60s
# 打分
print("best score:{:0.3f}".format(grid.best_score_))
print("best params_:{}".format(grid.best_params_))
print("\ntest score:{}".format(grid.score(X_test, y_test)) )
best score:0.733 best params_:{'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (100,)} test score:0.76
# 检查步骤
pipeline.steps
[('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter=1600, random_state=38))]
# (1) 获取原始json数据
import time
timestamp=str(round(time.time()*1000))
begin=str(0)
end=str(2060) #1515
# http://www.sse.com.cn/market/price/report/
url="http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity?callback=jQuery111208015895779126387_1560941576071&select=date%2Ccode%2Cname%2Copen%2Chigh%2Clow%2Clast%2Cprev_close%2Cchg_rate%2Cvolume%2Camount%2Ctradephase%2Cchange%2Camp_rate%2Ccpxxsubtype%2Ccpxxprodusta&order=&begin="+begin+"&end="+end+"&_="+timestamp;
print("url=", url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
'Referer': 'http://www.sse.com.cn/market/price/report/'
}
import requests
r = requests.get(url, headers=headers) #, auth=('user', 'pass')
#rs1=r.status_code #200
#rs2=r.headers['content-type'] #'text/html; charset=utf-8'
#rs3=r.encoding #'utf-8' 编码,修改编码
rs4=r.text
#r.json() #只有r.headers['content-type']为json时才能用。否则报错。
#print(rs1,rs2,rs3,"\n")
#rs4 #现在是字符串格式
# (2) 解析json格式为python数组
import json,re
#json
rs=re.sub(r"jQuery111208015895779126387_1560941576071\(", "", rs4)
rs=re.sub(r"\)$", "", rs)
rs=eval(rs)
print( len(rs['list']) ) #25
# (3) 注释每一列的字段名
# select: code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate
print('date code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate cpxxsubtype cpxxprodusta')
titles='date,code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate,cpxxsubtype,cpxxprodusta'.split(",")
#data
alists=rs['list']
#len(alist) #1515
print(alists[0])
alist=alists[0]
for i in range(len(titles)):
print(i, titles[i], "=", alist[i])
import numpy as np
alists2=np.array(alists)
print(alists2.shape) #(2052, 16)
import pandas as pd
stock=pd.DataFrame(alists2, columns=titles)
stock['amount']=stock['amount'].astype(np.float64)/10000 #单位 万元
stock['volume']=stock['volume'].astype(np.float64)/100 # 单位 手
stock['chg_rate']=stock['chg_rate'].astype(np.number)
stock['last']=stock['last'].astype(np.number)
stock['amp_rate']=stock['amp_rate'].astype(np.number)
stock['open']=stock['open'].astype(np.number)
stock['high']=stock['high'].astype(np.number)
stock['low']=stock['low'].astype(np.number)
stock['prev_close']=stock['prev_close'].astype(np.number)
stock['high']=stock['high'].astype(np.number)
stock['change']=stock['change'].astype(np.number)
stock.head()
# 涨跌幅 chg_rate(%)
# 振幅 amp_rate
url= http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity?callback=jQuery111208015895779126387_1560941576071&select=date%2Ccode%2Cname%2Copen%2Chigh%2Clow%2Clast%2Cprev_close%2Cchg_rate%2Cvolume%2Camount%2Ctradephase%2Cchange%2Camp_rate%2Ccpxxsubtype%2Ccpxxprodusta&order=&begin=0&end=2060&_=1637155887391 2053 date code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate cpxxsubtype cpxxprodusta [20211117, '600000', '浦发银行', 8.67, 8.72, 8.65, 8.65, 8.73, -0.92, 19301599, 167446824, 'E110', -0.08, 0.8, 'ASH', ' D F N '] 0 date = 20211117 1 code = 600000 2 name = 浦发银行 3 open = 8.67 4 high = 8.72 5 low = 8.65 6 last = 8.65 7 prev_close = 8.73 8 chg_rate = -0.92 9 volume = 19301599 10 amount = 167446824 11 tradephase = E110 12 change = -0.08 13 amp_rate = 0.8 14 cpxxsubtype = ASH 15 cpxxprodusta = D F N (2053, 16)
date | code | name | open | high | low | last | prev_close | chg_rate | volume | amount | tradephase | change | amp_rate | cpxxsubtype | cpxxprodusta | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20211117 | 600000 | 浦发银行 | 8.67 | 8.72 | 8.65 | 8.65 | 8.73 | -0.92 | 193015.99 | 16744.6824 | E110 | -0.08 | 0.80 | ASH | D F N |
1 | 20211117 | 600004 | 白云机场 | 12.38 | 12.52 | 12.30 | 12.40 | 12.39 | 0.08 | 82691.87 | 10249.4920 | E110 | 0.01 | 1.78 | ASH | D F N |
2 | 20211117 | 600006 | 东风汽车 | 6.82 | 6.87 | 6.78 | 6.85 | 6.80 | 0.74 | 166517.32 | 11379.6372 | E110 | 0.05 | 1.32 | ASH | D F N |
3 | 20211117 | 600007 | 中国国贸 | 14.56 | 14.61 | 14.33 | 14.39 | 14.62 | -1.57 | 25977.16 | 3745.6189 | E110 | -0.23 | 1.92 | ASH | D F N |
4 | 20211117 | 600008 | 首创环保 | 3.15 | 3.18 | 3.14 | 3.17 | 3.15 | 0.63 | 614585.99 | 19445.9376 | E110 | 0.02 | 1.27 | ASH | D F N |
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
X=stock.loc[:, "open":"amp_rate"]
X=X.drop(["chg_rate", "tradephase"], axis=1)
y=stock["chg_rate"]
print("size:", X.shape, y.shape)
size: (2053, 9) (2053,)
# 从9列特征中预测涨幅, 使用MLP多层感知神经网络
# 导入交叉验证
from sklearn.model_selection import cross_val_score
# 导入MLP神经网络回归
from sklearn.neural_network import MLPRegressor
scores=cross_val_score(MLPRegressor(random_state=38, max_iter=800), X, y, cv=3)
print("mean score:{:0.3f}".format( scores.mean()) )
# 这个打分怎么小于0呢?而且小这么多!?
# 因为没有预处理,各个特征的极值差异过大。
mean score:-430.876
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
# 对比两种方法的语法
pipeline=Pipeline([ ("scaler",StandardScaler()),
("mlp", MLPRegressor(random_state=38, max_iter=800) )])
pipe = make_pipeline(StandardScaler(), MLPRegressor(random_state=38, max_iter=800))
# make_pipeline 看着更简洁。
print(pipeline.steps)
print(pipe.steps)
[('scaler', StandardScaler()), ('mlp', MLPRegressor(max_iter=800, random_state=38))] [('standardscaler', StandardScaler()), ('mlpregressor', MLPRegressor(max_iter=800, random_state=38))]
# 进行交叉验证
# 这次评分是建立在管道模型pipe上,也就是数偶在交叉验证中,每次都会对数据集进行StandardScaler预处理,再拟合MLP回归模型。
scores = cross_val_score(pipe, X, y, cv=3)
print( "mean score:{:0.3f}".format(scores.mean()) )
# 这个打分也不算多好,但至少正常点了。
mean score:0.720
# 尝试使用随机森林模型,对数据集进行特征筛选。
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
pipe = make_pipeline(StandardScaler(),
SelectFromModel(RandomForestRegressor(random_state=38)),
MLPRegressor(random_state=38, max_iter=800))
pipe.steps
[('standardscaler', StandardScaler()), ('selectfrommodel', SelectFromModel(estimator=RandomForestRegressor(random_state=38))), ('mlpregressor', MLPRegressor(max_iter=800, random_state=38))]
# 进行交叉验证
scores = cross_val_score(pipe, X, y, cv=3)
print( "mean score:{:0.3f}".format(scores.mean()) )
# 打分略有变化
/home/wangjl/anaconda3/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (800) reached and the optimization hasn't converged yet. ConvergenceWarning,
mean score:0.725
# 查看每一步的属性,比如第二步选了哪些特征?
pipe.fit(X, y)
mask = pipe.named_steps["selectfrommodel"].get_support()
mask #可见,只有最后2个特征被用于模型。
array([False, False, False, False, False, False, False, True, True])
# true relation
print( X["change"] / X["prev_close"]*100 - stock['chg_rate'] )
0 0.003620 1 0.000710 2 -0.004706 3 -0.003187 4 0.004921 ... 2048 -0.001634 2049 -0.004177 2050 -0.004074 2051 0.000328 2052 -0.002552 Length: 2053, dtype: float64
要复用上述数据,所以接着运行。
# 目的: 看看 随机森林好, 还是 MLP 多层感知神经网络好。而MLP需要数据预处理。
from sklearn.model_selection import GridSearchCV
# 定义参数
params=[
{"reg":[MLPRegressor(random_state=38, max_iter=1000)],
"scaler":[StandardScaler(), None]},
{"reg":[RandomForestRegressor(random_state=38)],
"scaler":[None]}
]
# 实例化
pipe = Pipeline([("scaler", StandardScaler()), ("reg", MLPRegressor())])
grid=GridSearchCV(pipe, params, cv=3)
# 拟合数据
grid.fit(X, y)
# 打分
print( "best model:{}".format( grid.best_params_) )
print( "best score:{:0.2f}".format(grid.best_score_) )
best model:{'reg': RandomForestRegressor(random_state=38), 'scaler': None} best score:0.88
# 在参数字典中增加 MLP 隐藏层 和随机森林中 estimator 数量的选项
params= [
{"reg":[MLPRegressor(random_state=38, max_iter=1000)],
"scaler":[StandardScaler(), None],
"reg__hidden_layer_sizes":[(50,), (100,), (100,100)]},
{"reg": [RandomForestRegressor(random_state=38)],
"scaler":[None],
"reg__n_estimators":[10, 50, 100]}
]
# 建立管道模型
pipe = Pipeline([("scaler", StandardScaler()),
("reg", MLPRegressor())])
# 建立网格搜索
grid = GridSearchCV(pipe, params, cv=3)
# 拟合网格
grid.fit(X, y)
# 打分
print( "best model:{}".format( grid.best_params_) )
print( "best score:{:0.2f}".format(grid.best_score_) )
#出现反转,最好的模型又变成了 多层神经网络。
best model:{'reg': MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=38), 'reg__hidden_layer_sizes': (100, 100), 'scaler': StandardScaler()} best score:0.90
grid.score(X, y) #训练集上的打分
0.98843420836218