# 常规步骤：数据预处理，交叉验证模型评估模型，使用网格搜索找到最优参数。

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# 生成数据，200个样本，分类2， 标准差为5
X,y = make_blobs(n_samples=200, centers=2, cluster_std=5)
# 拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=38)
# 预处理
scaler = StandardScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

print("shape of the datasets: ", X_train_scaled.shape, X_test_scaled.shape) #(150, 2) (50, 2)

shape of the datasets:  (150, 2) (50, 2)


# 神经网络是典型的需要数据预处理的算法模型。

# 原始的训练集
plt.scatter(X_train[:,0], X_train[:,1])
# 经过预处理的数据集
plt.scatter(X_train_scaled[:,0], X_train_scaled[:,1], marker='^', edgecolors='k')

plt.title("training set & scaled training set")
plt.show()

# 处理后的数据更加“聚拢”。


# 使用神经网络进行拟合，使用网格搜索确定最优参数
from sklearn.model_selection import GridSearchCV
# 设定参数组合 5*5=25个组合
params={"hidden_layer_sizes":[(50,), (100,), (100,100), (50,100), (100,50)],
       "alpha":[0.0001, 0.001,0.01, 0.1, 1]}
grid=GridSearchCV(MLPClassifier(max_iter=3000, random_state=38),
                 param_grid=params, cv=3)
# 拟合
grid.fit(X_train_scaled, y_train) #耗时 60s

# 打分
print("best score:{:0.3f}".format(grid.best_score_))
print("best params_:{}".format(grid.best_params_))
print("\ntest score:{}".format(grid.score(X_test_scaled, y_test)) )

# 这个过程其实是错误的。
#   因为我们对 X_train做的标准化fit，
#   而GridSearchCV时传入的是X_train_scaled，对该数据有切分为 train 和 validation 2部分，内部按照 validation 最高分输出的参数组合。
#   而外部，我们使用该参数组合 MLP fit 是相对于X_train_scaled，对 X_test_scaled 做 prediction 打分。
#   内部 scaler fit 应该对 内部的train，而不能是对数据总体。

best score:0.747
best params_:{'alpha': 1, 'hidden_layer_sizes': (100,)}

test score:0.76


# 每次划分都要手动预处理，需要做参数组合数次，太麻烦。

# Pipeline 能起到形式简化的作用
from sklearn.pipeline import Pipeline

# 在流水线上安装2个设备，一个数据预处理的 StandardScaler， 一个最大迭代次数1600的MLP多层感知神经网络。
pipeline = Pipeline([('scaler', StandardScaler()), 
                     ('mlp', MLPClassifier(max_iter=1600, random_state=38))])
# 使用管道模型对训练集进行拟合
pipeline.fit(X_train, y_train)
# 对测试集打分
print("test score:{:0.3f}".format(pipeline.score(X_test, y_test))) #0.880

test score:0.760


# 注意：参数加上管道中工具的前缀，中间使用双下划线__连接。
params={"mlp__hidden_layer_sizes":[(50,), (100,), (100,100), (50,100), (100,50)], #
       "mlp__alpha":[0.0001, 0.001,0.01, 0.1]}
grid=GridSearchCV(pipeline, param_grid=params, cv=3)
# 拟合
grid.fit(X_train, y_train) #耗时 60s

# 打分
print("best score:{:0.3f}".format(grid.best_score_))
print("best params_:{}".format(grid.best_params_))
print("\ntest score:{}".format(grid.score(X_test, y_test)) )

best score:0.733
best params_:{'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (100,)}

test score:0.76


# 检查步骤
pipeline.steps

[('scaler', StandardScaler()),
 ('mlp', MLPClassifier(max_iter=1600, random_state=38))]


# (1) 获取原始json数据
import time
timestamp=str(round(time.time()*1000))
begin=str(0)
end=str(2060) #1515

# http://www.sse.com.cn/market/price/report/
url="http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity?callback=jQuery111208015895779126387_1560941576071&select=date%2Ccode%2Cname%2Copen%2Chigh%2Clow%2Clast%2Cprev_close%2Cchg_rate%2Cvolume%2Camount%2Ctradephase%2Cchange%2Camp_rate%2Ccpxxsubtype%2Ccpxxprodusta&order=&begin="+begin+"&end="+end+"&_="+timestamp;
print("url=", url)

headers = {
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
  'Referer': 'http://www.sse.com.cn/market/price/report/'
}

import requests
r = requests.get(url, headers=headers) #, auth=('user', 'pass')
#rs1=r.status_code #200
#rs2=r.headers['content-type'] #'text/html; charset=utf-8'
#rs3=r.encoding #'utf-8' 编码，修改编码
rs4=r.text
#r.json() #只有r.headers['content-type']为json时才能用。否则报错。
#print(rs1,rs2,rs3,"\n")
#rs4 #现在是字符串格式

# (2) 解析json格式为python数组
import json,re
#json
rs=re.sub(r"jQuery111208015895779126387_1560941576071\(", "", rs4)
rs=re.sub(r"\)$", "", rs)
rs=eval(rs)
print( len(rs['list']) )  #25

# (3) 注释每一列的字段名
# select: code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate
print('date code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate cpxxsubtype cpxxprodusta')
titles='date,code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate,cpxxsubtype,cpxxprodusta'.split(",")

#data
alists=rs['list']
#len(alist) #1515
print(alists[0])

alist=alists[0]
for i in range(len(titles)):
    print(i, titles[i], "=", alist[i])

import numpy as np
alists2=np.array(alists)
print(alists2.shape) #(2052, 16)

import pandas as pd
stock=pd.DataFrame(alists2, columns=titles)

stock['amount']=stock['amount'].astype(np.float64)/10000 #单位 万元
stock['volume']=stock['volume'].astype(np.float64)/100 # 单位 手

stock['chg_rate']=stock['chg_rate'].astype(np.number)
stock['last']=stock['last'].astype(np.number)

stock['amp_rate']=stock['amp_rate'].astype(np.number)

stock['open']=stock['open'].astype(np.number)
stock['high']=stock['high'].astype(np.number)
stock['low']=stock['low'].astype(np.number)

stock['prev_close']=stock['prev_close'].astype(np.number)
stock['high']=stock['high'].astype(np.number)

stock['change']=stock['change'].astype(np.number)

stock.head()

# 涨跌幅 chg_rate(%)
# 振幅 amp_rate

url= http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity?callback=jQuery111208015895779126387_1560941576071&select=date%2Ccode%2Cname%2Copen%2Chigh%2Clow%2Clast%2Cprev_close%2Cchg_rate%2Cvolume%2Camount%2Ctradephase%2Cchange%2Camp_rate%2Ccpxxsubtype%2Ccpxxprodusta&order=&begin=0&end=2060&_=1637155887391
2053
date code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate cpxxsubtype cpxxprodusta
[20211117, '600000', '浦发银行', 8.67, 8.72, 8.65, 8.65, 8.73, -0.92, 19301599, 167446824, 'E110', -0.08, 0.8, 'ASH', '   D  F  N          ']
0 date = 20211117
1 code = 600000
2 name = 浦发银行
3 open = 8.67
4 high = 8.72
5 low = 8.65
6 last = 8.65
7 prev_close = 8.73
8 chg_rate = -0.92
9 volume = 19301599
10 amount = 167446824
11 tradephase = E110
12 change = -0.08
13 amp_rate = 0.8
14 cpxxsubtype = ASH
15 cpxxprodusta =    D  F  N          
(2053, 16)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

X=stock.loc[:, "open":"amp_rate"]
X=X.drop(["chg_rate", "tradephase"], axis=1)
y=stock["chg_rate"]

print("size:", X.shape, y.shape)

size: (2053, 9) (2053,)


# 从9列特征中预测涨幅, 使用MLP多层感知神经网络

# 导入交叉验证
from sklearn.model_selection import cross_val_score
# 导入MLP神经网络回归
from sklearn.neural_network import MLPRegressor
scores=cross_val_score(MLPRegressor(random_state=38, max_iter=800), X, y, cv=3)
print("mean score:{:0.3f}".format( scores.mean()) ) 
# 这个打分怎么小于0呢？而且小这么多!?
# 因为没有预处理，各个特征的极值差异过大。

mean score:-430.876


from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

# 对比两种方法的语法
pipeline=Pipeline([ ("scaler",StandardScaler()),
                  ("mlp", MLPRegressor(random_state=38, max_iter=800) )])
pipe = make_pipeline(StandardScaler(), MLPRegressor(random_state=38, max_iter=800))
# make_pipeline 看着更简洁。
print(pipeline.steps)
print(pipe.steps)

[('scaler', StandardScaler()), ('mlp', MLPRegressor(max_iter=800, random_state=38))]
[('standardscaler', StandardScaler()), ('mlpregressor', MLPRegressor(max_iter=800, random_state=38))]


# 进行交叉验证

# 这次评分是建立在管道模型pipe上，也就是数偶在交叉验证中，每次都会对数据集进行StandardScaler预处理，再拟合MLP回归模型。
scores = cross_val_score(pipe, X, y, cv=3)
print( "mean score:{:0.3f}".format(scores.mean()) )
# 这个打分也不算多好，但至少正常点了。

mean score:0.720


# 尝试使用随机森林模型，对数据集进行特征筛选。

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
pipe = make_pipeline(StandardScaler(),
                    SelectFromModel(RandomForestRegressor(random_state=38)),
                    MLPRegressor(random_state=38, max_iter=800))
pipe.steps

[('standardscaler', StandardScaler()),
 ('selectfrommodel',
  SelectFromModel(estimator=RandomForestRegressor(random_state=38))),
 ('mlpregressor', MLPRegressor(max_iter=800, random_state=38))]


# 进行交叉验证

scores = cross_val_score(pipe, X, y, cv=3)
print( "mean score:{:0.3f}".format(scores.mean()) )

# 打分略有变化

/home/wangjl/anaconda3/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (800) reached and the optimization hasn't converged yet.
  ConvergenceWarning,

mean score:0.725


# 查看每一步的属性，比如第二步选了哪些特征？
pipe.fit(X, y)
mask = pipe.named_steps["selectfrommodel"].get_support()
mask #可见，只有最后2个特征被用于模型。

array([False, False, False, False, False, False, False,  True,  True])


# true relation

print( X["change"] / X["prev_close"]*100 - stock['chg_rate'] )

0       0.003620
1       0.000710
2      -0.004706
3      -0.003187
4       0.004921
          ...   
2048   -0.001634
2049   -0.004177
2050   -0.004074
2051    0.000328
2052   -0.002552
Length: 2053, dtype: float64


# 目的: 看看 随机森林好， 还是 MLP 多层感知神经网络好。而MLP需要数据预处理。

from sklearn.model_selection import GridSearchCV

# 定义参数
params=[
    {"reg":[MLPRegressor(random_state=38, max_iter=1000)],
        "scaler":[StandardScaler(), None]},
    
    {"reg":[RandomForestRegressor(random_state=38)],
       "scaler":[None]}
]
# 实例化
pipe = Pipeline([("scaler", StandardScaler()), ("reg", MLPRegressor())])
grid=GridSearchCV(pipe, params, cv=3)

# 拟合数据
grid.fit(X, y)

# 打分
print( "best model:{}".format( grid.best_params_) )
print( "best score:{:0.2f}".format(grid.best_score_) )

best model:{'reg': RandomForestRegressor(random_state=38), 'scaler': None}
best score:0.88


# 在参数字典中增加 MLP 隐藏层 和随机森林中 estimator 数量的选项
params= [
    {"reg":[MLPRegressor(random_state=38, max_iter=1000)], 
    "scaler":[StandardScaler(), None],
    "reg__hidden_layer_sizes":[(50,), (100,), (100,100)]},
    
    {"reg": [RandomForestRegressor(random_state=38)],
    "scaler":[None],
    "reg__n_estimators":[10, 50, 100]}
]

# 建立管道模型
pipe = Pipeline([("scaler", StandardScaler()),
                ("reg", MLPRegressor())])
# 建立网格搜索
grid = GridSearchCV(pipe, params, cv=3)

# 拟合网格
grid.fit(X, y)

# 打分
print( "best model:{}".format( grid.best_params_) )
print( "best score:{:0.2f}".format(grid.best_score_) )

#出现反转，最好的模型又变成了 多层神经网络。

best model:{'reg': MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=38), 'reg__hidden_layer_sizes': (100, 100), 'scaler': StandardScaler()}
best score:0.90


grid.score(X, y) #训练集上的打分

0.98843420836218

管道模型的概念与用法¶

基本概念¶

管道模型¶

使用管道模型进行网格搜索¶

使用管道模型对股票涨幅进行回归分析¶

下载数据¶

预处理和MLP模型的管道¶

添加特征选择步骤¶

管道进行模型选择和参数调优¶

模型选择¶

参数调优¶

	date	code	name	open	high	low	last	prev_close	chg_rate	volume	amount	tradephase	change	amp_rate	cpxxsubtype	cpxxprodusta
0	20211117	600000	浦发银行	8.67	8.72	8.65	8.65	8.73	-0.92	193015.99	16744.6824	E110	-0.08	0.80	ASH	D F N
1	20211117	600004	白云机场	12.38	12.52	12.30	12.40	12.39	0.08	82691.87	10249.4920	E110	0.01	1.78	ASH	D F N
2	20211117	600006	东风汽车	6.82	6.87	6.78	6.85	6.80	0.74	166517.32	11379.6372	E110	0.05	1.32	ASH	D F N
3	20211117	600007	中国国贸	14.56	14.61	14.33	14.39	14.62	-1.57	25977.16	3745.6189	E110	-0.23	1.92	ASH	D F N
4	20211117	600008	首创环保	3.15	3.18	3.14	3.17	3.15	0.63	614585.99	19445.9376	E110	0.02	1.27	ASH	D F N