# 导入 pandas
import pandas as pd
fruits = pd.DataFrame({"value":[5,6,7,8,9],
                      "type":["waterMelon", "banana", "orange", "apple", "grape"]})
display(fruits)


# 转化数据表中的字符串为数值
fruits_dum=pd.get_dummies(fruits)
fruits_dum

# 数值列并没有变化，分类列变成了0-1矩阵


# 令程序将数值也看做字符串
fruits["value"] = fruits["value"].astype(str) #先将数值转为字符串，这一句可选，但是推荐加上。
# 用 get_dummies 转化为字符串
pd.get_dummies(fruits, columns=["value"])


# 产生数据
import numpy as np
import matplotlib.pyplot as plt
rnd=np.random.RandomState(38)
x=rnd.uniform(-5, 5, size=50)

# 向数据集添加噪音
y_no_noise = np.cos(6*x) +x
X = x.reshape(-1, 1)
y = (y_no_noise + rnd.normal(size=len(x)))/2

plt.plot(X, y, 'o', c='r')
plt.show()


# 分别使用 MLP 算法和 KNN 算法 对这个数据集进行回归分析

from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

# 生成一个等差数列
line=np.linspace(-5, 5, 1000, endpoint=False).reshape(-1, 1)
mlpr=MLPRegressor().fit(X, y)
knr=KNeighborsRegressor().fit(X, y)

plt.plot(line, mlpr.predict(line), label="MLP")
plt.plot(line, knr.predict(line), label="KNN")
plt.plot(X, y, 'o', c='r')
plt.legend(loc="best")
plt.show()

# 肉眼可见，MLP更平滑，而KNN覆盖更多的点。
# 该采用哪一个呢？


# 对数据进行一下“装箱处理”(binning)，也称为“离散化处理”(discretization)

# 设置箱子个数11
bins=np.linspace(-5, 5, 11)
#将数据进行装箱处理
target_bin=np.digitize(X, bins=bins)
print("装箱范围:\n",bins)

print("前10个数据点的特征值:\n", X[:10])
print("前10个数据点的箱子:\n", target_bin[:10])

装箱范围:
 [-5. -4. -3. -2. -1.  0.  1.  2.  3.  4.  5.]
前10个数据点的特征值:
 [[-1.1522688 ]
 [ 3.59707847]
 [ 4.44199636]
 [ 2.02824894]
 [ 1.33634097]
 [ 1.05961282]
 [-2.99873157]
 [-1.12612112]
 [-2.41016836]
 [-4.25392719]]
前10个数据点的箱子:
 [[ 4]
 [ 9]
 [10]
 [ 8]
 [ 7]
 [ 7]
 [ 3]
 [ 4]
 [ 3]
 [ 1]]


# sklearn 的 OneHotEncoder 和 pandas 的 get_dummies 功能基本一致，但是 OneHotEncoder 目前只能用于整型数值的类型变量。

from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse=False)
onehot.fit(target_bin)
X_in_bin=onehot.transform(target_bin)

print("装箱前的数据形态:\n", target_bin.shape ) #50行1列
print("装箱后的数据形态:\n", X_in_bin.shape ) #50行10列
print("装箱后的前10个数据点:\n", X_in_bin[:10])

装箱前的数据形态:
 (50, 1)
装箱后的数据形态:
 (50, 10)
装箱后的前10个数据点:
 [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


# 分别使用 MLP 算法和 KNN 算法 对这个新数据集进行回归分析

# 使用独热编码进行数据表达
line2=onehot.transform(np.digitize(line, bins=bins))

mlpr2=MLPRegressor(max_iter=500).fit(X_in_bin, y)
knr2=KNeighborsRegressor().fit(X_in_bin, y)

plt.plot(line, mlpr2.predict(line2), label="MLP")
plt.plot(line, knr2.predict(line2), label="KNN")
plt.plot(X, y, 'o', c='r')
plt.legend(loc="best")
plt.title("OneHotEncode")
plt.show()

# 在x>0部分，2个拟合几乎完全重合。
# 对比 OneHotEncode 编码前，MLP回归模型变得更复杂；而KNN模型变得更简单。


# 交叉式特征（Interaction Features） 就是在原始特征中添加交互项。

# 测试 np.hstack() 函数
import numpy as np
arr1=[1,2,3,4]
arr2=[10,30,40,50]
arr3=np.hstack( [arr1, arr2] )
arr3

array([ 1,  2,  3,  4, 10, 30, 40, 50])


# 把原始数据和装箱后的数据堆叠
# 产生数据
import numpy as np
import matplotlib.pyplot as plt
rnd=np.random.RandomState(38)
x=rnd.uniform(-5, 5, size=50)

# 向数据集添加噪音
y_no_noise = np.cos(6*x) +x
X = x.reshape(-1, 1)
y = (y_no_noise + rnd.normal(size=len(x)))/2
print("X\n",X[:3])


# 对数据进行一下“装箱处理”(binning)，也称为“离散化处理”(discretization)
# 设置箱子个数11
bins=np.linspace(-5, 5, 11)
# 将数据进行装箱处理
target_bin = np.digitize(X, bins=bins)
print("\ntarget_bin\n",target_bin[:3])

# OneHotEncoder 只能输入整型
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse=False)
onehot.fit(target_bin)
X_in_bin=onehot.transform(target_bin)
print("\nX_in_bin\n", X_in_bin[:3])

# 按列拼接
X_stack = np.hstack([X, X_in_bin])
print(X.shape, X_stack.shape) #(50, 1) (50, 11) 由1列，变为11列。后10列是编码后的。

X
 [[-1.1522688 ]
 [ 3.59707847]
 [ 4.44199636]]

target_bin
 [[ 4]
 [ 9]
 [10]]

X_in_bin
 [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
(50, 1) (50, 11)


# 分别使用 MLP 算法和 KNN 算法 对这个数据集进行回归分析

from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

# 生成一个等差数列
line=np.linspace(-5, 5, 1000, endpoint=False).reshape(-1, 1)
# 使用独热编码进行数据表达
line2=onehot.transform( np.digitize(line, bins=bins) )

#将数据进行堆叠
line_stack = np.hstack( [line, line2] )

# 训练数据
mlpr3=MLPRegressor().fit(X_stack, y)
knr3=KNeighborsRegressor().fit(X_stack, y)

# 绘图
plt.plot(line, mlpr3.predict(line_stack), linewidth=3, label="MLP")
#plt.plot(line, knr3.predict(line_stack), label="KNN")
plt.ylim(-4, 4)
for vline in bins:
    plt.plot([vline, vline], [-5, 5], ":", c='k')

plt.legend(loc="best")
plt.plot(X, y, 'o', c='r')

plt.title("After adding interaction")
plt.show()

# 每个数据所在的箱体中，MLP增加了斜率。
# 模型复杂度提高了。


# 每个箱体中的斜率基本一致了，但是这不是我们需要的。我们希望每个箱体都有各自的截距和斜率。
X_multi = np.hstack([X_in_bin, X*X_in_bin])
print(X.shape, X_in_bin.shape)

print(X_multi.shape) #(50, 20)
print(X_multi[0])

(50, 1) (50, 10)
(50, 20)
[ 0.         0.         0.         1.         0.         0.
  0.         0.         0.         0.        -0.        -0.
 -0.        -1.1522688 -0.        -0.        -0.        -0.
 -0.        -0.       ]


# 每个箱子内都有自己的斜率

# 训练数据
mlpr4=MLPRegressor().fit(X_multi, y)
knr4=KNeighborsRegressor().fit(X_multi, y)

#将数据进行堆叠
line_multi = np.hstack( [line2, line * line2] )

# 绘图
plt.plot(line, mlpr4.predict(line_multi), linewidth=3, label="MLP")
#plt.plot(line, knr3.predict(line_stack), label="KNN")
plt.ylim(-4, 4)
for vline in bins:
    plt.plot([vline, vline], [-5, 5], ":", c='grey')

plt.plot(X, y, 'o', c='r')

plt.legend(loc="lower right")
plt.title("Adding multiply")
plt.show()

# 线性模型在低维度表现不好，在高纬度表现良好。所以可以升维后使用glm模型。


# 导入多项式特征工具
from sklearn.preprocessing import PolynomialFeatures

# 向数据集添加多项式特征
poly=PolynomialFeatures(degree=20, include_bias=False)
X_poly = poly.fit_transform(X)

print(X.shape, X_poly.shape) #degree=20 就是变成20列

(50, 1) (50, 20)


# 检验一下各项怎么来的, 第一个是1次方，第二个是2次方，类推。
print(X[0], np.power(X[0],2), np.power(X[0],[3, 4,5,6] ) )
print(X_poly[0])

print("\nfeature_names:\n", poly.get_feature_names_out())

[-1.1522688] [1.3277234] [-1.52989425  1.76284942 -2.0312764   2.34057643]
[ -1.1522688    1.3277234   -1.52989425   1.76284942  -2.0312764
   2.34057643  -2.6969732    3.10763809  -3.58083443   4.1260838
  -4.75435765   5.47829801  -6.3124719    7.27366446  -8.38121665
   9.65741449 -11.12793745  12.82237519 -14.77482293  17.02456756]

feature_names:
 ['x0' 'x0^2' 'x0^3' 'x0^4' 'x0^5' 'x0^6' 'x0^7' 'x0^8' 'x0^9' 'x0^10'
 'x0^11' 'x0^12' 'x0^13' 'x0^14' 'x0^15' 'x0^16' 'x0^17' 'x0^18' 'x0^19'
 'x0^20']


# 尝试线性回归
from sklearn.linear_model import LinearRegression
lr_poly = LinearRegression().fit(X_poly, y)

line_poly = poly.transform(line)

# 绘制图形
plt.plot(line, lr_poly.predict(line_poly), label="Linear Regressor")
plt.xlim(np.min(X)-0.5, np.max(X)+0.5)
plt.ylim(np.min(y)-0.5, np.max(y)+0.5)
plt.plot(X, y, 'o', c='r')
plt.legend(loc="lower right")
plt.show()

# 这个线性拟合可不是直线！低维数据集，线性拟合通常欠拟合，但是进行多项式扩展后，可以一定程度解决欠拟合问题。

# 除了将特征转为多项式的方法之外，我们还可以用类似正弦函数 sin(), 对数函数 log(), 或指数函数 exp() 等来进行相似的操作。


# 导入某天中国A股全部股票交易信息 
# 上交所: http://www.sse.com.cn/market/price/report/
# 深交所: http://www.szse.cn/market/trend/index.html


# (1) 获取原始json数据
import time
timestamp=str(round(time.time()*1000))
begin=str(0)
end=str(2060) #1515

# http://www.sse.com.cn/market/price/report/
url="http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity?callback=jQuery111208015895779126387_1560941576071&select=date%2Ccode%2Cname%2Copen%2Chigh%2Clow%2Clast%2Cprev_close%2Cchg_rate%2Cvolume%2Camount%2Ctradephase%2Cchange%2Camp_rate%2Ccpxxsubtype%2Ccpxxprodusta&order=&begin="+begin+"&end="+end+"&_="+timestamp;
print("url=", url)

headers = {
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
  'Referer': 'http://www.sse.com.cn/market/price/report/'
}

import requests
r = requests.get(url, headers=headers) #, auth=('user', 'pass')
#rs1=r.status_code #200
#rs2=r.headers['content-type'] #'text/html; charset=utf-8'
#rs3=r.encoding #'utf-8' 编码，修改编码
rs4=r.text
#r.json() #只有r.headers['content-type']为json时才能用。否则报错。
#print(rs1,rs2,rs3,"\n")
#rs4 #现在是字符串格式

# (2) 解析json格式为python数组
import json,re
#json
rs=re.sub(r"jQuery111208015895779126387_1560941576071\(", "", rs4)
rs=re.sub(r"\)$", "", rs)
rs=eval(rs)
print( len(rs['list']) )  #25

# (3) 注释每一列的字段名
# select: code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate
print('date code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate cpxxsubtype cpxxprodusta')
titles='date,code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate,cpxxsubtype,cpxxprodusta'.split(",")

#data
alists=rs['list']
#len(alist) #1515
print(alists[0])

alist=alists[0]
for i in range(len(titles)):
    print(i, titles[i], "=", alist[i])

import numpy as np
alists2=np.array(alists)
print(alists2.shape) #(2050, 13)

import pandas as pd
stock=pd.DataFrame(alists2, columns=titles)
stock.head()

# 涨跌幅 chg_rate(%)
# 振幅 amp_rate

url= http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity?callback=jQuery111208015895779126387_1560941576071&select=date%2Ccode%2Cname%2Copen%2Chigh%2Clow%2Clast%2Cprev_close%2Cchg_rate%2Cvolume%2Camount%2Ctradephase%2Cchange%2Camp_rate%2Ccpxxsubtype%2Ccpxxprodusta&order=&begin=0&end=2060&_=1636988436812
2050
date code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate cpxxsubtype cpxxprodusta
[20211115, '600000', '浦发银行', 8.66, 8.75, 8.65, 8.75, 8.65, 1.16, 27625801, 240602624, 'E110', 0.1, 1.16, 'ASH', '   D  F  N          ']
0 date = 20211115
1 code = 600000
2 name = 浦发银行
3 open = 8.66
4 high = 8.75
5 low = 8.65
6 last = 8.75
7 prev_close = 8.65
8 chg_rate = 1.16
9 volume = 27625801
10 amount = 240602624
11 tradephase = E110
12 change = 0.1
13 amp_rate = 1.16
14 cpxxsubtype = ASH
15 cpxxprodusta =    D  F  N          
(2050, 16)


print(stock["cpxxsubtype"].unique())
print(stock["cpxxprodusta"].unique())

['ASH' 'KSH' 'BSH']
['   D  F  N          ' '   S  F  N          ' '   D  FU N          '
 'N  D  F  N          ' '   D  F WN          ' '   D  FUWN          '
 '   DY F WN          ']


tmp=stock["chg_rate"].astype(float)
print(tmp.min(), "\t", tmp.max())

tmp=stock["amp_rate"].astype(float)
print(tmp.min(), "\t", tmp.max())

-10.31 	 71.16
0.0 	 37.82


# target 涨幅列
y=stock['chg_rate'].astype(float)
print(y.shape)
y[0]

(2050,)

1.16


# 特征
features = stock.loc[:, 'open,high,low,last,prev_close,volume,amount,change,amp_rate'.split(",")].astype(float)
X=features.values

print(X.shape)
X[:1]

(2050, 9)

array([[8.66000000e+00, 8.75000000e+00, 8.65000000e+00, 8.75000000e+00,
        8.65000000e+00, 2.76258010e+07, 2.40602624e+08, 1.00000000e-01,
        1.16000000e+00]])


# 由于各个列差异较大，需要标准化

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=62)

# 数据预处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

# 设置神经网络层数和alpha
from sklearn.neural_network import MLPRegressor
mlpr=MLPRegressor(random_state=62, hidden_layer_sizes=[100,100], alpha=0.001, max_iter=500)
mlpr.fit(X_train_scaled, y_train)

# 打分
print("training set:{:0.3f}".format( mlpr.score(X_train_scaled, y_train)) )
print("tesing set:{:0.3f}".format( mlpr.score(X_test_scaled, y_test)) )

# 打分挺高了

training set:0.992
tesing set:0.982


# 列举涨跌幅>=10%
wanted=stock.loc[:, ("date","code","name", "last", "prev_close", "chg_rate")]
print("涨停榜\n", wanted[y>=9].sort_values(by=['chg_rate', "last"], ascending=[False,True] ))

print("\n跌停榜\n", wanted[y<= -9].sort_values(by=['chg_rate', "last"], ascending=[False,True] ))

涨停榜
           date    code    name    last prev_close chg_rate
515   20211115  600640   新国脉     14.86      13.51     9.99
1110  20211115  603138    海量数据   18.71      17.01     9.99
1612  20211115  605300    佳禾食品   21.13      19.21     9.99
470   20211115  600587    新华医疗    22.8      20.73     9.99
1617  20211115  605333    沪光股份   24.77      22.52     9.99
1604  20211115  605277    新亚电子    37.1      33.73     9.99
1056  20211115  603050    科林电气   19.28      17.53     9.98
1176  20211115  603286    日盈电子   19.51      17.74     9.98
1453  20211115  603869    新智认知    9.92       9.02     9.98
312   20211115  600379    宝光股份   12.46      11.33     9.97
1312  20211115  603603    博天环境     7.4       6.73     9.96
175   20211115  600215    长春经开    9.16       8.33     9.96
1231  20211115  603377    东方时尚    9.28       8.44     9.95
1762  20211115  688185   康希诺     272.8      248.3     9.87
1714  20211115  688097    博众精工    48.4      44.33     9.18
1777  20211115  688212  N澳华      38.51       22.5    71.16
1720  20211115  688105   N诺唯赞    85.35       55.0    55.18
1717  20211115  688100    威胜信息    29.5       25.3     16.6
1954  20211115  688668    鼎通科技    66.0       58.0    13.79
1957  20211115  688676    金盘科技   35.42       31.4     12.8
1830  20211115  688330   宏力达    135.88      122.5    10.92
1173  20211115  603278    大业股份    9.74       8.85    10.06
77    20211115  600101    明星电力     6.9       6.27    10.05
234   20211115  600285    羚锐制药   11.84      10.76    10.04
1254  20211115  603458    勘设股份   11.95      10.86    10.04
1327  20211115  603626    科森科技   14.58      13.25    10.04
359   20211115  600452    涪陵电力   18.87      17.15    10.03
570   20211115  600706    曲江文旅    7.02       6.38    10.03
1085  20211115  603099   长白山     11.75      10.68    10.02
1285  20211115  603556    海兴电力   13.95      12.68    10.02
1373  20211115  603703    盛洋科技   17.35      15.77    10.02
1146  20211115  603213    镇洋发展   10.44       9.49    10.01
1086  20211115  603100    川仪股份   19.34      17.58    10.01
1593  20211115  605198    德利股份   19.46      17.69    10.01
839   20211115  601126    四方股份   20.23      18.39    10.01
1165  20211115  603258    电魂网络   30.21      27.46    10.01
1601  20211115  605259    绿田机械   35.72      32.47    10.01
1472  20211115  603897    长城科技   52.99      48.17    10.01
1321  20211115  603613    国联股份   120.4     109.45     10.0
1573  20211115  605133    嵘泰股份   22.66       20.6     10.0
738   20211115  600892    大晟文化    3.74        3.4     10.0
1043  20211115  603031   安德利     39.82       36.2     10.0
321   20211115  600389    江山股份   42.24       38.4     10.0
1384  20211115  603719    良品铺子   46.07      41.88     10.0
196   20211115  600237    铜峰电子    6.93        6.3     10.0
1560  20211115  605089   味知香     77.25      70.23     10.0

跌停榜
           date    code   name   last prev_close chg_rate
1505  20211115  603959   百利科技  18.12      20.13    -9.99
1499  20211115  603948   建业股份  35.03      38.92    -9.99
1264  20211115  603505   金石资源  38.84      43.15    -9.99
1788  20211115  688233   神工股份   81.5      90.55    -9.99
120   20211115  600151   航天机电  11.84      13.15    -9.96
550   20211115  600683   京投发展    6.8       7.55    -9.93
542   20211115  600673  东阳光     9.52      10.57    -9.93
865   20211115  601218   吉鑫科技   7.03       7.79    -9.76
83    20211115  600108   亚盛集团   3.56       3.94    -9.64
1858  20211115  688390  固德威    422.5      467.0    -9.53
1401  20211115  603766   隆鑫通用   5.71       6.31    -9.51
1842  20211115  688359   三孚新科   58.0      64.67   -10.31
139   20211115  600172   黄河旋风  11.04      12.27   -10.02
72    20211115  600096  云天化    23.55      26.17   -10.01
1078  20211115  603088   宁波精达  13.77       15.3    -10.0
1488  20211115  603922  金鸿顺     23.4       26.0    -10.0


from sklearn.feature_selection import SelectPercentile
#设置特征选择参数
select = SelectPercentile(percentile=50)
select.fit(X_train_scaled, y_train)
X_train_selected = select.transform(X_train_scaled)

# 输出维度
print(X_train_scaled.shape, X_train_selected.shape) #9列变为4列

# 查看哪些被保留了
mask=select.get_support()
print(mask)

(1537, 9) (1537, 4)
[False False False False False  True  True  True  True]


# 使用图像表示特征选择的结果
import matplotlib.pyplot as plt
plt.matshow(mask.reshape(1,-1), cmap=plt.cm.cool)
plt.xlabel("Featuers Selected")
plt.show() # 红色是保留的。


# 再次使用选择的特征，训练神经网络
X_test_selected=select.transform(X_test_scaled)

# 设置神经网络层数和alpha
from sklearn.neural_network import MLPRegressor
mlpr_sp=MLPRegressor(random_state=62, hidden_layer_sizes=[100,100], alpha=0.001, max_iter=500)
mlpr_sp.fit(X_train_selected, y_train)

# 打分
print("training set:{:0.3f}".format( mlpr_sp.score(X_train_selected, y_train)) )
print("tesing set:{:0.3f}".format( mlpr_sp.score(X_test_selected, y_test)) )

# 打分降低了。
# 说明我们的数据不包括噪音，去掉的都是有用的信息。

# 单一变量法进行特征筛选，不依赖于具体建模的算法。

training set:0.974
tesing set:0.943


# 导入基于模型选择特征的工具
from sklearn.feature_selection import SelectFromModel

# 导入随机森林模型
from sklearn.ensemble import RandomForestRegressor

# 设置模型 n_estimators 参数
sfm=SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=38), threshold="median")

# 使用模型拟合数据
sfm.fit(X_train_scaled, y_train)
X_train_sfm =sfm.transform(X_train_scaled)

# 打印形状
print(X_train_scaled.shape, X_train_sfm.shape) #9列变5列


# 查看哪些被保留了
mask_sfm=sfm.get_support()
print(mask_sfm)

(1537, 9) (1537, 5)
[ True False False False  True  True False  True  True]


# 使用图像表示特征选择的结果
import matplotlib.pyplot as plt
plt.matshow(mask_sfm.reshape(1,-1), cmap=plt.cm.cool)
plt.xlabel("Featuers Selected")
plt.show()

# 红色是保留的。随机森林法 和 单一变量法，选择的特征不同。


# 再次使用选择的特征，训练神经网络
X_test_sfm=sfm.transform(X_test_scaled)

# 设置神经网络层数和alpha
from sklearn.neural_network import MLPRegressor
mlpr_sfm=MLPRegressor(random_state=62, hidden_layer_sizes=[100,100], alpha=0.001, max_iter=500)
mlpr_sfm.fit(X_train_sfm, y_train)

# 打分
print("training set:{:0.3f}".format( mlpr_sfm.score(X_train_sfm, y_train)) )
print("tesing set:{:0.3f}".format( mlpr_sfm.score(X_test_sfm, y_test)) )

# 打分比原始的降低了。但是比单一变量法略高。
# 说明我们的数据不包括噪音，去掉的都是有用的信息。

training set:0.989
tesing set:0.976


from sklearn.feature_selection import RFE
rfe=RFE(RandomForestRegressor(n_estimators=100, random_state=38), n_features_to_select=5)

rfe.fit(X_train_scaled, y_train)
mask=rfe.get_support()
mask

array([ True, False,  True, False,  True, False, False,  True,  True])


# 使用图像表示特征选择的结果
import matplotlib.pyplot as plt
plt.matshow(mask.reshape(1,-1), cmap=plt.cm.cool)
plt.xlabel("Featuers Selected")
plt.show()


# 再次使用选择的特征，训练神经网络
X_train_rfe=rfe.transform(X_train_scaled)
X_test_rfe=rfe.transform(X_test_scaled)

# 设置神经网络层数和alpha
from sklearn.neural_network import MLPRegressor
mlpr_rfe=MLPRegressor(random_state=62, hidden_layer_sizes=[100,100], alpha=0.001, max_iter=500)
mlpr_rfe.fit(X_train_rfe, y_train)

# 打分
print("training set:{:0.3f}".format( mlpr_rfe.score(X_train_rfe, y_train)) )
print("tesing set:{:0.3f}".format( mlpr_rfe.score(X_test_rfe, y_test)) )

# 打分比原始的略高，最好的结果了。

training set:0.992
tesing set:0.986

	value	type_apple	type_banana	type_grape	type_orange	type_waterMelon
0	5	0	0	0	0	1
1	6	0	1	0	0	0
2	7	0	0	0	1	0
3	8	1	0	0	0	0
4	9	0	0	1	0	0

数据表达¶

使用哑变量转化类型特征¶

对数据进行装箱处理¶

OneHotEncoder¶

数据“升维”¶

交叉式特征（Interaction Features）¶

交叉相¶

多项式特征(Polynomial Features)¶

自动特征选择¶

单一变量法进行特征选择¶

使用 SelectPercentile 进行特征选择¶

基于模型的特征选择¶

迭代式特征选择¶

	type	value_5	value_6	value_7	value_8	value_9
0	waterMelon	1	0	0	0	0
1	banana	0	1	0	0	0
2	orange	0	0	1	0	0
3	apple	0	0	0	1	0
4	grape	0	0	0	0	1

	date	code	name	open	high	low	last	prev_close	chg_rate	volume	amount	tradephase	change	amp_rate	cpxxsubtype	cpxxprodusta
0	20211115	600000	浦发银行	8.66	8.75	8.65	8.75	8.65	1.16	27625801	240602624	E110	0.1	1.16	ASH	D F N
1	20211115	600004	白云机场	12.65	12.71	12.5	12.51	12.73	-1.73	13532530	170390064	E110	-0.22	1.65	ASH	D F N
2	20211115	600006	东风汽车	7.07	7.16	6.9	6.94	7.09	-2.12	32560993	227359342	E110	-0.15	3.67	ASH	D F N
3	20211115	600007	中国国贸	14.6	14.9	14.25	14.81	14.6	1.44	5293865	77362085	E110	0.21	4.45	ASH	D F N
4	20211115	600008	首创环保	3.18	3.22	3.15	3.21	3.19	0.63	79820965	254427365	E110	0.02	2.19	ASH	D F N