边界位置的向量,对确定边界有决定作用,这些变量称为支持向量(Suport vectors)
# 最大边界间隔超平面 Maximum Margin Separating Hyperplane,该平面和所有支持向量的距离都是最大的。
import numpy as np
import matplotlib.pyplot as plt
# 造数据,50个点,2类
from sklearn.datasets import make_blobs
X,y= make_blobs(n_samples=50, centers=2, random_state=6)
# 导入支持向量机
from sklearn import svm
clf=svm.SVC(kernel="linear", C=1000)
clf.fit(X, y)
print("score:", clf.score(X, y))
# 可视化
plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired)
# 建立图像坐标
ax=plt.gca()
xlim=ax.get_xlim()
ylim=ax.get_ylim()
# 生成2个等差数列
xx=np.linspace(xlim[0], xlim[1], 30)
yy=np.linspace(ylim[0], ylim[1], 30)
YY, XX=np.meshgrid(yy, xx)
xy=np.vstack([XX.ravel(), YY.ravel()]).T #ravel() 拉直,vstack 按列叠放,成为2行,再转置成2列。
Z=clf.decision_function(xy).reshape(XX.shape)
# 把分类的决定边界画出来
ax.contour(XX, YY, Z, colors="k", levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:,1], s=100, linewidth=2, facecolors="none") #没有颜色,相当于没画。
plt.title("SVM: kernel='linear'")
plt.show()
score: 1.0
clf=svm.SVC(kernel="rbf", C=1000)
clf.fit(X, y)
print("score:", clf.score(X, y))
# 画数据点
plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired)
# 建立图像坐标
ax=plt.gca()
xlim=ax.get_xlim()
ylim=ax.get_ylim()
# 生成2个等差数列
xx=np.linspace(xlim[0], xlim[1], 30)
yy=np.linspace(ylim[0], ylim[1], 30)
YY, XX=np.meshgrid(yy, xx)
xy=np.vstack([XX.ravel(), YY.ravel()]).T #ravel() 拉直,vstack 按列叠放,成为2行,再转置成2列。
Z=clf.decision_function(xy).reshape(XX.shape)
# 把分类的决定边界画出来
ax.contour(XX, YY, Z, colors="k", levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100, linewidth=1, facecolors="none") #没有颜色,相当于没画。
plt.title("SVM: kernel='rbf'")
plt.show()
# 分类器是一条曲线,2个支持向量边界也是曲线。
# 计算距离的公式变了,Krbf(x1, x2)=exp( gamma||x1-x2||^2 )
# 其中||x1-x2||代表2点之间的欧几里得距离,gamma是控制RBF内核宽度的参数。
score: 1.0
import numpy as np
import matplotlib.pyplot as plt
# 导入支持向量机
from sklearn import svm
# 定义一个画图函数
def make_meshgrid(x, y, h=0.02):
x_min, x_max = x.min()-1, x.max()+1
y_min, y_max = y.min()-1, y.max()+1
xx, yy=np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
return xx, yy
# 定义一个绘制等高线的函数
def plot_contours(ax, clf, xx, yy, **params):
Z=clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z=Z.reshape(xx.shape)
out=ax.contourf(xx, yy, Z, **params)
return out
# 导入 wine 数据集
from sklearn.datasets import load_wine
wine=load_wine()
# 只选取数据集的前2个特征,为了图形方便展示
X=wine.data[:, :2]
y=wine.target
# 使用 SVM 模型进行拟合
C=1.0 #设定正则化参数
models=[svm.SVC(kernel='linear', C=C),
svm.LinearSVC(C=C, max_iter=8000), # 这里报warn , 加 max_iter=8000 (默认 1000,7k依旧报错)
# ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
svm.SVC(kernel='rbf', gamma=0.7, C=C),
svm.SVC(kernel='poly', degree=4, C=C)]
[ clf.fit(X, y) for clf in models ]
# 输出打分
scores= [ clf.score(X, y) for clf in models ]
print(scores)
# 设定图的标题
titles=("SVC with linear kernel",
"LinearSVC (linear kernel)",
"SVC with RBF kernel(g=0.7)",
"SVC with polynomial kernel(d=4)")
# 设定子图的个数和排列方式
fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
# 使用前面定义的函数进行画图
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
for clf, title, ax in zip(models, titles, sub.flatten()):
plot_contours(ax, clf, xx, yy, cmap=plt.cm.plasma, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.plasma, s=20, edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xlabel("Feature 0")
ax.set_ylabel("Feature 1")
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
plt.show()
[0.7808988764044944, 0.7640449438202247, 0.8370786516853933, 0.8202247191011236]
结论:
探讨 gamma 值对RBF内核的SVC分类器的影响。
C=1.0
models=[svm.SVC(kernel="rbf", gamma=0.1, C=C),
svm.SVC(kernel="rbf", gamma=1, C=C),
svm.SVC(kernel="rbf", gamma=10, C=C),]
[clf.fit(X, y) for clf in models]
# 输出打分
scores= [ clf.score(X, y) for clf in models ]
print(scores)
# 设定标题
titles=("gamma=0.1", "gamma=1", "gamma=10")
# 设置子图个数和排列
fig, sub=plt.subplots(1, 3, figsize=(10, 3))
X0, X1=X[:,0], X[:,1]
xx, yy = make_meshgrid(X0, X1)
# 画图
for clf, title, ax in zip(models, titles, sub.flatten()):
plot_contours(ax, clf, xx, yy, cmap=plt.cm.plasma, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.plasma, s=20, edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xlabel("Feature 0")
ax.set_ylabel("Feature 1")
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
plt.show()
[0.8314606741573034, 0.8426966292134831, 0.8932584269662921]
from sklearn.model_selection import train_test_split
# 将数据集拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)
print("data size: ", X_train.shape, X_test.shape)
scores=[]
for gm in np.linspace(0.01, 25, 200):
clf=svm.SVC(kernel="rbf", gamma=gm, C=1)
clf.fit(X_train, y_train)
scores.append( [gm, clf.score(X_train, y_train), clf.score(X_test, y_test)] )
scores=np.array(scores)
# print(scores)
# 画图
plt.plot(scores[:,0], scores[:,1], label="trainning score")
plt.plot(scores[:,0], scores[:,2], label="testing score")
plt.xlabel("gamma")
plt.ylabel("Score")
plt.legend()
plt.show()
# 本数据看,gamma=5就可以了,之后就一直过拟合。
data size: (133, 2) (45, 2)
from sklearn.model_selection import train_test_split
# 将数据集拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)
print("data size: ", X_train.shape, X_test.shape)
scores=[]
for c2 in np.linspace(0.01, 1, 100):
clf=svm.SVC(kernel="rbf", gamma=5, C=c2)
clf.fit(X_train, y_train)
scores.append( [c2, clf.score(X_train, y_train), clf.score(X_test, y_test)] )
scores=np.array(scores)
# print(scores)
# 画图
plt.plot(scores[:,0], scores[:,1], label="trainning score")
plt.plot(scores[:,0], scores[:,2], label="testing score")
plt.xlabel("C")
plt.ylabel("Score")
plt.legend()
plt.show()
# 本数据看,C=0.5 就可以了,之后就效果不大了。
data size: (133, 2) (45, 2)
from sklearn.datasets import load_boston
boston=load_boston()
X,y=load_boston().data, load_boston().target
print(boston.keys())
# print(boston.DESCR) #506 行,13列,第14列是 MEDV 业主自住房屋价格的中位数,千美元为单位。
print(boston.data.shape)
boston.target[1:5]
dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module']) (506, 13)
/home/wangjl/anaconda3/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function load_boston is deprecated; `load_boston` is deprecated in 1.0 and will be removed in 1.2. The Boston housing prices dataset has an ethical problem. You can refer to the documentation of this function for further details. The scikit-learn maintainers therefore strongly discourage the use of this dataset unless the purpose of the code is to study and educate about ethical issues in data science and machine learning. In this special case, you can fetch the dataset from the original source:: import pandas as pd import numpy as np data_url = "http://lib.stat.cmu.edu/datasets/boston" raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) target = raw_df.values[1::2, 2] Alternative datasets include the California housing dataset (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing dataset. You can load the datasets as follows:: from sklearn.datasets import fetch_california_housing housing = fetch_california_housing() for the California housing dataset and:: from sklearn.datasets import fetch_openml housing = fetch_openml(name="house_prices", as_frame=True) for the Ames housing dataset. warnings.warn(msg, category=FutureWarning)
array([21.6, 34.7, 33.4, 36.2])
# 拆分数据
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=8)
print("data size:", X_train.shape, X_test.shape)
data size: (379, 13) (127, 13)
# 尝试哪个内核效果更好 Linear ,rbf
# 导入支持向量机回归模型
from sklearn.svm import SVR
# 分别测试2个内核
for kernel in ["linear", "rbf"]:
svr=SVR(kernel=kernel)
svr.fit(X_train, y_train)
print("{} score: {:0.3f} {:0.3f}".format(kernel, svr.score(X_train, y_train), svr.score(X_test, y_test)))
# 结果都不好。 rbf 更差。
# 原因可能是特征之间差异过大,没有预处理
linear score: 0.709 0.696 rbf score: 0.192 0.222
import matplotlib.pyplot as plt
# 可视化每个特征的数量级
plt.plot(X.min(axis=0), 'v', label="min")
plt.plot(X.max(axis=0), '^', label="max")
# y坐标对数形式
plt.yscale("log") #试试去掉这一行,看原始数值
# 图例
plt.legend(loc="best")
plt.xlabel("features")
plt.ylabel("feature magnitude")
#plt.ylim(1e-3, 1e3)
plt.show()
# 确实,每一列的范围差异很大,第一列 -2 到2,而第4列都在0附近。
# 第二列为什么没最小值
# 特征的标准化
# 导入数据预处理工具
from sklearn.preprocessing import StandardScaler
# 对训练集和测试集进行数据预处理
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
# 目测最值都在 [-4, 10] 之间
print( X_train_scaled.min(axis=0) )
print( X_train_scaled.max(axis=0) )
# 可视化最值
# 将预处理后的数据特征最大值和最小值用散点图画出来
plt.plot(X_train_scaled.min(axis=0), 'v', label="train set min")
plt.plot(X_train_scaled.max(axis=0), '^', label="train set max")
plt.plot(X_test_scaled.min(axis=0), 'v', label="test set min")
plt.plot(X_test_scaled.max(axis=0), '^', label="test set max")
plt.yscale("log")
plt.legend(loc="best")
plt.xlabel("scaled features")
plt.ylabel("scaled feature magnitude")
#plt.ylim(-10, 10)
plt.show()
[-0.40393121 -0.47427135 -1.48527127 -0.27139342 -1.45192638 -3.9803001 -2.25664549 -1.27029459 -0.99500294 -1.34902986 -2.73912108 -3.79208852 -1.55741869] [9.37760702 3.86958662 2.40814478 3.68468765 2.76118253 3.54589817 1.11032825 4.02183941 1.65986105 1.78063621 1.64253792 0.44420826 3.61301908]
for kernel in ["linear", "rbf"]:
svr=SVR(kernel=kernel)
svr.fit(X_train_scaled, y_train)
print("{} score: {:0.3f} {:0.3f}".format(kernel, svr.score(X_train_scaled, y_train), svr.score(X_test_scaled, y_test)))
# linear 打分基本没变化。rbf打分提升很多。
linear score: 0.706 0.698 rbf score: 0.665 0.695
# 和SVC一样,SVR模型也有 gamma 和C 参数。
svr=SVR(kernel='rbf', gamma=0.1, C=100)
svr.fit(X_train_scaled, y_train)
print("rbf score: {:0.3f} {:0.3f}".format( svr.score(X_train_scaled, y_train), svr.score(X_test_scaled, y_test)))
# 这个打分很高了,算是可以接受
rbf score: 0.966 0.894