线性模型的概念¶

画一条直线¶

import numpy as np
import matplotlib.pyplot as plt

# 令x为-5到5之间，元素数为100的等差数列
x=np.linspace(-5,5,100)
# 输入直线方程
y=0.5*x + 3

plt.plot(x,y, c="orange")
plt.title("Straight line")
plt.show()

通过2点确定一条直线¶

# 2点(1,3) (4,5) 确定一条直线

# 导入线性回归模型
from sklearn.linear_model import LinearRegression

# 输入2个点的横坐标
X=[[1], [4]]
# 输入2个点的纵坐标
y=[3,5]

# 用线性模型拟合这2个点
lr=LinearRegression().fit(X, y)
# 画出2个点和直线
z=np.linspace(0,5, 20)
plt.scatter(X,y,s=80) #画2个点
plt.plot(z, lr.predict(z.reshape(-1,1)), c='k')

plt.title("Straight line")
plt.show()

# 输出该直线的方程
w=lr.coef_[0]
b=lr.intercept_
print( "y = {:.3f}".format(w), "x", " + {:.3f}".format(b)   )

y = 0.667 x  + 2.333

lr

LinearRegression()

如果是3个点呢？¶

# 2点(1,3) (4,5) (3,3)确定一条直线
from sklearn.linear_model import LinearRegression

X=[[1], [4], [3]]
y=[3, 5, 3]

# 拟合
lr=LinearRegression().fit(X, y)

# 画图
z=np.linspace(0,5, 20)
plt.scatter(X,y,s=80) #画2个点
plt.plot(z, lr.predict(z.reshape(-1,1)), c='k')
plt.title("Straight line")
plt.show()

# 输出该直线的方程
w=lr.coef_[0]
b=lr.intercept_
print( "y = {:.3f}".format(w), "x", " + {:.3f}".format(b)   )

y = 0.571 x  + 2.143

生成更多点，做线性拟合¶

from sklearn.datasets import make_regression
#生成用于回归分析的数据
X,y=make_regression(n_samples=50, n_features=1, n_informative=1, noise=50, random_state=1)

# 线性拟合
reg=LinearRegression()
reg.fit(X,y)
# 生成等差数列z作为横轴，画线性模型的图形
z=np.linspace(-3,3, 200).reshape(-1,1)
plt.scatter(X, y, c='b', s=60)
plt.plot(z, reg.predict(z), c='k') #预测每个x对应的y
plt.title("Linear regression")
plt.show()

# 输出该直线的方程
w=reg.coef_[0]
b=reg.intercept_
print( "y = {:.3f}".format(w), "x", " + {:.3f}".format(b)   )

y = 79.525 x  + 10.922

最基本的线性模型 - 线性回归¶

无噪音模拟数据¶

from sklearn.datasets import make_regression

# 导入数据集拆分工具
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X,y=make_regression(n_samples=100, n_features=2, n_informative=2, random_state=38)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=8)
lr=LinearRegression().fit(X_train, y_train)

# 打印出模型
print("coef:", lr.coef_[:])
print("intercept:", lr.intercept_)
# y=w1*x1 +w2*x2 + b

coef: [70.38592453  7.43213621]
intercept: -7.105427357601002e-15

# 打分
print("training set:", lr.score(X_train, y_train))
print("tesing set:", lr.score(X_test, y_test))

# 完全对的原因，是因为没有添加noise！真实世界的数据，噪音是很大的。

training set: 1.0
tesing set: 1.0

载入真实数据 - 糖尿病数据¶

# 导入数据集拆分工具
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# 载入数据
from sklearn.datasets import load_diabetes
X,y=load_diabetes().data, load_diabetes().target

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=8)
lr=LinearRegression().fit(X_train, y_train)

# 打印出模型
print("coef:", lr.coef_[:])
print("intercept:", lr.intercept_)
# y=w1*x1 +w2*x2 + b

# 打分
print()
print("training set:", lr.score(X_train, y_train))
print("tesing set:", lr.score(X_test, y_test))

# 分别是0.53 和0.46，打分降低了很多！

coef: [   11.5106203   -282.51347161   534.20455671   401.73142674
 -1043.89718398   634.92464089   186.43262636   204.93373199
   762.47149733    91.9460394 ]
intercept: 152.5624877455247

training set: 0.5303814759709331
tesing set: 0.4593440496691642

使用L2正则化的线性模型 - 岭回归¶

保留全部特征变量，只是降低特征变量的系数来避免过拟合的方法，称为L2正则化。

糖尿病数据集 - 岭回归¶

# 导入数据集拆分工具
from sklearn.model_selection import train_test_split

# 载入数据
from sklearn.datasets import load_diabetes
X,y=load_diabetes().data, load_diabetes().target
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=8)

# 导入岭回归
from sklearn.linear_model import Ridge

# 使用岭回归对数据进行拟合
ridge=Ridge().fit(X_train, y_train)

# 打印出模型
print("coef:", ridge.coef_[:])
print("intercept:", ridge.intercept_)
# y=w1*x1 +w2*x2 + b

# 打分
print()
print("training set:", ridge.score(X_train, y_train))
print("tesing set:", ridge.score(X_test, y_test))

# 分别是 0.43 和 0.43，打分接近

coef: [  36.8262072   -75.80823733  282.42652716  207.39314972   -1.46580263
  -27.81750835 -134.3740951    98.97724793  222.67543268  117.97255343]
intercept: 152.553545058867

training set: 0.4326376676137663
tesing set: 0.4325217769068186

可以说，复杂度越低的模型，在训练集上表现越差，但是其泛化能力会更好。

如果在意泛化能力，则应该选择岭回归，而不是线性回归模型

岭回归的参数调节¶

from sklearn.linear_model import Ridge

# 使用岭回归对数据进行拟合，设置 alpha=10
ridge=Ridge(alpha=10).fit(X_train, y_train)

# 打印出模型
print("coef:", ridge.coef_[:])
print("intercept:", ridge.intercept_)
# y=w1*x1 +w2*x2 + b

# 打分
print()
print("training set:", ridge.score(X_train, y_train))
print("tesing set:", ridge.score(X_test, y_test))

# 分别是 0.15 和 0.16，测试集打分超过训练集了
# 也就是说，如果模型过拟合，可以通过提高 alpha 值来降低过拟合现象。

coef: [ 15.08676646  -1.9586191   60.69903425  47.11843221  14.72337546
   9.87779644 -35.56015266  35.74603575  54.27193163  37.42095846]
intercept: 152.7585777843719

training set: 0.15119962367011153
tesing set: 0.16202013428866247

降低 alpha 值会让系数的限制变得不那么严格。当alpha很小时，限制可以忽略不计，非常接近线性回归。

from sklearn.linear_model import Ridge

# 使用岭回归对数据进行拟合，设置 alpha=0.1
ridge=Ridge(alpha=0.1).fit(X_train, y_train)

# 打印出模型
print("coef:", ridge.coef_[:])
print("intercept:", ridge.intercept_)
# y=w1*x1 +w2*x2 + b

# 打分
print()
print("training set:", ridge.score(X_train, y_train))
print("tesing set:", ridge.score(X_test, y_test))

# 分别是 0.52 和 0.47
# 相比线性模型，alpha很小时，训练集打分略降低，而测试集打分略提高。

coef: [  24.77802114 -228.33364296  495.54594378  361.21481169 -109.82542594
  -78.3286822  -190.69780344  108.24040795  383.72269392  107.42593373]
intercept: 152.48093836963517

training set: 0.521564605524134
tesing set: 0.4734019500945309

alpha值对模型的影响¶

画图展示不同 alpha 值对应的模型的 coef_ 属性。
较高的 alpha 值表示模型的限制更加严格。
所以我们认为，alpha值越高，coef属性的数值会更小，反之 coef 属性的值更大。

import matplotlib.pyplot as plt

# alpha=0.1 时的模型系数
plt.plot(Ridge(alpha=0.1).fit(X_train, y_train).coef_, 'o', label="Ridge alpha=0.1")

# alpha=1 时的模型系数
plt.plot(Ridge(alpha=1).fit(X_train, y_train).coef_, 's', label="Ridge alpha=1")

# alpha=10 时的模型系数
plt.plot(Ridge(alpha=10).fit(X_train, y_train).coef_, '^', label="Ridge alpha=10")

# 绘制线性回归的系数作为对比
from sklearn.linear_model import LinearRegression
lr=LinearRegression().fit(X_train, y_train)
plt.plot(lr.coef_, "o", label="linear regression")
#
plt.xlabel("coefficient index")
plt.ylabel("coefficent magnitude")
plt.hlines(0,0,len(lr.coef_)) #水平直线，过原点
plt.legend()
plt.show()

数据集大小对岭回归的影响 - 学习曲线¶

另一个理解正则化对模型影响的方法，就是固定alpha值，该不安训练集的数据量。

import numpy as np

from sklearn.model_selection import learning_curve, KFold
# 定义一个绘制学习曲线的函数
def plot_learning_curve(est, X, y):
    # 对数据进行20次拆分用来对模型进行评分
    training_set_size, train_scores, test_scores=learning_curve(
        est, X, y, train_sizes=np.linspace(0.1, 1, 20), cv=KFold(20, shuffle=True,random_state=1))
    
    estimator_name=est.__class__.__name__
    line=plt.plot(training_set_size, train_scores.mean(axis=1), '--', label="training "+estimator_name)
    plt.plot(training_set_size, test_scores.mean(axis=1), '-',
            label="test "+estimator_name, c=line[0].get_color())
    plt.xlabel("Training set size")
    plt.ylabel("Score")
    plt.ylim(0, 1.1)    

plot_learning_curve(Ridge(alpha=1), X, y)
plot_learning_curve(LinearRegression(), X, y)
plt.legend(loc=(0, 1.05), ncol=2, fontsize=11)

<matplotlib.legend.Legend at 0x7f5c2bec9630>

可见，数据量小的时候，岭回归的训练集和测试集表现差不多，而普通线性回归则差异很大。
当数据量很大时，正则化就没那么重要了，两者表现一致。
随着数据量的增大，线性回归在训练集上的得分是下降的；说明数据量越大，线性回归越不容易过拟合，或者越难记住已知数据。

岭迹图 x=alpha, y=coef¶

np.logspace(-10,2,4)

array([1.e-10, 1.e-06, 1.e-02, 1.e+02])

# 创建 alpha 集合
alphas = np.logspace(-3, 2, 100)  # -3 到 2 取100份
# 计算对应的 coef
coefs = []
for alpha in alphas:
    # 获取模型 设置参数
	# 通过修改Ridge(fit_intercept=False)，来让岭回归模型来关闭差值，不让差值调整结果值，这样我们获得的斜率就不是0了。
    rr = Ridge(alpha=alpha, fit_intercept=False)
    rr.fit(X_train, y_train)
    coefs.append(rr.coef_)
# 绘图
plt.plot(alphas,coefs)
# 设置坐标轴 不是以均匀的方式展示 设置x轴线 而是 以10的倍数来显示
plt.xscale('log')
plt.xlabel("Alpha")
plt.ylabel("Coef")
plt.show()

使用 L1 正则化的线性模型 - 套索回归 lasso¶

套索回归 - 默认参数¶

# 载入 糖尿病模型
import numpy as np
import matplotlib.pyplot as plt

# 导入数据集拆分工具
from sklearn.model_selection import train_test_split

# 载入数据
from sklearn.datasets import load_diabetes
X,y=load_diabetes().data, load_diabetes().target
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=8)


# 载入套索回归
from sklearn.linear_model import Lasso
# 使用套索回归拟合
lasso=Lasso().fit(X_train, y_train)

#输出打分
print("training score:", lasso.score(X_train, y_train))
print("testing score:", lasso.score(X_test, y_test))
print("特征数:", np.sum(lasso.coef_ !=0 ))

# 打分只有 0.36， 0.37，只使用了3个特征。
# 训练集结果也很糟糕，说明fasjeng欠拟合。

training score: 0.36242428249291325
testing score: 0.36561858962128
特征数: 3

套索回归的参数调节¶

# 使用套索回归拟合
lasso=Lasso(alpha=0.1, max_iter=100000).fit(X_train, y_train)

#输出打分
print("training score:", lasso.score(X_train, y_train))
print("testing score:", lasso.score(X_test, y_test))
print("特征数:", np.sum(lasso.coef_ !=0 ))

training score: 0.519480608218357
testing score: 0.47994757514558173
特征数: 7

降低alpha值可以拟合出更复杂的模型，从而在训练集和测试集都能获得良好的表现。
该结果比岭回归稍好，且只用了10个特征中的7个特征。
但是，alpha 设置的太低，就去掉了正则化效果，模型就会像线性回归一样，出现过拟合现象。

# 设置 lasso 回归alpha=0.0001
lasso=Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)
#输出打分
print("training score:", lasso.score(X_train, y_train))
print("testing score:", lasso.score(X_test, y_test))
print("特征数:", np.sum(lasso.coef_ !=0 ))

# alpha太小时，所有变量都用上了。且测试集打分低了10个百分点，说明有过拟合现象。

training score: 0.5303811330981303
testing score: 0.4594509683706016
特征数: 10

套索回归与岭回归的对比¶

画出不同alpha值的套索回归与岭回归的系数。

# 绘制 alpha=1, 0.1, 0.001 是的模型系数
plt.plot( Lasso(alpha=1).fit(X_train, y_train).coef_, 's', label="Lasso alpha=1")
plt.plot( Lasso(alpha=0.1).fit(X_train, y_train).coef_, '^', label="Lasso alpha=0.1")
plt.plot( Lasso(alpha=0.001).fit(X_train, y_train).coef_, 'v', label="Lasso alpha=0.001")

# 绘制 alpha=0.1 时的岭回归模型
from sklearn.linear_model import Ridge
plt.plot(Ridge(alpha=0.1).fit(X_train, y_train).coef_, "o", label="Ridge alpha=0.1")

plt.legend(ncol=2, loc=(0, 1.05))
#plt.ylim(-25, 25)
plt.hlines(0,0, 10) #水平直线，过原点
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.show()

系数收缩图 x=alpha, y=coef¶

# 创建 alpha 集合
alphas = np.logspace(-4, 1, 100)  # -3 到 2 取100份
# 计算对应的 coef
coefs = []
for alpha in alphas:
    # 获取模型 设置参数
    #rr = Lasso(alpha=alpha, fit_intercept=False) #
    rr = Lasso(alpha=alpha) #
    rr.fit(X_train, y_train)
    coefs.append(rr.coef_)
# 绘图
plt.plot(alphas,coefs)
# 设置坐标轴 不是以均匀的方式展示 设置x轴线 而是 以10的倍数来显示
plt.xscale('log')
plt.xlabel("Alpha")
plt.ylabel("Coef")
plt.show()

弹性网模型 Elastic net: 套索回归 + 岭回归 //todo¶

普通线性回归、岭回归与lasso回归比较¶

import numpy as np
import matplotlib.pyplot as plt

# (1)创建数据
np.random.seed(10)  # 随机数种子
samples = 50  # 有几个样本就有几行
features = 100  # 有几个特征就有几列
X = np.random.randn(samples,features)  # 以0为中心，标准差为1的数 参数为形状
# X 作为特征值

# 随机生成权重
w = 10*np.random.randn(features)  # 有几个特征就有几个权重的值  给每个权重扩大10倍

# 随机将一些权重归零
index = np.random.permutation(features)  # 打乱的 各个权重的索引
index[:90]  # 找出前九十个索引
w[index[:90]] = 0  # 把前九十个打乱顺序的所对应的权重值 归零

# 根据现有的特征值与权重值求目标值
y = np.dot(X,w)


# (2) 比较各回归方式 预测权重的效果
from sklearn.linear_model import LinearRegression,Ridge,Lasso
lr = LinearRegression()
rr = Ridge(alpha=1, fit_intercept=False)  #这里主要研究 w 的值，所以为了不受影响，不使用偏差值
lasso = Lasso(alpha=0.8)  #alpha 用来设置权重的上限，不过alpha 的值为0-1的小数 用来表示有用的特征的比例

# 注意：Lasso中的alpha 表示有用特征的比例
# 例如 共有 5 个特征， 有用的 只有一个，那么 alpha = 0.2

# 训练数据
lr.fit(X,y)
rr.fit(X,y)
lasso.fit(X,y)


# 查看各个模型对coef的预测是否正确
# plt.figure(figsize=(12,8))  #设置画布大小
axes1 = plt.subplot(2,2,1)  # 先绘制真实的权重
axes1.plot(w)
axes1.set_title('real')

# 普通线性回归
axes2 = plt.subplot(2,2,2)
axes2.plot(lr.coef_)
axes2.set_title('lr')

# 岭回归
axes3 = plt.subplot(2,2,3)
axes3.plot(rr.coef_)
axes3.set_title('rr')

# 拉索回归
axes4 = plt.subplot(2,2,4)
axes4.plot(lasso.coef_)
axes4.set_title('lasso')

plt.show()