数据预处理、降维、特征提取及聚类
# 生成数据
import numpy as np
import matplotlib.pyplot as plt
# 生成数据
from sklearn.datasets import make_blobs
# 40个点,2个中心,标准差2
X, y=make_blobs(n_samples=40, centers=2, random_state=50, cluster_std=2)
print(X.shape) #数据是40行,2个特征(x和y)
# 散点图
plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.cool)
plt.show()
# 2个特征的范围
print("x: [{:0.2f}, {:0.2f}]".format( X[:,0].min(), X[:,0].max()) )
print("y: [{:0.2f}, {:0.2f}]".format( X[:,1].min(), X[:,1].max()) )
(40, 2)
x: [-8.22, 6.52] y: [-9.69, 0.14]
from sklearn.preprocessing import StandardScaler
# 预处理
X_1=StandardScaler().fit_transform(X)
# 散点图
plt.scatter(X_1[:,0], X_1[:,1], c=y, cmap=plt.cm.cool)
plt.title("StandardScaler")
plt.show()
# 2个特征的范围
print("x: [{:0.2f}, {:0.2f}]".format( X_1[:,0].min(), X_1[:,0].max()) )
print("y: [{:0.2f}, {:0.2f}]".format( X_1[:,1].min(), X_1[:,1].max()) )
# 可见点之间的相对位置不变,但是x和y的极值都缩小到0附近了
# Z标注化后的数据符合 标准正态分布 N(0,1)
x: [-2.11, 2.92] y: [-2.71, 1.76]
help(StandardScaler)
x
is calculated as: z = (x - u) / su
is the mean of the training samples or zero if with_mean=False
,s
is the standard deviation of the training samples or one if with_std=False
.from sklearn.preprocessing import MinMaxScaler
# 预处理
X_2=MinMaxScaler().fit_transform(X)
# 散点图
plt.scatter(X_2[:,0], X_2[:,1], c=y, cmap=plt.cm.cool)
plt.title("MinMaxScaler")
plt.show()
# 2个特征的范围
print("x: [{:0.2f}, {:0.2f}]".format( X_2[:,0].min(), X_2[:,0].max()) )
print("y: [{:0.2f}, {:0.2f}]".format( X_2[:,1].min(), X_2[:,1].max()) )
# x = (x - min)/(max - min) 都在[0, 1]之间
x: [0.00, 1.00] y: [0.00, 1.00]
from sklearn.preprocessing import RobustScaler
# 预处理
X_3=RobustScaler().fit_transform(X)
# 散点图
plt.scatter(X_3[:,0], X_3[:,1], c=y, cmap=plt.cm.cool)
plt.title("RobustScaler")
plt.show()
# 2个特征的范围
print("x: [{:0.2f}, {:0.2f}]".format( X_3[:,0].min(), X_3[:,0].max()) )
print("y: [{:0.2f}, {:0.2f}]".format( X_3[:,1].min(), X_3[:,1].max()) )
# 使用中位数和四分位数进行转换,而不是使用均值和方差。
# 能去掉异常值 outlier。
x: [-1.46, 2.02] y: [-2.18, 1.33]
help(RobustScaler)
将所有样本的特征向量转化为欧几里得距离为1.
from sklearn.preprocessing import Normalizer
# 预处理
X_4=Normalizer().fit_transform(X)
# 散点图
plt.scatter(X_4[:,0], X_4[:,1], c=y, cmap=plt.cm.cool)
plt.title("Normalizer")
plt.show()
# 2个特征的范围
print("x: [{:0.2f}, {:0.2f}]".format( X_4[:,0].min(), X_4[:,0].max()) )
print("y: [{:0.2f}, {:0.2f}]".format( X_4[:,1].min(), X_4[:,1].max()) )
x: [-1.00, 0.88] y: [-1.00, 0.13]
# 用到了再查文档
import sklearn.preprocessing
dir(sklearn.preprocessing)
['Binarizer', 'FunctionTransformer', 'KBinsDiscretizer', 'KernelCenterer', 'LabelBinarizer', 'LabelEncoder', 'MaxAbsScaler', 'MinMaxScaler', 'MultiLabelBinarizer', 'Normalizer', 'OneHotEncoder', 'OrdinalEncoder', 'PolynomialFeatures', 'PowerTransformer', 'QuantileTransformer', 'RobustScaler', 'SplineTransformer', 'StandardScaler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_csr_polynomial_expansion', '_data', '_discretization', '_encoders', '_function_transformer', '_label', '_polynomial', 'add_dummy_feature', 'binarize', 'label_binarize', 'maxabs_scale', 'minmax_scale', 'normalize', 'power_transform', 'quantile_transform', 'robust_scale', 'scale']
from sklearn.datasets import load_wine
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
wine=load_wine()
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, random_state=62)
print(X_train.shape, X_test.shape)
(133, 13) (45, 13)
# 训练神经网络
mlp=MLPClassifier(hidden_layer_sizes=[100,100], max_iter=700, random_state=62)
mlp.fit(X_train, y_train)
# 打分
print("training set:{:0.3f}".format( mlp.score(X_train, y_train)) )
print("tesing set:{:0.3f}".format( mlp.score(X_test, y_test)) )
training set:0.992 tesing set:0.933
# 数据预处理
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
# 训练
mlp.fit(X_train_scaled, y_train)
# 打分
print("training set:{:0.3f}".format( mlp.score(X_train_scaled, y_train)) )
print("tesing set:{:0.3f}".format( mlp.score(X_test_scaled, y_test)) )
# 惊呆了! 完全正确。
# 记住数据预处理的顺序:先拟合原始train数据,再转换原始的train和test数据。
# 而不能 拟合原始test数据,再去转换test数据,这样就失去了数据转换的意义了。
training set:1.000 tesing set:1.000
# 载入数据
from sklearn.datasets import load_wine
wine=load_wine()
# 载入预处理
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X=wine.data
y=wine.target
X_scaled = scaler.fit_transform(X)
print(X_scaled.shape) # 13个特征
(178, 13)
# 为了可视化最显著的特征方向,我们需要做PCA
from sklearn.decomposition import PCA
# 设置主成分数量为2
pca=PCA(n_components=2)
pca.fit(X_scaled) #拟合
X_pca=pca.transform(X_scaled) #转换
print(X_pca.shape) # 2个主成分(特征组合)
(178, 2)
# 可视化
import numpy as np
import matplotlib.pyplot as plt
# 将三个分类中的主成分提取出来
X0=X_pca[wine.target==0]
X1=X_pca[wine.target==1]
X2=X_pca[wine.target==2]
# 绘制散点图
plt.scatter(X0[:,0], X0[:,1], c='b', s=60, edgecolor='k')
plt.scatter(X1[:,0], X1[:,1], c='g', s=60, edgecolor='k')
plt.scatter(X2[:,0], X2[:,1], c='r', s=60, edgecolor='k')
plt.legend(wine.target_names, loc="best")
plt.xlabel("PC_1")
plt.ylabel("PC_2")
plt.show()
# 将数据降到2维,从图片中差不多能看到3类的分界线。
从数学上讲,需要先理解内积和投影。本文略。
本文只直观绘图。
print(pca.components_.shape)
pca.components_
# 一个主成分,就等于原始特征前面乘以这个系数。
# 系数使正,就是原始特征和PC正相关;为负,就是和PC负相关。
(2, 13)
array([[ 0.1443294 , -0.24518758, -0.00205106, -0.23932041, 0.14199204, 0.39466085, 0.4229343 , -0.2985331 , 0.31342949, -0.0886167 , 0.29671456, 0.37616741, 0.28675223], [-0.48365155, -0.22493093, -0.31606881, 0.0105905 , -0.299634 , -0.06503951, 0.00335981, -0.02877949, -0.03930172, -0.52999567, 0.27923515, 0.16449619, -0.36490283]])
# 使用主成分绘制热图
# plt.figure(figsize=(2,1))
plt.matshow(pca.components_, cmap="plasma")
# y轴作为主成分数
plt.yticks([0,1], ["PC_1", "PC_2"])
plt.colorbar()
# x轴为原始特征数量
plt.xticks( range(len(wine.feature_names)), wine.feature_names, rotation=60, ha="left" )
plt.show()
# 每个主成分解释的变异性百分比
pca.explained_variance_ratio_
array([0.36198848, 0.1920749 ])
help(PCA)
使用 LFW(Labeled Faces in the Wild) 人脸识别数据集。
# 导入数据集获取工具
from sklearn.datasets import fetch_lfw_people
# 载入人脸数据集
faces=fetch_lfw_people(min_faces_per_person=20, resize=0.8)
image_shape = faces.images[0].shape
image_shape
(100, 75)
# 显示照片
import numpy as np
import matplotlib.pyplot as plt
fig, axes = plt.subplots(3, 4, figsize=(12,9),
subplot_kw={"xticks":(), "yticks":()})
for target, image, ax in zip(faces.target, faces.images, axes.ravel()):
ax.imshow(image, cmap=plt.cm.gray)
ax.set_title(faces.target_names[target])
plt.show()
print(faces.target.min(), faces.target.max() ) #共62个人
0 61
# 对数据拆分
import time
start=time.time()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(faces.data/255, faces.target, random_state=0)
# 导入神经网络
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier(hidden_layer_sizes=[100,100], random_state=0, max_iter=400)
mlp.fit(X_train, y_train)
print("time:{:0.2f}".format(time.time()-start, "s") )
# 打分
print("training set:{:0.3f}".format( mlp.score(X_train, y_train)) )
print("tesing set:{:0.3f}".format( mlp.score(X_test, y_test)) )
# 耗时: 60s
# 测试集记住了一半多一点。能在一分钟内记住 36/62 个人脸,也算很厉害了。
time:60.08 training set:0.986 tesing set:0.583
数据白化能提升模型正确率。
所谓白化,就是降低冗余性,消除相邻像素的相关性,且使所有特征具有相同的方差。
from sklearn.decomposition import PCA
# 使用 PCA 的白化功能处理人脸
pca=PCA(whiten=True, n_components=0.9, random_state=0)
pca.fit(X_train)
X_train_whiten=pca.transform(X_train)
X_test_whiten = pca.transform(X_test)
# (2267, 105) (756, 105)
# 维度
print( X_train.shape, X_test.shape ) #7500 列
print( X_train_whiten.shape, X_test_whiten.shape ) #105 列
(2267, 7500) (756, 7500) (2267, 105) (756, 105)
# 导入神经网络
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier(hidden_layer_sizes=[100,100], random_state=0, max_iter=400)
mlp.fit(X_train_whiten, y_train)
# 打分
print("training set:{:0.3f}".format( mlp.score(X_train_whiten, y_train)) )
print("tesing set:{:0.3f}".format( mlp.score(X_test_whiten, y_test)) )
# 训练集满分,测试集打分比白化前又降了。
# 真是玄学。
training set:1.000 tesing set:0.571
矩阵分解,就是把一个矩阵写成n个矩阵的连乘。
与PCA的区别:
# 使用 NMF 对LFW人脸数据集进行特征提取,再重新训练神经网络,看看模型的正确率是否变化
import time
start=time.time()
# 导入 NMF
from sklearn.decomposition import NMF
# 使用NMF处理数据: 很慢 15:42->15:
nmf = NMF(n_components=105, max_iter=400, random_state=62).fit(X_train) #默认 max_iter=200
X_train_nmf = nmf.transform(X_train)
X_test_nmf=nmf.transform(X_test)
print("time:{:0.2f}".format(time.time()-start, "s") )
# 打印NMF处理后的数据形状
print("shpe before NMF: ", X_train.shape)
print("shpe after NMF: ", X_train_nmf.shape)
/home/wangjl/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/_nmf.py:294: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26). FutureWarning, /home/wangjl/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/_nmf.py:1641: ConvergenceWarning: Maximum number of iterations 400 reached. Increase it to improve convergence. ConvergenceWarning,
time:80.86 shpe before NMF: (2267, 7500) shpe after NMF: (2267, 105)
# 使用 NMF 分解后的矩阵 训练神经网络
mlp.fit(X_train_nmf, y_train)
# 打分
print("training set:{:0.3f}".format( mlp.score(X_train_nmf, y_train)) )
print("tesing set:{:0.3f}".format( mlp.score(X_test_nmf, y_test)) )
# 打分更低了!时间比PCA长,效果比PCA差。
training set:0.991 tesing set:0.558
/home/wangjl/anaconda3/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (400) reached and the optimization hasn't converged yet. ConvergenceWarning,
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
blobs=make_blobs(random_state=1, centers=1)
X_blobs=blobs[0]
# 可视化
plt.scatter(X_blobs[:,0], X_blobs[:,1], c='r', edgecolor='k')
plt.show()
# 目测确实无差别
# 导入 KMeans 工具
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=7)
kmeans.fit(X_blobs)
# 可视化
x_min, x_max=X_blobs[:,0].min()-0.5, X_blobs[:,0].max()+0.5
y_min, y_max=X_blobs[:,1].min()-0.5, X_blobs[:,1].max()+0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
Z=kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z=Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.summer,
aspect='auto', origin='lower')
plt.plot(X_blobs[:,0], X_blobs[:,1], 'r.', markersize=5)
# 用蓝色叉号代表聚类中心
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:,0], centroids[:,1],
marker='x', s=150, linewidths=3, color='b', zorder=10)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks()
plt.yticks()
plt.show()
# 打印 KMeans 进行聚类的标签
print("K均值的聚类标签:\n{}".format(kmeans.labels_))
# k-means 的优点是简单;
# 缺点是它认为每个点到聚类中心的方向都是同等重要的,对于“形状”复杂的数据集,k均值就表现很差。
K均值的聚类标签: [1 1 0 2 2 2 1 1 0 2 1 2 1 0 1 2 2 1 0 0 2 0 1 1 1 1 2 1 1 1 0 0 1 1 2 0 2 0 1 0 2 1 0 0 2 2 2 1 0 1 0 1 2 0 2 2 0 2 2 1 2 0 2 1 0 2 0 0 1 2 2 1 2 2 2 1 2 1 1 0 2 0 2 2 0 1 2 1 0 0 2 1 0 0 2 2 1 2 2 1]
简单说,就是最近的聚类,然后剩下的最近的再聚类,一直到最后成为一类。
from scipy.cluster.hierarchy import dendrogram, ward
# 使用连线的方式进行可视化
linkage=ward(X_blobs)
dendrogram(linkage)
ax=plt.gca() #这个有啥用?
plt.xlabel("Sample")
plt.ylabel("Cluster distance")
plt.show()
# 感觉就是R中的层次聚类。
# 也不能处理“形状”复杂的数据集。
全称是“基于密度的有噪声应用空间聚类”(Density-based spatial clustering of applications with noise)
from sklearn.cluster import DBSCAN
db=DBSCAN()
# 拟合
clusters=db.fit_predict(X_blobs)
#可视化
plt.scatter(X_blobs[:,0], X_blobs[:,1], c=clusters, cmap=plt.cm.cool, s=60, edgecolor='k')
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()
# 打印聚类个数
print("聚类标签:\n{}".format(clusters))
# 为什么会有-1?原来 DBSCAN 认为-1是噪声。
# 中间一团密度比较大,归为一类;周围浅色的不属于任何一类,放入噪声类。
聚类标签: [-1 0 -1 0 -1 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 -1 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 -1 -1 0 0 0 0 -1 0 0 -1 0 0 -1 0 0 0 0 0 0 0 0 -1 0 0 0 -1]
DBSCAN 重要的参数:
help(DBSCAN)
eps : float, default=0.5
# 调整 eps 参数 0.5->2
from sklearn.cluster import DBSCAN
db_1=DBSCAN(eps=2)
# 拟合
clusters_1=db_1.fit_predict(X_blobs)
#可视化
plt.scatter(X_blobs[:,0], X_blobs[:,1], c=clusters_1, cmap=plt.cm.cool, s=60, edgecolor='k')
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.title("DBSCAN(eps=2)")
plt.show()
# 只有一类,没有噪音了
# 调整 min_samples 参数 2->20
from sklearn.cluster import DBSCAN
db_2=DBSCAN(min_samples=20)
# 拟合
clusters_2=db_2.fit_predict(X_blobs)
#可视化
plt.scatter(X_blobs[:,0], X_blobs[:,1], c=clusters_2, cmap=plt.cm.cool, s=60, edgecolor='k')
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.title("DBSCAN(min_samples=20)")
plt.show()
# min_samples 调大,噪音点变多。
实际使用时,先使用 MinMaxScaler 或 StandardScaler 进行预处理,DBSCAN算法的表现会更好。