!pip3 freeze | grep -i numpy
numpy==1.21.3
import numpy as np
# 对角矩阵
matrix=np.eye(6)
print(matrix)
[[1. 0. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0.] [0. 0. 1. 0. 0. 0.] [0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 1.]]
# 转为 稀疏矩阵
from scipy import sparse
sparse_matrix=sparse.csr_matrix(matrix)
print(sparse_matrix)
(0, 0) 1.0 (1, 1) 1.0 (2, 2) 1.0 (3, 3) 1.0 (4, 4) 1.0 (5, 5) 1.0
import pandas as pd
p1=pd.DataFrame({
"Name":["Tom","Lucy","Tim"],
"City":["Bj", "Sh", "Sz"],
"Age":[12,15,18],
})
print(p1)
p1
Name City Age 0 Tom Bj 12 1 Lucy Sh 15 2 Tim Sz 18
Name | City | Age | |
---|---|---|---|
0 | Tom | Bj | 12 |
1 | Lucy | Sh | 15 |
2 | Tim | Sz | 18 |
display(p1[p1.City !="Bj"])
Name | City | Age | |
---|---|---|---|
1 | Lucy | Sh | 15 |
2 | Tim | Sz | 18 |
p1[p1.City !="Bj"].shape
(2, 3)
import matplotlib.pyplot as plt
# 从-20到20取20个点
x=np.linspace(-20, 20, 20)
y=x**3 +2*x**2 +6*x +5
plt.plot(x,y, marker="o")
[<matplotlib.lines.Line2D at 0x7f076be70390>]
# 训练集:生成已知标签的数据集
# 导入数据生成器
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
data=make_blobs(n_samples=200, centers=2, random_state=8)
X,y=data
plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.spring, edgecolor='k')
plt.show()
import numpy as np
# 导入KNN分类器
from sklearn.neighbors import KNeighborsClassifier
# 导入数据集拆分工具
from sklearn.model_selection import train_test_split
clf=KNeighborsClassifier()
clf.fit(X, y)
# 画网格背景
x_min, x_max = X[:, 0].min() -1, X[:,0].max()+1
y_min, y_max = X[:, 1].min() -1, X[:,1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
# 对区域的每一点进行预测,作为颜色值
Z=clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z=Z.reshape(xx.shape)
# 画散点图
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1, shading='auto') # 带颜色的网格, 第三个参数是颜色
plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.spring, edgecolor='k') #散点图
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: KNN")
plt.show()
# 创建分类模型,由粉色区域和灰色区域组成。
# 如果有新数据,落到哪里就是哪个分类了。
# 新数据 6.75, 4.82
# 在分类模型上画出该点:在 plt.show() 前加入这句: plt.scatter(6.75, 4.82, marker="*", c="red", s=200)
# 画网格背景
x_min, x_max = X[:, 0].min() -1, X[:,0].max()+1
y_min, y_max = X[:, 1].min() -1, X[:,1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
# 对区域的每一点进行预测,作为颜色值
Z=clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z=Z.reshape(xx.shape)
# 画散点图
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1, shading='auto') # 带颜色的网格, 第三个参数是颜色
plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.spring, edgecolor='k') #散点图
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: KNN")
plt.scatter(6.75, 4.82, marker="*", c="red", s=200) #画出新的数据点
plt.show()
# 从图中看,新点在浅色区域中。
# 带入模型验证一次。
clf.predict([ [ 6.75, 4.82] ]) #确实归为第1类
array([1])
print(clf.predict([ [6, 10] ])) #上方点 0
print(clf.predict([ [6, -0.29] ])) #下方点 1
[0] [1]
# 导入数据生成器
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
# 修改 make_blobs 的 center 参数,分类数提高到5个
# 修改 n_samples 参数,把样本量也增加到 500个
data2=make_blobs(n_samples=500, centers=5, random_state=8)
X2,y2=data2
# 画散点图
plt.scatter(X2[:,0], X2[:,1], c=y2, cmap=plt.cm.spring, edgecolor='k')
plt.show()
#图中可见,5类中的2类有重叠部分。
# 使用KNN建模
import numpy as np
# 导入KNN分类器
from sklearn.neighbors import KNeighborsClassifier
clf=KNeighborsClassifier()
clf.fit(X2, y2)
# 画图
x_min, x_max = X2[:, 0].min() -1, X2[:,0].max()+1
y_min, y_max = X2[:, 1].min() -1, X2[:,1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
Z=clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z=Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1, shading='auto')
plt.scatter(X2[:,0], X2[:,1], c=y2, cmap=plt.cm.spring, edgecolor='k')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier: KNN")
plt.show()
# 建立5个分区,大部分是正确分类的,重合区域、边界附近有少部分点是错误分类的。
#耗时比上一次多了很多。
# 输出在训练集中的正确率
clf.score(X2, y2) #0.956
0.956
# 导入 make_regression 回归数据生成器
from sklearn.datasets import make_regression
# 生成特征数量为1,噪音为50的数据集
X,y=make_regression(n_features=1, n_informative=1, noise=50, random_state=8)
# 散点图
import matplotlib.pyplot as plt
plt.scatter(X,y, c="orange", edgecolor="k")
plt.show()
#图: x范围+-3, y范围+-250,倾斜45度角的散点
# 导入用于回归分析的KNN模型
from sklearn.neighbors import KNeighborsRegressor
reg=KNeighborsRegressor()
# 用KNN模型拟合数据
reg.fit(X,y)
# 可视化
import numpy as np
z=np.linspace(-3,3, 200).reshape(-1,1)
plt.scatter(X,y,c="orange", edgecolor="k")
plt.plot(z, reg.predict(z), c="k", linewidth=3)
plt.title("KNN Regressor")
plt.show()
# 黑色表示KNN回归生成的模型。直观看,效果不好,大量的数据点没有被模型覆盖。
# 给模型评分
reg.score(X,y)
0.7721167832505298
# 怎么提高模型打分?
# 调整 n_neighbors,默认5,我们减少该值
reg2=KNeighborsRegressor(n_neighbors=2)
reg2.fit(X,y)
# 再次可视化
import numpy as np
z=np.linspace(-3,3, 200).reshape(-1,1)
plt.scatter(X,y,c="orange", edgecolor="k")
plt.plot(z, reg2.predict(z), c="k", linewidth=3)
plt.title("KNN Regressor: n_neighbors=2")
plt.show()
# 黑色曲线覆盖了更多的点,也就是说,模型变复杂了。
# 再次给模型评分
reg2.score(X,y) #打分确实提高了,0.77->0.86
0.8581798802065704
# 假设我们对酒一无所知,现在已知一个酒的各项参数,让给出分级,怎么做?
from sklearn.datasets import load_wine
wine_dataset=load_wine()
print(type(wine_dataset)) #这是一个很复杂的格式
#wine_dataset
<class 'sklearn.utils.Bunch'>
wine_dataset.keys()
# 数据 data,分类目标 target, 分类名字 target_names,数据描述 DESCR,特征变量的名字 feature_names
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])
wine_dataset.data.shape #178行 样本,13列 特征变量
(178, 13)
print(wine_dataset.DESCR)
#可见共3个分类,class_0-2。
# 13个变量分别是: 酒精含量、苹果酸、色彩表合度等。
.. _wine_dataset: Wine recognition dataset ------------------------ **Data Set Characteristics:** :Number of Instances: 178 (50 in each of three classes) :Number of Attributes: 13 numeric, predictive attributes and the class :Attribute Information: - Alcohol - Malic acid - Ash - Alcalinity of ash - Magnesium - Total phenols - Flavanoids - Nonflavanoid phenols - Proanthocyanins - Color intensity - Hue - OD280/OD315 of diluted wines - Proline - class: - class_0 - class_1 - class_2 :Summary Statistics: ============================= ==== ===== ======= ===== Min Max Mean SD ============================= ==== ===== ======= ===== Alcohol: 11.0 14.8 13.0 0.8 Malic Acid: 0.74 5.80 2.34 1.12 Ash: 1.36 3.23 2.36 0.27 Alcalinity of Ash: 10.6 30.0 19.5 3.3 Magnesium: 70.0 162.0 99.7 14.3 Total Phenols: 0.98 3.88 2.29 0.63 Flavanoids: 0.34 5.08 2.03 1.00 Nonflavanoid Phenols: 0.13 0.66 0.36 0.12 Proanthocyanins: 0.41 3.58 1.59 0.57 Colour Intensity: 1.3 13.0 5.1 2.3 Hue: 0.48 1.71 0.96 0.23 OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71 Proline: 278 1680 746 315 ============================= ==== ===== ======= ===== :Missing Attribute Values: None :Class Distribution: class_0 (59), class_1 (71), class_2 (48) :Creator: R.A. Fisher :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) :Date: July, 1988 This is a copy of UCI ML Wine recognition datasets. https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data The data is the results of a chemical analysis of wines grown in the same region in Italy by three different cultivators. There are thirteen different measurements taken for different constituents found in the three types of wine. Original Owners: Forina, M. et al, PARVUS - An Extendible Package for Data Exploration, Classification and Correlation. Institute of Pharmaceutical and Food Analysis and Technologies, Via Brigata Salerno, 16147 Genoa, Italy. Citation: Lichman, M. (2013). UCI Machine Learning Repository [https://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. .. topic:: References (1) S. Aeberhard, D. Coomans and O. de Vel, Comparison of Classifiers in High Dimensional Settings, Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of Mathematics and Statistics, James Cook University of North Queensland. (Also submitted to Technometrics). The data was used with many others for comparing various classifiers. The classes are separable, though only RDA has achieved 100% correct classification. (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) (All results using the leave-one-out technique) (2) S. Aeberhard, D. Coomans and O. de Vel, "THE CLASSIFICATION PERFORMANCE OF RDA" Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of Mathematics and Statistics, James Cook University of North Queensland. (Also submitted to Journal of Chemometrics).
# 生成训练集和测试集
# train_test_split 函数,默认随机分组,75%的归为训练集,25%归为测试集。
# 一般使用X表示数据特征,y表示对应的标签。因为X是二维的数组,也称为矩阵,y是一维数组,也叫向量。
# 导入数据集拆分工具
from sklearn.model_selection import train_test_split
# 拆分数据。random_state 随机数种子。当设置为0或者缺省时,每次生成的随机数都不同。
X_train, X_test, y_train, y_test=train_test_split(
wine_dataset["data"], wine_dataset["target"], random_state=8
)
# 检查数据 行列 数
print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)
X_train (133, 13) X_test (45, 13) y_train (133,) y_test (45,)
# 导入 KNN 分类模型
from sklearn.neighbors import KNeighborsClassifier
# 指定模型的 n_neighbors 参数为1
knn=KNeighborsClassifier(n_neighbors=1) #最近的k=1个已知点
# fit: 用模型对数据进行拟合
knn.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=1)
# 使用测试集检验模型
print(knn.score(X_train, y_train)) #对训练集全对
knn.score(X_test, y_test) #0.7111 测试集
1.0
0.7111111111111111
# 使用模型对新数据进行预测
import numpy as np
X_new=np.array([ [ 13.2, 2.77, 2.51, 18.5,96.6,1.04,2.55,0.57,1.47,6.2,1.05,3.33,820] ] )
print(X_new.shape) #(1, 13)
prediction=knn.predict(X_new)
print(prediction, wine_dataset["target_names"][prediction] )
(1, 13) [2] ['class_2']
# 尝试各种可能的 K, 看在测试集给出的 打分
result=[]
for k in range(1, 40, 1):
# 指定模型的 n_neighbors 参数为1
knn=KNeighborsClassifier(n_neighbors=k)
# fit: 用模型对数据进行拟合
knn.fit(X_train, y_train)
# 使用测试集检验模型
sTrain=(knn.score(X_train, y_train)) #对训练集全对
sTest=knn.score(X_test, y_test) #0.7111 测试集
result.append( [k, sTrain, sTest])
# 画图
import matplotlib.pyplot as plt
plt.plot(np.array(result)[:, 0], np.array(result)[:, 1], label="train score")
plt.plot(np.array(result)[:, 0], np.array(result)[:, 2], label="test score")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x7f0c2ea91ef0>