import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
iris=pd.read_csv('../data/iris_data/iris.csv', index_col=0) # 将第一列作为行名字
iris.head()
iris.shape
# string to color list
def str2color(strs, dict1):
arr=[]
for item in strs:
arr.append(dict1[item])
return arr;
colors=str2color(iris['Species'], {'setosa':'red', 'versicolor':'green', 'virginica':'blue'})
colors[0:10]
iris.head()
import matplotlib.pyplot as plt
plt.scatter(iris['Sepal.Length'], iris['Petal.Length'], color=colors )
import numpy as np
def splitData(df, test_ratio):
# 索引范围为[0, n), 随机选x个不重复
n=df.shape[0]
x=round(n*test_ratio)
index = np.random.choice(np.arange(n), size=x, replace=False)
#
test_index = np.array(index)
train_index = np.delete(np.arange(n), test_index)
return df.iloc[train_index,],df.iloc[test_index,]
np.random.seed(1)
train_set, test_set=splitData(iris, 0.2)
print(train_set.shape)
print(test_set.shape)
test_set.head()
test_set.iloc[1,]
#
a=train_set.iloc[:,0:4]
b=test_set.iloc[0,0:4]
dist=np.sqrt( np.sum((a-b)**2,1) )
#train_set.iloc[:,4:5]
df=pd.DataFrame( {'dist':dist, 'clazz':train_set['Species']} )
print(df.head())
df.tail()
df2 = df.sort_values(by='dist', ascending=True)
df3=df2.head(7)
df3
df3['clazz']
def getTopFreq(string1):
dict1={}
for item in string1:
if item not in dict1:
dict1[item]=1
else:
dict1[item]+=1
#print(dict1)
for k,v in dict1.items():
if v==max(dict1.values()):
return k
return None
rs=getTopFreq(df3['clazz'])
rs
def classify0(testOne, trainSet, lables, k):
# (1) dist
#a=trainSet#train_set.iloc[:,0:4]
#b=testOne #test_set.iloc[0,0:4]
dist=np.sqrt( np.sum((trainSet-testOne)**2,1) )
# (2) order dist
df=pd.DataFrame( {'dist':dist, 'clazz':lables} ) #train_set['Species']
df2 = df.sort_values(by='dist', ascending=True)
df3=df2.head(k)
# (3) return top freq item
return getTopFreq(df3['clazz'])
# test
classify0(test_set.iloc[0,0:4], train_set.iloc[:,0:4], train_set['Species'], 7)
print(test_set.iloc[1,4])
classify0(test_set.iloc[1,0:4], train_set.iloc[:,0:4], train_set['Species'], 7)
j=3
print(test_set.iloc[j,4])
classify0(test_set.iloc[j,0:4], train_set.iloc[:,0:4], train_set['Species'], 10)
np.sqrt(150)
import time
start=time.time()
def testCorrectRatio_K(k,n):
j=0
for i in range(n):
pred = classify0(test_set.iloc[i,0:4], train_set.iloc[:,0:4], train_set['Species'], k)
if pred==test_set.iloc[i,4]:
j+=1
return j/n
#
rightArr=[]
n=test_set.shape[0]
for k in range(1,100):
rightRatio=testCorrectRatio_K(k,n)
rightArr.append(rightRatio)
#print(k, rightRatio)
time.time()-start
plt.plot(rightArr)
i=1
print(test_set.iloc[i,0:4])
classify0(test_set.iloc[i,0:4], train_set.iloc[:,0:4], train_set['Species'], k=30)
# 自定义数据
classify0(np.array([4.9,2.5,4.5,1.7]), train_set.iloc[:,0:4], train_set['Species'], k=30)
# 自定义数据
classify0(np.array([1.9,1.5,1.5,0.7]), train_set.iloc[:,0:4], train_set['Species'], k=30)
from sklearn.neighbors import KNeighborsClassifier
import time
start=time.time()
correctArr=[]
n=len(np.array(test_set.iloc[:,4]))
for k in range(1,100):
neigh=KNeighborsClassifier(n_neighbors=k)
neigh.fit(train_set.iloc[:,0:4], train_set.iloc[:,4])
pred=neigh.predict(test_set.iloc[:,0:4])
j=0
for i in range(n):
r1=pred[i]
r2=np.array(test_set.iloc[:,4])[i]
if r1==r2:
j+=1
correctArr.append(j/n)
time.time()-start
比我写的快太多了!我写10s,这个0.8s;
plt.plot(correctArr)