python KNN demo¶

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

get data set¶

import pandas as pd
iris=pd.read_csv('../data/iris_data/iris.csv', index_col=0) # 将第一列作为行名字
iris.head()

iris.shape

(150, 5)

# string to color list
def str2color(strs, dict1):
    arr=[]
    for item in strs:
        arr.append(dict1[item])
    return arr;

colors=str2color(iris['Species'], {'setosa':'red', 'versicolor':'green', 'virginica':'blue'})
colors[0:10]

['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red']

iris.head()

import matplotlib.pyplot as plt
plt.scatter(iris['Sepal.Length'], iris['Petal.Length'], color=colors )

<matplotlib.collections.PathCollection at 0x25d00b37748>

to 1 function¶

import numpy as np
def splitData(df, test_ratio):
    # 索引范围为[0, n), 随机选x个不重复
    n=df.shape[0]
    x=round(n*test_ratio)
    index = np.random.choice(np.arange(n), size=x, replace=False)
    #
    test_index = np.array(index)
    train_index = np.delete(np.arange(n), test_index)
    return df.iloc[train_index,],df.iloc[test_index,]
np.random.seed(1)
train_set, test_set=splitData(iris, 0.2)
print(train_set.shape)
print(test_set.shape)

(120, 5)
(30, 5)

test_set.head()

calculate dist¶

test_set.iloc[1,]

Sepal.Length           5.1
Sepal.Width            2.5
Petal.Length             3
Petal.Width            1.1
Species         versicolor
Name: 99, dtype: object

# 
a=train_set.iloc[:,0:4]
b=test_set.iloc[0,0:4]
dist=np.sqrt( np.sum((a-b)**2,1) )

#train_set.iloc[:,4:5]
df=pd.DataFrame( {'dist':dist, 'clazz':train_set['Species']} )
print(df.head())
df.tail()

       dist   clazz
1  0.883176  setosa
2  1.360147  setosa
3  1.363818  setosa
4  1.529706  setosa
5  0.916515  setosa

order dist¶

df2 = df.sort_values(by='dist', ascending=True)
df3=df2.head(7)
df3

count type freq¶

df3['clazz']

16    setosa
19    setosa
11    setosa
37    setosa
49    setosa
33    setosa
47    setosa
Name: clazz, dtype: object

def getTopFreq(string1):
    dict1={}
    for item in string1:
        if item not in dict1:
            dict1[item]=1
        else:
            dict1[item]+=1
    #print(dict1)
    for k,v in dict1.items():
        if v==max(dict1.values()):
            return k
    return None
        
rs=getTopFreq(df3['clazz'])
rs

'setosa'

to 1 function¶

def classify0(testOne, trainSet, lables, k):
    # (1) dist
    #a=trainSet#train_set.iloc[:,0:4]
    #b=testOne #test_set.iloc[0,0:4]
    dist=np.sqrt( np.sum((trainSet-testOne)**2,1) )
    # (2) order dist
    df=pd.DataFrame( {'dist':dist, 'clazz':lables} ) #train_set['Species']
    df2 = df.sort_values(by='dist', ascending=True)
    df3=df2.head(k)
    # (3) return top freq item
    return getTopFreq(df3['clazz'])
# test
classify0(test_set.iloc[0,0:4], train_set.iloc[:,0:4], train_set['Species'], 7)

'setosa'

print(test_set.iloc[1,4])
classify0(test_set.iloc[1,0:4], train_set.iloc[:,0:4], train_set['Species'], 7)

versicolor

'versicolor'

j=3
print(test_set.iloc[j,4])
classify0(test_set.iloc[j,0:4], train_set.iloc[:,0:4], train_set['Species'], 10)

setosa

'setosa'

get the best K¶

np.sqrt(150)

12.24744871391589

import time
start=time.time()

def testCorrectRatio_K(k,n):
    j=0
    for i in range(n):
        pred = classify0(test_set.iloc[i,0:4], train_set.iloc[:,0:4], train_set['Species'], k)
        if pred==test_set.iloc[i,4]:
            j+=1
    return j/n
#

rightArr=[]
n=test_set.shape[0]
for k in range(1,100):
    rightRatio=testCorrectRatio_K(k,n)
    rightArr.append(rightRatio)
    #print(k, rightRatio)
time.time()-start

12.984581470489502

plt.plot(rightArr)

[<matplotlib.lines.Line2D at 0x25d01be2ba8>]

using the best K¶

i=1
print(test_set.iloc[i,0:4])
classify0(test_set.iloc[i,0:4], train_set.iloc[:,0:4], train_set['Species'], k=30)

Sepal.Length    5.1
Sepal.Width     2.5
Petal.Length      3
Petal.Width     1.1
Name: 99, dtype: object

'versicolor'

# 自定义数据
classify0(np.array([4.9,2.5,4.5,1.7]), train_set.iloc[:,0:4], train_set['Species'], k=30)

'versicolor'

# 自定义数据
classify0(np.array([1.9,1.5,1.5,0.7]), train_set.iloc[:,0:4], train_set['Species'], k=30)

'setosa'

KNN using package¶

from sklearn.neighbors import KNeighborsClassifier

import time
start=time.time()

correctArr=[]
n=len(np.array(test_set.iloc[:,4]))
for k in range(1,100):
    neigh=KNeighborsClassifier(n_neighbors=k)
    neigh.fit(train_set.iloc[:,0:4], train_set.iloc[:,4])
    pred=neigh.predict(test_set.iloc[:,0:4])
    j=0
    for i in range(n):
        r1=pred[i]
        r2=np.array(test_set.iloc[:,4])[i]
        if r1==r2:
            j+=1
    correctArr.append(j/n)

time.time()-start

1.000542163848877

比我写的快太多了！我写10s，这个0.8s;

plt.plot(correctArr)

[<matplotlib.lines.Line2D at 0x25d0211d780>]

	dist	clazz
145	5.180734	virginica
146	4.713809	virginica
148	4.553021	virginica
149	4.750789	virginica
150	4.333590	virginica

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
1	5.1	3.5	1.4	0.2	setosa
2	4.9	3.0	1.4	0.2	setosa
3	4.7	3.2	1.3	0.2	setosa
4	4.6	3.1	1.5	0.2	setosa
5	5.0	3.6	1.4	0.2	setosa

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
15	5.8	4.0	1.2	0.2	setosa
99	5.1	2.5	3.0	1.1	versicolor
76	6.6	3.0	4.4	1.4	versicolor
17	5.4	3.9	1.3	0.4	setosa
132	7.9	3.8	6.4	2.0	virginica

	dist	clazz
16	0.547723	setosa
19	0.556776	setosa
11	0.583095	setosa
37	0.591608	setosa
49	0.655744	setosa
33	0.685565	setosa
47	0.830662	setosa

python KNN demo¶

get data set¶

to 1 function¶

calculate dist¶

order dist¶

count type freq¶

to 1 function¶

get the best K¶

using the best K¶

KNN using package¶

手写体数字识别 //todo¶