show the code of naive bayes
贝叶斯就是把无法计算的概率,转化为容易计算的概率,然后算出来的过程。中间需要一些假设,而且即使这些假设不成立,分类效果依旧很好。
离散型数据
连续型数据
本例是一个帖子分类问题。输入评论的句子,判断帖子是否有侮辱性。--> 再复杂点就是垃圾邮件分类。
对于单词的统计有2种策略,1是只统计有无出现,2是统计出现的次数。
从文本,构建文本所有单词列表。
然后构建每个文本中是否出现该单词,1出现,0没有出现。
计算每个单词出现的频率: w表示词向量, ci表示帖子分类
训练模型就是计算每个分类下,每个单词的频率。
测试模型,就是输入单词次数,使用计算好的各个分类下的单词频率,计算频率最高的分类
import re
def loadDataSet():
postingList=[
'my dog has flea problems help please',
'mybe not take him to dog park stupid',
'my dalmation is so cute I love him',
'stop posting stupid worthless garbage',
'mr licks ate my steak how to stop him',
'quit buying worthless dog food stupid'
]
classVec=[0,1,0,1,0,1] #是否有侮辱性词汇,1有,0没;
postingList2=[]
for s in postingList:
postingList2.append( re.split(' ',s) )
return postingList2, classVec
# test
loadDataSet()
def createVocabList(dataSet):
vocabSet=set()
for words in dataSet:
vocabSet=vocabSet | set(words) # 求并集
return list(vocabSet) #集合 to list
def setOfWords2Vec(vocabList, inputSet):
rsVect=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
rsVect[vocabList.index(word)]=1
else:
print('the word: %s is not in my Vocabulary!' % word) #这个应该用不到
return rsVect
# test
posts,tags=loadDataSet()
vocabList=createVocabList(posts)
print(vocabList)
print( setOfWords2Vec(vocabList, posts[0]) )
print( setOfWords2Vec(vocabList, posts[2]) )
# get word vector matrix
train_X=[]
for post in posts:
train_X.append(setOfWords2Vec(vocabList, post))
print(train_X)
1.由词向量,我们知道一个词是否出现在某个文档中,也知道了某个文档的分类。
2.而计算 p(w|ci)=p(w0,w1,...,wn|ci) 就是统计该分类下,每个单词出现的概率。
import numpy as np
def trainNB0(train_X, train_Y):
numTrainDocs=len(train_X)
# 单词总数
numWords=len(train_X[0])
# 1分类的百分比
pAbusive=sum(train_Y)/float(numTrainDocs)
p0Num=np.zeros(numWords); p1Num=np.zeros(numWords);
p0Denom=0.0; p1Denom=0.0;
for i in range(numTrainDocs):
if train_Y[i]==1:
p1Num += train_X[i] #该分类下,每个单词的数量
p1Denom += sum(train_X[i]) #该分类下,总单词数
else:
p0Num += train_X[i]
p0Denom += sum(train_X[i])
p1Vect=p1Num/p1Denom; # num ro frequncy
p0Vect=p0Num/p0Denom;
return p0Vect, p1Vect, pAbusive
# test
pV0,pV1,pAb=trainNB0(train_X, train_Y=tags)
print(pV0)
print(pV1)
print(pAb)
# 'cute' 在0出现1次,1出现0次
不能出现零,否则相乘后都是0.
同时,x>0时,f(x)与 f(ln(x)) 的单调性相同。所以把乘法转为取log后的加法。
# 训练模型
import numpy as np
def trainNB1(train_X, train_Y):
numTrainDocs=len(train_X)
# 单词总数
numWords=len(train_X[0])
# 1分类的百分比
pAbusive=sum(train_Y)/float(numTrainDocs)
p0Num=np.ones(numWords); p1Num=np.ones(numWords);
p0Denom=2.0; p1Denom=2.0;
for i in range(numTrainDocs):
if train_Y[i]==1:
p1Num += train_X[i] #该分类下,每个单词的数量
p1Denom += sum(train_X[i]) #该分类下,总单词数
else:
p0Num += train_X[i]
p0Denom += sum(train_X[i])
p1Vect=np.log(p1Num/p1Denom) # num ro frequncy
p0Vect=np.log(p0Num/p0Denom)
return p0Vect, p1Vect, pAbusive
# test
pV0,pV1,pAb=trainNB1(train_X, train_Y=tags)
print(pV0)
print(pV1)
print(pAb)
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1=sum(vec2Classify*p1Vec) + np.log(pClass1)
p0=sum(vec2Classify*p0Vec) + np.log(1.0-pClass1)
print(p0, p1)
if p1>p0:
return 1
else:
return 0
# test
entryPost=['love', 'my', 'dalmation']
entryVect=np.array(setOfWords2Vec(vocabList, entryPost))
print( entryVect )
classifyNB(entryVect, pV0, pV1, pAb)
entryPost=['stupid', 'garbage']
entryVect=np.array(setOfWords2Vec(vocabList, entryPost))
print( entryVect )
classifyNB(entryVect, pV0, pV1, pAb)
def bagOfWords2Vec(vocabList, inputSet):
rsVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
rsVec[vocabList.index(word)] += 1
return rsVec
# 训练模型
import numpy as np
def trainNB2(train_X, train_Y):
numTrainDocs=len(train_X)
# 单词总数
numWords=len(train_X[0])
# 1分类的百分比
pAbusive=sum(train_Y)/float(numTrainDocs)
p0Num=np.ones(numWords); p1Num=np.ones(numWords);
p0Denom=2.0; p1Denom=2.0;
for i in range(numTrainDocs):
if train_Y[i]==1:
p1Num += train_X[i] #该分类下,每个单词的数量
p1Denom += sum(train_X[i]) #该分类下,总单词数
else:
p0Num += train_X[i]
p0Denom += sum(train_X[i])
p1Vect=np.log(p1Num/p1Denom) # num ro frequncy
p0Vect=np.log(p0Num/p0Denom)
return p0Vect, p1Vect, pAbusive
# test
# get word vector matrix
train_X2=[]
for post in posts:
train_X2.append(bagOfWords2Vec(vocabList, post))
print(train_X2)
pV0_2,pV1_2,pAb_2=trainNB2(train_X2, train_Y=tags)
print(pV0_2)
print(pV1_2)
print(pAb_2)
entryPost=['love', 'my', 'dalmation']
entryVect=np.array(bagOfWords2Vec(vocabList, entryPost))
print( entryVect )
classifyNB(entryVect, pV0_2, pV1_2, pAb_2)
entryPost=['stupid', 'garbage']
entryVect=np.array(bagOfWords2Vec(vocabList, entryPost))
print( entryVect )
classifyNB(entryVect, pV0_2, pV1_2, pAb_2)
接着还有多分类(n>2)怎么处理?
性别 身高(英尺) 体重(磅) 脚掌(英寸) 男 6 180 12 男 5.92 190 11 男 5.58 170 12 男 5.92 165 10 女 5 100 6 女 5.5 150 8 女 5.42 130 7 女 5.75 150 9 已知某人身高6英尺、体重130磅,脚掌8英寸,请问该人是男是女?###### P(ci|w)=P(w|ci)*p(ci) / p(w), 我们可以忽略掉p(w),因为它是一个恒定的频率。 而p(ci)很容易计算。 根据朴素贝叶斯假设,条件都是相对独立的,p(w|ci)=p(w0|ci)p(w1|ci)p(w2|ci)...p(wn|ci) ###### 根据概率密度函数,计算概率 假设都是正态分布 X~ N(mu, sigma^2),则需要计算正态分布的2个参数 heights=c(6,5.92,5.58,5.92) mean2=mean(heights);mean2 #5.855 var2=var(heights);var2 #0.03503333 # R 1/sqrt(2*3.1415*0.035)*exp(-(6-5.855)^2/(2*0.035)) ## js 1/Math.sqrt(2*3.1415*0.035)*Math.exp(-((6-5.855)**2)/(2*0.035)) #1.579206773964085
1/np.sqrt(2*3.1415*0.035)*np.exp(-((6-5.855)**2)/(2*0.035))
def loadDataSet2():
dataSet=[ #列名: 身高(英尺) 体重(磅) 脚掌(英寸)
[6,180,12],
[5.92,190,11],
[5.58,170,12],
[5.92,165,10],
[5,100,6],
[5.5,150,8],
[5.42,130,7],
[5.75,150,9]
]
tags=[1,1,1,1, 0,0,0,0] #tags 1=男的; 0=女的
return dataSet, tags
dataSet, tags=loadDataSet2()
print(dataSet)
print(tags)
import numpy as np
def trainBayes(dataSet, tags):
paras={}
uniqTag=set(tags)
print('uniqTag=',uniqTag)
for tag in uniqTag:
paras[tag]=[]
dataOfThisTag=[]
for i in range(len(tags)):
if tags[i]==tag:
dataOfThisTag.append(dataSet[i])
dataOfThisTag=np.array(dataOfThisTag)
# cal mean, var by column
paras[tag].append( dataOfThisTag.mean(axis=0))
paras[tag].append( dataOfThisTag.var(axis=0, ddof=1))
paras[tag].append( dataOfThisTag.shape[0] )
return paras
# test
paras=trainBayes(dataSet, tags)
paras
# 女: 三列的平均值,三列的方差, 观察条目个数
# 男: 三列的平均值,三列的方差, 观察条目个数
P(身高=6|男) x P(体重=130|男) x P(脚掌=8|男) x P(男) = 6.1984 x e-9 P(身高=6|女) x P(体重=130|女) x P(脚掌=8|女) x P(女) = 5.3778 x e-4 可以看到,女性的概率比男性要高出将近10000倍,所以判断该人为女性。
def predictBayes(newEntry, paras):
newEntry=np.array(newEntry)
uniqTag=set(paras.keys())
rsDict={}
totalbyGroup={} # get pre-prob for each class
for tag in uniqTag:
rsDict[tag]=[]
totalbyGroup[tag]=paras[tag][-1]
# print(tag, paras[tag])
for col in range(len(paras[tag][0])):
mean1=paras[tag][0][col]
var1=paras[tag][1][col]
X=newEntry[col]
p=1/np.sqrt(2*3.1415*var1)*np.exp(-((X-mean1)**2)/(2*var1))
rsDict[tag].append(p)
#
#print(rsDict)
# calc post-prob for each class
postProbs={}
for tag in uniqTag:
p0=totalbyGroup[tag]/sum(totalbyGroup.values())
p= np.sum(np.log( np.array( rsDict[tag] ) ) )+ np.log(np.array(p0)) #np把乘法转为加法,会更方便
print(tag, p)
# 记录后验概率最大的p和分类,并返回p最大的分类
if 'max' not in postProbs:
postProbs['max']=p
postProbs['tag']=tag
elif p>postProbs['max']:
postProbs['max']=p
postProbs['tag']=tag
return postProbs['tag']
# test
predictBayes([6, 130, 8],paras)
a0=np.exp(-7.527996461433017) #女
a1=np.exp(-18.89914470021889) #男
print(a0, a1, a0/a1)
# 数量太少了,只好用原始数据进行验证了
i=-1
for item in dataSet:
i=i+1
pred=predictBayes(item,paras)
print('>>>>>>>>>>',item, '; Pred=',pred, '; Actual=', tags[i], '; ', pred==tags[i])
#
import os
os.getcwd()
import pandas as pd
def loadDataSet3():
return pd.read_csv('../iris_data/iris.csv', index_col=0)
iris=loadDataSet3()
iris.head()
import numpy as np
def splitData(df, test_ratio):
# 索引范围为[0, n), 随机选x个不重复
n=df.shape[0]
x=round(n*test_ratio)
index = np.random.choice(np.arange(n), size=x, replace=False)
#
test_index = np.array(index)
train_index = np.delete(np.arange(n), test_index)
return df.iloc[train_index,],df.iloc[test_index,]
np.random.seed(1)
train_set, test_set=splitData(iris, 0.2)
print(train_set.shape)
print(test_set.shape)
def np2vec(npArray):
arrayVec=[]
for i in range(npArray.shape[0]):
arrayVec.append([npArray.iloc[i,0], npArray.iloc[i,1],npArray.iloc[i,2],npArray.iloc[i,3]])
#
tags2=[]
for item in npArray['Species']:
tags2.append(item)
return arrayVec,tags2
#test
trainX, trainY=np2vec(train_set)
testX, testY=np2vec(test_set)
paras=trainBayes(trainX, tags=trainY)
paras
predictBayes([4.9, 3.1, 1.5, 0.1],paras)
i=-1
rsArr=[]
j=0
for item in testX:
i=i+1
pred=predictBayes(item,paras)
rs=(pred==testY[i])
rsArr.append(rs)
if rs==True:
j+=1
print('>>>>>>>>>>',i,item, '; Pred=',pred, '; Actual=', testY[i], '; ', rs, '\n')
print(i,j, round(j/i,2)*100, '%')
完全正确!