chapter 13 文本数据处理
# 导入向量化工具
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
# 拟合文本数据
en=["The quick brown fox jumps over a lazy dog"]
vect.fit(en)
print("words:{}".format( len(vect.vocabulary_) ))
print("words:{}".format(vect.vocabulary_))
words:8 words:{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumps': 3, 'over': 5, 'lazy': 4, 'dog': 1}
# 而中文不能自动分词
cn=["那只敏捷的棕色狐狸跳过了一只懒惰的狗"]
vect.fit(cn)
vect.vocabulary_
{'那只敏捷的棕色狐狸跳过了一只懒惰的狗': 0}
# 中文使用 结巴分词 pip3 install jieba -i https://pypi.douban.com/simple/
# jieba-0.42.1
import jieba
cn1=jieba.lcut(cn[0])
print("cn1:", cn1)
cn1: ['那', '只', '敏捷', '的', '棕色', '狐狸', '跳过', '了', '一只', '懒惰', '的', '狗']
# 使用空格连起来
cn2= [' '.join(cn1)]
print("cn2:", cn2)
vect.fit(cn2)
vect.vocabulary_
cn2: ['那 只 敏捷 的 棕色 狐狸 跳过 了 一只 懒惰 的 狗']
{'敏捷': 2, '棕色': 3, '狐狸': 4, '跳过': 5, '一只': 0, '懒惰': 1}
# 定义词袋模型
print(cn2)
bag_of_words = vect.transform( cn2 )
print("2->", bag_of_words)
print("3->", repr(bag_of_words) )
# 打印词袋模型的密度表达
print( bag_of_words.toarray() )
['那 只 敏捷 的 棕色 狐狸 跳过 了 一只 懒惰 的 狗'] 2-> (0, 0) 1 (0, 1) 1 (0, 2) 1 (0, 3) 1 (0, 4) 1 (0, 5) 1 3-> <1x6 sparse matrix of type '<class 'numpy.int64'>' with 6 stored elements in Compressed Sparse Row format> [[1 1 1 1 1 1]]
## 话一句复杂的句子
cn_1=jieba.lcut("懒惰的狐狸不如敏捷的狐狸敏捷,敏捷的狐狸不如懒惰的狐狸懒惰")
cn_2=[" ".join(cn_1)]
print(cn_2)
# 建立新的词袋模型
new_bag = vect.transform(cn_2)
print("词袋特征:", repr(new_bag) )
print("词袋密度:", new_bag.toarray())
# 0位 一只 出现0次;1位 懒惰 出现3次;... 4位 狐狸 出现4次;...
# 词袋模型,仅仅是用数组表示每个单词出现的频率,并不表示每个单词的位置。
['懒惰 的 狐狸 不如 敏捷 的 狐狸 敏捷 , 敏捷 的 狐狸 不如 懒惰 的 狐狸 懒惰'] 词袋特征: <1x6 sparse matrix of type '<class 'numpy.int64'>' with 3 stored elements in Compressed Sparse Row format> 词袋密度: [[0 3 3 0 4 0]]
除了词语的频率,词语的顺序也很重要。
# 随便写一句话
import jieba
joke = jieba.lcut("道士看到和尚亲吻了尼姑的嘴唇")
# 插入空格
joke = [' '.join(joke)]
# 转为向量
# 导入向量化工具
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(joke)
joke_feature = vect.transform(joke)
# 打印
print("单词表:", vect.vocabulary_)
print("特征表达:", joke_feature.toarray())
Building prefix dict from the default dictionary ... Loading model from cache /tmp/jieba.cache Loading model cost 0.460 seconds. Prefix dict has been built successfully.
单词表: {'道士': 5, '看到': 4, '和尚': 1, '亲吻': 0, '尼姑': 3, '嘴唇': 2} 特征表达: [[1 1 1 1 1 1]]
# 调整顺序
joke2 = jieba.lcut("尼姑看到道士亲吻了和尚的嘴唇")
# 插入空格
joke2=[' '.join(joke2)]
# 特征提取
joke_feature2 = vect.transform(joke2)
# 打印
print("单词表:", vect.vocabulary_)
print("特征表达:", joke_feature2.toarray())
# 这2句对于人类是明显不同的,但对于这个机器模型是一模一样的。这肯定有问题。
单词表: {'道士': 5, '看到': 4, '和尚': 1, '亲吻': 0, '尼姑': 3, '嘴唇': 2} 特征表达: [[1 1 1 1 1 1]]
# 可以使用 CountVectorizer 的 ngram_range 参数进行参数调节。
# n 是整型,
# n=2的模型叫做 bi-Gram,表示 n-Gram 会对相邻的2个单词进行配对;
# n=3的模型叫做 tri-Gram,表示 n-Gram 会对相邻的3个单词进行配对。
vect=CountVectorizer(ngram_range=(2,2))
# 重新进行文本数据的特征提取
cv=vect.fit(joke)
joke_feature=cv.transform(joke)
print("单词表:", cv.vocabulary_)
print("特征表达:", joke_feature.toarray())
单词表: {'道士 看到': 4, '看到 和尚': 3, '和尚 亲吻': 1, '亲吻 尼姑': 0, '尼姑 嘴唇': 2} 特征表达: [[1 1 1 1 1]]
# 调整顺序后
# 特征提取
joke_feature2 = cv.transform(joke2)
# 打印
print("单词表:", cv.vocabulary_)
print("特征表达:", joke_feature2.toarray())
# 机器已经不认为这是同一句话了。
单词表: {'道士 看到': 4, '看到 和尚': 3, '和尚 亲吻': 1, '亲吻 尼姑': 0, '尼姑 嘴唇': 2} 特征表达: [[0 0 0 0 0]]
tf-idf: term frequency-inverse document frequency,翻译 “词频-逆向文件频率”。
公式:tf-idf 的计算公式有很多实现,这是其中一种。
# 这里使用一套英文 评价数据,一个是文本评论,一个是情感分类 positive or negative
# http://ai.stanford.edu/~amaas/data/sentiment/
def readFiles2Array(filename):
arr=[];
fr=open(filename)
for lineR in fr.readlines():
line=lineR.strip()
arr.append(line)
fr.close()
return arr;
reviews=readFiles2Array("data/reviews.txt")
labels=readFiles2Array("data/labels.txt")
# 查看一条评论
print(reviews[0], "\n", labels[0])
bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life such as teachers . my years in the teaching profession lead me to believe that bromwell high s satire is much closer to reality than is teachers . the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn t positive
# 拆分数据
import numpy as np
from sklearn.model_selection import train_test_split
# 为了快速,只取前1000条
total_size=2000
X_train, X_test, y_train, y_test = train_test_split( np.array(reviews)[:total_size], np.array(labels)[:total_size] )
print(X_train.shape, X_test.shape) #(18750,) (6250,)
(1500,) (500,)
# 拟合
vect = CountVectorizer().fit(X_train)
# 文本转为向量
X_train_vect = vect.transform(X_train)
#
print("训练集样本特征数量:{}".format(len(vect.get_feature_names_out()) ) )
print("最后10个训练集样本特征:{}".format( vect.get_feature_names_out()[-10:] ))
X_train_vect.shape #(750, 14330) 有1.4万特征
训练集样本特征数量:19861 最后10个训练集样本特征:['zorro' 'zsigmond' 'zu' 'zubeidaa' 'zucco' 'zuckerman' 'zuzz' 'zwick' 'zz' 'zzzz']
(1500, 19861)
# 有监督的学习算法,进行交叉验证评分
# 导入线性SVC分类模型
from sklearn.svm import LinearSVC
# 导入交叉验证工具
from sklearn.model_selection import cross_val_score
# 使用交叉验证对模型进行打分
scores = cross_val_score(LinearSVC(max_iter=3000), X_train_vect, y_train)
print("mean score:{:0.3f}".format(scores.mean()))
# 1k data: 0.836
# 2k data: 0.858
mean score:0.849
# 泛化到测试集的表现
X_test_vect = vect.transform(X_test)
# 使用 SVC 拟合训练数据集
clf=LinearSVC(max_iter=3000).fit(X_train_vect, y_train)
print("train score:{:0.3f}".format(clf.score(X_train_vect, y_train)))
print("test score:{:0.3f}".format(clf.score(X_test_vect, y_test)))
# test score:
# 1k data: 0.844
# 2k data: 0.860
train score:1.000 test score:0.864
from sklearn.feature_extraction.text import TfidfTransformer
# 使用tf-idf工具转化训练集和测试集
tfidf = TfidfTransformer(smooth_idf=False)
tfidf.fit(X_train_vect)
X_train_tfidf = tfidf.transform(X_train_vect)
X_test_tfidf = tfidf.transform(X_test_vect)
# 将处理前后的特征打印进行比较
print("tf-idf前的特征:\n", X_train_vect[:6, 20:30].toarray())
print("tf-idf处理后的特征:")
import pandas as pd
display( pd.DataFrame(X_train_tfidf[:6, 20:30].toarray() ) )
# 可见处理前是单词频数,处理后是词频乘以逆向文档频率,是一个浮点数。
tf-idf前的特征: [[0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0]] tf-idf处理后的特征:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
# 使用 tf-idf 后的新数据
# 使用交叉验证对模型进行打分
scores = cross_val_score(LinearSVC(max_iter=3000), X_train_tfidf, y_train)
print("mean score of CV:{:0.3f}".format(scores.mean()))
# 训练线性 SVC 模型
clf=LinearSVC(max_iter=3000).fit(X_train_tfidf, y_train)
print("train score:{:0.3f}".format(clf.score(X_train_tfidf, y_train)))
print("test score:{:0.3f}".format(clf.score(X_test_tfidf, y_test)))
# 测试集有所提升
mean score of CV:0.902 train score:1.000 test score:0.928
停用词 stopwords,指的是出现频率很高,但没有实际意义的单词,通常包括各种语气词、连词、介词等。
# 导入停用词表
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("sklearn内置停用词个数:", len(ENGLISH_STOP_WORDS))
print("前20个和后20个:\n",list(ENGLISH_STOP_WORDS)[:20], "\n", list(ENGLISH_STOP_WORDS)[-20:])
sklearn内置停用词个数: 318 前20个和后20个: ['until', 'into', 'whereby', 'anywhere', 'some', 'around', 'us', 'five', 'these', 'ten', 'anyhow', 'interest', 'by', 'couldnt', 'throughout', 'nothing', 'put', 'hundred', 'thence', 'per'] ['had', 'may', 'somewhere', 'co', 'therefore', 'with', 'were', 'him', 'many', 'fifteen', 'own', 'me', 'mine', 'being', 'between', 'wherever', 'three', 'done', 'move', 'fifty']
# 删除停用词后,看模型打分是否有提升
from sklearn.feature_extraction.text import TfidfVectorizer
# 激活停用词参数
tfidf = TfidfVectorizer(smooth_idf=False, stop_words="english")
# 拟合
tfidf.fit(X_train)
# 将训练集转为向量
X_train_tfidf= tfidf.transform(X_train)
# 使用交叉验证对模型进行打分
scores3 = cross_val_score(LinearSVC(max_iter=3000), X_train_tfidf, y_train)
print("mean score of CV:{:0.3f}".format(scores3.mean()))
# 训练线性 SVC 模型
X_test_tfidf= tfidf.transform(X_test)
clf=LinearSVC(max_iter=3000).fit(X_train_tfidf, y_train)
print("train score:{:0.3f}".format(clf.score(X_train_tfidf, y_train)))
print("test score:{:0.3f}".format(clf.score(X_test_tfidf, y_test))) #略有提升
mean score of CV:0.902 train score:1.000 test score:0.918
自然语言处理