# 导入向量化工具
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
# 拟合文本数据
en=["The quick brown fox jumps over a lazy dog"]
vect.fit(en)

print("words:{}".format( len(vect.vocabulary_) ))
print("words:{}".format(vect.vocabulary_))

words:8
words:{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumps': 3, 'over': 5, 'lazy': 4, 'dog': 1}


# 而中文不能自动分词
cn=["那只敏捷的棕色狐狸跳过了一只懒惰的狗"]
vect.fit(cn)

vect.vocabulary_

{'那只敏捷的棕色狐狸跳过了一只懒惰的狗': 0}


# 中文使用 结巴分词 pip3 install jieba -i https://pypi.douban.com/simple/
# jieba-0.42.1
import jieba
cn1=jieba.lcut(cn[0])
print("cn1:", cn1)

cn1: ['那', '只', '敏捷', '的', '棕色', '狐狸', '跳过', '了', '一只', '懒惰', '的', '狗']


# 使用空格连起来
cn2= [' '.join(cn1)]
print("cn2:", cn2)

vect.fit(cn2)
vect.vocabulary_

cn2: ['那 只 敏捷 的 棕色 狐狸 跳过 了 一只 懒惰 的 狗']

{'敏捷': 2, '棕色': 3, '狐狸': 4, '跳过': 5, '一只': 0, '懒惰': 1}


# 定义词袋模型
print(cn2)
bag_of_words = vect.transform( cn2 )
print("2->", bag_of_words)
print("3->", repr(bag_of_words) )

# 打印词袋模型的密度表达
print( bag_of_words.toarray() )

['那 只 敏捷 的 棕色 狐狸 跳过 了 一只 懒惰 的 狗']
2->   (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
3-> <1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>
[[1 1 1 1 1 1]]


## 话一句复杂的句子
cn_1=jieba.lcut("懒惰的狐狸不如敏捷的狐狸敏捷，敏捷的狐狸不如懒惰的狐狸懒惰")
cn_2=[" ".join(cn_1)]
print(cn_2)

# 建立新的词袋模型
new_bag = vect.transform(cn_2)
print("词袋特征:", repr(new_bag) )
print("词袋密度:", new_bag.toarray())
# 0位 一只 出现0次；1位 懒惰 出现3次；... 4位 狐狸 出现4次；...

# 词袋模型，仅仅是用数组表示每个单词出现的频率，并不表示每个单词的位置。

['懒惰 的 狐狸 不如 敏捷 的 狐狸 敏捷 ， 敏捷 的 狐狸 不如 懒惰 的 狐狸 懒惰']
词袋特征: <1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>
词袋密度: [[0 3 3 0 4 0]]


# 随便写一句话
import jieba
joke = jieba.lcut("道士看到和尚亲吻了尼姑的嘴唇")
# 插入空格
joke = [' '.join(joke)]

# 转为向量
# 导入向量化工具
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(joke)
joke_feature = vect.transform(joke)
# 打印
print("单词表:", vect.vocabulary_)
print("特征表达:", joke_feature.toarray())

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.460 seconds.
Prefix dict has been built successfully.

单词表: {'道士': 5, '看到': 4, '和尚': 1, '亲吻': 0, '尼姑': 3, '嘴唇': 2}
特征表达: [[1 1 1 1 1 1]]


# 调整顺序
joke2 = jieba.lcut("尼姑看到道士亲吻了和尚的嘴唇")
# 插入空格
joke2=[' '.join(joke2)]
# 特征提取
joke_feature2 = vect.transform(joke2)

# 打印
print("单词表:", vect.vocabulary_)
print("特征表达:", joke_feature2.toarray())

# 这2句对于人类是明显不同的，但对于这个机器模型是一模一样的。这肯定有问题。

单词表: {'道士': 5, '看到': 4, '和尚': 1, '亲吻': 0, '尼姑': 3, '嘴唇': 2}
特征表达: [[1 1 1 1 1 1]]


# 可以使用 CountVectorizer 的 ngram_range 参数进行参数调节。
# n 是整型，
#  n=2的模型叫做 bi-Gram，表示 n-Gram 会对相邻的2个单词进行配对；
#  n=3的模型叫做 tri-Gram，表示 n-Gram 会对相邻的3个单词进行配对。

vect=CountVectorizer(ngram_range=(2,2))
# 重新进行文本数据的特征提取
cv=vect.fit(joke)
joke_feature=cv.transform(joke)

print("单词表:", cv.vocabulary_)
print("特征表达:", joke_feature.toarray())

单词表: {'道士 看到': 4, '看到 和尚': 3, '和尚 亲吻': 1, '亲吻 尼姑': 0, '尼姑 嘴唇': 2}
特征表达: [[1 1 1 1 1]]


# 调整顺序后

# 特征提取
joke_feature2 = cv.transform(joke2)

# 打印
print("单词表:", cv.vocabulary_)
print("特征表达:", joke_feature2.toarray())
# 机器已经不认为这是同一句话了。

单词表: {'道士 看到': 4, '看到 和尚': 3, '和尚 亲吻': 1, '亲吻 尼姑': 0, '尼姑 嘴唇': 2}
特征表达: [[0 0 0 0 0]]


# 这里使用一套英文 评价数据，一个是文本评论，一个是情感分类 positive or negative
# http://ai.stanford.edu/~amaas/data/sentiment/
def readFiles2Array(filename):
    arr=[];
    fr=open(filename)
    for lineR in fr.readlines():
        line=lineR.strip()
        arr.append(line)
    fr.close()
    return arr;

reviews=readFiles2Array("data/reviews.txt")
labels=readFiles2Array("data/labels.txt")

# 查看一条评论
print(reviews[0], "\n", labels[0])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t 
 positive


# 拆分数据
import numpy as np
from sklearn.model_selection import train_test_split

# 为了快速，只取前1000条
total_size=2000
X_train, X_test, y_train, y_test = train_test_split( np.array(reviews)[:total_size], np.array(labels)[:total_size] )
print(X_train.shape, X_test.shape) #(18750,) (6250,)

(1500,) (500,)


# 拟合
vect = CountVectorizer().fit(X_train)
# 文本转为向量
X_train_vect = vect.transform(X_train)
# 
print("训练集样本特征数量:{}".format(len(vect.get_feature_names_out()) ) )
print("最后10个训练集样本特征:{}".format( vect.get_feature_names_out()[-10:] ))

X_train_vect.shape #(750, 14330) 有1.4万特征

训练集样本特征数量:19861
最后10个训练集样本特征:['zorro' 'zsigmond' 'zu' 'zubeidaa' 'zucco' 'zuckerman' 'zuzz' 'zwick'
 'zz' 'zzzz']

(1500, 19861)


# 有监督的学习算法，进行交叉验证评分

# 导入线性SVC分类模型
from sklearn.svm import LinearSVC
# 导入交叉验证工具
from sklearn.model_selection import cross_val_score
# 使用交叉验证对模型进行打分
scores = cross_val_score(LinearSVC(max_iter=3000), X_train_vect, y_train)
print("mean score:{:0.3f}".format(scores.mean()))
# 1k data: 0.836
# 2k data: 0.858

mean score:0.849


# 泛化到测试集的表现
X_test_vect = vect.transform(X_test)
# 使用 SVC 拟合训练数据集
clf=LinearSVC(max_iter=3000).fit(X_train_vect, y_train)
print("train score:{:0.3f}".format(clf.score(X_train_vect, y_train)))
print("test score:{:0.3f}".format(clf.score(X_test_vect, y_test)))
# test score:
# 1k data: 0.844
# 2k data: 0.860

train score:1.000
test score:0.864


from sklearn.feature_extraction.text import TfidfTransformer
# 使用tf-idf工具转化训练集和测试集
tfidf = TfidfTransformer(smooth_idf=False)
tfidf.fit(X_train_vect)
X_train_tfidf = tfidf.transform(X_train_vect)
X_test_tfidf = tfidf.transform(X_test_vect)
# 将处理前后的特征打印进行比较
print("tf-idf前的特征:\n", X_train_vect[:6, 20:30].toarray())

print("tf-idf处理后的特征:")
import pandas as pd
display( pd.DataFrame(X_train_tfidf[:6, 20:30].toarray() ) )

# 可见处理前是单词频数，处理后是词频乘以逆向文档频率，是一个浮点数。

tf-idf前的特征:
 [[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
tf-idf处理后的特征:


# 使用 tf-idf 后的新数据

# 使用交叉验证对模型进行打分
scores = cross_val_score(LinearSVC(max_iter=3000), X_train_tfidf, y_train)
print("mean score of CV:{:0.3f}".format(scores.mean()))

# 训练线性 SVC 模型
clf=LinearSVC(max_iter=3000).fit(X_train_tfidf, y_train)
print("train score:{:0.3f}".format(clf.score(X_train_tfidf, y_train)))
print("test score:{:0.3f}".format(clf.score(X_test_tfidf, y_test)))
# 测试集有所提升

mean score of CV:0.902
train score:1.000
test score:0.928


# 导入停用词表
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("sklearn内置停用词个数:", len(ENGLISH_STOP_WORDS))
print("前20个和后20个:\n",list(ENGLISH_STOP_WORDS)[:20], "\n", list(ENGLISH_STOP_WORDS)[-20:])

sklearn内置停用词个数: 318
前20个和后20个:
 ['until', 'into', 'whereby', 'anywhere', 'some', 'around', 'us', 'five', 'these', 'ten', 'anyhow', 'interest', 'by', 'couldnt', 'throughout', 'nothing', 'put', 'hundred', 'thence', 'per'] 
 ['had', 'may', 'somewhere', 'co', 'therefore', 'with', 'were', 'him', 'many', 'fifteen', 'own', 'me', 'mine', 'being', 'between', 'wherever', 'three', 'done', 'move', 'fifty']


# 删除停用词后，看模型打分是否有提升

from sklearn.feature_extraction.text import TfidfVectorizer
# 激活停用词参数
tfidf = TfidfVectorizer(smooth_idf=False, stop_words="english")
# 拟合
tfidf.fit(X_train)
# 将训练集转为向量
X_train_tfidf= tfidf.transform(X_train)


# 使用交叉验证对模型进行打分
scores3 = cross_val_score(LinearSVC(max_iter=3000), X_train_tfidf, y_train)
print("mean score of CV:{:0.3f}".format(scores3.mean()))

# 训练线性 SVC 模型
X_test_tfidf= tfidf.transform(X_test)
clf=LinearSVC(max_iter=3000).fit(X_train_tfidf, y_train)
print("train score:{:0.3f}".format(clf.score(X_train_tfidf, y_train)))
print("test score:{:0.3f}".format(clf.score(X_test_tfidf, y_test))) #略有提升

mean score of CV:0.902
train score:1.000
test score:0.918

	0	1	2	3	4	5	6	7	8	9
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

词袋模型¶

英语句子分词¶

中文句子分词¶

句子向量化，词袋模型 bag of words¶

对文本数据的优化处理¶

使用 n-Gram 改善词袋模型¶

使用 tf-idf 算法处理文本数据¶

常规 SVM 分类评价文本¶

tf-idf 分类¶

删除文本数据中的停用词 stopwords¶

进一步学习¶

	0	1	2	3	4	5	6	7	8	9
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	0	1	2	3	4	5	6	7	8	9
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	0	1	2	3	4	5	6	7	8	9
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0