十二次作业

1.读取

# 1.读取数据集
def read_dataset():
    file_path = r'D:SMSSpamCollection.txt'
    sms = open(file_path, encoding='utf-8')
    sms_data = []
    sms_label = []
    csv_reader = csv.reader(sms, delimiter='	')
    for line in csv_reader:
        sms_label.append(line[0])  # 提取出标签
        sms_data.append(preprocessing(line[1]))  # 提取出特征
    sms.close()
    return sms_data, sms_label

2.数据预处理

# 2、数据预处理
def preprocess(text):
     tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]  # 分词
     stops = stopwords.words('english')  # 使用英文的停用词表
     tokens = [token for token in tokens if token not in stops]  # 去除停用词
     tokens = [token.lower() for token in tokens if len(token) >= 3]  # 大小写，短词
     wnl = WordNetLemmatizer()
     tag = nltk.pos_tag(tokens)  # 词性
     tokens = [wnl.lemmatize(token, pos=get_wordnet_pos(tag[i][1])) for i, token in enumerate(tokens)]  # 词性还原
     preprocessed_text = ' '.join(tokens)
     return preprocessed_text

3.数据划分—训练集和测试集数据划分

from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0, stratify=y_train)

def split_dataset(data, label):
     x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0, stratify=label)
     return x_train, x_test, y_train, y_tes

4.文本特征提取

sklearn.feature_extraction.text.CountVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=sklearn%20feature_extraction%20text%20tfidfvectorizer

sklearn.feature_extraction.text.TfidfVectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=sklearn%20feature_extraction%20text%20tfidfvectorizer#sklearn.feature_extraction.text.TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf2 = TfidfVectorizer()

观察邮件与向量的关系

向量还原为邮件

# 4、文本特征提取
# 把文本转化为tf-idf的特征矩阵
def tfidf_dataset(x_train,x_test):
     tfidf = TfidfVectorizer()
     X_train = tfidf.fit_transform(x_train)  
     X_test = tfidf.transform(x_test)
     return X_train, X_test, tfidf
# 向量还原成邮件
def revert_mail(x_train, X_train, model):
    s = X_train.toarray()[0]
    print("第一封邮件向量表示为：", s)
    a = np.flatnonzero(X_train.toarray()[0])  # 非零元素的位置（index）
    print("非零元素的位置:", a)
    print("向量的非零元素的值：", s[a])
    b = model.vocabulary_  # 词汇表
    key_list = []
    for key, value in b.items():
        if value in a:
            key_list.append(key)  # key非0元素对应的单词
    print("向量非零元素对应的单词：", key_list)
    print("向量化之前的邮件：", x_train[0])

5.模型选择

from sklearn.naive_bayes import GaussianNB

from sklearn.naive_bayes import MultinomialNB

说明为什么选择这个模型？

答本次邮件数据属于概率性的数据，并不符合正态分布的特征，是不能选择高斯型分布模型，应该选择多项式分布模型

# 5、模型选择
def mnb_model(x_train, x_test, y_train, y_test):
    mnb = MultinomialNB()
    mnb.fit(x_train, y_train)
    pre = mnb.predict(x_test)
    print("总数：", len(y_test))
    print("预测正确数：", (pre == y_test).sum())
    print("预测准确率：",sum(pre == y_test) / len(y_test))
    return pre

6.模型评价：混淆矩阵，分类报告

from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_predict)

说明混淆矩阵的含义

from sklearn.metrics import classification_report

说明准确率、精确率、召回率、F值分别代表的意义

答：①混淆矩阵 confusion-matrix:

TP（True Positive）：真实为0，预测为0

TN（True Negative）：真实为1，预测为1

FN（False Negative）：真实为0，预测为1

FP（False Positive）：真实为1，预测为0

②准确率 accuracy：代表分类器对整个样本判断正确的比重。

③精确率 precision：指被分类器判断正例中的正样本的比重。

④召回率 recall：指被预测为正例的占总的正例的比重。

⑤F值：准确率和召回率的加权调和平均。

# 模型评价：混淆矩阵，分类报告
def class_report(ypre_mnb, y_test):
    conf_matrix = confusion_matrix(y_test, ypre_mnb)
    print("=====================================================")
    print("混淆矩阵：
", conf_matrix)
    c = classification_report(y_test, ypre_mnb)
    print("=====================================================")
    print("分类报告：
", c)
    print("模型准确率：", (conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix))
6.比较与总结
如果用CountVectorizer进行文本特征生成，与TfidfVectorizer相比，效果如何？

相关阅读:
Windows server 2016 解决“无法完成域加入，原因是试图加入的域的SID与本计算机的SID相同。”
Windows Server 2016 辅助域控制器搭建
 Windows Server 2016 主域控制器搭建
 Net Framework 4.7.2 覆盖 Net Framework 4.5 解决办法
 SQL SERVER 2012更改默认的端口号为1772
Windows下彻底卸载删除SQL Serever2012
在Windows Server2016中安装SQL Server2016
SQL Server 创建索引
 C#控制台或应用程序中两个多个Main()方法的设置
 Icon cache rebuilding with Delphi（Delphi 清除Windows 图标缓存源代码）
原文地址：https://www.cnblogs.com/Gidupar/p/12972242.html