• 【机器学习】朴素贝叶斯文本分类案例


     1 import pandas as pd
     2 from sklearn.feature_extraction.text import CountVectorizer
     3 import jieba
     4 import numpy as np
     5 from sklearn.naive_bayes import MultinomialNB
     6 
     7 # 1、加载数据
     8 data = pd.read_csv("./data.csv", encoding="ansi")
     9 print("data:
    ", data)
    10 print("data 的列索引:
    ", data.columns)
    11 
    12 content = []
    13 
    14 # 分词
    15 for tmp in data.loc[:, "内容 "]:
    16     print(tmp)
    17     # 以精确模式对文章进行分词
    18     seg = jieba.cut(tmp, cut_all=False)
    19     # seg = jieba.cut(tmp, cut_all=True)
    20     seg_ = ",".join(seg)
    21 
    22     content.append(seg_)
    23 
    24 # print(content)
    25 
    26 data.loc[:, "内容 "] = content
    27 
    28 print(data)
    29 
    30 # 加载停止词
    31 with open("./stopwords.txt", "r", encoding="utf-8") as f:
    32     stopwords = f.readlines()
    33     # 去除掉停止词前后的空白字符
    34     st_list = [tmp.strip() for tmp in stopwords]
    35 # print(st_list)
    36 # print(len(st_list))
    37 # 停止词去重
    38 st_list = list(set(st_list))
    39 # print("去重之后的停止词长度:
    ", len(st_list))
    40 
    41 # # 2、将文本内容转化为数值类型
    42 # #  统计词数 统计词的重要性程度
    43 # # 1、实例化对象
    44 conv = CountVectorizer(stop_words=st_list)
    45 # 2、词数统计
    46 x = conv.fit_transform(data.loc[:, "内容 "])
    47 print("x: 
    ", x)
    48 
    49 # 获取统计的词语
    50 feature_names = conv.get_feature_names()
    51 res = x.toarray()
    52 print(feature_names)
    53 print("res:
    ", res)
    54 
    55 # 将目标值获取到
    56 data.loc[data.loc[:, "评价"] == "好评", "评价"] = 0
    57 data.loc[data.loc[:, "评价"] == "差评", "评价"] = 1
    58 
    59 # print(data.dtypes)
    60 # 将目标值转化为Int类型
    61 data.loc[:, "评价"] = data.loc[:, "评价"].astype(np.int64)
    62 print("data的type: 
    ", data.dtypes)
    63 
    64 # 将转化为数组之后的特征与目标值进行拼接
    65 data = np.concatenate((res, data.loc[:, "评价"].values.reshape(-1, 1)), axis=1)
    66 
    67 print("data:
    ", data)
    68 print("data:
    ", data.dtype)
    69 
    70 # 拆分成训练集与测试集
    71 train = data[[0, 1, 3, 6, 8, 9, 10, 11, 12, 4], :]
    72 test = data[[2, 5, 7], :]
    73 
    74 # 构建朴素贝叶斯算法进行分类
    75 # 1、实例化对象
    76 nb = MultinomialNB(alpha=1.0)
    77 # 2、训练数据
    78 nb.fit(train[:, :-1], train[:, -1])
    79 # 3、预测数据
    80 y_predict = nb.predict(test[:, :-1])
    81 
    82 # 获取准确率
    83 score = nb.score(test[:, :-1], test[:, -1])
    84 
    85 print("预测结果:
    ", y_predict)
    86 print("准确率:
    ", score)
  • 相关阅读:
    比较实用的断点调试技巧
    objc非主流代码技巧
    0代码ViewController
    xib的动态桥接
    ios中集合遍历方法的比较和技巧
    再见了NSLog
    Reactive Cocoa Tutorial [4] = 只取所需的Filters
    objc@interface的设计哲学与设计技巧
    ARC下dealloc过程及.cxx_destruct的探究
    Reactive Cocoa Tutorial [3] = "RACSignal的巧克力工厂“;
  • 原文地址:https://www.cnblogs.com/Tree0108/p/12116212.html
Copyright © 2020-2023  润新知