• 微博情感分析


    每个文本文件包含相对应类的数据(0:喜悦1:愤怒2:厌恶3:低落对应不同类别的感情)

    1. 文本读取
    2. 用均值的方差,在高斯分布里面计算某个词的概率。
    3. 对文本特征进行提取,提取词频。
    4. 通过词频在各种词频目录里面进行匹配。
    5. 对模型的准确率的预测。

    main.py

      1 # -*- coding: utf-8 -*-
      2 import os
      3 import pandas as pd
      4 import nltk
      5 from tools import proc_text, split_train_test, get_word_list_from_data, 
      6     extract_feat_from_data, cal_acc
      7 from nltk.text import TextCollection
      8 from sklearn.naive_bayes import GaussianNB
      9 
     10 dataset_path = './dataset'
     11 text_filenames = ['0_simplifyweibo.txt', '1_simplifyweibo.txt',
     12                   '2_simplifyweibo.txt', '3_simplifyweibo.txt']
     13 
     14 # 原始数据的csv文件
     15 output_text_filename = 'raw_weibo_text.csv'
     16 
     17 # 清洗好的文本数据文件
     18 output_cln_text_filename = 'clean_weibo_text.csv'
     19 
     20 # 处理和清洗文本数据的时间较长,通过设置is_first_run进行配置
     21 # 如果是第一次运行需要对原始文本数据进行处理和清洗,需要设为True
     22 # 如果之前已经处理了文本数据,并已经保存了清洗好的文本数据,设为False即可
     23 is_first_run = True
     24 
     25 
     26 def read_and_save_to_csv():
     27     """
     28         读取原始文本数据,将标签和文本数据保存成csv
     29     """
     30 
     31     text_w_label_df_lst = []
     32     for text_filename in text_filenames:
     33         text_file = os.path.join(dataset_path, text_filename)
     34 
     35         # 获取标签,即0, 1, 2, 3
     36         label = int(text_filename[0])
     37 
     38         # 读取文本文件
     39         with open(text_file, 'r', encoding='utf-8') as f:
     40             lines = f.read().splitlines()
     41 
     42         labels = [label] * len(lines)
     43 
     44         text_series = pd.Series(lines)
     45         label_series = pd.Series(labels)
     46 
     47         # 构造dataframe
     48         text_w_label_df = pd.concat([label_series, text_series], axis=1)
     49         text_w_label_df_lst.append(text_w_label_df)
     50 
     51     result_df = pd.concat(text_w_label_df_lst, axis=0)
     52 
     53     # 保存成csv文件
     54     result_df.columns = ['label', 'text']
     55     result_df.to_csv(os.path.join(dataset_path, output_text_filename),
     56                      index=None, encoding='utf-8')
     57 
     58 
     59 def run_main():
     60     """
     61         主函数
     62     """
     63     # 1. 数据读取,处理,清洗,准备
     64     if is_first_run:
     65         print('处理清洗文本数据中...', end=' ')
     66         # 如果是第一次运行需要对原始文本数据进行处理和清洗
     67 
     68         # 读取原始文本数据,将标签和文本数据保存成csv
     69         read_and_save_to_csv()
     70 
     71         # 读取处理好的csv文件,构造数据集
     72         text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
     73                               encoding='utf-8')
     74 
     75         # 处理文本数据
     76         text_df['text'] = text_df['text'].apply(proc_text)
     77 
     78         # 过滤空字符串
     79         text_df = text_df[text_df['text'] != '']
     80 
     81         # 保存处理好的文本数据
     82         text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
     83                        index=None, encoding='utf-8')
     84         print('完成,并保存结果。')
     85 
     86     # 2. 分割训练集、测试集
     87     print('加载处理好的文本数据')
     88     clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
     89                                 encoding='utf-8')
     90     # 分割训练集和测试集
     91     train_text_df, test_text_df = split_train_test(clean_text_df)
     92     # 查看训练集测试集基本信息
     93     print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
     94     print('测试集中各类的数据个数:', test_text_df.groupby('label').size())
     95 
     96     # 3. 特征提取
     97     # 计算词频
     98     n_common_words = 200
     99 
    100     # 将训练集中的单词拿出来统计词频
    101     print('统计词频...')
    102     all_words_in_train = get_word_list_from_data(train_text_df)
    103     fdisk = nltk.FreqDist(all_words_in_train)
    104     common_words_freqs = fdisk.most_common(n_common_words)
    105     print('出现最多的{}个词是:'.format(n_common_words))
    106     for word, count in common_words_freqs:
    107         print('{}: {}次'.format(word, count))
    108     print()
    109 
    110     # 在训练集上提取特征
    111     text_collection = TextCollection(train_text_df['text'].values.tolist())
    112     print('训练样本提取特征...', end=' ')
    113     train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
    114     print('完成')
    115     print()
    116 
    117     print('测试样本提取特征...', end=' ')
    118     test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
    119     print('完成')
    120 
    121     # 4. 训练模型Naive Bayes
    122     print('训练模型...', end=' ')
    123     gnb = GaussianNB()
    124     gnb.fit(train_X, train_y)
    125     print('完成')
    126     print()
    127 
    128     # 5. 预测
    129     print('测试模型...', end=' ')
    130     test_pred = gnb.predict(test_X)
    131     print('完成')
    132 
    133     # 输出准确率
    134     print('准确率:', cal_acc(test_y, test_pred))
    135 
    136 if __name__ == '__main__':
    137     run_main()

    tools.py

      1 # -*- coding: utf-8 -*-
      2 import re
      3 import jieba.posseg as pseg
      4 import pandas as pd
      5 import math
      6 import numpy as np
      7 
      8 # 加载常用停用词
      9 stopwords1 = [line.rstrip() for line in open('./中文停用词库.txt', 'r', encoding='utf-8')]
     10 # stopwords2 = [line.rstrip() for line in open('./哈工大停用词表.txt', 'r', encoding='utf-8')]
     11 # stopwords3 = [line.rstrip() for line in open('./四川大学机器智能实验室停用词库.txt', 'r', encoding='utf-8')]
     12 # stopwords = stopwords1 + stopwords2 + stopwords3
     13 stopwords = stopwords1
     14 
     15 
     16 def proc_text(raw_line):
     17     """
     18         处理每行的文本数据
     19         返回分词结果
     20     """
     21     # 1. 使用正则表达式去除非中文字符
     22     filter_pattern = re.compile('[^u4E00-u9FD5]+')
     23     chinese_only = filter_pattern.sub('', raw_line)
     24 
     25     # 2. 结巴分词+词性标注
     26     words_lst = pseg.cut(chinese_only)
     27 
     28     # 3. 去除停用词
     29     meaninful_words = []
     30     for word, flag in words_lst:
     31         # if (word not in stopwords) and (flag == 'v'):
     32             # 也可根据词性去除非动词等
     33         if word not in stopwords:
     34             meaninful_words.append(word)
     35 
     36     return ' '.join(meaninful_words)
     37 
     38 
     39 def split_train_test(text_df, size=0.8):
     40     """
     41         分割训练集和测试集
     42     """
     43     # 为保证每个类中的数据能在训练集中和测试集中的比例相同,所以需要依次对每个类进行处理
     44     train_text_df = pd.DataFrame()
     45     test_text_df = pd.DataFrame()
     46 
     47     labels = [0, 1, 2, 3]
     48     for label in labels:
     49         # 找出label的记录
     50         text_df_w_label = text_df[text_df['label'] == label]
     51         # 重新设置索引,保证每个类的记录是从0开始索引,方便之后的拆分
     52         text_df_w_label = text_df_w_label.reset_index()
     53 
     54         # 默认按80%训练集,20%测试集分割
     55         # 这里为了简化操作,取前80%放到训练集中,后20%放到测试集中
     56         # 当然也可以随机拆分80%,20%(尝试实现下DataFrame中的随机拆分)
     57 
     58         # 该类数据的行数
     59         n_lines = text_df_w_label.shape[0]
     60         split_line_no = math.floor(n_lines * size)
     61         text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :]
     62         text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :]
     63 
     64         # 放入整体训练集,测试集中
     65         train_text_df = train_text_df.append(text_df_w_label_train)
     66         test_text_df = test_text_df.append(text_df_w_label_test)
     67 
     68     train_text_df = train_text_df.reset_index()
     69     test_text_df = test_text_df.reset_index()
     70     return train_text_df, test_text_df
     71 
     72 
     73 def get_word_list_from_data(text_df):
     74     """
     75         将数据集中的单词放入到一个列表中
     76     """
     77     word_list = []
     78     for _, r_data in text_df.iterrows():
     79         word_list += r_data['text'].split(' ')
     80     return word_list
     81 
     82 
     83 def extract_feat_from_data(text_df, text_collection, common_words_freqs):
     84     """
     85         特征提取
     86     """
     87     # 这里只选择TF-IDF特征作为例子
     88     # 可考虑使用词频或其他文本特征作为额外的特征
     89 
     90     n_sample = text_df.shape[0]
     91     n_feat = len(common_words_freqs)
     92     common_words = [word for word, _ in common_words_freqs]
     93 
     94     # 初始化
     95     X = np.zeros([n_sample, n_feat])
     96     y = np.zeros(n_sample)
     97 
     98     print('提取特征...')
     99     for i, r_data in text_df.iterrows():
    100         if (i + 1) % 5000 == 0:
    101             print('已完成{}个样本的特征提取'.format(i + 1))
    102 
    103         text = r_data['text']
    104 
    105         feat_vec = []
    106         for word in common_words:
    107             if word in text:
    108                 # 如果在高频词中,计算TF-IDF值
    109                 tf_idf_val = text_collection.tf_idf(word, text)
    110             else:
    111                 tf_idf_val = 0
    112 
    113             feat_vec.append(tf_idf_val)
    114 
    115         # 赋值
    116         X[i, :] = np.array(feat_vec)
    117         y[i] = int(r_data['label'])
    118 
    119     return X, y
    120 
    121 
    122 def cal_acc(true_labels, pred_labels):
    123     """
    124         计算准确率
    125     """
    126     n_total = len(true_labels)
    127     correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)]
    128 
    129     acc = sum(correct_list) / n_total
    130     return acc
  • 相关阅读:
    gym101350 c h m
    Gym
    poj 1511 Invitation Cards(最短路中等题)
    POJ 1062 昂贵的聘礼(最短路中等题)
    POJ 1125 Stockbroker Grapevine(最短路基础题)
    【Linux】buffer cache free 理解
    python 绘图 工具
    【Linux】时间跟时区的校正
    python conda、pip区别,python 下 faiss 安装
    celery-demo
  • 原文地址:https://www.cnblogs.com/chengchengaqin/p/9655283.html
Copyright © 2020-2023  润新知