• 贝叶斯分类


      1 # -*- coding: utf-8 -*-
      2 
      3 import sys
      4 import os
      5 import numpy as np
      6 import pickle
      7 from sklearn import metrics
      8 
      9 #导入数据集
     10 def loadDataSet():
     11     postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
     12                    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
     13                    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him', 'my'],
     14                    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
     15                    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
     16                    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
     17     classVec = [0, 1, 0, 1, 0, 1]  # 1 is abusive, 0 not,分类
     18     return postingList, classVec
     19 
     20 
     21 # 读取文件
     22 def readfile(path):
     23     fp = open(path, "rb")
     24     content = fp.read()
     25     fp.close()
     26     return content
     27 
     28 """
     29 
     30 #计算分类精度:
     31 def metrics_result(actual,predict):
     32     print('精度:{0:.3f}'.format(metrics.precision_score(actual,predict)))
     33     print ('召回:{0:0.3f}'.format(metrics.recall_score(actual,predict)))
     34     print ('f1-score:{0:.3f}'.format(metrics.f1_score(actual,predict)))
     35 
     36 """
     37 
     38 # 读取bunch对象
     39 def readbunchobj(path):
     40     file_obj = open(path, "rb")
     41     bunch = pickle.load(file_obj)
     42     file_obj.close()
     43     return bunch
     44 
     45 
     46 # 写入bunch对象
     47 def writebunchobj(path, bunchobj):
     48     file_obj = open(path, "wb")
     49     pickle.dump(bunchobj, file_obj)
     50     file_obj.close()
     51 
     52 
     53 class NBayes(object):
     54     def __init__(self):
     55         self.vocabulary = []  # 词典
     56         self.idf = 0  # 词典的idf权值向量
     57         self.tf = 0  # 训练集的权值矩阵
     58         self.tdm = 0  # P(x|yi)
     59         self.Pcates = {}  # P(yi)--是个类别字典,这个集合就是p(yi)的值的集合
     60         self.labels = []  # 对应每个文本的分类,是个外部导入的列表
     61         self.doclength = 0  # 训练集文本数
     62         self.vocablen = 0  # 词典词长
     63         self.testset = 0  # 测试集
     64 
     65     #    加载训练集并生成词典,以及tf, idf值
     66     def train_set(self, trainset, classVec):
     67         self.cate_prob(classVec)  # 计算每个分类在数据集中的概率:P(yi)
     68         self.doclength = len(trainset)
     69         tempset = set()
     70         [tempset.add(word) for doc in trainset for word in doc]  # 生成词典
     71         self.vocabulary = list(tempset)
     72         self.vocablen = len(self.vocabulary)
     73         self.calc_wordfreq(trainset)
     74         # self.calc_tfidf(trainset)  # 生成tf-idf权值
     75         self.build_tdm()  # 按分类累计向量空间的每维值:P(x|yi)
     76 
     77     # 生成 tf-idf
     78     def calc_tfidf(self, trainset):
     79         self.idf = np.zeros([1, self.vocablen])
     80         self.tf = np.zeros([self.doclength, self.vocablen])
     81         for indx in range(self.doclength):
     82             for word in trainset[indx]:
     83                 self.tf[indx, self.vocabulary.index(word)] += 1
     84             # 消除不同句长导致的偏差
     85             self.tf[indx] = self.tf[indx] / float(len(trainset[indx]))
     86             for signleword in set(trainset[indx]):
     87                 self.idf[0, self.vocabulary.index(signleword)] += 1
     88         self.idf = np.log(float(self.doclength) / self.idf)
     89         self.tf = np.multiply(self.tf, self.idf)  # 矩阵与向量的点乘
     90 
     91     # 生成普通的词频向量
     92     def calc_wordfreq(self, trainset):
     93         self.idf = np.zeros([1, self.vocablen])  # 1*词典数
     94         self.tf = np.zeros([self.doclength, self.vocablen])  # 训练集文件数*词典数
     95         for indx in range(self.doclength):  # 遍历所有的文本
     96             for word in trainset[indx]:  # 遍历文本中的每个词
     97                 self.tf[indx, self.vocabulary.index(word)] += 1  # 找到文本的词在字典中的位置+1
     98             for signleword in set(trainset[indx]):
     99                 self.idf[0, self.vocabulary.index(signleword)] += 1
    100 
    101     # 计算每个分类在数据集中的概率:P(yi)
    102     def cate_prob(self, classVec):
    103         self.labels = classVec#让分类作为相对应的标签
    104         labeltemps = set(self.labels)  # 获取全部分类,返回的是一个集合,其值为{0,1}
    105         #print('分类的结果:',labeltemps)
    106         for labeltemp in labeltemps:
    107             # 统计列表中重复的值:self.labels.count(labeltemp)
    108             self.Pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels))#求分类列表中重复的值,就是0和1在所有当中所占的比例
    109 
    110     # 按分类累计向量空间的每维值:P(x|yi)
    111     def build_tdm(self):
    112         self.tdm = np.zeros([len(self.Pcates), self.vocablen])  # 类别行*词典列
    113         sumlist = np.zeros([len(self.Pcates), 1])  # 统计每个分类的总值
    114         for indx in range(self.doclength):
    115             self.tdm[self.labels[indx]] += self.tf[indx]  # 将同一类别的词向量空间值加总
    116             sumlist[self.labels[indx]] = np.sum(self.tdm[self.labels[indx]])  # 统计每个分类的总值--是个标量
    117         self.tdm = self.tdm / sumlist  # P(x|yi)
    118 
    119     # 测试集映射到当前词典
    120     def map2vocab(self, testdata):
    121         self.testset = np.zeros([1, self.vocablen])
    122         for word in testdata:
    123             self.testset[0, self.vocabulary.index(word)] += 1
    124 
    125     # 输出分类类别
    126     def predict(self, testset):
    127         if np.shape(testset)[1] != self.vocablen:
    128             print("输入错误")
    129             exit(0)
    130         predvalue = 0
    131         predclass = ""
    132         for tdm_vect, keyclass in zip(self.tdm, self.Pcates):
    133             # P(x|yi)P(yi)
    134             temp = np.sum(testset * tdm_vect * self.Pcates[keyclass])
    135             if temp > predvalue:
    136                 predvalue = temp
    137                 predclass = keyclass
    138         return predclass

     算法的改进:

     # 生成 tf-idf
     78     def calc_tfidf(self, trainset):
     79         self.idf = np.zeros([1, self.vocablen])
     80         self.tf = np.zeros([self.doclength, self.vocablen])
     81         for indx in range(self.doclength):
     82             for word in trainset[indx]:
     83                 self.tf[indx, self.vocabulary.index(word)] += 1
     84             # 消除不同句长导致的偏差
     85             self.tf[indx] = self.tf[indx] / float(len(trainset[indx]))
     86             for signleword in set(trainset[indx]):
     87                 self.idf[0, self.vocabulary.index(signleword)] += 1
     88         self.idf = np.log(float(self.doclength) / self.idf)
     89         self.tf = np.multiply(self.tf, self.idf)  # 矩阵与向量的点乘
  • 相关阅读:
    迭代合并排序算法
    appendChild和insertBefore的区别
    使用定时器处理数组
    正则表达式 删除string首尾的空白
    图片滚动
    数组合并法(IE7性能优化)
    赋值取值+arguments
    条件预加载(conditional advanceloading)
    Just a Note~
    腾讯马拉松复赛第一场
  • 原文地址:https://www.cnblogs.com/caicaihong/p/5768714.html
Copyright © 2020-2023  润新知