• 95行代码实现最大熵模型训练


    关于最大熵模型的介绍请看:http://www.cnblogs.com/hexinuaa/p/3353479.html

    以下是GIS训练算法的python实现,代码不到100行。


    from collections import defaultdict

    import math


    class MaxEnt(object):

        def __init__(self):

            self.feats = defaultdict(int)

            self.trainset = []

            self.labels = set()  

          

        def load_data(self,file):

            for line in open(file):

                fields = line.strip().split()

                # at least two columns

                if len(fields) < 2: continue

                # the first column is label

                label = fields[0]

                self.labels.add(label)

                for f in set(fields[1:]):

                    # (label,f) tuple is feature 

                    self.feats[(label,f)] += 1

                self.trainset.append(fields)

                

        def _initparams(self):

            self.size = len(self.trainset)

            # M param for GIS training algorithm

            self.M = max([len(record)-1 for record in self.trainset])

            self.ep_ = [0.0]*len(self.feats)

            for i,f in enumerate(self.feats):

                # calculate feature expectation on empirical distribution

                self.ep_[i] = float(self.feats[f])/float(self.size)

                # each feature function correspond to id

                self.feats[f] = i

            # init weight for each feature

            self.w = [0.0]*len(self.feats)

            self.lastw = self.w

            

        def probwgt(self,features,label):

            wgt = 0.0

            for f in features:

                if (label,f) in self.feats:

                    wgt += self.w[self.feats[(label,f)]]

            return math.exp(wgt)

                

        """

        calculate feature expectation on model distribution

        """        

        def Ep(self):

            ep = [0.0]*len(self.feats)

            for record in self.trainset:

                features = record[1:]

                # calculate p(y|x)

                prob = self.calprob(features)

                for f in features:

                    for w,l in prob:

                        # only focus on features from training data.

                        if (l,f) in self.feats:

                            # get feature id

                            idx = self.feats[(l,f)]

                            # sum(1/N * f(y,x)*p(y|x)), p(x) = 1/N

                            ep[idx] += w * (1.0/self.size)

            return ep

        

        def _convergence(self,lastw,w):

            for w1,w2 in zip(lastw,w):

                if abs(w1-w2) >= 0.01:

                    return False

            return True

                    

        def train(self, max_iter =1000):

            self._initparams()

            for i in range(max_iter):

                print 'iter %d ...'%(i+1)

                # calculate feature expectation on model distribution

                self.ep = self.Ep()           

                self.lastw = self.w[:]  

                for i,win enumerate(self.w):

                    delta = 1.0/self.M * math.log(self.ep_[i]/self.ep[i])

                    # update w

                    self.w[i] += delta

                print self.w

                # test if the algorithm is convergence

                if self._convergence(self.lastw,self.w):

                    break

        

        def calprob(self,features):

            wgts = [(self.probwgt(features, l),l) for l in self.labels]

            Z = sum([ w for w,l in wgts])

            prob = [ (w/Z,l) for w,l in wgts]

            return prob 

                

        def predict(self,input):

            features = input.strip().split()

            prob = self.calprob(features)

            prob.sort(reverse=True)

            return prob   


    执行:

    prepare training data:

    Outdoor Sunny Happy

    Outdoor Sunny Happy Dry

    Outdoor Sunny Happy Humid

    Outdoor Sunny Sad Dry

    Outdoor Sunny Sad Humid

    Outdoor Cloudy Happy Humid

    Outdoor Cloudy Happy Humid

    Outdoor Cloudy Sad Humid

    Outdoor Cloudy Sad Humid

    Indoor Rainy Happy Humid

    Indoor Rainy Happy Dry

    Indoor Rainy Sad Dry

    Indoor Rainy Sad Humid

    Indoor Cloudy Sad Humid

    Indoor Cloudy Sad Humid


    open ipython to run the following commands:

    In [11]: import maxent


    In [12]: model = maxent.MaxEnt()


    In [13]: model.load_data('data/gameLocation.dat')


    In [14]: model.train()

    In [11]: import maxent


    In [12]: model = maxent.MaxEnt()


    In [13]: model.load_data('data/gameLocation.dat')


    In [14]: model.train()

    iter 1 ...

    iter 2 ...

    iter 3 ...

    iter 4 ...

    iter 5 ...

    iter 6 ...

    iter 7 ...

    iter 8 ...

    iter 9 ...

    iter 10 ...

    iter 11 ...

    iter 12 ...

    iter 13 ...

    iter 14 ...

    iter 15 ...

    iter 16 ...

    iter 17 ...

    iter 18 ...

    iter 19 ...

    iter 20 ...

    iter 21 ...

    iter 22 ...

    iter 23 ...

    iter 24 ...

    iter 25 ...

    iter 26 ...

    iter 27 ...

    iter 28 ...

    iter 29 ...

    iter 30 ...

    iter 31 ...

    iter 32 ...

    iter 33 ...

    iter 34 ...

    iter 35 ...

    iter 36 ...

    iter 37 ...

    iter 38 ...

    iter 39 ...

    iter 40 ...

    iter 41 ...

    iter 42 ...

    iter 43 ...

    iter 44 ...

    iter 45 ...

    iter 46 ...

    iter 47 ...

    iter 48 ...

    iter 49 ...

    iter 50 ...

    iter 51 ...

    iter 52 ...

    iter 53 ...

    iter 54 ...

    iter 55 ...

    iter 56 ...

    iter 57 ...

    iter 58 ...

    iter 59 ...

    iter 60 ...

    iter 61 ...

    iter 62 ...

    iter 63 ...

    iter 64 ...

    iter 65 ...

    iter 66 ...

    iter 67 ...

    iter 68 ...

    iter 69 ...

    iter 70 ...

    iter 71 ...

    iter 72 ...

    iter 73 ...

    iter 74 ...

    iter 75 ...

    iter 76 ...

    iter 77 ...

    iter 78 ...

    iter 79 ...

    iter 80 ...

    iter 81 ...

    iter 82 ...

    iter 83 ...

    iter 84 ...

    iter 85 ...

    iter 86 ...

    iter 87 ...

    iter 88 ...

    iter 89 ...

    iter 90 ...

    iter 91 ...

    iter 92 ...

    iter 93 ...

    iter 94 ...

    iter 95 ...

    iter 96 ...

    iter 97 ...

    iter 98 ...

    iter 99 ...

    iter 100 ...

    iter 101 ...

    iter 102 ...

    iter 103 ...

    iter 104 ...

    iter 105 ...

    iter 106 ...

    iter 107 ...

    iter 108 ...

    iter 109 ...

    iter 110 ...

    iter 111 ...

    iter 112 ...

    iter 113 ...

    iter 114 ...

    iter 115 ...

    iter 116 ...

    iter 117 ...

    iter 118 ...

    iter 119 ...

    iter 120 ...

    iter 121 ...

    iter 122 ...

    iter 123 ...

    iter 124 ...

    iter 125 ...

    iter 126 ...

    iter 127 ...

    iter 128 ...

    iter 129 ...

    iter 130 ...

    iter 131 ...

    iter 132 ...

    iter 133 ...

    iter 134 ...

    iter 135 ...

    iter 136 ...

    iter 137 ...

    iter 138 ...

    iter 139 ...

    iter 140 ...

    iter 141 ...

    iter 142 ...

    iter 143 ...

    iter 144 ...


    In [16]: model.predict('Sunny')

    Out[16]: [(0.9763203118841158, 'Outdoor'), (0.02367968811588421, 'Indoor')]

    In [18]: model.predict('Cloudy')

    Out[18]: [(0.7136730549489295, 'Outdoor'), (0.28632694505107054, 'Indoor')]


  • 相关阅读:
    Pig Latin-freecodecamp算法题目
    Search and Replace -freecodecamp算法题目
    Where art thou-freecodecamp算法题目
    Roman Numeral Converter-freecodecamp算法题目
    Diff Two Arrays-freecodecamp算法题目
    Asp.Net前台调用后台变量
    ASP.NET获取前端页面的Html标签的值
    echart 设置图例图标形状
    解决tableexport导出到excel中有关中文乱码的问题
    C# Async与Await用法
  • 原文地址:https://www.cnblogs.com/gcczhongduan/p/4331996.html
Copyright © 2020-2023  润新知