文本分段的两个指标

 1 def pk(ref, hyp, k=None, boundary='1'):
 2     """
 3     Compute the Pk metric for a pair of segmentations A segmentation
 4     is any sequence over a vocabulary of two items (e.g. "0", "1"),
 5     where the specified boundary value is used to mark the edge of a
 6     segmentation.
 7 
 8     >>> '%.2f' % pk('0100'*100, '1'*400, 2)
 9     '0.50'
10     >>> '%.2f' % pk('0100'*100, '0'*400, 2)
11     '0.50'
12     >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
13     '0.00'
14 
15     :param ref: the reference segmentation
16     :type ref: str or list
17     :param hyp: the segmentation to evaluate
18     :type hyp: str or list
19     :param k: window size, if None, set to half of the average reference segment length
20     :type boundary: str or int or bool
21     :param boundary: boundary value
22     :type boundary: str or int or bool
23     :rtype: float
24     """
25 
26     if k is None:
27         k = int(round(len(ref) / (ref.count(boundary) * 2.)))
28 
29     err = 0
30     for i in range(len(ref)-k +1):
31         r = ref[i:i+k].count(boundary) > 0
32         h = hyp[i:i+k].count(boundary) > 0
33         if r != h:
34            err += 1
35     return err / (len(ref)-k +1.)

WindowDiff

 1 def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
 2     """
 3     Compute the windowdiff score for a pair of segmentations.  A
 4     segmentation is any sequence over a vocabulary of two items
 5     (e.g. "0", "1"), where the specified boundary value is used to
 6     mark the edge of a segmentation.
 7 
 8         >>> s1 = "000100000010"
 9         >>> s2 = "000010000100"
10         >>> s3 = "100000010000"
11         >>> '%.2f' % windowdiff(s1, s1, 3)
12         '0.00'
13         >>> '%.2f' % windowdiff(s1, s2, 3)
14         '0.30'
15         >>> '%.2f' % windowdiff(s2, s3, 3)
16         '0.80'
17 
18     :param seg1: a segmentation
19     :type seg1: str or list
20     :param seg2: a segmentation
21     :type seg2: str or list
22     :param k: window width
23     :type k: int
24     :param boundary: boundary value
25     :type boundary: str or int or bool
26     :param weighted: use the weighted variant of windowdiff
27     :type weighted: boolean
28     :rtype: float
29     """
30 
31     if len(seg1) != len(seg2):
32         raise ValueError("Segmentations have unequal length")
33     if k > len(seg1):
34         raise ValueError("Window width k should be smaller or equal than segmentation lengths")
35     wd = 0
36     for i in range(len(seg1) - k + 1):
37         ndiff = abs(seg1[i:i + k].count(boundary) - seg2[i:i + k].count(boundary))
38         if weighted:
39             wd += ndiff
40         else:
41             wd += min(1, ndiff)
42     return wd / (len(seg1) - k + 1.)

这两个指标观看文献，还真是有点玄学！还好，找到了nltk中对应的实现，极其简单明了！

相关阅读:
iOS Touch ID 身份认证
 iOS 真机测试错误“The application could not be verified”
iOS 容器控制器 (Container View Controller)
cocos creator按钮点击按钮弹起效果设置方法
 coco creator编辑动画坑之拖图片
 cocos动画没有cc.Sprite.spriteFrame属性
 cocos发布遇到的问题
 Android数据库GreenDao配置版本问题
 Android写入文件电脑看不到
 如何保证WebBrowser一直在页面内跳转
原文地址：https://www.cnblogs.com/crackpotisback/p/7624519.html