关于中文分词的一些资料网上资料很多,大家可以自己去了解了解,今天这里只关注代码怎么写。
中文分词主要可以归纳为“规则分词”、“统计分词”以及“规则+统计”三个主要派别,今天主要了解“规则分词”中常见的正向、逆向和双向最大化匹配,这三个都是基于现在词典做的,所以得准备一个中文词典,一行一个词。
一.正向最大化匹配
描述:
1.找到词典中最长的词,记下长度L
2.从 “左向右” 取长度为L的字符串,查找词典进行匹配,若匹配成功,则将这个词切分出来。若匹配失败,将这个字符串的最后一个字符去掉,将剩下的串作为新的匹配串 进行匹配。如此重复下去,直到切完。
二.逆向最大化匹配
描述:
1.找到词典中最长的词,记下长度L
2.从 ”右向左“ 取长度为L的字符串,查找词典进行匹配,若匹配成功,则将这个词切分出来。若匹配失败,将这个字符串的最前面一个字符去掉,将剩下的串作为新的匹配 串进行匹配。如此重复下去,直到切完。
三.双向最大化匹配
描述:
1.将正向和逆向进行比较,先取词数切分最少的作为结果。
四.代码采用python
1.load 词典
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 #这是词典路径 5 dictPath = '../resource/dict.txt' 6 7 def loadDict(): 8 print('load dict...') 9 dictionary = dict() 10 maximum = 0 11 # read resource 12 with open(dictPath, 'r', encoding='utf8') as f: 13 for line in f: 14 line = line.strip() 15 if not line: 16 continue 17 str = line.split(' ') 18 dictionary[str[0]] = str[2] 19 wordLength = len(line) 20 if wordLength > maximum: 21 maximum = wordLength #词典中最长的词的长度 22 return dictionary, maximum
2.核心方法
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM 5 from word_segmentation.regulation.MaximumMatchMethod import MM 6 from word_segmentation.regulation.BiDirectctionMatchMethod import BDMM 7 from word_segmentation.util.LoadDict import loadDict 8 9 class RegulationMatch(object): 10 def __init__(self): 11 self.dictionary, self.maximum = loadDict() 12 13 def cut(self, text, method): 14 #逆向 15 if method == 'RMM': 16 return RMM.cut(text, self.dictionary, self.maximum) 17 #正向 18 if method == 'MM': 19 return MM.cut(text, self.dictionary, self.maximum) 20 #双向 21 if method == 'BDMM': 22 return BDMM.cut(text, self.dictionary, self.maximum)
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 ''' 5 词和词性 6 ''' 7 class Word(object) : 8 def __init__(self, token, property): 9 self.__token = token 10 self.__property = property 11 #单词 12 def getToken(self): 13 return self.__token 14 #词性 15 def getProperty(self): 16 return self.__property
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 from word_segmentation.regulation.Word import Word 5 6 ''' 7 正向最大化匹配 8 MaximumMatchMethod 9 ''' 10 class MM(object): 11 def __init__(self): 12 pass 13 14 @staticmethod 15 def cut(text, dictionary, maximum): 16 result = [] 17 textLength = len(text) 18 start = 0 19 while textLength > 0: 20 word = None 21 for size in range(maximum, 0, -1): 22 if textLength - size < 0: 23 continue 24 piece = text[start:(start + size)] 25 if dictionary.__contains__(piece): 26 word = piece 27 result.append(Word(piece, dictionary.get(piece))) 28 textLength -= size 29 start += size 30 break 31 if word is None: 32 textLength -= 1 33 return result
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 from word_segmentation.regulation.Word import Word 5 6 ''' 7 逆向最大化匹配 8 ReverseMaximumMatchMethod 9 ''' 10 class RMM(object): 11 def __init__(self): 12 pass 13 14 @staticmethod 15 def cut(text, dictionary, maximum): 16 result = [] 17 textLength = len(text) 18 while textLength > 0: 19 word = None 20 for size in range(maximum, 0, -1): 21 if textLength - size < 0: 22 continue 23 piece = text[(textLength - size) : textLength] 24 if dictionary.__contains__(piece): 25 word = piece 26 result.append(Word(piece, dictionary.get(piece))) 27 textLength -= size 28 break 29 if word is None: 30 textLength -= 1 31 return result[::-1]
1 # -*- coding:utf-8 -*- 2 3 from word_segmentation.regulation.MaximumMatchMethod import MM 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM 5 6 ''' 7 比较正向最大匹配和逆向最大匹配结果: 8 1.如果分词数量结果不同,那么取分词数量较少的那个 9 2.如果分词数量结果相同 10 a.分词结果相同,可以返回任何一个 11 b.分词结果不同,返回单字数比较少的那个 12 c.分词结果不同,单字数相同,返回谁呢(可以返回逆向分词结果) 13 ''' 14 class BDMM(object): 15 def __init__(self): 16 pass 17 18 @staticmethod 19 def cut(text, dictionary, maximum): 20 mmResult = MM.cut(text, dictionary, maximum) 21 rmmResult = RMM.cut(text, dictionary, maximum) 22 mmSegment = [] 23 rmmSegment = [] 24 for word in mmResult: 25 mmSegment.append(word.getToken()) 26 # print('token = %s, property = %s' %(word.getToken(), word.getProperty())) 27 for word in rmmResult: 28 rmmSegment.append(word.getToken()) 29 30 if mmSegment.__len__() < rmmSegment.__len__(): 31 return mmResult 32 elif mmSegment.__len__() == rmmSegment.__len__(): 33 flag = True 34 for segment in mmSegment: 35 if segment not in rmmSegment: 36 flag = False 37 break 38 if flag: 39 return mmResult 40 else: 41 mmSingleWords = 0 42 rmmSingleWords = 0 43 for word in mmSegment: 44 if len(word) == 1: 45 mmSingleWords += 1 46 for word in rmmSegment: 47 if len(word) == 1: 48 rmmSingleWords += 1 49 if mmSingleWords < rmmSingleWords: 50 return mmResult 51 else: 52 return rmmResult 53 else: 54 return rmmResult
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM 5 import word_segmentation.regulation.MaximumMatchMethod 6 import word_segmentation.regulation.BiDirectctionMatchMethod 7 from word_segmentation.regulation.RegulationMatchMthod import RegulationMatch 8 9 def test(): 10 pass 11 if __name__ == '__main__': 12 text = '各国有各国的困难…' 13 print('分词:') 14 print('各国有各国的困难…') 15 regulation = RegulationMatch() 16 mmResult = regulation.cut(text, 'MM') 17 rmmResult = regulation.cut(text, 'RMM') 18 bdmmResult = regulation.cut(text, 'BDMM') 19 mmSegment = [] 20 rmmSegment = [] 21 bdmmSegment = [] 22 for word in mmResult: 23 mmSegment.append(word.getToken()) 24 #print('token = %s, property = %s' %(word.getToken(), word.getProperty())) 25 for word in rmmResult: 26 rmmSegment.append(word.getToken()) 27 for word in bdmmResult: 28 bdmmSegment.append(word.getToken()) 29 30 print('正向匹配: %s' % mmSegment) 31 print('逆向匹配: %s' % rmmSegment) 32 print('双向匹配: %s' % bdmmSegment)