文章参照
链接:https://www.cnblogs.com/pinard/p/6945257.html
# coding=utf-8 import re import numpy as np class Hmm(object): def __init__(self, train_path): self.train_path = train_path self.clean_data() def clean_data(self): with open(self.train_path, encoding='utf-8') as f: sents = f.read() self.sents = [[word.split(" ") for word in sent.split("\n")] for sent in sents.split("\n\n")] self.Q = sorted(list(set([word[1] for sent in self.sents for word in sent]))) # 隐含状态集合 self.V = sorted(list(set([word[0] for sent in self.sents for word in sent]))) # 观测集合 def train(self): # 1、求hmm的初试隐含状态概率pi first_label = [sent[0][1] for sent in self.sents] self.pi = np.array([round(first_label.count(q) / len(first_label), 4) for q in self.Q]) # 2、求hmm的隐含状态转移概率矩阵A label = [[word[1] for word in sent] for sent in self.sents] two_label = [[tag[index:index + 2] for index in range(len(tag) - 1)] for tag in label] two_label = [''.join(word) for label in two_label for word in label] self.A = np.array( [[round(two_label.count(q1 + q2) / sum([1 for label in two_label if label[0] == q1]), 4) for q2 in self.Q] for q1 in self.Q]) # 3、求hmm的发射概率矩阵B word_label = [[''.join(word) for word in sent] for sent in self.sents] word_label = [word for label in word_label for word in label] label = [t for tag in label for t in tag] self.B = np.array([[word_label.count(v + q) / label.count(q) for v in self.V] for q in self.Q]) def predict(self, sent): O = np.array([self.V.index(word) for word in sent]) δ = np.zeros((len(O), len(self.A))) # 第一个局部 Ψ = np.zeros((len(O), len(self.A))) # 第二个局部 # 1、初始化t=1时刻维特比的两个局部变量 δ[0] = self.pi * self.B[:, O[0]] # 2、递归求序列每一步的两个局部变量 for index in range(1, len(δ)): δ[index] = np.max(δ[index - 1] * self.A.T, 1) * self.B[:, O[index]] Ψ[index] = np.argmax(δ[index - 1] * self.A.T, 1) # 3、求最后一个概率最大对应的隐含标签 label = [δ[-1].argmax()] # 4、回溯求整个序列的隐含标签 for index, tag in enumerate(Ψ[::-1]): if index < len(Ψ) - 1: label.append(int(tag[int(label[-1])])) label = label[::-1] label = ''.join([self.Q[index] for index in label]) return label if __name__ == '__main__': text = '维特比算法是一个分词方法' train_path = 'test.txt' hmm = Hmm(train_path) hmm.train() label = hmm.predict(text) print([text[word.start():word.end()] for word in re.finditer(r'bi+|o', label)])
test.txt
维 b
特 i
比 i
算 b
法 i
也 o
是 o
寻 b
找 i
序 b
列 i
最 b
短 i
路 b
径 i
的 o
一 b
个 i
通 b
用 i
方 b
法 i
同 b
时 i
维 b
特 i
比 i
算 b
法 i
仅 b
仅 i
局 b
限 i
于 o
求 o
序 b
列 i
最 b
短 i
路 b
径 i
如 b
果 i
大 b
家 i
看 b
过 i
之 b
前 i
写 o
的 o
文 b
本 i
挖 b
掘 i
的 o
分 b
词 i
原 b
理 i
中 o
的 o
维 b
特 i
比 i
算 b
法 i