• 单词纠错系统


    单词纠错

    vocab = set([line.rstrip() for line in open('/content/drive/My Drive/data/vocab_data/vocab.txt')])
    

    需要生成所有候选集合

    def generate_candidates(word):
      """
      word: 给定的输入(错误的输入)
      返回所有(valid)候选集合
      """
      # 生成编辑距离为1的单词
      # 1. insert 2. delete 3. replace
      # app: replace: bpp1, cpp1, app1, abp1....
      #      insert: bappl, cappl, abppl, acppl....
      #      delete: ppl, apl, app....
      # 假设使用26个字符
      letters = 'abcderghigklmnopqrstuvwxyz'
    
      splits = [(word[:i], word[i:]) for i in range(len(word)+1)]
    
      # insert copteration
      inserts = [L+c+R for L,R in splits for c in letters]
      # delete opteration
      deletes = [L+R[1:] for L,R in splits if R]
      # repalce opteration
      repalces =[L+c+R[1:] for L,R in splits if R for c in letters]
        
      # 生成的所有候选单词
      candidates =  set(inserts+deletes+repalces)
    
      # 过滤掉不存在与词典库里面的单词
      return [word for word in candidates if word in vocab]
    
    generate_candidates("apple")
    

    读取语料库

    import nltk
    nltk.download('reuters')
    nltk.download('punkt')
    from nltk.corpus import reuters
    categories = reuters.categories()
    corpus = reuters.sents(categories=categories)
    

    构建语言模型:bigram

    term_count = {}
    bigram_count = {}
    for doc in corpus:
      doc = ['<s>'] + doc
      # bigram; [i, i+1]
      for i in range(0,len(doc)-1):
        term = doc[i]
        bigram = doc[i:i+2]
    
        if term in term_count:
          term_count[term]+=1
        else:
          term_count[term]=1
    
        bigram = ''.join(bigram)
        if bigram in bigram_count:
          bigram_count[bigram]+=1
        else:
          bigram_count[bigram]=1
    

    用户打错的概率

    channel_prob = {}
    
    for line in open('/content/drive/My Drive/data/vocab_data/spell-errors.txt'):
      items = line.split(":")
      correct = items[0].strip()
      mistakes =[item.strip() for item in items[1].strip().split(",")]
      channel_prob[correct] = {}
      for mis in mistakes:
        channel_prob[correct][mis]=1.0/len(mistakes)
    
    import numpy as np
    
    V = len(term_count.keys())
    
    file = open('/content/drive/My Drive/data/vocab_data/testdata.txt')
    for line in file:
      items = line.rstrip().split('	')
      line = items[2].split()
      # ["I","loke","palying"]
      for word in line:
        if word not in vocab:
          # 需要替换word成正确的单词
          # Step1: 生成所有的valid候选集合
          candidates = generate_candidates(word=word)
    
          if len(candidates) < 1:
            continue  # 不建议这么做(这是不对的)
            # TODO : 根据条件生成更多的集合条件
          probs = []
          # 对于每一恶搞candidate,计算他的score
          # score = p(correct)*p(misktake|correct)
          #       = log p(correct)*p(misktake|correct)
          # 返回 score最大的candiate
          for candi in candidates:
            prob = []
            # a. 计算channelprobability
            if candi in channel_prob and word in channel_prob[candi]:
              prob += np.log(channel_prob[candi][word])
            else:
              prob += np.log(0.0001)
    
            # b. 计算语言模型的概率,开始有个s的符号
            idx = items[2].index(word)+1
    
            if (items[2][idx-1] in bigram_count and candi in bigram_count[items[2]]):
              prob += np.log((bigram_count[item[2][idx-1]][candi]+1.0)/(
                  term_count[bigram_count[items[2][idx-1]]]+V))
            else:
              prob += np.log(1.0/V)
    
            probs.append(prob)
          max_idx = probs.index(max(probs))
          print(word,candidates[max_idx])
    
    

    后续未完待续.....

  • 相关阅读:
    try catch finally
    类的小练习
    易混淆概念总结
    C#中struct和class的区别详解
    Doing Homework again
    悼念512汶川大地震遇难同胞——老人是真饿了
    Repair the Wall
    Saving HDU
    JAVA-JSP隐式对象
    JAVA-JSP动作
  • 原文地址:https://www.cnblogs.com/TuringEmmy/p/12534966.html
Copyright © 2020-2023  润新知