• 分享一个还不错的英文(文献)分句代码——基于Stack overflow版本优化


    废话不多说直接上代码

    import re
    
    caps = "([A-Z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    digits = "([0-9])"
    pics = "(FIG|FIGS|fig|figs|sub|TM|sup)"
    
    def split_into_sentences2(text):
        #text = open(item,'r').read()
        # text = text.replace('\n','')
        text = " " + text + "  "
        text = re.sub(prefixes,"\\1<prd>",text)
        text = re.sub(websites,"<prd>\\1",text)
        if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
        text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
        text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
        text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
        text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
        text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
        text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
        text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
        text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
        # text = re.sub(digits + "[.]", "\\1<prd>", text)
        text = re.sub(pics + "[.]", "\\1<prd>", text)
        if "”" in text: text = text.replace(".”","”.")
        if "\"" in text: text = text.replace(".\"","\".")
        if "!" in text: text = text.replace("!\"","\"!")
        if "?" in text: text = text.replace("?\"","\"?")
        if "e.g." in text: text = text.replace("e.g.", "e<prd>g<prd>")
        if "i.e." in text: text = text.replace("i.e.", "i<prd>e<prd>")
        if "Fig. " in text: text = text.replace("Fig. ", "Fig<prd> ")
        if "No. " in text: text = text.replace("No. ", "No<prd> ")
        if "NO. " in text: text = text.replace("NO. ", "No<prd> ")
        if "CO." in text: text = text.replace("CO.", "Co<prd>")
        text = text.replace(". ",".<stop>")
        text = text.replace("?","?<stop>")
        text = text.replace("!","!<stop>")
        # text = text.replace(";",";<stop>")
        # text = text.replace(",",",<stop>")
        text = text.replace("<prd>",".")
        sentences = text.split("<stop>")
        sentences = [s.strip() for s in sentences]
    
        return sentences
    

    部分数据进行了注释,是因为基于本工作文献文本的处理需要,也可按照自己的工作需要进行修改和微调,目前效果还不错的一个版本了,比NLTK的sent_tokenize效果要好很多。
    感觉sent_tokenize就是一个按照. 进行句子分割,实测效果不好,所以在检索到底基础上加以修改,先共享并记录一下。


    Life is beautiful. And the meaning of life is to be recognized. Keep Going, All.

  • 相关阅读:
    小程序---云开发----云函数
    小程序的基本概念-生命周期(组件 wxml)
    小程序的基本概念
    vue登录功能和将商品添加至购物车实现
    vue脚手架创建项目
    node.js评论列表和添加购物车数据库表创建
    学习脚手架--组件之间跳转与参数(组件之间参数)
    node.js 需要注意知识点
    如何查询小程序官方手册
    vue ui九宫格、底部导航、新闻列表、跨域访问
  • 原文地址:https://www.cnblogs.com/ache/p/16194425.html
Copyright © 2020-2023  润新知