分享一个还不错的英文（文献）分句代码——基于Stack overflow版本优化

废话不多说直接上代码

import re

caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"
pics = "(FIG|FIGS|fig|figs|sub|TM|sup)"

def split_into_sentences2(text):
    #text = open(item,'r').read()
    # text = text.replace('\n','')
    text = " " + text + "  "
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
    # text = re.sub(digits + "[.]", "\\1<prd>", text)
    text = re.sub(pics + "[.]", "\\1<prd>", text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    if "e.g." in text: text = text.replace("e.g.", "e<prd>g<prd>")
    if "i.e." in text: text = text.replace("i.e.", "i<prd>e<prd>")
    if "Fig. " in text: text = text.replace("Fig. ", "Fig<prd> ")
    if "No. " in text: text = text.replace("No. ", "No<prd> ")
    if "NO. " in text: text = text.replace("NO. ", "No<prd> ")
    if "CO." in text: text = text.replace("CO.", "Co<prd>")
    text = text.replace(". ",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    # text = text.replace(";",";<stop>")
    # text = text.replace(",",",<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]

    return sentences

部分数据进行了注释，是因为基于本工作文献文本的处理需要，也可按照自己的工作需要进行修改和微调，目前效果还不错的一个版本了，比NLTK的sent_tokenize效果要好很多。
感觉sent_tokenize就是一个按照. 进行句子分割，实测效果不好，所以在检索到底基础上加以修改，先共享并记录一下。

Life is beautiful. And the meaning of life is to be recognized. Keep Going, All.

相关阅读:
小程序---云开发----云函数
 小程序的基本概念-生命周期(组件 wxml)
小程序的基本概念
 vue登录功能和将商品添加至购物车实现
 vue脚手架创建项目
 node.js评论列表和添加购物车数据库表创建
 学习脚手架--组件之间跳转与参数(组件之间参数)
node.js 需要注意知识点
 如何查询小程序官方手册
 vue ui九宫格、底部导航、新闻列表、跨域访问
原文地址：https://www.cnblogs.com/ache/p/16194425.html