废话不多说直接上代码
import re
caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"
pics = "(FIG|FIGS|fig|figs|sub|TM|sup)"
def split_into_sentences2(text):
#text = open(item,'r').read()
# text = text.replace('\n','')
text = " " + text + " "
text = re.sub(prefixes,"\\1<prd>",text)
text = re.sub(websites,"<prd>\\1",text)
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
# text = re.sub(digits + "[.]", "\\1<prd>", text)
text = re.sub(pics + "[.]", "\\1<prd>", text)
if "”" in text: text = text.replace(".”","”.")
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
if "e.g." in text: text = text.replace("e.g.", "e<prd>g<prd>")
if "i.e." in text: text = text.replace("i.e.", "i<prd>e<prd>")
if "Fig. " in text: text = text.replace("Fig. ", "Fig<prd> ")
if "No. " in text: text = text.replace("No. ", "No<prd> ")
if "NO. " in text: text = text.replace("NO. ", "No<prd> ")
if "CO." in text: text = text.replace("CO.", "Co<prd>")
text = text.replace(". ",".<stop>")
text = text.replace("?","?<stop>")
text = text.replace("!","!<stop>")
# text = text.replace(";",";<stop>")
# text = text.replace(",",",<stop>")
text = text.replace("<prd>",".")
sentences = text.split("<stop>")
sentences = [s.strip() for s in sentences]
return sentences
部分数据进行了注释,是因为基于本工作文献文本的处理需要,也可按照自己的工作需要进行修改和微调,目前效果还不错的一个版本了,比NLTK的sent_tokenize
效果要好很多。
感觉sent_tokenize
就是一个按照.
进行句子分割,实测效果不好,所以在检索到底基础上加以修改,先共享并记录一下。
Life is beautiful. And the meaning of life is to be recognized. Keep Going, All.