使用NLTK进行基础的NLP处理

1 import nltk
2 from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


统计词语的数量

1 text7

<Text: Wall Street Journal>

1 sent7

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

1 len(sent7)

1 len(text7)

1 len(set(text7))

1 list(set(text7))[:10]

['bottom',
 'Richmond',
 'tension',
 'limits',
 'Wedtech',
 'most',
 'boost',
 '143.80',
 'Dale',
 'refunded']

词频

1 dist = FreqDist(text7)
2 len(dist)

1 vocab1 = dist.keys()
2 #vocab1[:10] 
3 # In Python 3 dict.keys() returns an iterable view instead of a list
4 list(vocab1)[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

1 dist['four']

1 freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
2 freqwords

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

标准化和词干

1 input1 = "List listed lists listing listings"
#把字母都小写，再进行分词处理
2 words1 = input1.lower().split(' ')
3 words1

['list', 'listed', 'lists', 'listing', 'listings']

1 porter = nltk.PorterStemmer()
2 [porter.stem(t) for t in words1]

['list', 'list', 'list', 'list', 'list']

词形还原

1 udhr = nltk.corpus.udhr.words('English-Latin1')
2 udhr[:20]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']

1 [porter.stem(t) for t in udhr[:20]] # Still Lemmatization

['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']

1 WNlemma = nltk.WordNetLemmatizer()
2 [WNlemma.lemmatize(t) for t in udhr[:20]]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

分词和分句

1 #根据空格分词
2 text11 = "Children shouldn't drink a sugary drink before bed."
3 text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

1 #nltk分词
2 nltk.word_tokenize(text11)

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

1 #nltk分句
2 text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
3 sentences = nltk.sent_tokenize(text12)
4 len(sentences)

1 sentences

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

使用NLTK进行文本高级处理
POS标签

1 nltk.help.upenn_tagset('MD')

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would

1 text13 = nltk.word_tokenize(text11)
2 nltk.pos_tag(text13)

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

1 text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
2 nltk.pos_tag(text14)

[('Visiting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuisance', 'NN')]

 1 # 解析语法结构
 2 text15 = nltk.word_tokenize("Alice loves Bob")
 3 grammar = nltk.CFG.fromstring("""
 4 S -> NP VP
 5 VP -> V NP
 6 NP -> 'Alice' | 'Bob'
 7 V -> 'loves'
 8 """)
 9 
10 parser = nltk.ChartParser(grammar)
11 trees = parser.parse_all(text15)
12 for tree in trees:
13     print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))

1 #读取数据
2 text16 = nltk.word_tokenize("I saw the man with a telescope")
3 grammar1 = nltk.data.load('mygrammar.cfg')
4 grammar1

<Grammar with 13 productions>

1 #生成语法树
2 parser = nltk.ChartParser(grammar1)
3 trees = parser.parse_all(text16)
4 for tree in trees:
5     print(tree)

(S
  (NP I)
  (VP
    (VP (V saw) (NP (Det the) (N man)))
    (PP (P with) (NP (Det a) (N telescope)))))
(S
  (NP I)
  (VP
    (V saw)
    (NP (Det the) (N man) (PP (P with) (NP (Det a) (N telescope))))))

1 from nltk.corpus import treebank
2 text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
3 print(text17)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))

位置标记和歧义解释

1 text18 = nltk.word_tokenize("The old man the boat")
2 nltk.pos_tag(text18)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

1 text19 = nltk.word_tokenize("Colorless green ideas sleep furiously")
2 nltk.pos_tag(text19)

[('Colorless', 'NNP'),
 ('green', 'JJ'),
 ('ideas', 'NNS'),
 ('sleep', 'VBP'),
 ('furiously', 'RB')]

相关阅读:
我的WCF之旅（1）：创建一个简单的WCF程序
 与众不同 windows phone (15) Media（媒体）之后台播放音频
 与众不同 windows phone (14) Media（媒体）之音频播放器, 视频播放器, 与 Windows Phone 的音乐和视频中心集成
 与众不同 windows phone (10) Push Notification（推送通知）之推送 Tile 通知, 推送自定义信息
 与众不同 windows phone (17) Graphic and Animation（画图和动画）
与众不同 windows phone (5) Chooser（选择器）
与众不同 windows phone (26) Contacts and Calendar（联系人和日历）
与众不同 windows phone (7) Local Database（本地数据库）
与众不同 windows phone (19) Device（设备）之陀螺仪传感器, Motion API
与众不同 windows phone (16) Media（媒体）之编辑图片, 保存图片到相册, 与图片的上下文菜单“应用程序...”和“共享...”关联, 与 Windows Phone 的图片中心集成
原文地址：https://www.cnblogs.com/zhengzhe/p/8573075.html