1 import nltk
2 from nltk.book import *
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
<Text: Wall Street Journal>
1 dist = FreqDist(text7)
2 len(dist)
1 vocab1 = dist.keys()
2 #vocab1[:10]
3 # In Python 3 dict.keys() returns an iterable view instead of a list
4 list(vocab1)[:10]
['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']
1 freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
2 freqwords
1 input1 = "List listed lists listing listings"
2 words1 = input1.lower().split(' ')
3 words1
['list', 'listed', 'lists', 'listing', 'listings']
1 porter = nltk.PorterStemmer()
2 [porter.stem(t) for t in words1]
['list', 'list', 'list', 'list', 'list']
1 udhr = nltk.corpus.udhr.words('English-Latin1')
2 udhr[:20]
1 [porter.stem(t) for t in udhr[:20]] # Still Lemmatization
1 WNlemma = nltk.WordNetLemmatizer()
2 [WNlemma.lemmatize(t) for t in udhr[:20]]
1 #根据空格分词
2 text11 = "Children shouldn't drink a sugary drink before bed."
3 text11.split(' ')
['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']
1 #nltk分词
2 nltk.word_tokenize(text11)
1 #nltk分句
2 text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
3 sentences = nltk.sent_tokenize(text12)
4 len(sentences)
['This is the first sentence.',
'A gallon of milk in the U.S. costs $2.99.',
'Is this the third sentence?',
'Yes, it is!']
1 nltk.help.upenn_tagset('MD')
MD: modal auxiliary
can cannot could couldn't dare may might must need ought shall should
shouldn't will would
1 text13 = nltk.word_tokenize(text11)
2 nltk.pos_tag(text13)
[('Children', 'NNP'),
('should', 'MD'),
("n't", 'RB'),
('drink', 'VB'),
('a', 'DT'),
('sugary', 'JJ'),
('drink', 'NN'),
('before', 'IN'),
('bed', 'NN'),
('.', '.')]
1 text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
2 nltk.pos_tag(text14)
[('Visiting', 'VBG'),
('aunts', 'NNS'),
('can', 'MD'),
('be', 'VB'),
('a', 'DT'),
('nuisance', 'NN')]
1 # 解析语法结构
2 text15 = nltk.word_tokenize("Alice loves Bob")
3 grammar = nltk.CFG.fromstring("""
4 S -> NP VP
5 VP -> V NP
6 NP -> 'Alice' | 'Bob'
7 V -> 'loves'
8 """)
10 parser = nltk.ChartParser(grammar)
11 trees = parser.parse_all(text15)
12 for tree in trees:
13 print(tree)
(S (NP Alice) (VP (V loves) (NP Bob)))
1 #读取数据
2 text16 = nltk.word_tokenize("I saw the man with a telescope")
3 grammar1 = nltk.data.load('mygrammar.cfg')
4 grammar1
<Grammar with 13 productions>
1 #生成语法树
2 parser = nltk.ChartParser(grammar1)
3 trees = parser.parse_all(text16)
4 for tree in trees:
5 print(tree)
(NP I)
(VP (V saw) (NP (Det the) (N man)))
(PP (P with) (NP (Det a) (N telescope)))))
(NP I)
(V saw)
(NP (Det the) (N man) (PP (P with) (NP (Det a) (N telescope))))))
1 from nltk.corpus import treebank
2 text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
3 print(text17)
(NP (NNP Pierre) (NNP Vinken))
(, ,)
(ADJP (NP (CD 61) (NNS years)) (JJ old))
(, ,))
(MD will)
(VB join)
(NP (DT the) (NN board))
(PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
(NP-TMP (NNP Nov.) (CD 29))))
(. .))
1 text18 = nltk.word_tokenize("The old man the boat")
2 nltk.pos_tag(text18)
[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]
1 text19 = nltk.word_tokenize("Colorless green ideas sleep furiously")
2 nltk.pos_tag(text19)
[('Colorless', 'NNP'),
('green', 'JJ'),
('ideas', 'NNS'),
('sleep', 'VBP'),
('furiously', 'RB')]