Natural Language Processing with Python
Chapter 6.1
由于nltk.FreqDist的排序问题,获取电影文本特征词的代码有些微改动。
1 import nltk 2 from nltk.corpus import movie_reviews as mr 3 4 def document_features(document,words_features): 5 document_words=set(document) 6 features={} 7 for word in words_features: 8 features['has(%s)' %word] = (word in document_words) 9 return features 10 11 def test_doc_classification(): 12 documents=[(list(mr.words(fileid)),category) 13 for category in mr.categories() 14 for fileid in mr.fileids(categories=category)] 15 all_words_dist=nltk.FreqDist(w.lower() for w in mr.words()) 16 words_freq =sorted(all_words_dist.items(), key=lambda x: (-1*x[1], x[0]))[:2000] 17 words_features=[word[0] for word in words_freq] 18 19 featuresets=[(document_features(doc,words_features),c) for (doc,c) in 20 documents] 21 22 train_set, test_set= featuresets[100:],featuresets[:100] 23 classifier=nltk.NaiveBayesClassifier.train(train_set) 24 25 print nltk.classify.accuracy(classifier,test_set) 26 27 classifier.show_most_informative_features(5)
结果如下,accuracy为0.86:
0.86
Most Informative Features
has(outstanding) = True pos : neg = 10.4 : 1.0
has(seagal) = True neg : pos = 8.7 : 1.0
has(mulan) = True pos : neg = 8.1 : 1.0
has(wonderfully) = True pos : neg = 6.3 : 1.0
has(damon) = True pos : neg = 5.7 : 1.0