1 #coding=utf-8
2 from numpy import *
3
4 #解析文档的函数
5 def textParse(bigString):
6 import re
7 listOfTokens = re.split(r'W*',bigString)
8 return [tok.lower() for tok in listOfTokens if len(tok) > 2]
9
10
11 #创建一个带有所有单词的列表
12 def createVocabList(dataSet):
13 vocabSet = set([])
14 for document in dataSet:
15 vocabSet = vocabSet | set(document)
16 return list(vocabSet)
17
18 def setOfWords2Vec(vocabList, inputSet):
19 retVocabList = [0] * len(vocabList)
20 for word in inputSet:
21 if word in vocabList:
22 retVocabList[vocabList.index(word)] = 1
23 else:
24 print 'word ',word ,'not in dict'
25 return retVocabList
26
27 #另一种模型
28 def bagOfWords2VecMN(vocabList, inputSet):
29 returnVec = [0]*len(vocabList)
30 for word in inputSet:
31 if word in vocabList:
32 returnVec[vocabList.index(word)] += 1
33 return returnVec
34
35 def trainNB0(trainMatrix,trainCatergory):
36 numTrainDoc = len(trainMatrix)
37 numWords = len(trainMatrix[0])
38 pAbusive = sum(trainCatergory)/float(numTrainDoc)
39 #防止多个概率的成绩当中的一个为0
40 p0Num = ones(numWords)
41 p1Num = ones(numWords)
42 p0Denom = 2.0
43 p1Denom = 2.0
44 for i in range(numTrainDoc):
45 if trainCatergory[i] == 1:
46 p1Num +=trainMatrix[i]
47 p1Denom += sum(trainMatrix[i])
48 else:
49 p0Num +=trainMatrix[i]
50 p0Denom += sum(trainMatrix[i])
51 p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零
52 p0Vect = log(p0Num/p0Denom)
53 return p0Vect,p1Vect,pAbusive
54
55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
56 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult
57 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
58 if p1 > p0:
59 return 1
60 else:
61 return 0
62
63 def stopWords():
64 stopW = []
65 f = open('stopwords.txt').readlines()
66 for eachLine in f:
67 stopW.append(eachLine[:-1])
68 return stopW
69
70 def calcMostFreq(vocabList,fullText):
71 import operator
72 freqDict = {}
73 for token in vocabList:
74 freqDict[token]=fullText.count(token)
75 sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
76 return sortedFreq[:30]
77
78 def localWords(rss1,rss0):
79 import feedparser
80 feed1 = feedparser.parse(rss1)
81 feed0 = feedparser.parse(rss0)
82 docList=[]; classList = []; fullText =[]
83 minLen = min(len(feed1['entries']),len(feed0['entries']))
84 for i in range(minLen):
85 wordList = textParse(feed1['entries'][i]['summary'])
86 docList.append(wordList)
87 fullText.extend(wordList)
88 classList.append(1) #NY is class 1
89 wordList = textParse(feed0['entries'][i]['summary'])
90 docList.append(wordList)
91 fullText.extend(wordList)
92 classList.append(0)
93 vocabList = createVocabList(docList)#create vocabulary
94 top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words
95 for pairW in top30Words:
96 if pairW[0] in vocabList: vocabList.remove(pairW[0])
97 stopW = stopWords()
98 for pairW in stopW:
99 if pairW[0] in vocabList:
100 vocabList.remove(pairW[0])
101 trainingSet = range(2*minLen); testSet=[] #create test set
102 for i in range(20):
103 randIndex = int(random.uniform(0,len(trainingSet)))
104 testSet.append(trainingSet[randIndex])
105 del(trainingSet[randIndex])
106 trainMat=[]; trainClasses = []
107 for docIndex in trainingSet:#train the classifier (get probs) trainNB0
108 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
109 trainClasses.append(classList[docIndex])
110 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
111 errorCount = 0
112 for docIndex in testSet: #classify the remaining items
113 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
114 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
115 errorCount += 1
116 print 'the error rate is: ',float(errorCount)/len(testSet)
117 return vocabList,p0V,p1V
118
119 def getTopWords(ny,sf):
120 import operator
121 vocabList,p0V,p1V=localWords(ny,sf)
122 topNY=[]; topSF=[]
123 for i in range(len(p0V)):
124 if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
125 if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
126 sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
127 print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"
128 for item in sortedSF:
129 print item[0]
130 sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
131 print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"
132 for item in sortedNY:
133 print item[0]
134
135 def main():
136 #print stopWords()
137 localWords('http://newyork.craigslist.org/stp/index.rss','http://sfbay.craigslist.org/stp/index.rss')
138
139 if __name__ == '__main__':
140 main()