    from numpy import *
    import time
    starttime = time.time()
    def loadDataSet(): 
        postingList = [['my', 'dog', 'has', 'flea',
                        'problems', 'help', 'please'],
                        ['maybe', 'not', 'take', 'him',
                        'to', 'dog', 'park', 'stupid'],
                        ['my', 'dalmation', 'is', 'so', 'cute',
                        'I', 'love', 'him'],
                        ['stop', 'posting', 'stupid', 'worthless', 
                        ['mr', 'licks', 'ate', 'my', 'steak', 'how',
                        'to', 'stop', 'him'],
                        ['quit', 'buying', 'worthless', 'dog', 'food',
        classVec = [0, 1, 0, 1, 0, 1] 
        return postingList, classVec
    def createVocabList(dataSet): # dataSet = postingList 
        vocabSet = set([]) # vocabSet = set(dataSet)
        for document in dataSet:
            vocabSet = vocabSet | set(document) # 
        return list(vocabSet) # createVocabList = list(set(dataSet)) 
    def setOfWords2Vec(vocabList, inputSet): 
        returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0
        for word in vocabList:
            if word in inputSet:
                returnVec[vocabList.index(word)] = 1 + 1.0
                returnVec[vocabList.index(word)] = 1.0
                print "the word: %s is not in my Vocabulary!" % word
        return returnVec 
    def txt2trainxy(filename1, filename2):
        import re
        reg = re.compile(r'W*') #
        # step 1: loading data...
        print "stet 1: loading data..."
        from os import listdir
        ld1 = listdir('email/' + filename1); ld2 = listdir('email/' + filename2)
        filelist = ld1 + ld2
        trainy = ((filename1 + '	') * len(ld1) + (filename2 + '	') * len(ld2)).split()
        trainx = []; fulltext = []; i = 0
        for File in filelist:
            if i < len(ld1):
                fr = reg.split(open('email/' + filename1 + '/' + File).readlines()[0].lower())
                fr = reg.split(open('email/' + filename2 + '/' + File).readlines()[0].lower())
            trainx.append([f for f in fr if len(f) > 2]) #
            fulltext.extend([f for f in fr if len(f) > 2]) #
            i += 1
        fulltext = list(set(fulltext))
        # set of words
        trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]
        # bag of words 
        trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]
        return trainxws, trainxwb, trainy, trainx, fulltext
    def testx2vec(testx, fulltext):
        # set of words
        testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #
        # bag of words 
        testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #
        for word in testx:
            if word not in fulltext:
                print "the word: %s is not in my fulltext!" % word
        return testxws, testxwb
    def bayes(testx, trainx, trainy, fulltext):
        print "---Getting Prob..."
        s = set(trainy); l = len(trainy); r = len(trainx[0])
        IDs = [[id for id in range(l) if trainy[id] == item] for item in s]
        logproby = [log(array(trainy.count(item)) / float(l)) for item in s]
        numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]
        numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #
        probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]
        logprobx = [[log(p[i]) for i in range(r)] for p in probx]
        print "---Printing Prob..."
        #print probx
        print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big
        print trainy[IDs[0][0]]
        print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]
        print trainy[IDs[1][0]]
        print IDs
        print numbxv
        print logprobx
        # step 4: showing the result...
        print "---Showing the result..."
        # set of words
        sumlogpxws = sum(array(logprobx) * testx, 1)
        sumlogpxyws = array(sumlogpxws) + array(logproby)
        #print logprobx
        print sumlogpxws
        print sum(array(probx) * testx, 1)
        bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]
        print "---From set of words: ", bestyws
        # bag of words
        sumlogpxwb = sum(array(logprobx) * testxwb, 1)
        sumlogpxywb = array(sumlogpxwb) + array(logproby)
        bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]
        print "---From bag of words: ", bestywb
        return bestyws
    def main():
        # step 1: loading data...
        trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy('spam','ham')
        print fulltext
        # step 2: training...
        print "step 2: training..."
        # step 3: testing...
        print "step 3: testing..."
        print "---Preparing testdata..."
        import random
        l = len(trainy)
        testid = random.sample(range(l), 20)
        testxxx = [trainxws[i] for i in testid]
        testyyy = [trainy[i] for i in testid]
        testtrainxws = [trainxws[i] for i in range(l) if i not in testid]
        testtrainy = [trainy[i] for i in range(l) if i not in testid]
        print "---Testing now..."
        errorcount = 0; p = len(testid)
        for i in range(p):
            if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:
                errorcount += 1
        print errorcount
        print p
        print "---Errorrate is: ", (errorcount / float(p))
        # step 4: showing the result
        print "step 4: using..."
        testx = ['love', 'my', 'dalmation']
        print "the testx is: ", testx
        print "---Changing testx into vector..."
        testxws, testxwb = testx2vec(testx, fulltext)
        #print testxws
        bayes(testxws, testtrainxws, testtrainy, fulltext)
    trainx, trainy = loadDataSet()
    fulltext = createVocabList(trainx)
    print fulltext
    print setOfWords2Vec(fulltext, trainx[0])
    trainxws = []
    for t in trainx:
        trainxws.append(setOfWords2Vec(fulltext, t))
    testEntry1 = ['love', 'my', 'dalmation']
    testEntry2 = ['stupid', 'garbage']
    bayes(testEntry1, trainxws, trainy, fulltext)
