• ptyhon中文本挖掘精简版


    import xlrd
    import jieba
    import sys  
    import importlib
    import os         #python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数  
    import pickle    #导入cPickle包并且取一个别名pickle #持久化类
    import random
    import numpy as np
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    from pylab import mpl  
    from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包
    from sklearn import svm
    
    from sklearn import metrics 
    from sklearn.datasets.base import Bunch
    from sklearn.feature_extraction.text import TfidfVectorizer
    importlib.reload(sys)
    
    
    #把内容和类别转化成一个向量的形式
    trainContentdatasave=[] #存储所有训练和测试数据的分词
    testContentdatasave=[]
    
    trainContentdata = []
    testContentdata = []
    trainlabeldata = []
    testlabeldata = []
    
    #导入文本描述的训练和测试数据
    def importTrainContentdata():
        file = '20180716_train.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            trainContentdata.append(ws.cell(r, 0).value)
    
    def importTestContentdata():
        file = '20180716_test.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            testContentdata.append(ws.cell(r, 0).value)   
    
    #导入类别的训练和测试数据
    def importTrainlabeldata():
        file = '20180716_train_label.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            trainlabeldata.append(ws.cell(r, 0).value)
            
    def importTestlabeldata():
        file = '20180716_test_label.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            testlabeldata.append(ws.cell(r, 0).value)
    
    
    if __name__=="__main__": 
        
        importTrainContentdata()
        importTestContentdata()
        importTrainlabeldata()
        importTestlabeldata()
        
        '''贝叶斯
        clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label)  
        #clf = svm.SVC(C=0.7, kernel='poly', gamma=10, decision_function_shape='ovr')
        clf.fit(train_set.tdm, train_set.label)  
        predicted=clf.predict(test_set.tdm)
        
        逻辑回归
        tv = TfidfVectorizer()
        train_data = tv.fit_transform(X_train)
        test_data = tv.transform(X_test)
        
        lr = LogisticRegression(C=3)
        lr.fit(train_set.tdm, train_set.label)
        predicted=lr.predict(test_set.tdm)
        print(lr.score(test_set.tdm, test_set.label))
        #print(test_set.tdm)
        
        #SVM
        clf = SVC(C=1500)
        clf.fit(train_set.tdm, train_set.label)
        predicted=clf.predict(test_set.tdm)
        print(clf.score(test_set.tdm, test_set.label))
        '''
        
        tv = TfidfVectorizer()
        train_data = tv.fit_transform(trainContentdata)
        test_data = tv.transform(testContentdata)
    
        clf = SVC(C=1500)
        clf.fit(train_data, trainlabeldata)
        print(clf.score(test_data, testlabeldata))
        
        
        
        a=[]
        b=[]
        for i in range(len(predicted)):
            b.append((int)(float(predicted[i])))
            a.append(int(test_set.label[i][0]))
        
        '''
        f=open('F:/goverment/ArticleMining/predict.txt', 'w')
        for i in range(len(predicted)):
           f.write(str(b[i]))
           f.write('
    ')
        f.write("写好了")
        f.close()
        #for i in range(len(predicted)):
            #print(b[i])
        '''
        #metrics_result(a, b)
  • 相关阅读:
    C++模板编译模型
    C++继承与构造函数、复制控制
    PHP判断用户是手机端?还是浏览器端访问?
    CentOS6.5搭建LNMP
    星级评分--封装成jquery插件
    扩展thinkphp5的redis类方法
    js实现星级评分之方法一
    js原型与继承
    一个基于Tp3.2(thinkphp3.2)的工会管理系统
    实验楼的php比赛题,网页数据提取。
  • 原文地址:https://www.cnblogs.com/caiyishuai/p/13270962.html
Copyright © 2020-2023  润新知