• 朴素贝叶斯分类器基本代码 && n折交叉优化 2


    这个代码基于上一个代码

    不同的是:读取了txt文件,改变了min_ft与max_ft的参数

    import re
    import pandas as pd
    import warnings
    import numpy as np
    from sklearn.metrics import roc_auc_score
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB as MNB #多项分布朴素贝叶斯公式
    from sklearn.naive_bayes import BernoulliNB as BNB
    from sklearn.model_selection  import cross_val_score
    warnings.filterwarnings("ignore")
    def proces(col2):
        col2_text=re.sub("[^a-zA-Z]"," ",col2)
        words=col2_text.lower().split()
        #print(words)
        return words
    train=pd.read_table('sentimentLabel.txt',lineterminator='
    ', header=None, names=[0, 1])
    print(train.head(5))
    train_labers=train[0]
    train_texts=train[1]
    class_mapping={'Negative':0, 'Positive':1}
    train_labers=train_labers.map(class_mapping)
    #print(labers)
    
    test=pd.read_table('test.txt', lineterminator='
    ', header=None, names=[0, 1])
    test_labers=test[0]
    test_texts=test[1]
    test_labers=test_labers.map(class_mapping)
    
    train_data=[]
    for i in range(len(train_texts)):
        train_data.append(' '.join(proces(train_texts[i])))
        pass
    test_data=[]
    for i in range(len(test_texts)):
        test_data.append(' '.join(proces(test_texts[i])))
    #print(train_data)
    #print(test_data)
    data_all = train_data+test_data
    #print(data_all)
    count_vec = TfidfVectorizer(min_df=1,
                                max_df=60,
                                analyzer='word',
                                ngram_range=(1, 2),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                stop_words='english'
    )
    length=len(train_data)
    count_vec.fit(data_all)
    data_all=count_vec.transform(data_all)
    #print(data_all)
    train_data=data_all[:length]
    test_data=data_all[length:]
    
    
    model=MNB()
    #model=BNB()
    model.fit(train_data,train_labers)
    #pred=model.predict(test_data)
    MNB(alpha=1.0, class_prior=False, fit_prior=True)
    #print("roc_auc",roc_auc_score(test_labers, pred))
    #print("roc_auc",roc_auc_score(w, pred))
    '''
    MX = 0.7996632996632996
    MX_idx = 5
    for i in range(400, 500):
        if MX < np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')):
            MX=np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc'))
            MX_idx=i
        pass
    print("roc_auc",MX, MX_idx)
    '''
    print("roc_auc", np.mean(cross_val_score(model, train_data, train_labers, cv=297, scoring='roc_auc')))
    化繁为简 大巧不工
  • 相关阅读:
    python版本切换及添加路径
    python下载及安装步骤
    day01计算机基础-python解释器-pycharm介绍
    查看pycharm有效期
    对于Makefile的基本使用
    Shell命令整理
    一起来免费听无损高品质音乐吧!
    记录 解决ubuntu16.04 ‘E: 无法获得锁 /var/lib/dpkg/lock-frontend
    Ubuntu16手动安装OpenStack——keystone篇
    python3中的RE(正则表达式)
  • 原文地址:https://www.cnblogs.com/mpeter/p/11172284.html
Copyright © 2020-2023  润新知