• 数据挖掘初次接触!学习代码


    import pandas as pd
    import numpy as np
    from time import time
    
    data=pd.read_csv("dankuan.csv",sep=';',index_col=0) #打开表
    data.head() #查看前几行的数据,默认前5行
    data.describe() #数据的快速统计汇总
    data.g4_term_type.value_counts()
    data2=data.fillna(value={'star_class':1300,'main_offer_level':0,'mon_flow_last2':0,'mon_flow_last1':0,'thrmon_flow_trend':0}).fillna(6) #缺值处理
    data2.g4_term_type.value_counts(dropna=False)#计数
    data2.isnull().any()#查看字段是否有空值
    
    #pd.get_dummies
    
    from sklearn import preprocessing
    
    enc = preprocessing.OneHotEncoder()
    a1 = data2[['g4_term_type','thrmon_flow_trend']]
    enc.fit(a1)
    
    data2_onehot=pd.DataFrame(enc.transform(a1).toarray().astype(np.bool))
    # train_onehot.columns  = ['edu_class0','edu_class1','edu_class2','edu_class3','curPlan1','curPlan2','curPlan3','curPlan4']
    # train2=pd.concat([train,train_onehot.set_index(train.index)],axis=1)
    # test_onehot=pd.DataFrame(enc.transform(test[['edu_class', 'curPlan']]).toarray().astype(np.bool))
    # test_onehot.columns  = ['edu_class0','edu_class1','edu_class2','edu_class3','curPlan1','curPlan2','curPlan3','curPlan4']
    # test2=pd.concat([test,test_onehot.set_index(test.index)],axis=1)
    
    print(enc.n_values_,enc.feature_indices_,data2_onehot.shape)
    
    data2_onehot.columns=['data2_onehot1','data2_onehot2','data2_onehot3','data2_onehot4','data2_onehot5','data2_onehot6','data2_onehot7','data2_onehot8']#区总类别数
    data3=pd.concat([data2,data2_onehot.set_index(data2.index)],axis=1)
    
    data3.head()
    
    names=data3.columns
    names=names.drop(['is_zf_flag'])#去除无用值
    data3[names].head()
    names
    
    X=data3[names]
    y=data3.is_zf_flag
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)#拆分训练数据与测试数据
    from sklearn.ensemble import RandomForestClassifier #算法1:自动调优算法
    from sklearn.model_selection import train_test_split,GridSearchCV
    rf = RandomForestClassifier(n_estimators=200,max_features=0.7,)
    model = GridSearchCV(rf, param_grid={'max_depth':[1,5,10], 'min_samples_leaf':[2,5,10,20,50,100]}, cv=3)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    from sklearn.ensemble import RandomForestClassifier#算法2:随机深林算法
    
    clf = RandomForestClassifier(n_jobs = 1)#分类型决策树;
    #n_jobs:用几个处理器
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)#判定结果
    y_pred.shape
    
    print(list(zip(names,np.round(clf.feature_importances_*100,2))))#各feature的重要性
    print(names[clf.feature_importances_.argmax()],np.round(clf.feature_importances_*100,2).max())
    
    from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
    #分类准确率,提取出的正确信息条数 /样本中的信息条数,正确被检索/实际被检索,正确被检索/应该检索到
    
    print(accuracy_score(y_test,y_pred),recall_score(y_test,y_pred))
    print(precision_score(y_test,y_pred),f1_score(y_test,y_pred))
    
    from sklearn.model_selection import cross_val_score
    re_yc = cross_val_score(model, X_train, y_train, cv=10) #交叉验证用于防止模型过于复杂而引起的过拟合
    
    from sklearn.model_selection import cross_val_score
    re = cross_val_score(clf, X_train, y_train, cv=15) #交叉验证用于防止模型过于复杂而引起的过拟合
    
    re_yc.max()
    
    re_yc.min()
    
    re_yc.max()-re_yc.min()
    
    data_yc = pd.read_csv("dankuan_yc.csv",sep=';',index_col=0) #打开表
    data_yc2=data_yc.fillna(value={'star_class':1300,'main_offer_level':0,'mon_flow_last2':0,'mon_flow_last1':0,'thrmon_flow_trend':0}).fillna(6) #缺值处理
    a2 = data_yc2[['g4_term_type','thrmon_flow_trend']]
    enc.fit(a2)
    data_yc_onehot=pd.DataFrame(enc.transform(a2).toarray().astype(np.bool))
    data2_onehot.columns=['data2_onehot1','data2_onehot2','data2_onehot3','data2_onehot4','data2_onehot5','data2_onehot6','data2_onehot7','data2_onehot8']#区总类别数
    data3_yc=pd.concat([data_yc2,data_yc_onehot.set_index(data_yc2.index)],axis=1)
    names_yc=data3_yc.columns
    xxx=data3_yc[names_yc]
    
    ans=model.predict(xxx)
    pd.DataFrame(ans).set_index(xxx.index).to_csv("dkjg.csv",header=False) 
    
    ans=clf.predict(xxx)
    pd.DataFrame(ans).set_index(xxx.index).to_csv("dkjg.csv",header=False) 
    

      

  • 相关阅读:
    baselines库中cmd_util.py模块对atari游戏的包装为什么要分成两部分并在中间加入flatten操作呢?
    baselines库中atari_wrappers.py中的环境包装器的顺序问题
    baselines中环境包装器EpisodicLifeEnv的分析
    【转载】 为何大厂选择减人而不是降薪?
    神经网络初始化:xavier,kaiming、ortho正交初始化在CNN网络中的使用
    嵌入式开发之linuxweb 服务器Nginx 详解
    iotop
    blktrace 编译与使用
    文件系统预读
    玩儿观赏鱼教程一:鱼缸分类
  • 原文地址:https://www.cnblogs.com/CQ-LQJ/p/7834476.html
Copyright © 2020-2023  润新知