核心步骤:
①将nsrxx.csv和zzsfp.csv两个表进行了合并和缩减,最终保留下了6个字段存入business.csv
字段含义:
id:企业id
xf_count:作为销方次数
gf_count:作为购方次数
del_count:销方购方次数之差
zfcs:作废次数
problem:是否为问题企业
②导入相关扩展包
from sklearn.model_selection import train_test_split # 划分数据集 from sklearn.feature_extraction import DictVectorizer #字典特征值提取 from sklearn.tree import DecisionTreeClassifier # 决策树 from sklearn.tree import export_graphviz # 决策树可视化 import pandas as pd
③获取数据
business=pd.read_csv("./business.csv")
④筛选特征值和目标值
x=business[["xf_count","gf_count","del_count","zfcs"]] #特征值 y=business["problem"] #目标值
⑤将特征值转化为字典格式
x=x.to_dict(orient="records")
⑥划分数据集
x_train,x_test,y_train,y_test=train_test_split(x,y)
⑦字典特征抽取
transfer=DictVectorizer() x_train=transfer.fit_transform(x_train) x_test=transfer.transform(x_test)
⑧决策树预估器(estimator)
estimator = DecisionTreeClassifier(criterion="entropy") # criterion默认为'gini'系数,也可选择信息增益熵'entropy' estimator.fit(x_train, y_train) # 调用fit()方法进行训练,()内为训练集的特征值与目标值
⑨模型评估
-
直接对比真实值和预测值
y_predict = estimator.predict(x_test) # 传入测试集特征值,预测所给测试集的目标值 print("y_predict: ", y_predict)
-
计算有问题的企业数
flag=0; for i in y_train: if(i==1): flag=flag+1 for i in y_test: if(i==1): flag=flag+1 print("有问题的企业数为:",flag)
-
对比真实值和预测值:
print("直接对比真实值和预测值: ", y_test == y_predict)
-
计算准确率
score = estimator.score(x_test, y_test) # 传入测试集的特征值和目标值 print("准确率为: ", score)
⑩决策树可视化
export_graphviz(estimator, out_file="tree_business.dot", feature_names=transfer.get_feature_names())
核心代码:
def tree_business(): # 1.获取数据 business=pd.read_csv("./business.csv") # 2.筛选特征值和目标值 x=business[["xf_count","gf_count","del_count","zfcs"]] #特征值 y=business["problem"] #目标值 # 3.数据处理(缺失值处理,特征值——>字典类型) #转换为字典 x=x.to_dict(orient="records") # 4.划分数据集 x_train,x_test,y_train,y_test=train_test_split(x,y) # 5.字典特征抽取 transfer=DictVectorizer() x_train=transfer.fit_transform(x_train) x_test=transfer.transform(x_test) # 6.决策树预估器(estimator) estimator = DecisionTreeClassifier(criterion="entropy") # criterion默认为'gini'系数,也可选择信息增益熵'entropy' estimator.fit(x_train, y_train) # 调用fit()方法进行训练,()内为训练集的特征值与目标值 # 7.模型评估 # 方法一:直接对比真实值和预测值 y_predict = estimator.predict(x_test) # 传入测试集特征值,预测所给测试集的目标值 print("y_predict: ", y_predict) #计算有问题的企业数: flag=0; for i in y_train: if(i==1): flag=flag+1 for i in y_test: if(i==1): flag=flag+1 print("有问题的企业数为:",flag) print("直接对比真实值和预测值: ", y_test == y_predict) # 方法二:计算准确率 score = estimator.score(x_test, y_test) # 传入测试集的特征值和目标值 print("准确率为: ", score) # 8.决策树可视化 export_graphviz(estimator, out_file="tree_business.dot", feature_names=transfer.get_feature_names()) return None
运行结果:
决策树可视化:
(因图规模过大无法展示完整)