• 机器学习技法笔记:Homework #7 Decision Tree&Random Forest相关习题


    原文地址:https://www.jianshu.com/p/7ff6fd6fc99f

    问题描述

    图1 13-15
    图2 16-20

    程序实现

    13-15

    # coding:utf-8
    
    # decision_tree.py
    
    import numpy as np
    
    
    def ReadData(dataFile):
    
        with open(dataFile, 'r') as f:
            lines = f.readlines()
            data_list = []
            for line in lines:
                line = line.strip().split()
                data_list.append([float(l) for l in line])
            dataArray = np.array(data_list)
            return dataArray
    
    
    def sign(n):
    
        if(n>=0):
            return 1
        else:
            return -1
    
    
    def GetSortedArray(dataArray,i):
         # 根据dataArray第i列的值对dataArray进行从小到大的排序
        data_list=dataArray.tolist()
        sorted_data_list=sorted(data_list,key=lambda x:x[i],reverse=False)
        sortedDataArray=np.array(sorted_data_list)
        return sortedDataArray
    
    
    def GetSplitData(pred,dataArray):
        assert pred.shape[0]==dataArray.shape[0],"wrong shape of prediction!"
        falseData=[]
        trueData=[]
        for n in range(pred.shape[0]):
            if pred[n]==-1:
                falseData.append(dataArray[n,:])
            elif pred[n]==1:
                trueData.append(dataArray[n,:])
            else:
                print("wrong prediction!")
        return np.array(falseData),np.array(trueData)
    
    
    def GetWeightedImpurity(pred,dataY):
        num_data = dataY.shape[0]
        num_false=(pred==-1).sum()
        num_true=(pred==1).sum()
        assert num_false+num_true==num_data,"wrong prediction!"
        if(num_false==0):
            falseGini=0
        else:
            falseFalse = ((pred + dataY) == -2).sum()
            falseTrue = num_false - falseFalse
            falseGini=1 - (falseFalse ** 2 + falseTrue ** 2) / num_false ** 2
        if(num_true==0):
            trueGini=0
        else:
            trueTrue = ((pred + dataY) == 2).sum()
            trueFalse = num_true - trueTrue
            trueGini=1-(trueFalse**2+trueTrue**2)/num_true**2
        return (num_false*falseGini+num_true*trueGini)/num_data
    
    
    def decision_stump(dataArray):
    
        num_data=dataArray.shape[0]
        num_dim=dataArray.shape[1]-1
        min_e=np.inf
        min_s = np.inf
        min_d=np.inf
        min_theta = np.inf
        min_pred=np.zeros((num_data,))
        for d in range(num_dim):
            sortedDataArray=GetSortedArray(dataArray,d)
            d_min_e=np.inf
            d_min_s = np.inf
            d_min_theta = np.inf
            d_min_pred = np.zeros((num_data,))
            for s in [-1.0,1.0]:
                for i in range(num_data):
                    if(i==0):
                        theta=-np.inf
                        pred=s*np.ones((num_data,))
                    else:
                        if sortedDataArray[i-1][d]==sortedDataArray[i][d]:
                            continue
                        theta=(sortedDataArray[i-1][d]+sortedDataArray[i][d])/2
                        pred=np.zeros((num_data,))
                        for n in range(num_data):
                            pred[n]=s*sign(dataArray[n,d]-theta)
                    d_now_e=GetWeightedImpurity(pred,dataArray[:,-1])
                    if(d_now_e<d_min_e):
                        d_min_e=d_now_e
                        d_min_s=s
                        d_min_theta=theta
                        d_min_pred=pred
            if(d_min_e<min_e):
                min_e=d_min_e
                min_s=d_min_s
                min_d=d
                min_theta=d_min_theta
                min_pred=d_min_pred
        return min_s,min_d,min_theta,min_pred
    
    
    paraDict={}
    def decision_tree(id,dataArray,prune=False):
        num_data=dataArray.shape[0]
        num_dim=dataArray.shape[1]-1
        dataX=dataArray[:,:-1]
        dataY=dataArray[:,-1]
        if(dataY.min()==dataY.max()): # y相同
            return {id:dataY[0]}
        tmpX=np.concatenate([dataX[0,:].reshape((1,num_dim))]*num_data,axis=0)
        if(((dataX-tmpX)==0).all()): # x无法再分割
            return {id:sign(np.sum(dataY))}
        s,d,theta,pred=decision_stump(dataArray)
        paraDict[id]=[s,d,theta]
        falseArray,trueArray=GetSplitData(pred,dataArray)
        if prune:
            return {id:{-1:{id*2:sign(falseArray[:,-1].sum())},1:{id*2+1:sign(trueArray[:,-1].sum())}}}
        falseTree=decision_tree(id*2,falseArray)
        trueTree=decision_tree(id*2+1,trueArray)
        return {id:{-1:falseTree,1:trueTree}}
    
    
    def GetZeroOneError(pred,dataY):
        return (pred!=dataY).sum()/dataY.shape[0]
    
    
    def predict(treeDict,dataX):
        num_data=dataX.shape[0]
        pred=np.zeros((num_data,))
        for n in range(num_data):
            x=dataX[n,:]
            id=1
            tmp_dict=treeDict
            while(1):
                tmp_dict=tmp_dict[id]
                if(type(tmp_dict).__name__!="dict"):
                    break
                paraList = paraDict[id]
                tmp_res=paraList[0]*sign(x[paraList[1]]-paraList[2])
                tmp_dict=tmp_dict[tmp_res]
                id=list(tmp_dict.keys())[0]
            pred[n]=tmp_dict
        return pred
    
    
    def getNumLeafs(myTree):
        numLeafs = 0
        firstStr = list(myTree.keys())[0]
        secondDict = myTree[firstStr]
        if(type(secondDict).__name__=="dict"):
            numLeafs += getNumLeafs(secondDict[-1])
            numLeafs+=getNumLeafs(secondDict[1])
        else:
            numLeafs += 1
        return numLeafs
    
    
    def getTreeDepth(myTree):
        maxDepth = 0
        firstStr = list(myTree.keys())[0]
        secondDict = myTree[firstStr]
        if(type(secondDict).__name__=="dict"):
                thisDepth = 1 + max(getTreeDepth(secondDict[-1]),getTreeDepth(secondDict[1]))
        else:
            thisDepth = 1
        if thisDepth > maxDepth: maxDepth = thisDepth
        return maxDepth
    
    
    import matplotlib.pyplot as plt
    
    decisionNode = dict(boxstyle="round", fc="0.8",pad=0.8)
    leafNode = dict(boxstyle="circle", fc="0.8",pad=0.1)
    arrow_args = dict(arrowstyle="<-")
    
    
    def plotNode(nodeTxt, centerPt, parentPt, nodeType):
        createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
                                xytext=centerPt, textcoords='axes fraction',
                                va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
        return
    
    def plotMidText(centerPt, parentPt, txtString):
        xMid = (parentPt[0] - centerPt[0]) / 2.0 + centerPt[0]
        yMid = (parentPt[1] - centerPt[1]) / 2.0 + centerPt[1]
        createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
        return
    
    def plotTree(myTree, centerPt, parentPt, nodeTxt):
        firstStr = list(myTree.keys())[0]
        if firstStr==1:
            createPlot.ax1.annotate(str(1), xy=parentPt, xycoords='axes fraction',
                                    va="center", ha="center",bbox=decisionNode)
        elif firstStr in paraDict:
            plotNode(str(firstStr),centerPt,parentPt,decisionNode)
            plotMidText(centerPt,parentPt,nodeTxt)
        else:
            plotNode(str(myTree[firstStr]),centerPt,parentPt,leafNode)
            plotMidText(centerPt,parentPt,nodeTxt)
            return 
        secondDict = myTree[firstStr]
        if (type(secondDict).__name__ == "dict"):
            for key in secondDict.keys():
                plotTree(secondDict[key],(centerPt[0]+key*plotTree.xDict[firstStr],centerPt[1]-1.0/plotTree.totalD)
                         ,centerPt, str(key))
        return
    
    def createPlot(inTree,savename="13.png"):
        fig = plt.figure(1, facecolor='white',figsize=(20,10))
        fig.clf()
        axprops = dict(xticks=[], yticks=[])
        createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
        plotTree.totalW = float(getNumLeafs(inTree))
        plotTree.totalD = float(getTreeDepth(inTree))
        plotTree.xDict={}
        plotTree.xDict[1] = 4*1.0/plotTree.totalW
        for i in range(2,int(plotTree.totalD)+1):
            for j in range(2**(i-1),2**i):
                plotTree.xDict[j]=plotTree.xDict[2**(i-2)]/1.8
        plotTree(inTree,(0.43,1.0),(0.43, 1.0), '')
        plt.savefig(savename)
        return
    
    
    
    if __name__=="__main__":
    
        dataArray=ReadData("hw7_train.dat")
        treeDict=decision_tree(1,dataArray)
        print(treeDict)
    
        # 13
        createPlot(treeDict)
    
        # 14
        pred=predict(treeDict,dataArray[:,:-1])
        ein=GetZeroOneError(pred,dataArray[:,-1])
        print("the Ein of the tree:",ein)
    
        # 15
        testArray=ReadData("hw7_test.dat")
        pred=predict(treeDict,testArray[:,:-1])
        eout=GetZeroOneError(pred,testArray[:,-1])
        print("the Eout of the tree:",eout)
    

    16-20

    # coding: utf-8
    
    # random_forest.py
    
    
    from  decision_tree import *
    
    
    def bagging(N,dataArray):
        bagDataArray=[]
        for n in range(N):
            id=np.random.randint(low=0,high=dataArray.shape[0])
            bagDataArray.append(dataArray[id,:])
        return np.array(bagDataArray)
    
    
    def random_forest(dataArray,iterations,prune=False):
        num_data=dataArray.shape[0]
        g_list=[]
        ein_g_list=[]
        ein_G_list=[]
        pred_G=np.zeros((num_data,))
        for t in range(iterations):
            print(t+1)
            bagDataArray=bagging(num_data,dataArray)
            treeDict=decision_tree(1,bagDataArray,prune)
            pred_g=predict(treeDict,dataArray[:,:-1])
            pred_G+=pred_g
            g_list.append(treeDict)
            ein_g_list.append(GetZeroOneError(pred_g,dataArray[:,-1]))
            tmpG=np.array(pred_G)
            for i in range(num_data):
                tmpG[i]=sign(tmpG[i])
            ein_G_list.append(GetZeroOneError(tmpG,dataArray[:,-1]))
        return g_list,ein_g_list,ein_G_list
    
    
    def plot_line_chart(X=np.arange(0,3000,1).tolist(),Y=np.arange(0,3000,1).tolist(),nameX="t",nameY="Ein(gt)",saveName="16.png"):
    
        plt.figure(figsize=(30,12))
        plt.plot(X,Y,'b')
        plt.plot(X,Y,'ro')
        plt.xlim((X[0]-1,X[-1]+1))
        for (x,y) in zip(X,Y):
            if(x%100==0):
                plt.text(x+0.1,y,str(round(y,4)))
        plt.xlabel(nameX)
        plt.ylabel(nameY)
        plt.title(nameY+" versus "+nameX)
        plt.savefig(saveName)
        return
    
    
    def plot_bar_chart(X=np.arange(0,3000,1).tolist(),Y=np.arange(0,300,1).tolist(),nameX="t",nameY="Ein(gt)",saveName="16.png"):
        plt.figure(figsize=(30,12))
        plt.bar(left=X,height=Y,width=1,align="center",yerr=0.000001)
        for (c,w) in zip(X,Y):
            if(c%100==0):
                plt.text(c,w*1.03,str(round(w,4)))
        plt.xlabel(nameX)
        plt.ylabel(nameY)
        plt.xlim(X[0]-1,X[-1]+1)
        plt.ylim(0,1)
        plt.title(nameY+" versus "+nameX)
        plt.savefig(saveName)
        return
    
    
    
    if __name__ == "__main__":
    
        dataArray = ReadData("hw7_train.dat")
        g_list, ein_g_list, ein_G_list = random_forest(dataArray, 3000)
    
        # 16
        plot_bar_chart(Y=ein_g_list)
    
        # 17
        plot_line_chart(Y=ein_G_list, nameY="Ein(Gt)", saveName="17.png")
    
        testArray = ReadData("hw7_test.dat")
        num_test = testArray.shape[0]
        pred_G = np.zeros((num_test,))
        eout_G_list = []
        for t in range(3000):
            print(t+1)
            pred_g = predict(treeDict=g_list[t],dataX=testArray[:, :-1])
            pred_G += pred_g
            tmpG = np.array(pred_G)
            for i in range(num_test):
                tmpG[i] = sign(tmpG[i])
            eout_G_list.append(GetZeroOneError(tmpG, testArray[:, -1]))
    
        # 18
        plot_line_chart(Y=eout_G_list, nameY="Eout(Gt)", saveName="18.png")
    
        g_list, ein_g_list, ein_G_list = random_forest(dataArray, 3000, True)
    
        # 19
        plot_line_chart(Y=ein_G_list, nameY="Ein(Gt)", saveName="19.png")
    
        pred_G = np.zeros((num_test,))
        eout_G_list = []
        for t in range(3000):
            print(t+1)
            pred_g = predict(treeDict=g_list[t],dataX=testArray[:, :-1])
            pred_G += pred_g
            tmpG = np.array(pred_G)
            for i in range(num_test):
                tmpG[i] = sign(tmpG[i])
            eout_G_list.append(GetZeroOneError(tmpG, testArray[:, -1]))
    
        # 20
        plot_line_chart(Y=eout_G_list, nameY="Eout(Gt)", saveName="20.png")
    

    运行结果

    图3 13结果
    图4 14-15结果
    图5 16结果
    图6 17结果
    图7 18结果
    图8 19结果
    图9 20结果

  • 相关阅读:
    pycharm的一些操作指令和技巧
    Python开发:模块
    python字符编码
    Pyhton开发:Python基础杂货铺
    Python之函数
    python介绍
    记录
    HDOJ3699 A hard Aoshu Problem[暴力]
    HDOJ3697 Selecting courses[贪心]
    HDOJ4054 Hexadecimal View[编码题]
  • 原文地址:https://www.cnblogs.com/cherrychenlee/p/10803514.html
Copyright © 2020-2023  润新知