• 机器学习8—回归学习笔记


    线性回归,加权回归推导过程

     

    一、普通线性回归(OLS)

    损失函数: 

    J(w)=1ni=1n(yiwxi)2=1n||YXw||2


    其中:Ywxi为向量,X为矩阵。对该损失函数求解如下,即为对J(w)函数求w的偏导: 

     


    需要用到的矩阵求导公式为: 

    dBAdA=BT


    dATBdA=B


    dATBAdA=2BA


    所以,J(w)w求导得到: 

     


    XTX为可逆矩阵的时候,有解: 

    w=(XTX)1XTY


    因为: 

    Xw=X(XTX)1XTY=Y^=H^Y


    所以,帽子矩阵为: 

    H^=X(XTX)1XT

    二、加权回归

    损失函数: 

    J(w)=1ni=1nαi(yiwxi)2=1nα||YXw||2


    同理推导: 

     


    其中,α为是权重的对角矩阵。对w求导得到: 

     


    所以,得到: 

    w=(XTαX)1XTαY


    加权的帽子矩阵为: 

    H^=X(XTαX)1XTα

    机器学习实战之回归

    python3.6代码

    test8.py

    #-*- coding:utf-8
    
    import sys
    sys.path.append("regression.py")
    
    import regression
    from numpy import *
    import matplotlib.pyplot as plt
    
    # xArr,yArr = regression.loadDataSet('ex0.txt')
    #
    # print(xArr)
    #
    # ws = regression.standRegres(xArr, yArr)
    # print(ws)
    #
    #
    # xMat = mat(xArr)
    # yMat = mat(yArr)
    # yHat = xMat*ws
    #
    # fig = plt.figure()
    # ax = fig.add_subplot(111)
    # ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
    #
    # xCopy = xMat.copy()
    # xCopy.sort(0)
    # yHat = xCopy*ws
    # ax.plot(xCopy[:,1],yHat)
    # plt.show()
    #
    # yHat = xMat*ws
    # array = corrcoef(yHat.T, yMat)
    # print(array)
    
    
    # xArr,yArr = regression.loadDataSet('ex0.txt')
    # print(yArr[0])
    # reg1 = regression.lwlr(xArr[0], xArr, yArr, 1.0)
    # print(reg1)
    # reg001 = regression.lwlr(xArr[0], xArr, yArr, 0.001)
    # print(reg001)
    #
    # yHat = regression.lwlrTest(xArr, xArr, yArr, 0.01)
    # xMat = mat(xArr)
    # srtInd = xMat[:,1].argsort(0)
    # xSort = xMat[srtInd][:,0,:]
    #
    # fig = plt.figure()
    # ax = fig.add_subplot(111)
    # ax.plot(xSort[:,1],yHat[srtInd])
    # ax.scatter(xMat[:,1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red')
    # plt.show()
    #
    #
    # abX, abY = regression.loadDataSet('abalone.txt')
    # yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
    # yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
    # yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
    #
    # error01 = regression.rssError(abY[0:99], yHat01.T)
    # print(error01)
    # error1 = regression.rssError(abY[0:99], yHat1.T)
    # print(error1)
    # error10 = regression.rssError(abY[0:99], yHat10.T)
    # print(error10)
    #
    # yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
    # error01 = regression.rssError(abY[100:199], yHat01.T)
    # print(error01)
    #
    # yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
    # error1 = regression.rssError(abY[100:199], yHat1.T)
    # print(error1)
    #
    # yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
    # error10 = regression.rssError(abY[100:199], yHat10.T)
    # print(error10)
    #
    # ws = regression.standRegres(abX[0:99], abY[0:99])
    # yHat = mat(abX[100:199])*ws
    # error = regression.rssError(abY[100:199], yHat.T.A)
    # print(error)
    
    # abX, abY = regression.loadDataSet("abalone.txt")
    # ridgeWeights = regression.ridgeTest(abX, abY)
    # fig = plt.figure()
    # ax = fig.add_subplot(111)
    # ax.plot(ridgeWeights)
    # plt.show()
    # print(ridgeWeights)
    # print("ridgeWeights over")
    
    xArr, yArr = regression.loadDataSet('abalone.txt')
    weights = regression.stageWise(xArr, yArr, 0.005, 1000)
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(weights)
    plt.show()
    
    
    # print("最小二乘法")
    # xMat = mat(xArr)
    # yMat = mat(yArr).T
    # xMat = regression.regularize(xMat)
    # yM = mean(yMat, 0)
    # yMat = yMat - yM
    # weights = regression.standRegres(xMat, yMat.T)
    # print(weights.T)
    
    
    
    # lgX = []; lgY = []
    # regression.setDataCollect(lgX, lgY)
    # print(lgX)
    # print("__________")
    # print(lgY)
    
    
    print("over!")

    regression.py

    '''
    Created on Jan 8, 2011
    
    @author: Peter
    '''
    from numpy import *
    
    def loadDataSet(fileName):      #general function to parse tab -delimited floats
        numFeat = len(open(fileName).readline().split('	')) - 1 #get number of fields 
        dataMat = []; labelMat = []
        fr = open(fileName)
        for line in fr.readlines():
            lineArr =[]
            curLine = line.strip().split('	')
            for i in range(numFeat):
                lineArr.append(float(curLine[i]))
            dataMat.append(lineArr)
            labelMat.append(float(curLine[-1]))
        return dataMat,labelMat
    
    def standRegres(xArr,yArr):
        xMat = mat(xArr); yMat = mat(yArr).T
        xTx = xMat.T*xMat
        if linalg.det(xTx) == 0.0:
            print("This matrix is singular, cannot do inverse")
            return
        ws = xTx.I * (xMat.T*yMat)
        return ws
    
    def lwlr(testPoint,xArr,yArr,k=1.0):
        xMat = mat(xArr); yMat = mat(yArr).T
        m = shape(xMat)[0]
        weights = mat(eye((m)))
        for j in range(m):                      #next 2 lines create weights matrix
            diffMat = testPoint - xMat[j,:]     #
            weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
        xTx = xMat.T * (weights * xMat)
        if linalg.det(xTx) == 0.0:
            print("This matrix is singular, cannot do inverse")
            return
        ws = xTx.I * (xMat.T * (weights * yMat))
        return testPoint * ws
    
    def lwlrTest(testArr,xArr,yArr,k=1.0):  #loops over all the data points and applies lwlr to each one
        m = shape(testArr)[0]
        yHat = zeros(m)
        for i in range(m):
            yHat[i] = lwlr(testArr[i],xArr,yArr,k)
        return yHat
    
    def lwlrTestPlot(xArr,yArr,k=1.0):  #same thing as lwlrTest except it sorts X first
        yHat = zeros(shape(yArr))       #easier for plotting
        xCopy = mat(xArr)
        xCopy.sort(0)
        for i in range(shape(xArr)[0]):
            yHat[i] = lwlr(xCopy[i],xArr,yArr,k)
        return yHat,xCopy
    
    def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
        return ((yArr-yHatArr)**2).sum()
    
    def ridgeRegres(xMat,yMat,lam=0.2):
        xTx = xMat.T*xMat
        denom = xTx + eye(shape(xMat)[1])*lam
        if linalg.det(denom) == 0.0:
            print("This matrix is singular, cannot do inverse")
            return
        ws = denom.I * (xMat.T*yMat)
        return ws
        
    def ridgeTest(xArr,yArr):
        xMat = mat(xArr); yMat=mat(yArr).T
        yMean = mean(yMat,0)
        yMat = yMat - yMean     #to eliminate X0 take mean off of Y
        #regularize X's
        xMeans = mean(xMat,0)   #calc mean then subtract it off
        xVar = var(xMat,0)      #calc variance of Xi then divide by it
        xMat = (xMat - xMeans)/xVar
        numTestPts = 30
        wMat = zeros((numTestPts,shape(xMat)[1]))
        for i in range(numTestPts):
            ws = ridgeRegres(xMat,yMat,exp(i-10))
            wMat[i,:]=ws.T
        return wMat
    
    def regularize(xMat):#regularize by columns
        inMat = xMat.copy()
        inMeans = mean(inMat,0)   #calc mean then subtract it off
        inVar = var(inMat,0)      #calc variance of Xi then divide by it
        inMat = (inMat - inMeans)/inVar
        return inMat
    
    def stageWise(xArr,yArr,eps=0.01,numIt=100):
        xMat = mat(xArr); yMat=mat(yArr).T
        yMean = mean(yMat,0)
        yMat = yMat - yMean     #can also regularize ys but will get smaller coef
        xMat = regularize(xMat)
        m,n=shape(xMat)
        returnMat = zeros((numIt,n)) #testing code remove
        ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
        for i in range(numIt):
            print(ws.T)
            lowestError = inf; 
            for j in range(n):
                for sign in [-1,1]:
                    wsTest = ws.copy()
                    wsTest[j] += eps*sign
                    yTest = xMat*wsTest
                    rssE = rssError(yMat.A,yTest.A)
                    if rssE < lowestError:
                        lowestError = rssE
                        wsMax = wsTest
            ws = wsMax.copy()
            returnMat[i,:]=ws.T
        return returnMat
    
    #def scrapePage(inFile,outFile,yr,numPce,origPrc):
    #    from BeautifulSoup import BeautifulSoup
    #    fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
    #    soup = BeautifulSoup(fr.read())
    #    i=1
    #    currentRow = soup.findAll('table', r="%d" % i)
    #    while(len(currentRow)!=0):
    #        title = currentRow[0].findAll('a')[1].text
    #        lwrTitle = title.lower()
    #        if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
    #            newFlag = 1.0
    #        else:
    #            newFlag = 0.0
    #        soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
    #        if len(soldUnicde)==0:
    #            print("item #%d did not sell" % i)
    #        else:
    #            soldPrice = currentRow[0].findAll('td')[4]
    #            priceStr = soldPrice.text
    #            priceStr = priceStr.replace('$','') #strips out $
    #            priceStr = priceStr.replace(',','') #strips out ,
    #            if len(soldPrice)>1:
    #                priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
    #            print("%s	%d	%s" % (priceStr,newFlag,title))
    #            fw.write("%d	%d	%d	%f	%s
    " % (yr,numPce,newFlag,origPrc,priceStr))
    #        i += 1
    #        currentRow = soup.findAll('table', r="%d" % i)
    #    fw.close()
        
    from time import sleep
    import json
    import urllib.request
    def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
        sleep(10)
        myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
        searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
        pg = urllib.request.urlopen(searchURL)
        retDict = json.loads(pg.read())
        for i in range(len(retDict['items'])):
            try:
                currItem = retDict['items'][i]
                if currItem['product']['condition'] == 'new':
                    newFlag = 1
                else: newFlag = 0
                listOfInv = currItem['product']['inventories']
                for item in listOfInv:
                    sellingPrice = item['price']
                    if  sellingPrice > origPrc * 0.5:
                        print("%d	%d	%d	%f	%f" % (yr,numPce,newFlag,origPrc, sellingPrice))
                        retX.append([yr, numPce, newFlag, origPrc])
                        retY.append(sellingPrice)
            except: print('problem with item %d' % i)
        
    def setDataCollect(retX, retY):
        searchForSet(retX, retY, 8288, 2006, 800, 49.99)
        searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
        searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
        searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
        searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
        searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
        
    def crossValidation(xArr,yArr,numVal=10):
        m = len(yArr)                           
        indexList = range(m)
        errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
        for i in range(numVal):
            trainX=[]; trainY=[]
            testX = []; testY = []
            random.shuffle(indexList)
            for j in range(m):#create training set based on first 90% of values in indexList
                if j < m*0.9: 
                    trainX.append(xArr[indexList[j]])
                    trainY.append(yArr[indexList[j]])
                else:
                    testX.append(xArr[indexList[j]])
                    testY.append(yArr[indexList[j]])
            wMat = ridgeTest(trainX,trainY)    #get 30 weight vectors from ridge
            for k in range(30):#loop over all of the ridge estimates
                matTestX = mat(testX); matTrainX=mat(trainX)
                meanTrain = mean(matTrainX,0)
                varTrain = var(matTrainX,0)
                matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
                yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
                errorMat[i,k]=rssError(yEst.T.A,array(testY))
                #print errorMat[i,k]
        meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
        minMean = float(min(meanErrors))
        bestWeights = wMat[nonzero(meanErrors==minMean)]
        #can unregularize to get model
        #when we regularized we wrote Xreg = (x-meanX)/var(x)
        #we can now write in terms of x not Xreg:  x*w/var(x) - meanX/var(x) +meanY
        xMat = mat(xArr); yMat=mat(yArr).T
        meanX = mean(xMat,0); varX = var(xMat,0)
        unReg = bestWeights/varX
        print("the best model from Ridge Regression is:
    ",unReg)
        print("with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat))

      

     

  • 相关阅读:
    POJ1094查分约束,判断关系是否唯一
    POJ1087DFS+匈牙利或者DINIC
    POJ1087DFS+匈牙利或者DINIC
    #Leetcode# 34. Find First and Last Position of Element in Sorted Array
    #Leetcode# 18. 4Sum
    #Leetcode# 16. 3Sum Closest
    #Leetcode# 15. 3Sum
    #Leetcode# 42. Trapping Rain Water
    #Leetcode# 63. Unique Paths II
    #Leetcode# 62. Unique Paths
  • 原文地址:https://www.cnblogs.com/Vae1990Silence/p/8459617.html
Copyright © 2020-2023  润新知