• 机器学习笔记


    sklearn API:http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection

    GaussianNB

    有关地形数据的GaussianNB 部署:

    #!/usr/bin/python
    
    """ Complete the code in ClassifyNB.py with the sklearn
        Naive Bayes classifier to classify the terrain data.
        
        The objective of this exercise is to recreate the decision 
        boundary found in the lesson video, and make a plot that
        visually shows the decision boundary """
    
    
    from prep_terrain_data import makeTerrainData
    from class_vis import prettyPicture, output_image
    from ClassifyNB import classify
    from sklearn.naive_bayes import GaussianNB
    import numpy as np
    import pylab as pl
    
    
    features_train, labels_train, features_test, labels_test = makeTerrainData()
    
    ### the training data (features_train, labels_train) have both "fast" and "slow" points mixed
    ### in together--separate them so we can give them different colors in the scatterplot,
    ### and visually identify them
    grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
    bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
    grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
    bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
    
    # You will need to complete this function imported from the ClassifyNB script.
    # Be sure to change to that code tab to complete this quiz.
    clf = classify(features_train, labels_train)
    
    ### draw the decision boundary with the text points overlaid
    prettyPicture(clf, features_test, labels_test)
    output_image("test.png", "png", open("test.png", "rb").read())
    studentMain.py
    #!/usr/bin/python
    
    #from udacityplots import *
    import warnings
    warnings.filterwarnings("ignore")
    
    import matplotlib 
    matplotlib.use('agg')
    
    import matplotlib.pyplot as plt
    import pylab as pl
    import numpy as np
    
    #import numpy as np
    #import matplotlib.pyplot as plt
    #plt.ioff()
    
    def prettyPicture(clf, X_test, y_test):
        x_min = 0.0; x_max = 1.0
        y_min = 0.0; y_max = 1.0
    
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        h = .01  # step size in the mesh
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
    
        plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)
    
        # Plot also the test points
        grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
        bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
        grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
        bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
    
        plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast")
        plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow")
        plt.legend()
        plt.xlabel("bumpiness")
        plt.ylabel("grade")
    
        plt.savefig("test.png")
        
    import base64
    import json
    import subprocess
    
    def output_image(name, format, bytes):
        image_start = "BEGIN_IMAGE_f9825uweof8jw9fj4r8"
        image_end = "END_IMAGE_0238jfw08fjsiufhw8frs"
        data = {}
        data['name'] = name
        data['format'] = format
        data['bytes'] = base64.encodestring(bytes)
        print image_start+json.dumps(data)+image_end
    class_vis.py
    #!/usr/bin/python
    import random
    
    
    def makeTerrainData(n_points=1000):
    ###############################################################################
    ### make the toy dataset
        random.seed(42)
        grade = [random.random() for ii in range(0,n_points)]
        bumpy = [random.random() for ii in range(0,n_points)]
        error = [random.random() for ii in range(0,n_points)]
        y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
        for ii in range(0, len(y)):
            if grade[ii]>0.8 or bumpy[ii]>0.8:
                y[ii] = 1.0
    
    ### split into train/test sets
        X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
        split = int(0.75*n_points)
        X_train = X[0:split]
        X_test  = X[split:]
        y_train = y[0:split]
        y_test  = y[split:]
    
        grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
        bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
        grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
        bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1]
    
    #    training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
    #            , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
    
    
        grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
        bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
        grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
        bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
    
        test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
                , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
    
        return X_train, y_train, X_test, y_test
    #    return training_data, test_data
    perp_terrain_data.py
    from sklearn.naive_bayes import GaussianNB
    def classify(features_train, labels_train):   
        ### import the sklearn module for GaussianNB
        ### create classifier
        ### fit the classifier on the training features and labels
        ### return the fit classifier
        
        
        ### your code goes here!
        
        clf=GaussianNB()
        clf.fit(features_train,labels_train)
       
        return clf.fit(features_train,labels_train)
    classifyNB.py

    计算 GaussianNB 准确性:

    def NBAccuracy(features_train, labels_train, features_test, labels_test):
        """ compute the accuracy of your Naive Bayes classifier """
        ### import the sklearn module for GaussianNB
        from sklearn.naive_bayes import GaussianNB
    
        ### create classifier
        clf = GaussianNB()
    
        ### fit the classifier on the training features and labels
        #TODO
        clf.fit(features_train,labels_train)
        ### use the trained classifier to predict labels for the test features
        pred = clf.predict(features_test)
    
    
        ### calculate and return the accuracy on the test data
        ### this is slightly different than the example, 
        ### where we just print the accuracy
        ### you might need to import an sklearn module
        from sklearn.metrics import accuracy_score
     #   print accuracy_score(pred,labels_test)
        accuracy =accuracy_score(pred,labels_test)
        return accuracy
    classify.py
    from class_vis import prettyPicture
    from prep_terrain_data import makeTerrainData
    from classify import NBAccuracy
    
    import matplotlib.pyplot as plt
    import numpy as np
    import pylab as pl
    
    
    features_train, labels_train, features_test, labels_test = makeTerrainData()
    
    def submitAccuracy():
        accuracy = NBAccuracy(features_train, labels_train, features_test, labels_test)
        return accuracy
    studentCode.py

    支持向量机(sport vertor)

    1、使用参考资料:http://scikit-learn.org/stable/modules/svm.html

    2、支持向量机相关代码:

    import sys
    from class_vis import prettyPicture
    from prep_terrain_data import makeTerrainData
    
    import matplotlib.pyplot as plt
    import copy
    import numpy as np
    import pylab as pl
    
    
    features_train, labels_train, features_test, labels_test = makeTerrainData()
    
    
    ########################## SVM #################################
    ### we handle the import statement and SVC creation for you here
    from sklearn.svm import SVC
    clf = SVC(kernel="linear")
    
    
    #### now your job is to fit the classifier
    #### using the training features/labels, and to
    #### make a set of predictions on the test data
    clf.fit(features_train,labels_train)
    
    
    #### store your predictions in a list named pred
    
    pred=clf.predict(features_test)
    
    
    
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(pred, labels_test)
    
    def submitAccuracy():
        return acc
    SVM

     决策树(decision tree)

    1、参考资料:http://scikit-learn.org/stable/modules/tree.html

                          http://www.jianshu.com/p/c2916d616acc

    2、相关代码:

    #!/usr/bin/python
    
    """ lecture and example code for decision tree unit """
    
    import sys
    from class_vis import prettyPicture, output_image
    from prep_terrain_data import makeTerrainData
    
    import matplotlib.pyplot as plt
    import numpy as np
    import pylab as pl
    from classifyDT import classify
    
    features_train, labels_train, features_test, labels_test = makeTerrainData()
    
    ### the classify() function in classifyDT is where the magic
    ### happens--fill in this function in the file 'classifyDT.py'!
    clf = classify(features_train, labels_train)
    
    
    #### grader code, do not modify below this line
    
    prettyPicture(clf, features_test, labels_test)
    output_image("test.png", "png", open("test.png", "rb").read())
    
    复制代码
    studentMain.py
    def classify(features_train, labels_train):
        
        ### your code goes here--should return a trained decision tree classifer
        from sklearn import tree
        clf=tree.DecisionTreeRegressor()
        pred=clf.fit(features_train,labels_train)
        return pred
    classifyDT.py

    决策树的准确性:

    import sys
    from class_vis import prettyPicture
    from prep_terrain_data import makeTerrainData
    
    import numpy as np
    import pylab as pl
    
    features_train, labels_train, features_test, labels_test = makeTerrainData()
    
    
    
    #################################################################################
    
    
    ########################## DECISION TREE #################################
    
    
    
    #### your code goes here
    from sklearn import tree
    clf=tree.DecisionTreeRegressor()
    clf.fit(features_train,labels_train)
    pred=clf.predict(features_test)
    
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(pred, labels_test)
    
    #acc = ### you fill this in!
    ### be sure to compute the accuracy on the test set
    
    
        
    def submitAccuracies():
      return {"acc":round(acc,3)}
    DT

    决策树参数:

     通过使用决策树参数min_samples_split=2,min_samples_split=50比较决策树的准确性

    import sys
    from class_vis import prettyPicture
    from prep_terrain_data import makeTerrainData
    
    import matplotlib.pyplot as plt
    import numpy as np
    import pylab as pl
    
    features_train, labels_train, features_test, labels_test = makeTerrainData()
    
    ########################## DECISION TREE #################################
    ### your code goes here--now create 2 decision tree classifiers,
    ### one with min_samples_split=2 and one with min_samples_split=50
    ### compute the accuracies on the testing data and store
    ### the accuracy numbers to acc_min_samples_split_2 and
    ### acc_min_samples_split_50, respectively
    
    from sklearn import tree
    clf_2=tree.DecisionTreeClassifier(min_samples_split=2)
    clf_2.fit(features_train,labels_train)
    pred_2=clf_2.predict(features_test)
    
    clf_50=tree.DecisionTreeClassifier(min_samples_split=50)
    clf_50.fit(features_train,labels_train)
    pred_50=clf_50.predict(features_test)
    
    from sklearn.metrics import accuracy_score
    acc_min_samples_split_2=accuracy_score(pred_2,labels_test)
    acc_min_samples_split_50=accuracy_score(pred_50,labels_test)
    
    from sklearn.tree import DecisionTreeClassifier
    def submitAccuracies():
      return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3),
              "acc_min_samples_split_50":round(acc_min_samples_split_50,3)}
    min_samples_split

    运行结果:

    {"message": "{'acc_min_samples_split_50': 0.912, 'acc_min_samples_split_2': 0.908}"}

    熵、条件熵、信息增益:

    参考:https://www.zhihu.com/question/22104055

    熵:表示随机变量的不确定性。

    条件熵:在一个条件下,随机变量的不确定性。

    信息增益:熵 - 条件熵

    线性回归(linearRegression)

    1、搜索sklearn regression 可找到相关资料

    2、参考资料:http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

    3、相关代码:

    年龄/净值回归代码

    #!/usr/bin/python
    
    import numpy
    import matplotlib
    matplotlib.use('agg')
    
    import matplotlib.pyplot as plt
    from studentRegression import studentReg
    from class_vis import prettyPicture, output_image
    
    from ages_net_worths import ageNetWorthData
    
    ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData()
    
    reg = studentReg(ages_train, net_worths_train)
    
    plt.clf()
    plt.scatter(ages_train, net_worths_train, color="b", label="train data")
    plt.scatter(ages_test, net_worths_test, color="r", label="test data")
    plt.plot(ages_test, reg.predict(ages_test), color="black")
    plt.legend(loc=2)
    plt.xlabel("ages")
    plt.ylabel("net worths")
    
    plt.savefig("test.png")
    output_image("test.png", "png", open("test.png", "rb").read())
    studentMain.py
    def studentReg(ages_train, net_worths_train):
        ### import the sklearn regression module, create, and train your regression
        ### name your regression reg
        
        ### your code goes here!
        from sklearn.linear_model import LinearRegression
        clf=LinearRegression()
        reg=clf.fit(ages_train,net_worths_train)
        
        return reg
    studentRegression.py

    reg.coef_  :斜率

    reg.intercept:截距

    reg.score:

    在sklearn 中训练/测试分离

    参考:http://scikit-learn.org/stable/modules/cross_validation.html

    X_train, X_test, y_train, y_test = train_test_split(
    ...     iris.data, iris.target, test_size=0.4, random_state=0)

    相关代码:

    #!/usr/bin/python
    
    """ 
    PLEASE NOTE:
    The api of train_test_split changed and moved from sklearn.cross_validation to
    sklearn.model_selection(version update from 0.17 to 0.18)
    
    The correct documentation for this quiz is here: 
    http://scikit-learn.org/0.17/modules/cross_validation.html
    """
    
    from sklearn import datasets
    from sklearn.svm import SVC
    
    iris = datasets.load_iris()
    features = iris.data
    labels = iris.target
    
    ###############################################################
    ### YOUR CODE HERE
    ###############################################################
    
    ### import the relevant code and make your train/test split
    ### name the output datasets features_train, features_test,
    ### labels_train, and labels_test
    # PLEASE NOTE: The import here changes depending on your version of sklearn
    from sklearn import cross_validation # for version 0.17
    # For version 0.18
    # from sklearn.model_selection import train_test_split
    
    ### set the random_state to 0 and the test_size to 0.4 so
    ### we can exactly check your result
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
    
    ###############################################################
    # DONT CHANGE ANYTHING HERE
    clf = SVC(kernel="linear", C=1.)
    clf.fit(features_train, labels_train)
    
    print clf.score(features_test, labels_test)
    ##############################################################
    def submitAcc():
        return clf.score(features_test, labels_test)
    studentCode.PY
    def submitAcc():
        return clf.score(features_test, labels_test)
    subFunction.py

    结果:0.966666666667

    k折交叉验证(k-fold cross validation)

     sklearn中的 GridSearchCV

    GridSearchCV 用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数。它的好处是,只需增加几行代码,就能遍历多种组合。

    下面是来自 sklearn 文档 的一个示例:

    parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    svr = svm.SVC()
    clf = grid_search.GridSearchCV(svr, parameters)
    clf.fit(iris.data, iris.target)
    

    让我们逐行进行说明。

    parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

    参数字典以及他们可取的值。在这种情况下,他们在尝试找到 kernel(可能的选择为 'linear' 和 'rbf' )和 C(可能的选择为1和10)的最佳组合。

    这时,会自动生成一个不同(kernel、C)参数值组成的“网格”:

    ('rbf', 1)('rbf', 10)
    ('linear', 1) ('linear', 10)

    各组合均用于训练 SVM,并使用交叉验证对表现进行评估。

    svr = svm.SVC() 
    这与创建分类器有点类似,就如我们从第一节课一直在做的一样。但是请注意,“clf” 到下一行才会生成—这儿仅仅是在说采用哪种算法。另一种思考方法是,“分类器”在这种情况下不仅仅是一个算法,而是算法加参数值。请注意,这里不需对 kernel 或 C 做各种尝试;下一行才处理这个问题。

    clf = grid_search.GridSearchCV(svr, parameters) 
    这是第一个不可思议之处,分类器创建好了。 我们传达算法 (svr) 和参数 (parameters) 字典来尝试,它生成一个网格的参数组合进行尝试。

    clf.fit(iris.data, iris.target) 
    第二个不可思议之处。 拟合函数现在尝试了所有的参数组合,并返回一个合适的分类器,自动调整至最佳参数组合。现在您便可通过 clf.best_params_ 来获得参数值。

    精确率(prcision)与召回率(recall)

    以Colin Powell为例子

    true positive (真正例):把Colin Powell预测成Colin Powell(55)

    false positive(假正例):把其他人预测成Colin Powell(4+1+3+1+3)

    false negative(假负例):把Colin Powell预测成其他人(8)

    F1 分数

    既然我们已讨论了精确率和召回率,接下来可能要考虑的另一个指标是 F1 分数。F1 分数会同时考虑精确率和召回率,以便计算新的分数。

    可将 F1 分数理解为精确率和召回率的加权平均值,其中 F1 分数的最佳值为 1、最差值为 0:

    F1 = 2 * (精确率 * 召回率) / (精确率 + 召回率)

    有关 F1 分数和如何在 sklearn 中使用它的更多信息,请查看此链接此处

    特征缩放

    http://scikit-learn.org/stable/modules/preprocessing.html

  • 相关阅读:
    模块二
    lambda map() filter() zip()练习
    装饰器
    函数模块回顾
    连接不同数据OleDb(不完整)
    连接不同的数据库
    连接数据库ORACLE(不完整)
    多数据之间的连接操作ODBC(不完整)
    ora0131
    ORACLE linux 下 sqlplus命令
  • 原文地址:https://www.cnblogs.com/amanda-x/p/7801427.html
Copyright © 2020-2023  润新知