• spark 决策树分类算法demo


    分类(Classification)

    下面的例子说明了怎样导入LIBSVM 数据文件,解析成RDD[LabeledPoint],然后使用决策树进行分类。GINI不纯度作为不纯度衡量标准并且树的最大深度设置为5。最后计算了测试错误率从而评估算法的准确性。

    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
    from pyspark.mllib.util import MLUtils
    
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    
    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)
    
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())
    
    # Save and load model
    model.save(sc, "myModelPath")
    sameModel = DecisionTreeModel.load(sc, "myModelPath")

    以下代码展示了如何载入一个LIBSVM数据文件,解析成一个LabeledPointRDD,然后使用决策树,使用Gini不纯度作为不纯度衡量指标,最大树深度是5.测试误差用来计算算法准确率。

    1. # -*- coding:utf-8 -*-
    2. """
    3. 测试决策树
    4. """
    5. import os
    6. import sys
    7. import logging
    8. from pyspark.mllib.tree import DecisionTree,DecisionTreeModel
    9. from pyspark.mllib.util import MLUtils
    10. # Path for spark source folder
    11. os.environ['SPARK_HOME']="D:javaPackagesspark-1.6.0-bin-hadoop2.6"
    12. # Append pyspark to Python Path
    13. sys.path.append("D:javaPackagesspark-1.6.0-bin-hadoop2.6python")
    14. sys.path.append("D:javaPackagesspark-1.6.0-bin-hadoop2.6pythonlibpy4j-0.9-src.zip")
    15. from pyspark import SparkContext
    16. from pyspark import SparkConf
    17. conf = SparkConf()
    18. conf.set("YARN_CONF_DIR ", "D:javaPackageshadoop_conf_diryarn-conf")
    19. conf.set("spark.driver.memory", "2g")
    20. #conf.set("spark.executor.memory", "1g")
    21. #conf.set("spark.python.worker.memory", "1g")
    22. conf.setMaster("yarn-client")
    23. conf.setAppName("TestDecisionTree")
    24. logger = logging.getLogger('pyspark')
    25. sc = SparkContext(conf=conf)
    26. mylog = []
    27. #载入和解析数据文件为 LabeledPoint RDDdata = MLUtils.loadLibSVMFile(sc,"/home/xiatao/machine_learing/")
    28. #将数据拆分成训练集合测试集
    29. (trainingData,testData) = data.randomSplit([0.7,0.3])
    30. ##训练决策树模型
    31. #空的 categoricalFeauresInfo 代表了所有的特征都是连续的
    32. model = DecisionTree.trainClassifier(trainingData, numClasses=2,categoricalFeaturesInfo={},impurity='gini',maxDepth=5,maxBins=32)
    33. # 在测试实例上评估模型并计算测试误差
    34. predictions = model.predict(testData.map(lambda x:x.features))
    35. labelsAndPoint = testData.map(lambda lp:lp.label).zip(predictions)
    36. testMSE = labelsAndPoint.map(lambda (v,p):(v-p)**2).sum()/float(testData.count())
    37. mylog.append("测试误差是")
    38. mylog.append(testMSE)
    39. #存储模型
    40. model.save(sc,"/home/xiatao/machine_learing/")
    41. sc.parallelize(mylog).saveAsTextFile("/home/xiatao/machine_learing/log")
    42. sameModel = DecisionTreeModel.load(sc,"/home/xiatao/machine_learing/")
     
  • 相关阅读:
    deepin 配置开发环境
    Springcloud学习笔记(一)总述
    Linux(Centos7) 配置 Tomcat 开机自启
    Linux下安装JDK8
    CentOS7 防火墙和端口管理
    Linux 下安装 MySQL8 教程
    WPF使用CefSharp嵌入网页
    WPF使用第三方字体(TTF字体)
    Hive学习笔记 --Permission denied: user=anonymous, access=READ
    SpringBoot jar 注册windows服务
  • 原文地址:https://www.cnblogs.com/bonelee/p/7149804.html
Copyright © 2020-2023  润新知