• 聚类-----高斯混合模型


    package Spark_MLlib
    
    import org.apache.spark.ml.clustering.GaussianMixture
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.ml.linalg.{Vector, Vectors}
    
    case class GMM_Schema(features:Vector)
    object 聚类__高斯混合 {
      val spark=SparkSession.builder().master("local[2]").getOrCreate()
      import spark.implicits._
      def main(args: Array[String]): Unit = {
        val data=spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/soyo2.txt")
                   .map(_.split(",")).map(x=>GMM_Schema(Vectors.dense(x(0).toDouble,x(1).toDouble,x(2).toDouble,x(3).toDouble))).toDF()
        data.show()
        val Array(traindata,textData)=data.randomSplit(Array(0.7,0.3))
        val GMM=new GaussianMixture().setK(3).setProbabilityCol("Probability").setPredictionCol("Prediction")
        val GMM_model=GMM.fit(traindata)
       //Probaility表示样本属于各个聚簇的概率-->0,1,2三个簇各个的概率,prediction表示对样本的聚簇归属预测
        val result=GMM_model.transform(textData)
          result.show(false)
         //保存选中的列
    extracte_data.select("features").rdd.saveAsTextFile("file:///home/soyo/桌面/spark编程测试数据/89987863.txt") //保存DataFrame所有
    extracte_data.rdd.saveAsTextFile(
    "file:///home/soyo/桌面/spark编程测试数据/89987862.txt")
        //GMM不直接给出聚类中心,而是给出各个混合成分(多元高斯分布)的参数
        //weights成员获取到各个混合成分的权重,使用gaussians成员来获取到各个混合成分的参数(均值向量和协方差矩阵)
       for (i<-0 until(GMM_model.getK)){
         println("Component %d : weight is %f
     mu vector is %s
     sigma matrix is %s".format(i,GMM_model.weights(i),GMM_model.gaussians(i).mean,GMM_model.gaussians(i).cov))
       }
      }
    }

    结果:

    +-----------------+
    |         features|
    +-----------------+
    |[5.1,3.5,1.4,0.2]|
    |[4.9,3.0,1.4,0.2]|
    |[4.7,3.2,1.3,0.2]|
    |[4.6,3.1,1.5,0.2]|
    |[5.0,3.6,1.4,0.2]|
    |[5.4,3.9,1.7,0.4]|
    |[4.6,3.4,1.4,0.3]|
    |[5.0,3.4,1.5,0.2]|
    |[4.4,2.9,1.4,0.2]|
    |[4.9,3.1,1.5,0.1]|
    |[5.4,3.7,1.5,0.2]|
    |[4.8,3.4,1.6,0.2]|
    |[4.8,3.0,1.4,0.1]|
    |[4.3,3.0,1.1,0.1]|
    |[5.8,4.0,1.2,0.2]|
    |[5.7,4.4,1.5,0.4]|
    |[5.4,3.9,1.3,0.4]|
    |[5.1,3.5,1.4,0.3]|
    |[5.7,3.8,1.7,0.3]|
    |[5.1,3.8,1.5,0.3]|
    +-----------------+
    only showing top 20 rows

    +-----------------+----------+------------------------------------------------------------------+
    |features         |Prediction|Probability                                                       |
    +-----------------+----------+------------------------------------------------------------------+
    |[4.5,2.3,1.3,0.3]|1         |[1.9460993789131094E-10,0.6613517186358104,0.33864828116957957]   |
    |[4.6,3.2,1.4,0.2]|1         |[2.7145624349052503E-15,0.9999999999625855,3.74117982257895E-11]  |
    |[4.6,3.6,1.0,0.2]|1         |[8.857071769427218E-14,0.9999999995762952,4.2361636256573393E-10] |
    |[4.7,3.2,1.3,0.2]|1         |[2.8280610349168036E-16,0.9999999997787022,2.2129751536671591E-10]|
    |[4.7,3.2,1.6,0.2]|1         |[2.2246229283736778E-13,0.9999999999924508,7.326722006026576E-12] |
    |[4.8,3.0,1.4,0.3]|1         |[1.5570916910918913E-14,0.999999515511457,4.844885274590749E-7]   |
    |[4.8,3.1,1.6,0.2]|1         |[5.416790617095303E-13,0.9999999998341826,1.652755935543129E-10]  |
    |[4.8,3.4,1.9,0.2]|1         |[1.1345882300938586E-9,0.9999999988651723,2.394366594145129E-13]  |
    |[5.0,3.0,1.6,0.2]|1         |[2.174953081273265E-12,0.9999999896690439,1.0328781109739351E-8]  |
    |[5.0,3.3,1.4,0.2]|1         |[9.157665389080891E-17,0.9999999999852162,1.4783767398737057E-11] |
    |[5.0,3.4,1.6,0.4]|1         |[1.1903839950520247E-15,0.9999999903439921,9.656006771864786E-9]  |
    |[5.1,3.4,1.5,0.2]|1         |[1.0337982164910104E-16,0.9999999999990304,9.69373338566045E-13]  |
    |[5.1,3.7,1.5,0.4]|1         |[8.255687030250876E-17,0.9999999999716316,2.8368326155201214E-11] |
    |[5.1,3.8,1.9,0.4]|1         |[6.664693730316072E-14,0.9999999999870913,1.2842010001494045E-11] |
    |[5.2,3.5,1.5,0.2]|1         |[5.519983601218073E-17,0.9999999999998658,1.3428034253525153E-13] |
    |[5.2,4.1,1.5,0.1]|1         |[9.520667996704964E-15,0.9999999999999809,9.520667236660166E-15]  |
    |[5.4,3.4,1.7,0.2]|1         |[1.5037240722382337E-14,0.9999999999919748,8.01024821156934E-12]  |
    |[5.5,2.3,4.0,1.3]|2         |[0.12790930371263204,9.702982800125614E-16,0.8720906962873669]    |
    |[5.5,3.5,1.3,0.2]|1         |[7.495980013661814E-16,0.9999999999984027,1.5966275150172275E-12] |
    |[5.7,2.8,4.5,1.3]|0         |[0.9627079132449172,6.238048595532193E-16,0.03729208675508215]    |
    +-----------------+----------+------------------------------------------------------------------+
    only showing top 20 rows

    Component 0 : weight is 0.410444
     mu vector is [6.229890449633949,2.9365709066142216,5.119923101567097,1.875036196728866]
     sigma matrix is 0.20106023109900695   0.061614617783844784  0.18615301343118426  0.12419808100465818  
    0.061614617783844784  0.08755324619180453   0.07600720617502173  0.07092234972645906  
    0.18615301343118426   0.07600720617502173   0.3190627635701519   0.2005911061068125   
    0.12419808100465818   0.07092234972645906   0.2005911061068125   0.18570154951117543  
    Component 1 : weight is 0.313131
     mu vector is [5.016129011272877,3.4516129375085347,1.4354837490859222,0.24516112308927235]
     sigma matrix is 0.13554637908317346   0.11368373014150743   0.011685744418836571  0.015723208692113348  
    0.11368373014150743   0.1399168236701324    0.009136331548362716  0.018314273558132532  
    0.011685744418836571  0.009136331548362716  0.018418296421506785  0.006461987075877571  
    0.015723208692113348  0.018314273558132532  0.006461987075877571  0.014089446355933986  
    Component 2 : weight is 0.276425
     mu vector is [6.268895982220705,2.8164655748162817,4.672112358525022,1.4474090607905907]
     sigma matrix is 0.7598437470441146   0.25905873612421504  0.8900973784969908   0.2622593125673333   
    0.25905873612421504  0.18040816341127572  0.2866224809874453   0.10562720921328374  
    0.8900973784969908   0.2866224809874453   1.1853685078670806   0.35992793746391144  
    0.2622593125673333   0.10562720921328374  0.35992793746391144  0.12216925649336183  

  • 相关阅读:
    多线程博文地址 http://www.cnblogs.com/nokiaguy/archive/2008/07/13/1241817.html
    vs2010一运行就报错deven.exe assert failure 解决方法,卸载系统中.netFramework最新版本的(简体中文)
    Lambda语句中创建自定义类型时,也可指定某种特定类型,方法是在new与{}之间写上类型名称
    Win7开始菜单所在目录
    C#中Struct与Class的区别
    Linq语句:三表联查
    用exp、dmp导入导出用户到同一个实例下时,类型type会有问题
    列、约束重命名,原数据不丢失
    CDM中,创建一个或多个组合属性的唯一约束
    EF中新建表和关联表的方法
  • 原文地址:https://www.cnblogs.com/soyo/p/7814749.html
Copyright © 2020-2023  润新知