• 聚类-----KMeans


    package Spark_MLlib
    
    import org.apache.spark.ml.clustering.KMeans
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.ml.linalg.{Vector, Vectors}
    
    /**
      * K均值
      */
    case class features_schema(features:Vector)
    object 聚类__KMeans {
           val spark=SparkSession.builder().master("local[2]").getOrCreate()
           import spark.implicits._
      def main(args: Array[String]): Unit = {
    
           val data=spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/soyo2.txt")
                      .map(_.split(",")).map(x=>features_schema(Vectors.dense(x(0).toDouble,x(1).toDouble,x(2).toDouble,x(3).toDouble))).toDF()
             data.show()
            val KMeansModel=new KMeans().setK(7).setFeaturesCol("features").setPredictionCol("prediction").fit(data)
            val results=KMeansModel.transform(data)
             results.show(150)
            //模型所有的聚类中心(指最后生成的聚类中心,K是几就有几组)的情况
             KMeansModel.clusterCenters.foreach(println)
            //集合内误差平方和(选取K的大小可以参照,使用场景+最大的集合内误差平方的值=较合适的K)
             val cost=KMeansModel.computeCost(data)
             println(cost)
      }
    }

    结果:

    +-----------------+
    |         features|
    +-----------------+
    |[5.1,3.5,1.4,0.2]|
    |[4.9,3.0,1.4,0.2]|
    |[4.7,3.2,1.3,0.2]|
    |[4.6,3.1,1.5,0.2]|
    |[5.0,3.6,1.4,0.2]|
    |[5.4,3.9,1.7,0.4]|
    |[4.6,3.4,1.4,0.3]|
    |[5.0,3.4,1.5,0.2]|
    |[4.4,2.9,1.4,0.2]|
    |[4.9,3.1,1.5,0.1]|
    |[5.4,3.7,1.5,0.2]|
    |[4.8,3.4,1.6,0.2]|
    |[4.8,3.0,1.4,0.1]|
    |[4.3,3.0,1.1,0.1]|
    |[5.8,4.0,1.2,0.2]|
    |[5.7,4.4,1.5,0.4]|
    |[5.4,3.9,1.3,0.4]|
    |[5.1,3.5,1.4,0.3]|
    |[5.7,3.8,1.7,0.3]|
    |[5.1,3.8,1.5,0.3]|
    +-----------------+
    only showing top 20 rows

    +-----------------+----------+
    |         features|prediction|
    +-----------------+----------+
    |[5.1,3.5,1.4,0.2]|         0|
    |[4.9,3.0,1.4,0.2]|         0|
    |[4.7,3.2,1.3,0.2]|         0|
    |[4.6,3.1,1.5,0.2]|         0|
    |[5.0,3.6,1.4,0.2]|         0|
    |[5.4,3.9,1.7,0.4]|         0|
    |[4.6,3.4,1.4,0.3]|         0|
    |[5.0,3.4,1.5,0.2]|         0|
    |[4.4,2.9,1.4,0.2]|         0|
    |[4.9,3.1,1.5,0.1]|         0|
    |[5.4,3.7,1.5,0.2]|         0|
    |[4.8,3.4,1.6,0.2]|         0|
    |[4.8,3.0,1.4,0.1]|         0|
    |[4.3,3.0,1.1,0.1]|         0|
    |[5.8,4.0,1.2,0.2]|         0|
    |[5.7,4.4,1.5,0.4]|         0|
    |[5.4,3.9,1.3,0.4]|         0|
    |[5.1,3.5,1.4,0.3]|         0|
    |[5.7,3.8,1.7,0.3]|         0|
    |[5.1,3.8,1.5,0.3]|         0|
    +-----------------+----------+
    only showing top 20 rows

    [5.005999999999999,3.4180000000000006,1.4640000000000002,0.2439999999999999]
    [6.8538461538461535,3.076923076923076,5.715384615384614,2.0538461538461537]
    [5.883606557377049,2.740983606557377,4.388524590163936,1.4344262295081966]
    78.94506582597859

  • 相关阅读:
    SpringBoot入门1
    git
    Linux 常用命令
    Linux虚拟机上安装redis
    用户登录(Material Design + Data-Binding + MVP架构模式)实现
    【方法总结】创建、读取、删除文件相关操作
    从源码角度入手实现RecyclerView的Item点击事件
    App启动页倒计时功能
    App内切换语言
    GreenDao与ReactiveX的完美搭配
  • 原文地址:https://www.cnblogs.com/soyo/p/7799422.html
Copyright © 2020-2023  润新知