package Spark_MLlib import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.sql.SparkSession object 特征变换_StringIndexer { val spark = SparkSession.builder().master("local[2]").appName("标签和索引的转换").getOrCreate() import spark.implicits._ def main(args: Array[String]): Unit = { val df=spark.createDataFrame(Seq( (0,"log"), (1,"text"), (2,"text"), (3,"soyo"), (4,"text"), (5,"log"), (6,"log"), (7,"log") )).toDF("id","type") val indexer=new StringIndexer().setInputCol("type").setOutputCol("type_index") val model=indexer.fit(df) model.labels.foreach(println) //类型的频率顺序(高-->低) val index=model.transform(df) //索引先排频率高的即log为0 index.show(false) } }
结果:
log
text
soyo
+---+----+----------+
|id |type|type_index|
+---+----+----------+
|0 |log |0.0 |
|1 |text|1.0 |
|2 |text|1.0 |
|3 |soyo|2.0 |
|4 |text|1.0 |
|5 |log |0.0 |
|6 |log |0.0 |
|7 |log |0.0 |
+---+----+----------+