一、One-Hot Encoding
- 性别:["male","female"]
- 地区:["Europe","US","Asia"]
- 浏览器:["Firefox","Chrome","Safari","Internet Explorer"]
二、One-Hot Encoding的处理方法
One-Hot Encoding 作用也就是为了将特征数字化为一个特征向量
package Spark_MLlib import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} import org.apache.spark.sql.SparkSession object 特征变换_OneHotEncoder { val spark=SparkSession.builder().master("local[2]").appName("IndexToString").getOrCreate() import spark.implicits._ def main(args: Array[String]): Unit = { val df=spark.createDataFrame(Seq( (0,"log"), (1,"text"), (2,"text"), (3,"soyo"), (4,"text"), (5,"log"), (6,"log"), (7,"log"), (8,"hadoop") )).toDF("id","label") val df2=spark.createDataFrame(Seq( (0,"log"), (1,"soyo"), (2,"soyo") )).toDF("id","label") val indexer=new StringIndexer().setInputCol("label").setOutputCol("label_index") val model=indexer.fit(df) val indexed1=model.transform(df)//这里测试数据用的是df indexed1.show() val indexed=model.transform(df2)//测试数据换为df2 val encoder=new OneHotEncoder().setInputCol("label_index").setOutputCol("lable_vector").setDropLast(false) //setDropLast:被编码为全0向量的标签也可以占有一个二进制特征 val encodered1=encoder.transform(indexed1) encodered1.show() val encodered=encoder.transform(indexed)//(4,[2],[1.0]) //这里的4表示训练数据中有4中类型的标签 encodered.show() } }
结果:
+---+------+-----------+
| id| label|label_index|
+---+------+-----------+
| 0| log| 0.0|
| 1| text| 1.0|
| 2| text| 1.0|
| 3| soyo| 2.0|
| 4| text| 1.0|
| 5| log| 0.0|
| 6| log| 0.0|
| 7| log| 0.0|
| 8|hadoop| 3.0|
+---+------+-----------+
+---+------+-----------+-------------+
| id| label|label_index| lable_vector|
+---+------+-----------+-------------+
| 0| log| 0.0|(4,[0],[1.0])|
| 1| text| 1.0|(4,[1],[1.0])|
| 2| text| 1.0|(4,[1],[1.0])|
| 3| soyo| 2.0|(4,[2],[1.0])|
| 4| text| 1.0|(4,[1],[1.0])|
| 5| log| 0.0|(4,[0],[1.0])|
| 6| log| 0.0|(4,[0],[1.0])|
| 7| log| 0.0|(4,[0],[1.0])|
| 8|hadoop| 3.0|(4,[3],[1.0])|
+---+------+-----------+-------------+
+---+-----+-----------+-------------+
| id|label|label_index| lable_vector|
+---+-----+-----------+-------------+
| 0| log| 0.0|(4,[0],[1.0])|
| 1| soyo| 2.0|(4,[2],[1.0])|
| 2| soyo| 2.0|(4,[2],[1.0])|
+---+-----+-----------+-------------+