一定要对文本数据集进行预处理
1.导入包
import org.apache.spark.ml.feature.PCA import org.apache.spark.sql.Row import org.apache.spark.ml.linalg.{Vector,Vectors} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.{Pipeline,PipelineModel} import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer,HashingTF, Tokenizer} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression} import org.apache.spark.sql.functions; import spark.implicits._
2.定义class
case class Adult(features:org.apache.spark.ml.linalg.Vector,label:String)
3.读取数据,并转换成DF
/*训练模型的原始数据集*/ val df = sc.textFile("file:///usr/spark/sparkdata/adult.txt").map(_.split(",")) .map(p=>Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble,p(10).toDouble ,p(11).toDouble,p(12).toDouble),p(14).toString())).toDF() /*测试数据集*/ val test = sc.textFile("file:///usr/spark/sparkdata/test.txt").map(_.split(",")) .map(p=>Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble,p(10).toDouble ,p(11).toDouble,p(12).toDouble),p(14).toString())).toDF()
4.如果维度过多,用PCA主成分分析进行降维(6维变3维)
//setK()填维度,fit()填df数据 val pca = new PCA().setInputCol("features") .setOutputCol("PCAfeatures").setK(3).fit(df) //用pca模型进行转换得到新的df val trainningdata = pca.transform(df)
5.分别获取标签列,和特征列,进行索引和重命名
//获取标签列 val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(trainningdata) //获取特征列 val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(trainningdata)
6.设置逻辑斯蒂参数
val lr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10)
7.设置一个convertLabel 把预测类型重新转换成字符型
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
8.构建pipeline,设置stage,并用fit()进行训练
//算法 val lrPipeline = new Pipeline().setStages(Array(labelIndexer,featureIndexer,lr,labelConvert)) //模型 val lrPipelineModel = lrPipeline.fit(tranningdata)
9.用构建好的模型进行预测
//调用模型的transform方法,对测试数据集进行预测,(先降维) val lrPredictions = lrPipelineModel.transform(test)
10.输出预测结果
lrPredictions.select("predictedLabel","label","features","probability") .collect().foreach{ case Row (predictedLabel:String,label:String,features:Vector,prob:Vector) => println(s"($label,$features)-->prob=$prob,predictedLabel=$predictedLabel") }
11.模型评估
val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") val lrAccuacry = evaluator.evaluate(lrpridicton)