今天做的是最后一个实验Spark 机器学习库 MLlib 编程实践的前一部分。
以下是部分代码:
import org.apache.spark.ml.feature.PCA import org.apache.spark.sql.Row import org.apache.spark.ml.linalg.{Vector,Vectors} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.{Pipeline,PipelineModel} import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer,HashingTF, Tokenizer} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression} import org.apache.spark.sql.functions; scala> import spark.implicits._ import spark.implicits._ scala> case class Adult(features: org.apache.spark.ml.linalg.Vector, label: String) defined class Adult scala> val df = sc.textFile("adult.data.txt").map(_.split(",")).map(p => Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble, p(10).toDouble, p(11).toDouble, p(12).toDouble), p(14).toString())).toDF() df: org.apache.spark.sql.DataFrame = [features: vector, label: string] scala> val test = sc.textFile("adult.test.txt").map(_.split(",")).map(p => Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble, p(10).toDouble, p(11).toDouble, p(12).toDouble), p(14).toString())).toDF() test: org.apache.spark.sql.DataFrame = [features: vector, label: string]