• Spark文本特征提取(TF-IDF/Word2Vec/CountVectorizer)


    import org.apache
    import org.apache.spark
    import org.apache.spark.ml.feature._
    import org.apache.spark.mllib.linalg._
    import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.mllib.linalg.distributed.RowMatrix
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
    import org.apache.spark.rdd.RDD
    import org.apache.spark.sql.SQLContext
    import org.apache.spark.{SparkConf, SparkContext}
    import org.apache.spark.mllib.linalg.{Matrices, Matrix}
    
    
    object test42 {
      def main(args: Array[String]): Unit = {
    
        val masterUrl = "local[2]"
        val appName = "tfidf_test"
        val sparkConf = new SparkConf().setMaster(masterUrl).setAppName(appName)
        @transient val sc = new SparkContext(sparkConf)
        val sqlContext = new SQLContext(sc)
        sc.setLogLevel("ERROR")
    
        //Scala默认会导入scala.collection.immutable.Vector,
        // 所以必须显式导入org.apache.spark.mllib.linalg.Vector才能使用MLlib才能使用MLlib提供的Vector。
    
        //密集向量
        val dv:Vector = Vectors.dense(1.0,0.0,3.0)
        println(dv)
        //稀疏向量,3表示此向量的长度,第一个Array(0,2)表示的索引,第二个Array(1.0, 3.0)与前面的Array(0,2)是相互对应的,表示第0个位置的值为1.0,第2个位置的值为3
        val sv1:Vector=Vectors.sparse(3,Array(0,2),Array(1.0,3.0))
        println(sv1)
        //稀疏向量, 3表示此向量的长度,Seq里面每一对都是(索引,值)的形式
        val sv2:Vector=Vectors.sparse(3,Seq((0,1.0),(2,3.0)))
        println(sv2)
    
        //标记点
        val pos=LabeledPoint(1.0,Vectors.dense(1.0,0.0,3.0))
    
        val neg=LabeledPoint(0.0,Vectors.sparse(3,Array(0,2),Array(1.0,3.0)))
    
    
        //创建矩阵,3行2列
        val dm:Matrix=Matrices.dense(2,3,Array(1,0,2.0,3.0,4.0,5.0))
        println("========dm========")
        println(dm)
    
        val v0 = Vectors.dense(1.0, 0.0, 3.0)
        val v1 = Vectors.sparse(3, Array(1), Array(2.5))
        val v2 = Vectors.sparse(3, Seq((0, 1.5), (1, 1.8)))
    
        val rows = sc.parallelize(Seq(v0, v1, v2))
        println("=========rows=======")
        println(rows.collect().toBuffer)
    
        val mat: RowMatrix = new RowMatrix(rows)
    
    
        val seriesX: RDD[Double] =sc.parallelize(List(1.0,2.0,3.0)) //a series
        val seriesY: RDD[Double] = sc.parallelize(List(4.0,5.0,6.0)) //和seriesX必须有相同的分区和基数
        val correlation:Double = Statistics.corr(seriesX, seriesY, "pearson")
        val data: RDD[Vector] =rows//每个向量必须是行,不能是列
        val correlMatrix: Matrix = Statistics.corr(data, "pearson")
        println("========correlMatrix========")
        println(correlMatrix)
    
    
    
        val summary: MultivariateStatisticalSummary = Statistics.colStats(rows)
        println("===================")
        println(summary.mean) //每个列值组成的密集向量
        println(summary.variance) //列向量方差
        println(summary.numNonzeros) //每个列的非零值个数
    
    
        /**
          * Word2Vec
          */
    
        val documentDF = sqlContext.createDataFrame(Seq(
          "Hi I heard about Spark".split(" "),
          "I wish Java could use case classes".split(" "),
          "Logistic regression models are neat".split(" ")
        ).map(Tuple1.apply)).toDF("text")
    
        // Learn a mapping from words to Vectors.
        val word2Vec = new Word2Vec()
          .setInputCol("text")
          .setOutputCol("result")
          .setVectorSize(3)
          .setMinCount(0)
        val model = word2Vec.fit(documentDF)
        val result = model.transform(documentDF)
        println("=======word2vec=========")
        result.show(10,false)
    
    
    
        /**
          * Countvectorizer
          */
    
    
    
        val df = sqlContext.createDataFrame(Seq(
          (0, Array("a", "b", "c")),
          (1, Array("a", "b", "b", "c", "a"))
        )).toDF("id", "words")
    
        // fit a CountVectorizerModel from the corpus
        val cvModel: CountVectorizerModel = new CountVectorizer()
          .setInputCol("words")
          .setOutputCol("features")
          .setVocabSize(3)
          .setMinDF(2)
          .fit(df)
    
        // alternatively, define CountVectorizerModel with a-priori vocabulary
        val cvm = new CountVectorizerModel(Array("a", "b", "c"))
          .setInputCol("words")
          .setOutputCol("features")
        println("=======CountVectorizerModel=========")
        cvModel.transform(df).show(10,false)
    
    
        /**
          * TF-IDF
          */
    
        val sentenceData = sqlContext.createDataFrame(Seq(
          (0, "Hi I heard about Spark"),
          (0, "I wish Java could use case classes"),
          (1, "Logistic regression models are neat")
        )).toDF("label", "sentence")
    
        val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
        val wordsData = tokenizer.transform(sentenceData)
        val hashingTF = new HashingTF()
          .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
        val featurizedData = hashingTF.transform(wordsData)
        // CountVectorizer也可获取词频向量
    
        val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
        val idfModel = idf.fit(featurizedData)
        val rescaledData = idfModel.transform(featurizedData)
        rescaledData.show(10,false)
    
    
    
      }
    }

  • 相关阅读:
    HITCON Training lab14 magic heap 堆技巧unsroted bin attack
    堆技巧Unsorted Bin Attack
    2014-hack-lu-oreo 堆技巧house of spirit
    极客时间JVM虚拟机
    堆技巧 housr_of_spirit 绕过检查源码分析
    babyheap_0ctf_2017 堆技巧 fastbin-attack
    gyctf_2020_borrowstack wp
    SqlServer Where后面Case When语句的写法
    EasyUI TextBox的onkeypress事件
    SQL
  • 原文地址:https://www.cnblogs.com/ShyPeanut/p/11103608.html
Copyright © 2020-2023  润新知