• 垃圾邮件分类(Scala 版本)


    import org.apache.log4j.{Level, Logger}
    import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
    import org.apache.spark.mllib.feature.HashingTF
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.{SparkConf, SparkContext}
    
    /**
      * Created by DengNi on 2016/9/21.
      * 邮件分类 scala
      */
    object spam_normal {
    
      def main(args: Array[String]) {
    
        Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    
        val conf = new SparkConf().setAppName("scala").setMaster("local[*]")
    
        val sc = new SparkContext(conf)
    
        val spam = sc.textFile("spam.txt")
        val norm = sc.textFile("noraml.txt")
        //创建一个HashingTF 实例来把邮件文本映射为包含 10000 个特征的向量
        val tf = new HashingTF(numFeatures = 10000)
        //各个邮件都被切分为单词,每个单词被映射为一个特征
        val spamFeatures  = spam.map(email =>tf.transform(email.split(" ")))
        val normFeatures  = norm.map(email =>tf.transform(email.split(" ")))
    
        //创建lablepoint 数据集分别存放垃圾邮件和正常邮件
    
        val positiveExample =spamFeatures.map(features => LabeledPoint(1,features))
        val negativeExample =normFeatures.map(features => LabeledPoint(0,features))
    
        val trainingDatat = positiveExample.union(negativeExample)
        //因为逻辑回归是迭代算法,所以使用缓存技术
        trainingDatat.cache()
    
        //使用SGD 算法运行逻辑回归
    
        val model  = new LogisticRegressionWithSGD().run(trainingDatat)
    
        //使用两组数据测试
    
        val psTest  = tf.transform("fuck you love sex cheap by sending money fund".split(" "))
    
        val negTest = tf.transform("Hi hwo do you good to see you want to spark".split(" "))
    
        println(model.predict(psTest)) //should be 1
        println(model.predict(negTest)) //should be 0
    
      }
    
    
    
    }
    


    "C:Program FilesJavajdk1.7.0_80injava" -Didea.launcher.port=7533 "-Didea.launcher.bin.path=C:Program Files (x86)JetBrainsIntelliJ IDEA Community Edition 2016.1.3in" -Dfile.encoding=UTF-8 -classpath "C:Program FilesJavajdk1.7.0_80jrelibcharsets.jar;C:Program FilesJavajdk1.7.0_80jrelibdeploy.jar;C:Program FilesJavajdk1.7.0_80jrelibextaccess-bridge-64.jar;C:Program FilesJavajdk1.7.0_80jrelibextdnsns.jar;C:Program FilesJavajdk1.7.0_80jrelibextjaccess.jar;C:Program FilesJavajdk1.7.0_80jrelibextlocaledata.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunec.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunjce_provider.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunmscapi.jar;C:Program FilesJavajdk1.7.0_80jrelibextzipfs.jar;C:Program FilesJavajdk1.7.0_80jrelibjavaws.jar;C:Program FilesJavajdk1.7.0_80jrelibjce.jar;C:Program FilesJavajdk1.7.0_80jrelibjfr.jar;C:Program FilesJavajdk1.7.0_80jrelibjfxrt.jar;C:Program FilesJavajdk1.7.0_80jrelibjsse.jar;C:Program FilesJavajdk1.7.0_80jrelibmanagement-agent.jar;C:Program FilesJavajdk1.7.0_80jrelibplugin.jar;C:Program FilesJavajdk1.7.0_80jrelib esources.jar;C:Program FilesJavajdk1.7.0_80jrelib t.jar;D:igdataworkspaces ecommderoutproduction ecommder;F:scalalibscala-actors-migration.jar;F:scalalibscala-actors.jar;F:scalalibscala-library.jar;F:scalalibscala-reflect.jar;F:scalalibscala-swing.jar;D:igdataworkspaces ecommderlibspark-assembly-1.6.0-hadoop2.6.0.jar;C:Program Files (x86)JetBrainsIntelliJ IDEA Community Edition 2016.1.3libidea_rt.jar" com.intellij.rt.execution.application.AppMain spam_normal
    Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
    16/09/21 22:16:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    16/09/21 22:16:15 INFO Slf4jLogger: Slf4jLogger started
    16/09/21 22:16:15 INFO Remoting: Starting remoting
    16/09/21 22:16:16 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.184.1:3070]
    16/09/21 22:16:18 WARN : Your hostname, root resolves to a loopback/non-reachable address: fe80:0:0:0:0:5efe:c0a8:8c01%17, but we couldn't find any external IP address!
    16/09/21 22:16:19 INFO FileInputFormat: Total input paths to process : 1
    16/09/21 22:16:19 INFO FileInputFormat: Total input paths to process : 1
    16/09/21 22:16:20 INFO deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id
    16/09/21 22:16:20 INFO deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id
    16/09/21 22:16:20 INFO deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap
    16/09/21 22:16:20 INFO deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition
    16/09/21 22:16:20 INFO deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id
    16/09/21 22:16:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
    16/09/21 22:16:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
    1.0
    0.0

    16/09/21 22:16:24 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon.

    Process finished with exit code 0

  • 相关阅读:
    Windows 认证小结
    Linux 提权学习小结
    ssrf与gopher与redis
    hacker101 CTF 学习记录(二)
    Hacker101 CTF 学习记录(一)
    libwebsockets支持外部eventloop变更
    ypipe, zmq的核心部件,并行读写的管道。
    std::regex与boost::regex的性能差5倍,有profile有真相。
    Spring整合WebSocket
    温故知新——Spring AOP(二)
  • 原文地址:https://www.cnblogs.com/TendToBigData/p/10501370.html
Copyright © 2020-2023  润新知