import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.classification.LogisticRegressionWithSGD import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.{SparkConf, SparkContext} /** * Created by DengNi on 2016/9/21. * 邮件分类 scala */ object spam_normal { def main(args: Array[String]) { Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) val conf = new SparkConf().setAppName("scala").setMaster("local[*]") val sc = new SparkContext(conf) val spam = sc.textFile("spam.txt") val norm = sc.textFile("noraml.txt") //创建一个HashingTF 实例来把邮件文本映射为包含 10000 个特征的向量 val tf = new HashingTF(numFeatures = 10000) //各个邮件都被切分为单词,每个单词被映射为一个特征 val spamFeatures = spam.map(email =>tf.transform(email.split(" "))) val normFeatures = norm.map(email =>tf.transform(email.split(" "))) //创建lablepoint 数据集分别存放垃圾邮件和正常邮件 val positiveExample =spamFeatures.map(features => LabeledPoint(1,features)) val negativeExample =normFeatures.map(features => LabeledPoint(0,features)) val trainingDatat = positiveExample.union(negativeExample) //因为逻辑回归是迭代算法,所以使用缓存技术 trainingDatat.cache() //使用SGD 算法运行逻辑回归 val model = new LogisticRegressionWithSGD().run(trainingDatat) //使用两组数据测试 val psTest = tf.transform("fuck you love sex cheap by sending money fund".split(" ")) val negTest = tf.transform("Hi hwo do you good to see you want to spark".split(" ")) println(model.predict(psTest)) //should be 1 println(model.predict(negTest)) //should be 0 } }
"C:Program FilesJavajdk1.7.0_80injava" -Didea.launcher.port=7533 "-Didea.launcher.bin.path=C:Program Files (x86)JetBrainsIntelliJ IDEA Community Edition 2016.1.3in" -Dfile.encoding=UTF-8 -classpath "C:Program FilesJavajdk1.7.0_80jrelibcharsets.jar;C:Program
FilesJavajdk1.7.0_80jrelibdeploy.jar;C:Program FilesJavajdk1.7.0_80jrelibextaccess-bridge-64.jar;C:Program FilesJavajdk1.7.0_80jrelibextdnsns.jar;C:Program FilesJavajdk1.7.0_80jrelibextjaccess.jar;C:Program FilesJavajdk1.7.0_80jrelibextlocaledata.jar;C:Program
FilesJavajdk1.7.0_80jrelibextsunec.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunjce_provider.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunmscapi.jar;C:Program FilesJavajdk1.7.0_80jrelibextzipfs.jar;C:Program FilesJavajdk1.7.0_80jrelibjavaws.jar;C:Program
FilesJavajdk1.7.0_80jrelibjce.jar;C:Program FilesJavajdk1.7.0_80jrelibjfr.jar;C:Program FilesJavajdk1.7.0_80jrelibjfxrt.jar;C:Program FilesJavajdk1.7.0_80jrelibjsse.jar;C:Program FilesJavajdk1.7.0_80jrelibmanagement-agent.jar;C:Program
FilesJavajdk1.7.0_80jrelibplugin.jar;C:Program FilesJavajdk1.7.0_80jrelib
esources.jar;C:Program FilesJavajdk1.7.0_80jrelib
t.jar;D:igdataworkspaces
ecommderoutproduction
ecommder;F:scalalibscala-actors-migration.jar;F:scalalibscala-actors.jar;F:scalalibscala-library.jar;F:scalalibscala-reflect.jar;F:scalalibscala-swing.jar;D:igdataworkspaces
ecommderlibspark-assembly-1.6.0-hadoop2.6.0.jar;C:Program
Files (x86)JetBrainsIntelliJ IDEA Community Edition 2016.1.3libidea_rt.jar" com.intellij.rt.execution.application.AppMain spam_normal
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
16/09/21 22:16:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
16/09/21 22:16:15 INFO Slf4jLogger: Slf4jLogger started
16/09/21 22:16:15 INFO Remoting: Starting remoting
16/09/21 22:16:16 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.184.1:3070]
16/09/21 22:16:18 WARN : Your hostname, root resolves to a loopback/non-reachable address: fe80:0:0:0:0:5efe:c0a8:8c01%17, but we couldn't find any external IP address!
16/09/21 22:16:19 INFO FileInputFormat: Total input paths to process : 1
16/09/21 22:16:19 INFO FileInputFormat: Total input paths to process : 1
16/09/21 22:16:20 INFO deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id
16/09/21 22:16:20 INFO deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id
16/09/21 22:16:20 INFO deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap
16/09/21 22:16:20 INFO deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition
16/09/21 22:16:20 INFO deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id
16/09/21 22:16:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
16/09/21 22:16:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
1.0
0.0
16/09/21 22:16:24 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon.
Process finished with exit code 0