Python+Spark2.0+hadoop学习笔记——Python Spark MLlib支持向量机二分类

支持向量机是一个应用很广的机器学习模型，利用核空间变换可以将数据从一个空间变换到另外一个空间当中，从而使得数据呈现出更清晰的分布。支持向量机不论是在工业界还是在学界都有举足亲重的意义，在学界，基于支持向量机的改进方法有很多，通过算法层面的改进可以得到一种针对特定数据情况的优质算法，从而能更好的解决实际问题。

接下来还是使用哪个辨别网站是否是长青网站或者是暂时网站。和前面的逻辑斯蒂思路一样，开始支持向量机的学习。

第一步：导入相应的库函数

import sys
from time import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf, SparkContext
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler

第二步：准备数据

def get_mapping(rdd, idx):
return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()

def extract_label(record):
label=(record[-1])
return float(label)

def extract_features(field,categoriesMap,featureEnd):
categoryIdx = categoriesMap[field[3]]
categoryFeatures = np.zeros(len(categoriesMap))
categoryFeatures[categoryIdx] = 1
numericalFeatures=[convert_float(field) for field in field[4: featureEnd]]
return np.concatenate(( categoryFeatures, numericalFeatures))

def convert_float(x):
return (0 if x=="?" else float(x))

def PrepareData(sc):
print("Data loading...")
rawDataWithHeader = sc.textFile(Path+"data/train.tsv")
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x !=header)
rData=rawData.map(lambda x: x.replace(""", ""))
lines = rData.map(lambda x: x.split(" "))
print("The number of data" + str(lines.count()))
print("Before normalization")
categoriesMap = lines.map(lambda fields: fields[3]).
distinct().zipWithIndex().collectAsMap()
labelRDD = lines.map(lambda r: extract_label(r))
featureRDD = lines.map(lambda r: extract_features(r,categoriesMap,len(r) - 1))

for i in featureRDD.first():
print (str(i)+","),
stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
ScalerFeatureRDD=stdScaler.transform(featureRDD)
print("After normalization")
for i in ScalerFeatureRDD.first():
print (str(i)+","),
labelpoint=labelRDD.zip(ScalerFeatureRDD)
labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
(trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
print("trainData:" + str(trainData.count()) +
"validationData:" + str(validationData.count()) +
"testData:" + str(testData.count()))
return (trainData, validationData, testData, categoriesMap)

第三步：对模型进行训练

def PredictData(sc,model,categoriesMap):
print("Data loading...")
rawDataWithHeader = sc.textFile(Path+"data/test.tsv")
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x !=header)
rData=rawData.map(lambda x: x.replace(""", ""))
lines = rData.map(lambda x: x.split(" "))
print("The number of data" + str(lines.count()))
dataRDD = lines.map(lambda r: ( r[0] ,
extract_features(r,categoriesMap,len(r) )))
DescDict = {
0: "ephemeral",
1: "evergreen"
}
for data in dataRDD.take(10):
predictResult = model.predict(data[1])
print (" Web:" +str(data[0])+" " +
"Predict:"+ str(predictResult)+
"Inllustration:"+DescDict[predictResult] +" ")

第四步：对模型进行评估（需要调节的参数有numIterations、stepSize和regParam）（注意：在MLlib里的SVM的调节的参数和SVM标准算法的参数有很大区别，可能是内置的核函数已经设置好了，不需要对核函数进行调节）

ef evaluateModel(model, validationData):
score = model.predict(validationData.map(lambda p: p.features))
score = score.map(lambda score : float(score))
Labels = validationData.map(lambda p: p.label)
Labels = Labels.map(lambda Labels : float(Labels))
scoreAndLabels=score.zip(Labels)
metrics = BinaryClassificationMetrics(scoreAndLabels)
AUC=metrics.areaUnderROC
return(AUC)

def trainEvaluateModel(trainData,validationData,
numIterations, stepSize, regParam):
startTime = time()
model = SVMWithSGD.train(trainData, numIterations, stepSize, regParam)
AUC = evaluateModel(model, validationData)
duration = time() - startTime
print( " numIterations="+str(numIterations) +
" stepSize="+str(stepSize) +
" regParam="+str(regParam) +
" Time="+str(duration) +
" AUC = " + str(AUC) )
return (AUC,duration, numIterations, stepSize, regParam,model)

def evalParameter(trainData, validationData, evalparm,
numIterationsList, stepSizeList, regParamList):
metrics = [trainEvaluateModel(trainData, validationData,
numIterations,stepSize, regParam )
for numIterations in numIterationsList
for stepSize in stepSizeList
for regParam in regParamList ]
if evalparm=="numIterations":
IndexList=numIterationsList[:]
elif evalparm=="stepSize":
IndexList=stepSizeList[:]
elif evalparm=="regParam":
IndexList=regParamList[:]
df = pd.DataFrame(metrics,index=IndexList,
columns=['AUC', 'duration','numIterations', 'stepSize', 'regParam','model'])
showchart(df,evalparm,'AUC','duration',0.5,0.7 )

def showchart(df,evalparm ,barData,lineData,yMin,yMax):
ax = df[barData].plot(kind='bar', title =evalparm,figsize=(10,6),legend=True, fontsize=12)
ax.set_xlabel(evalparm,fontsize=12)
ax.set_ylim([yMin,yMax])
ax.set_ylabel(barData,fontsize=12)
ax2 = ax.twinx()
ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r')
plt.show()

def evalAllParameter(trainData, validationData,
numIterationsList, stepSizeList, regParamList):
metrics = [trainEvaluateModel(trainData, validationData,
numIterations,stepSize, regParam )
for numIterations in numIterationsList
for stepSize in stepSizeList
for regParam in regParamList ]
Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True)
bestParameter=Smetrics[0]
print("numIterations:" + str(bestParameter[2]) +
" ,stepSize:" + str(bestParameter[3]) +
" ,regParam:" + str(bestParameter[4]) +
" ,AUC = " + str(bestParameter[0]))
return bestParameter[5]

def parametersEval(trainData, validationData):
print("numIterations")
evalParameter(trainData, validationData,"numIterations",
numIterationsList= [1, 3, 5, 15, 25],
stepSizeList=[100],
regParamList=[1 ])
print("stepSize")
evalParameter(trainData, validationData,"stepSize",
numIterationsList=[25],
stepSizeList= [10, 50, 100, 200],
regParamList=[1])
print("regParam")
evalParameter(trainData, validationData,"regParam",
numIterationsList=[25],
stepSizeList =[100],
regParamList=[0.01, 0.1, 1 ])

第五步：Spark相关设置

def SetLogger( sc ):
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

def SetPath(sc):
global Path
if sc.master[0:5]=="local" :
Path="file:/home/jorlinlee/pythonsparkexample/PythonProject/"
else:
Path="hdfs://master:9000/user/jorlinlee/"

def CreateSparkContext():
sparkConf = SparkConf()
.setAppName("SVM")
.set("spark.ui.showConsoleProgress", "false")
sc = SparkContext(conf = sparkConf)
print ("master="+sc.master)
SetLogger(sc)
SetPath(sc)
return (sc)

sc.stop()

第六步：运行主程序

if __name__ == "__main__":
print("SVM")
sc=CreateSparkContext()
print("Preparing")
(trainData, validationData, testData, categoriesMap) =PrepareData(sc)
trainData.persist(); validationData.persist(); testData.persist()
print("Evaluating")
(AUC,duration, numIterations, stepSize, regParam,model)=
trainEvaluateModel(trainData, validationData, 3, 50, 1)
if (len(sys.argv) == 2) and (sys.argv[1]=="-e"):
parametersEval(trainData, validationData)
elif (len(sys.argv) == 2) and (sys.argv[1]=="-a"):
print("Best parameter")
model=evalAllParameter(trainData, validationData,
[1, 3, 5, 15, 25],
[10, 50, 100, 200],
[0.01, 0.1, 1 ])
print("Test")
auc = evaluateModel(model, testData)
print("AUC:" + str(auc))
print("Predict")
PredictData(sc, model, categoriesMap)

结果：

Web:http://www.lynnskitchenadventures.com/2009/04/homemade-enchilada-sauce.html
Predict:1Inllustration:evergreen

Web:http://lolpics.se/18552-stun-grenade-ar
Predict:1Inllustration:evergreen

Web:http://www.xcelerationfitness.com/treadmills.html
Predict:1Inllustration:evergreen

Web:http://www.bloomberg.com/news/2012-02-06/syria-s-assad-deploys-tactics-of-father-to-crush-revolt-threatening-reign.html
Predict:1Inllustration:evergreen

Web:http://www.wired.com/gadgetlab/2011/12/stem-turns-lemons-and-limes-into-juicy-atomizers/
Predict:1Inllustration:evergreen

Web:http://www.latimes.com/health/boostershots/la-heb-fat-tax-denmark-20111013,0,2603132.story
Predict:1Inllustration:evergreen

Web:http://www.howlifeworks.com/a/a?AG_ID=1186&cid=7340ci
Predict:1Inllustration:evergreen

Web:http://romancingthestoveblog.wordpress.com/2010/01/13/sweet-potato-ravioli-with-lemon-sage-brown-butter-sauce/
Predict:1Inllustration:evergreen

Web:http://www.funniez.net/Funny-Pictures/turn-men-down.html
Predict:1Inllustration:evergreen

Web:http://youfellasleepwatchingadvd.com/
Predict:1Inllustration:evergreen

相关阅读:
signal(SIGCHLD, SIG_IGN)和signal(SIGPIPE, SIG_IGN);
关于pthread_cond_wait使用while循环判断的理解
linux的sleep()和usleep()的使用和区别
C中结构体的存储分配
扯扯python的多线程的同步锁 Lock RLock Semaphore Event Condition
线程属性的初始化以及销毁
Mysql数据库一个表字段中存了id,并以逗号分隔,id对应的详细信息在另一个表中
sqlyog 注册码
Oracle 12C卸载图文教程
Oracle12c Release1安装图解（详解）

原文地址：https://www.cnblogs.com/zhuozige/p/12627637.html