无监督学习
0.K-means
from __future__ import print_function
from pyspark.ml.clustering import KMeans
#from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession
.builder
.appName("KMeansExample")
.getOrCreate()
dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")
# 训练K-means聚类模型
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)
# 预测(即分配聚类中心)
predictions = model.transform(dataset)
# 根据Silhouette得分评估(pyspark2.2里新加)
#evaluator = ClusteringEvaluator()
#silhouette = evaluator.evaluate(predictions)
#print("Silhouette with squared euclidean distance = " + str(silhouette))
# 输出预测结果
print("predicted Center: ")
for center in predictions[['prediction']].collect():
print(center.asDict())
# 聚类中心
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
print(center)
spark.stop()
predicted Center:
{'prediction': 0}
{'prediction': 0}
{'prediction': 0}
{'prediction': 1}
{'prediction': 1}
{'prediction': 1}
Cluster Centers:
[0.1 0.1 0.1]
[9.1 9.1 9.1]
2.GMM模型
from __future__ import print_function
from pyspark.ml.clustering import GaussianMixture
from pyspark.sql import SparkSession
Gaussian Metrics Model
应用场景不太广泛
EM算法关系非常紧密
spark = SparkSession
.builder
.appName("GaussianMixtureExample")
.getOrCreate()
dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")
gmm = GaussianMixture().setK(2).setSeed(0)
model = gmm.fit(dataset)
print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)
spark.stop()
Gaussians shown as a DataFrame:
+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|mean |cov |
+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[9.099999999999984,9.099999999999984,9.099999999999984] |0.006666666666812185 0.006666666666812185 0.006666666666812185
0.006666666666812185 0.006666666666812185 0.006666666666812185
0.006666666666812185 0.006666666666812185 0.006666666666812185 |
|[0.10000000000001552,0.10000000000001552,0.10000000000001552]|0.006666666666806454 0.006666666666806454 0.006666666666806454
0.006666666666806454 0.006666666666806454 0.006666666666806454
0.006666666666806454 0.006666666666806454 0.006666666666806454 |
+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
3.关联规则
我这里是pyspark 2.2以下的版本的写法,新版可以参考此程序之下的程序
统计一块出现的频次
from pyspark.mllib.fpm import FPGrowth
from pyspark.sql import SparkSession
spark = SparkSession
.builder
.appName("FPGrowthExample")
.getOrCreate()
data = spark.sparkContext.textFile("sample_fpgrowth.txt")
transactions = data.map(lambda line: line.strip().split(' '))
model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
result = model.freqItemsets().collect()
for fi in result:
print(fi)
spark.stop()
spark = SparkSession
.builder
.appName("FPGrowthExample")
.getOrCreate()
df = spark.createDataFrame([
(0, [1, 2, 5]),
(1, [1, 2, 3, 5]),
(2, [1, 2])
], ["id", "items"])
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)
# Display frequent itemsets.
model.freqItemsets.show()
# Display generated association rules.
model.associationRules.show()
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()
spark.stop()
4.LDA主题模型
从聊天记录中产生一些价值,找到在聊什么,扫描聊天记录,统计分布,最后拿到一些参数,分成五个主题,分别是什么
from __future__ import print_function
from pyspark.ml.clustering import LDA
from pyspark.sql import SparkSession
spark = SparkSession
.builder
.appName("LDAExample")
.getOrCreate()
# 加载数据
dataset = spark.read.format("libsvm").load("sample_lda_libsvm_data.txt")
# 训练LDA模型
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)
ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp)+"
")
# 输出主题
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)
# 数据集解析
print("transform dataset:
")
transformed = model.transform(dataset)
transformed.show(truncate=False)
spark.stop()
The lower bound on the log likelihood of the entire corpus: -791.5947754606647
The upper bound on perplexity: 3.0445952902333255
The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights |
+-----+-----------+---------------------------------------------------------------+
|0 |[10, 3, 6] |[0.22065574573024932, 0.1463504588523678, 0.1400488511657804] |
|1 |[5, 4, 2] |[0.1402652280692649, 0.13500139507364667, 0.13316919532590202] |
|2 |[0, 2, 6] |[0.10263494028255915, 0.09813889751755107, 0.0962151005355116] |
|3 |[1, 7, 6] |[0.10372272198626253, 0.1006596765289526, 0.10016572752415726] |
|4 |[2, 10, 5] |[0.09773071868636975, 0.09581585125837458, 0.09500562251110266]|
|5 |[1, 4, 9] |[0.10332102242377281, 0.10213327780144739, 0.09682815734808546]|
|6 |[3, 8, 0] |[0.1005995436390056, 0.10055895536125281, 0.09990129390272384] |
|7 |[4, 5, 3] |[0.09885989687960994, 0.09809507226310166, 0.09779147510749324]|
|8 |[4, 7, 10] |[0.09996396714649075, 0.09704744502890804, 0.09701502340344925]|
|9 |[0, 2, 10] |[0.10066219141705475, 0.09972472570710426, 0.09768575088202358]|
+-----+-----------+---------------------------------------------------------------+
transform dataset:
+-----+---------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features |topicDistribution |
+-----+---------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0 |(11,[0,1,2,4,5,6,7,10],[1.0,2.0,6.0,2.0,3.0,1.0,1.0,3.0]) |[0.005396108560597916,0.9568045704109475,0.004724938445517785,0.004724929396921979,0.004724931227921214,0.0047248996639425505,0.004724896375116698,0.004724889052076712,0.004724916255098267,0.004724920611859444] |
|1.0 |(11,[0,1,3,4,7,10],[1.0,3.0,1.0,3.0,2.0,1.0]) |[0.009009034468933447,0.9279131530430607,0.007884637986360904,0.007884676476729824,0.007884750014046558,0.007884728106304674,0.00788479677692647,0.007884763456162899,0.007884811483363282,0.007884648188111272] |
|2.0 |(11,[0,1,2,5,6,8,9],[1.0,4.0,1.0,4.0,9.0,1.0,2.0]) |[0.5752215323724078,0.3919168152517831,0.0041077198209835165,0.004107746377527203,0.00410769650929023,0.004107715753536282,0.004107731018505452,0.004107709360041446,0.004107661260805229,0.004107672275119539] |
|3.0 |(11,[0,1,3,6,8,9,10],[2.0,1.0,3.0,5.0,2.0,3.0,9.0]) |[0.9670248941827648,0.003911619866330156,0.0036329447286274205,0.0036329349060151978,0.00363293024410495,0.0036329199402650994,0.003632953479521618,0.003632934246455745,0.003632940531525275,0.003632927874389646] |
|4.0 |(11,[0,1,2,3,4,6,9,10],[3.0,1.0,1.0,9.0,3.0,2.0,1.0,3.0]) |[0.9642703981721594,0.004239287119115748,0.003936289220164257,0.00393623581490737,0.003936305065274081,0.003936301184604929,0.003936300562792378,0.003936290777038453,0.003936307110584659,0.00393628497335866] |
|5.0 |(11,[0,1,3,4,5,6,7,8,9],[4.0,2.0,3.0,4.0,5.0,1.0,1.0,1.0,4.0]) |[0.3269192882753344,0.6440158554135977,0.003633092071538423,0.003633070883298121,0.0036331182556623475,0.003633104938129972,0.003633130775023706,0.0036331337585125997,0.003633114631165936,0.003633090997736694] |
|6.0 |(11,[0,1,3,6,8,9,10],[2.0,1.0,3.0,5.0,2.0,2.0,9.0]) |[0.9657039704819076,0.00406831739461995,0.0037784748008205683,0.003778465544734777,0.003778457710252918,0.0037784454793549404,0.00377848401129924,0.0037784614157254973,0.0037784685837089576,0.0037784545775754698]|
|7.0 |(11,[0,1,2,3,4,5,6,9,10],[1.0,1.0,1.0,9.0,2.0,1.0,2.0,1.0,3.0])|[0.9610163636667831,0.004625468466082317,0.004294748462391246,0.004294736869495192,0.004294806899609582,0.004294777090773744,0.004294785910562775,0.004294796434875638,0.004294768130034292,0.004294748069392068] |
|8.0 |(11,[0,1,3,4,5,6,7],[4.0,4.0,3.0,4.0,2.0,1.0,3.0]) |[0.15515361787545068,0.8104880925102035,0.004294758336335107,0.004294757405203489,0.0042947925391514695,0.004294786931391696,0.004294836057305791,0.004294800718226623,0.004294805072704067,0.004294752554027822] |
|9.0 |(11,[0,1,2,4,6,8,9,10],[2.0,8.0,2.0,3.0,2.0,2.0,7.0,2.0]) |[0.6842111534995883,0.2897345800426429,0.0032567898486064897,0.0032567752272422345,0.003256779834750705,0.003256796199702492,0.003256782809016066,0.0032567726837555446,0.0032568001002317645,0.003256769754463553] |
|10.0 |(11,[0,1,2,3,5,6,9,10],[1.0,1.0,1.0,9.0,2.0,2.0,3.0,3.0]) |[0.9627139858227667,0.004423932746438786,0.0041077242486949935,0.004107747016187805,0.004107807888457769,0.0041077519686468205,0.0041077896765748,0.004107799654103944,0.00410773046714894,0.004107730510979313] |
|11.0 |(11,[0,1,4,5,6,7,9],[4.0,1.0,4.0,5.0,1.0,3.0,1.0]) |[0.005395081191175718,0.956805526831423,0.0047249382610471714,0.004724903307160704,0.004724927884989505,0.004724916457246126,0.004724929213448612,0.0047249268381828954,0.004724922288527873,0.00472492772679845] |
+-----+---------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
PCA降维
from __future__ import print_function
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
spark = SparkSession
.builder
.appName("PCAExample")
.getOrCreate()
# 构建一份fake data
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
(Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
(Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data, ["features"])
# PCA降维
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)
result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)
spark.stop()
+-----------------------------------------------------------+
|pcaFeatures |
+-----------------------------------------------------------+
|[1.6485728230883807,-4.013282700516296,-5.524543751369388] |
|[-4.645104331781534,-1.1167972663619026,-5.524543751369387]|
|[-6.428880535676489,-5.337951427775355,-5.524543751369389] |
+-----------------------------------------------------------+
word2vec词嵌入
from __future__ import print_function
from pyspark.ml.feature import Word2Vec
from pyspark.sql import SparkSession
spark = SparkSession
.builder
.appName("Word2VecExample")
.getOrCreate()
# 输入是bag of words形式
documentDF = spark.createDataFrame([
("Hi I heard about Spark".split(" "), ),
("I wish Java could use case classes".split(" "), ),
("Logistic regression models are neat".split(" "), )
], ["text"])
# 设置窗口长度等参数,词嵌入学习
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
# 输出词和词向量
model.getVectors().show()
result = model.transform(documentDF)
for row in result.collect():
text, vector = row
print("Text: [%s] =>
Vector: %s
" % (", ".join(text), str(vector)))
spark.stop()
+----------+--------------------+
| word| vector|
+----------+--------------------+
| heard|[-0.1215931475162...|
| are|[-0.0518636293709...|
| neat|[0.15406239032745...|
| classes|[-0.1115801930427...|
| I|[-0.0540042743086...|
|regression|[0.12565632164478...|
| Logistic|[-0.0247527956962...|
| Spark|[0.07148043811321...|
| could|[-0.0817497298121...|
| use|[0.02122100256383...|
| Hi|[-0.1344777345657...|
| models|[0.06217052415013...|
| case|[-0.0487669110298...|
| about|[0.04373900592327...|
| Java|[0.09513066709041...|
| wish|[0.07848564535379...|
+----------+--------------------+
Text: [Hi, I, heard, about, Spark] =>
Vector: [-0.03897114247083664,-0.01978648453950882,0.02615281194448471]
Text: [I, wish, Java, could, use, case, classes] =>
Vector: [-0.014466256169336182,-0.021408329850861003,-0.013912523431437356]
Text: [Logistic, regression, models, are, neat] =>
Vector: [0.05305456221103669,0.0614190086722374,0.0421554870903492]