• Spark机器学习基础二


    无监督学习

    0.K-means

    from __future__ import print_function
    from pyspark.ml.clustering import KMeans
    #from pyspark.ml.evaluation import ClusteringEvaluator
    from pyspark.sql import SparkSession
    import pandas as pd
    
    spark = SparkSession
            .builder
            .appName("KMeansExample")
            .getOrCreate()
    
    dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")
    
    # 训练K-means聚类模型
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)
    
    # 预测(即分配聚类中心)
    predictions = model.transform(dataset)
    
    # 根据Silhouette得分评估(pyspark2.2里新加)
    #evaluator = ClusteringEvaluator()
    #silhouette = evaluator.evaluate(predictions)
    #print("Silhouette with squared euclidean distance = " + str(silhouette))
    
    # 输出预测结果
    print("predicted Center: ")
    for center in predictions[['prediction']].collect():
        print(center.asDict())
    
    # 聚类中心
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    
    spark.stop()
    
    predicted Center: 
    {'prediction': 0}
    {'prediction': 0}
    {'prediction': 0}
    {'prediction': 1}
    {'prediction': 1}
    {'prediction': 1}
    Cluster Centers: 
    [0.1 0.1 0.1]
    [9.1 9.1 9.1]
    

    2.GMM模型

    from __future__ import print_function
    from pyspark.ml.clustering import GaussianMixture
    from pyspark.sql import SparkSession
    

    Gaussian Metrics Model
    应用场景不太广泛
    EM算法关系非常紧密

    spark = SparkSession
            .builder
            .appName("GaussianMixtureExample")
            .getOrCreate()
    
    dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")
    
    gmm = GaussianMixture().setK(2).setSeed(0)
    model = gmm.fit(dataset)
    
    print("Gaussians shown as a DataFrame: ")
    model.gaussiansDF.show(truncate=False)
    
    spark.stop()
    
    Gaussians shown as a DataFrame: 
    +-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |mean                                                         |cov                                                                                                                                                                                                     |
    +-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |[9.099999999999984,9.099999999999984,9.099999999999984]      |0.006666666666812185  0.006666666666812185  0.006666666666812185  
    0.006666666666812185  0.006666666666812185  0.006666666666812185  
    0.006666666666812185  0.006666666666812185  0.006666666666812185  |
    |[0.10000000000001552,0.10000000000001552,0.10000000000001552]|0.006666666666806454  0.006666666666806454  0.006666666666806454  
    0.006666666666806454  0.006666666666806454  0.006666666666806454  
    0.006666666666806454  0.006666666666806454  0.006666666666806454  |
    +-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    

    3.关联规则

    我这里是pyspark 2.2以下的版本的写法,新版可以参考此程序之下的程序

    统计一块出现的频次

    from pyspark.mllib.fpm import FPGrowth
    from pyspark.sql import SparkSession
    
    spark = SparkSession
            .builder
            .appName("FPGrowthExample")
            .getOrCreate()
    
    data = spark.sparkContext.textFile("sample_fpgrowth.txt")
    transactions = data.map(lambda line: line.strip().split(' '))
    
    model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
    result = model.freqItemsets().collect()
    
    for fi in result:
        print(fi)
    
    spark.stop()
    
    spark = SparkSession
            .builder
            .appName("FPGrowthExample")
            .getOrCreate()
    
    df = spark.createDataFrame([
        (0, [1, 2, 5]),
        (1, [1, 2, 3, 5]),
        (2, [1, 2])
    ], ["id", "items"])
    
    fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
    model = fpGrowth.fit(df)
    
    # Display frequent itemsets.
    model.freqItemsets.show()
    
    # Display generated association rules.
    model.associationRules.show()
    
    # transform examines the input items against all the association rules and summarize the
    # consequents as prediction
    model.transform(df).show()
    
    spark.stop()
    

    4.LDA主题模型

    从聊天记录中产生一些价值,找到在聊什么,扫描聊天记录,统计分布,最后拿到一些参数,分成五个主题,分别是什么

    from __future__ import print_function
    from pyspark.ml.clustering import LDA
    from pyspark.sql import SparkSession
    
    spark = SparkSession 
            .builder 
            .appName("LDAExample") 
            .getOrCreate()
    
    # 加载数据
    dataset = spark.read.format("libsvm").load("sample_lda_libsvm_data.txt")
    
    # 训练LDA模型
    lda = LDA(k=10, maxIter=10)
    model = lda.fit(dataset)
    
    ll = model.logLikelihood(dataset)
    lp = model.logPerplexity(dataset)
    print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
    print("The upper bound on perplexity: " + str(lp)+"
    ")
    
    # 输出主题
    topics = model.describeTopics(3)
    print("The topics described by their top-weighted terms:")
    topics.show(truncate=False)
    
    # 数据集解析
    print("transform dataset:
    ")
    transformed = model.transform(dataset)
    transformed.show(truncate=False)
    
    spark.stop()
    
    The lower bound on the log likelihood of the entire corpus: -791.5947754606647
    The upper bound on perplexity: 3.0445952902333255
    
    The topics described by their top-weighted terms:
    +-----+-----------+---------------------------------------------------------------+
    |topic|termIndices|termWeights                                                    |
    +-----+-----------+---------------------------------------------------------------+
    |0    |[10, 3, 6] |[0.22065574573024932, 0.1463504588523678, 0.1400488511657804]  |
    |1    |[5, 4, 2]  |[0.1402652280692649, 0.13500139507364667, 0.13316919532590202] |
    |2    |[0, 2, 6]  |[0.10263494028255915, 0.09813889751755107, 0.0962151005355116] |
    |3    |[1, 7, 6]  |[0.10372272198626253, 0.1006596765289526, 0.10016572752415726] |
    |4    |[2, 10, 5] |[0.09773071868636975, 0.09581585125837458, 0.09500562251110266]|
    |5    |[1, 4, 9]  |[0.10332102242377281, 0.10213327780144739, 0.09682815734808546]|
    |6    |[3, 8, 0]  |[0.1005995436390056, 0.10055895536125281, 0.09990129390272384] |
    |7    |[4, 5, 3]  |[0.09885989687960994, 0.09809507226310166, 0.09779147510749324]|
    |8    |[4, 7, 10] |[0.09996396714649075, 0.09704744502890804, 0.09701502340344925]|
    |9    |[0, 2, 10] |[0.10066219141705475, 0.09972472570710426, 0.09768575088202358]|
    +-----+-----------+---------------------------------------------------------------+
    
    transform dataset:
    
    +-----+---------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |label|features                                                       |topicDistribution                                                                                                                                                                                                   |
    +-----+---------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    |0.0  |(11,[0,1,2,4,5,6,7,10],[1.0,2.0,6.0,2.0,3.0,1.0,1.0,3.0])      |[0.005396108560597916,0.9568045704109475,0.004724938445517785,0.004724929396921979,0.004724931227921214,0.0047248996639425505,0.004724896375116698,0.004724889052076712,0.004724916255098267,0.004724920611859444]  |
    |1.0  |(11,[0,1,3,4,7,10],[1.0,3.0,1.0,3.0,2.0,1.0])                  |[0.009009034468933447,0.9279131530430607,0.007884637986360904,0.007884676476729824,0.007884750014046558,0.007884728106304674,0.00788479677692647,0.007884763456162899,0.007884811483363282,0.007884648188111272]    |
    |2.0  |(11,[0,1,2,5,6,8,9],[1.0,4.0,1.0,4.0,9.0,1.0,2.0])             |[0.5752215323724078,0.3919168152517831,0.0041077198209835165,0.004107746377527203,0.00410769650929023,0.004107715753536282,0.004107731018505452,0.004107709360041446,0.004107661260805229,0.004107672275119539]     |
    |3.0  |(11,[0,1,3,6,8,9,10],[2.0,1.0,3.0,5.0,2.0,3.0,9.0])            |[0.9670248941827648,0.003911619866330156,0.0036329447286274205,0.0036329349060151978,0.00363293024410495,0.0036329199402650994,0.003632953479521618,0.003632934246455745,0.003632940531525275,0.003632927874389646] |
    |4.0  |(11,[0,1,2,3,4,6,9,10],[3.0,1.0,1.0,9.0,3.0,2.0,1.0,3.0])      |[0.9642703981721594,0.004239287119115748,0.003936289220164257,0.00393623581490737,0.003936305065274081,0.003936301184604929,0.003936300562792378,0.003936290777038453,0.003936307110584659,0.00393628497335866]     |
    |5.0  |(11,[0,1,3,4,5,6,7,8,9],[4.0,2.0,3.0,4.0,5.0,1.0,1.0,1.0,4.0]) |[0.3269192882753344,0.6440158554135977,0.003633092071538423,0.003633070883298121,0.0036331182556623475,0.003633104938129972,0.003633130775023706,0.0036331337585125997,0.003633114631165936,0.003633090997736694]   |
    |6.0  |(11,[0,1,3,6,8,9,10],[2.0,1.0,3.0,5.0,2.0,2.0,9.0])            |[0.9657039704819076,0.00406831739461995,0.0037784748008205683,0.003778465544734777,0.003778457710252918,0.0037784454793549404,0.00377848401129924,0.0037784614157254973,0.0037784685837089576,0.0037784545775754698]|
    |7.0  |(11,[0,1,2,3,4,5,6,9,10],[1.0,1.0,1.0,9.0,2.0,1.0,2.0,1.0,3.0])|[0.9610163636667831,0.004625468466082317,0.004294748462391246,0.004294736869495192,0.004294806899609582,0.004294777090773744,0.004294785910562775,0.004294796434875638,0.004294768130034292,0.004294748069392068]   |
    |8.0  |(11,[0,1,3,4,5,6,7],[4.0,4.0,3.0,4.0,2.0,1.0,3.0])             |[0.15515361787545068,0.8104880925102035,0.004294758336335107,0.004294757405203489,0.0042947925391514695,0.004294786931391696,0.004294836057305791,0.004294800718226623,0.004294805072704067,0.004294752554027822]   |
    |9.0  |(11,[0,1,2,4,6,8,9,10],[2.0,8.0,2.0,3.0,2.0,2.0,7.0,2.0])      |[0.6842111534995883,0.2897345800426429,0.0032567898486064897,0.0032567752272422345,0.003256779834750705,0.003256796199702492,0.003256782809016066,0.0032567726837555446,0.0032568001002317645,0.003256769754463553] |
    |10.0 |(11,[0,1,2,3,5,6,9,10],[1.0,1.0,1.0,9.0,2.0,2.0,3.0,3.0])      |[0.9627139858227667,0.004423932746438786,0.0041077242486949935,0.004107747016187805,0.004107807888457769,0.0041077519686468205,0.0041077896765748,0.004107799654103944,0.00410773046714894,0.004107730510979313]    |
    |11.0 |(11,[0,1,4,5,6,7,9],[4.0,1.0,4.0,5.0,1.0,3.0,1.0])             |[0.005395081191175718,0.956805526831423,0.0047249382610471714,0.004724903307160704,0.004724927884989505,0.004724916457246126,0.004724929213448612,0.0047249268381828954,0.004724922288527873,0.00472492772679845]   |
    +-----+---------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    

    PCA降维

    from __future__ import print_function
    from pyspark.ml.feature import PCA
    from pyspark.ml.linalg import Vectors
    from pyspark.sql import SparkSession
    
    spark = SparkSession
            .builder
            .appName("PCAExample")
            .getOrCreate()
    
    # 构建一份fake data
    data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
            (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
            (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
    df = spark.createDataFrame(data, ["features"])
    
    # PCA降维
    pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(df)
    
    result = model.transform(df).select("pcaFeatures")
    result.show(truncate=False)
    
    spark.stop()
    
    +-----------------------------------------------------------+
    |pcaFeatures                                                |
    +-----------------------------------------------------------+
    |[1.6485728230883807,-4.013282700516296,-5.524543751369388] |
    |[-4.645104331781534,-1.1167972663619026,-5.524543751369387]|
    |[-6.428880535676489,-5.337951427775355,-5.524543751369389] |
    +-----------------------------------------------------------+
    

    word2vec词嵌入

    from __future__ import print_function
    from pyspark.ml.feature import Word2Vec
    from pyspark.sql import SparkSession
    
    spark = SparkSession
            .builder
            .appName("Word2VecExample")
            .getOrCreate()
    
    # 输入是bag of words形式
    documentDF = spark.createDataFrame([
        ("Hi I heard about Spark".split(" "), ),
        ("I wish Java could use case classes".split(" "), ),
        ("Logistic regression models are neat".split(" "), )
    ], ["text"])
    
    # 设置窗口长度等参数,词嵌入学习
    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
    model = word2Vec.fit(documentDF)
    
    # 输出词和词向量
    model.getVectors().show()
    
    result = model.transform(documentDF)
    for row in result.collect():
        text, vector = row
        print("Text: [%s] => 
    Vector: %s
    " % (", ".join(text), str(vector)))
    
    spark.stop()
    
    +----------+--------------------+
    |      word|              vector|
    +----------+--------------------+
    |     heard|[-0.1215931475162...|
    |       are|[-0.0518636293709...|
    |      neat|[0.15406239032745...|
    |   classes|[-0.1115801930427...|
    |         I|[-0.0540042743086...|
    |regression|[0.12565632164478...|
    |  Logistic|[-0.0247527956962...|
    |     Spark|[0.07148043811321...|
    |     could|[-0.0817497298121...|
    |       use|[0.02122100256383...|
    |        Hi|[-0.1344777345657...|
    |    models|[0.06217052415013...|
    |      case|[-0.0487669110298...|
    |     about|[0.04373900592327...|
    |      Java|[0.09513066709041...|
    |      wish|[0.07848564535379...|
    +----------+--------------------+
    
    Text: [Hi, I, heard, about, Spark] => 
    Vector: [-0.03897114247083664,-0.01978648453950882,0.02615281194448471]
    
    Text: [I, wish, Java, could, use, case, classes] => 
    Vector: [-0.014466256169336182,-0.021408329850861003,-0.013912523431437356]
    
    Text: [Logistic, regression, models, are, neat] => 
    Vector: [0.05305456221103669,0.0614190086722374,0.0421554870903492]
    
    
    
  • 相关阅读:
    名词解释
    cwoa2011项目部署到tomcat服务器(环境搭建及项目的运行)
    上网过程与原理
    C-编译器的实现
    css
    HMTL
    FIRST DAY
    关于面试的吐槽
    我的老大去创业啦
    .Net Core下使用 RSA
  • 原文地址:https://www.cnblogs.com/chenxiangzhen/p/10710800.html
Copyright © 2020-2023  润新知