• pyspark的安装配置


    1、搭建基本spark+Hadoop的本地环境

      https://blog.csdn.net/u011513853/article/details/52865076?tdsourcetag=s_pcqq_aiomsg

    2、下载对应的spark与pyspark的版本进行安装

      https://pypi.org/project/pyspark/2.3.0/#history

    3、单词统计测试

      a、python版本

    import os
    import shutil
    
    from pyspark import SparkContext
    
    inputpath = './data/wc.txt'
    outputpath = './data/out.txt'
    
    sc = SparkContext('local', 'wordcount')
    
    # 读取文件
    input = sc.textFile(inputpath)
    # 切分单词
    words = input.flatMap(lambda line: line.split(' '))
    # 转换成键值对并计数
    counts = words.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)
    
    # 输出结果
    counts.foreach(print)
    
    # 删除输出目录
    if os.path.exists(outputpath):
        shutil.rmtree(outputpath, True)
    
    # 将统计结果写入结果文件
    counts.saveAsTextFile(outputpath)
    

      

      b、scala版本

    package com.wcount
    
    import java.io.{File, PrintWriter}
    
    import org.apache.spark.rdd.RDD
    import org.apache.spark.{SparkConf, SparkContext}
    
    object ScalaWordCount {
    
      def main(args: Array[String]): Unit = {
        /**
          * SparkConf:表示spark application的参数,
          *   setMaster:表示运行的模式:
          *
          *       local:本地模式,一般用于测试
          *       standalone:spark集群自带的资源调度模式
          *       yarn:hadoop
          *       mesos:资源调度框架
          *   setAppName:设置application的名称
          */
        val conf = new SparkConf().setMaster("local").setAppName("workJob")
        /**
          * SparkContext:spark application的上下文环境,通往集群的唯一入口
          */
        val sc = new SparkContext(conf)
    
    //    val session: SparkSession = SparkSession.builder.appName("wc").master("local").getOrCreate()
    
    
        val lines: RDD[String] = sc.textFile("./data/wc.txt")
        val words: RDD[String] = lines.flatMap(line => {
          println("flatmap...........")
          line.split(" ")
        })
        val tuple: RDD[(String, Int)] = words.map(word => {
          println("map............")
          new Tuple2(word, 1)
        })
        val result: RDD[(String, Int)] = tuple.reduceByKey((v1: Int, v2: Int) => v1 + v2)
        //result.foreach(println)
    
        //文件写入
        val outWriter = new PrintWriter(new File("./data/out.txt"))
        var wt:String = ""
    
        for (item<-result){
          wt =item._1.toString+":"+item._2.toString+" "
          println(wt)
        }
        println(wt)
        outWriter.println(wt)
        outWriter.close()
    
        while (true){
    
        }
        //    sc.textFile("./data/wc").flatMap(line => {line.split(" ")}).map(word => {new Tuple2(word, 1)}).reduceByKey((v1: Int, v2: Int) => v1 + v2).foreach(println)
        sc.stop()
      }
    }
    

      

  • 相关阅读:
    xagrs 指定参数位置
    Virtual Machine Kernel Panic : Not Syncing : VFS : Unable To Mount Root FS On Unknown-Block (0,0)
    Laravel Session() 失效的问题
    dirname
    github、git、idea
    ubuntu环境变量在/etc/environment
    ubuntu的PPA
    VMware 增加硬盘ubuntu
    比特币概念
    cpu查询
  • 原文地址:https://www.cnblogs.com/wuzaipei/p/10971878.html
Copyright © 2020-2023  润新知