• scala_spark实践1


    /**
      *  scala模型的main(args:Array[String])是业务执行入口
      *  org.apache.spark.{SparkConf, SparkContext}
      *  val sparkConf =new SparkConf().setAppName(appName)
      *  val ssc = new StreamingContext(sparkConf, Seconds(batchNum))
      *  val sc = ssc.sparkContext  //如果代码中不用StreamingContextval 只需要SparkContext则new一个如val sc = new SparkContext(sparkConf)
      *
      *  val sqlContext = new HiveContext(sc)//HiveContext是对SQLContext的扩展 val sqlContext = new SQLContext(sc)
      *  val result:DataFrame = sqlContext.sql(sql)
      *  //2.0之后HiveContext和SQLContext也可以用SparkSession替换val result =SparkSession.builder().appName("test").config("key","value").getOrCreate().sql(sql)
      *
      *  项目中一般用json处理,如发送kafka或者格式转换和过滤
      *   val resultRdd = result.toJSON.rdd.map(x => {
              val json = new JSONObject(x)
              val computerIp = json.optString("ip", "")
              val rowKey = json.optString("name", "")
              ......
              val dataMap = new util.HashMap[String, String]()
              dataMap.put("computerip", computerIp)
              (rowKey, dataMap)
          })
       val bhaseRdd = resultRdd.filter(r => {
       r._1 != "" && r._1 != null && r._1.length > 0
       }).map(line => {
       val put = new Put(Bytes.toBytes(line._1)) //rowKey 为参数,拿到put
       val key = line._2.keySet().iterator(); //拿到对应的dataMap
       while (key.hasNext) {
        val k = key.next().toString
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(k), Bytes.toBytes(line._2.get(k)))
       }
        (new ImmutableBytesWritable(), put)
       })
    
       val hadoopconf = sc.hadoopConfiguration
       val jobconf = new JobConf(hadoopconf)
       jobconf.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
       jobconf.setOutputValueClass(classOf[Result])
       jobconf.setClass("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]],classOf[OutputFormat[ImmutableBytesWritable, Mutation]])
       jobconf.set(TableOutputFormat.OUTPUT_TABLE, table)
    
       bhaseRdd.saveAsNewAPIHadoopDataset(jobconf) //存入Hasee
      *-----------------------------------------------------------------------------------------------------------
      * class KafkaSink(createProducer: () => KafkaProducer[String, String]) extends Serializable {
        lazy val producer = createProducer()
        def send(topic: String, value: String): Unit ={
    
          producer.send(new ProducerRecord(topic, value))
        }
        }
    
      object KafkaSink {
        def apply(config: java.util.Map[String, Object]): KafkaSink = {
          val f = () => {
            val producer = new KafkaProducer[String, String](config)
            producer
          }
          new KafkaSink(f)
        }
      }
      *val kafka = sc.broadcast(KafkaSink(Configs.kafka_props))
      *selectDatas.toJSON.rdd.foreach(x => {
          val json = new JSONObject(x)
          kafka.value.send(topic, json.toString)
      })
      *//发送topic
      *-------------------------------------------------------------------
    * val kafkaStream= KafkaUtils.createStream[String,String,StringDecoder,StringDecoder](ssc,kafka_param,topic,StorageLevel.MEMORY_AND_DISK_SER).map(_._2)
    * kafkaStream.foreachRDD(rdd =>{
    *   rdd.foreach(data=> {
    * //消费kafka
    */
    

      

    
    
  • 相关阅读:
    C#总结(四)调用C++动态库
    Golang 入门系列(十二)ORM框架gorm
    《关键对话》如何高效沟通,营造无往不利的事业和人生?
    Golang 入门系列(十一)Go语言实现webapi
    Golang 入门系列(十) mysql数据库的使用
    Golang 入门系列(九) 如何读取YAML,JSON,INI等配置文件
    Golang 入门系列(八) cron定时任务
    Golang 入门系列(七) Redis的使用
    福利 | 互联网产品经理学习资料免费下载(可下载)
    福利 | 2018各大技术大会资料汇总(可下载)
  • 原文地址:https://www.cnblogs.com/shaozhiqi/p/12171904.html
Copyright © 2020-2023  润新知