• Spark-读写HBase,SparkStreaming操作,Spark的HBase相关操作


    (1-4)原文地址:JasonLee’blog
    (5-6)原文地址:Lu_Xiao_Yue
    (7)原文地址:修行修心

    1.sparkstreaming实时写入Hbase(saveAsNewAPIHadoopDataset方法)

    import kafka.PropertiesScalaUtils
    import net.sf.json.JSONObject
    import org.apache.hadoop.hbase.client.{Put, Result}
    import org.apache.hadoop.hbase.io.ImmutableBytesWritable
    import org.apache.hadoop.hbase.util.Bytes
    import org.apache.hadoop.mapreduce.Job
    import org.apache.spark.SparkConf
    import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
    import org.apache.kafka.common.serialization.StringDeserializer
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
    import spark.wordcount.kafkaStreams
     
    /**
      * sparkstreaming写入hbase新的API;
      */
    object sparkToHbase {
      def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setAppName("Hbase Test")
        val scc = new StreamingContext(conf, Seconds(1))
        val sc = scc.sparkContext
        val tablename = "test"
        val mode = args(0).toString
        val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
        val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
        val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
        val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
        val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
        val topic = PropertiesScalaUtils.loadProperties("topic_combine",mode)
        val broker = PropertiesScalaUtils.loadProperties("broker",mode)
        sc.hadoopConfiguration.set("hbase.zookeeper.quorum",zk_hbase)
        sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", zk_port)
        sc.hadoopConfiguration.set("hbase.master", hbase_master)
        sc.hadoopConfiguration.set("hbase.defaults.for.version.skip", "true")
        sc.hadoopConfiguration.set("hhbase.rootdir", hbase_rootdir)
        sc.hadoopConfiguration.set("zookeeper.znode.parent", zookeeper_znode_parent)
        sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
        val job = Job.getInstance(sc.hadoopConfiguration)
        job.setOutputKeyClass(classOf[ImmutableBytesWritable])
        job.setOutputValueClass(classOf[Result])
        job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
        val topicSet = Set(topic)
        val kafkaParams = Map[String, Object](
          "auto.offset.reset" -> "latest",   //latest;earliest
          "value.deserializer" -> classOf[StringDeserializer] //key,value的反序列化;
          , "key.deserializer" -> classOf[StringDeserializer]
          , "bootstrap.servers" -> broker
          , "group.id" -> "jason_test"
          , "enable.auto.commit" -> (true: java.lang.Boolean)
        )
        kafkaStreams = KafkaUtils.createDirectStream[String, String](
          scc,
          LocationStrategies.PreferConsistent,
          ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
        try {
          kafkaStreams.foreachRDD(rdd => {
            if(!rdd.isEmpty()){
              val save_rdd = rdd.map(x => {
                val json = JSONObject.fromObject(x.value())
                val put = new Put(Bytes.toBytes(json.get("rowkey").toString))
                insert_hb(json,put)
                (new ImmutableBytesWritable, put)
              })
              save_rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
            }
          })
        }catch {
          case e:Exception => println("报错了")
        }
        scc.start()
        scc.awaitTermination()
      }
      def insert_hb(json: JSONObject, onePut: Put): Unit = {
        val keys = json.keySet
        val iterator_redis = keys.iterator
        while (iterator_redis.hasNext) {
          val hb_col = iterator_redis.next().toString
          val col_value = json.get(hb_col).toString
          onePut.addColumn(Bytes.toBytes("f1"), Bytes.toBytes(hb_col), Bytes.toBytes(col_value))
        }
      }
    }
    

    2.sparkstreaming整合kafka实现exactly-once语义

    手动维护kafka的offest
    为了实现exactly-once的语义,我采用自己保存offest的方法,offest可以保存在zk,kafka,mysql,hbase,redis中自己根据情况而定,我选择把offest保存到redis中.创建Dstream之前,先判断是否消费过,如果没有消费就从头开始,如果已经消费过了,就从上次保存的offest处开始消费。

    spark版本2.2.0,scala版本2.11.8,kafka版本0.10.1,hbase版本1.1.2.。

    package test
     
    import java.util
    import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
    import kafka.SparkStreamingKafka.{dbIndex, kafkaStreams}
    import org.apache.kafka.common.TopicPartition
    import org.apache.kafka.common.serialization.StringDeserializer
    import org.apache.log4j.{Level, Logger}
    import org.apache.spark.SparkConf
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
    import redis.RedisPool
     
    object sparkstreaming {
      def main(args: Array[String]): Unit = {
        Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
        Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
        Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
        val conf = new SparkConf().setAppName("sparkstreaming")
        conf.set("spark.streaming.kafka.maxRatePerPartition", "2000")
        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        conf.set("spark.streaming.concurrentJobs", "10")
        conf.set("spark.streaming.kafka.maxRetries", "50")
        val scc = new StreamingContext(conf, Seconds(5))
        val topic = PropertiesScalaUtils.loadProperties("topic")
        val topicSet: Set[String] = Set(topic)
        val kafkaParams = Map[String, Object](
          "auto.offset.reset" -> "latest",
          "value.deserializer" -> classOf[StringDeserializer]
          , "key.deserializer" -> classOf[StringDeserializer]
          , "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
          , "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
          , "enable.auto.commit" -> (false: java.lang.Boolean)
        )
        val maxTotal = 200
        val maxIdle = 100
        val minIdle = 10
        val testOnBorrow = false
        val testOnReturn = false
        val maxWaitMillis = 500
        RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
        val jedis = RedisPool.getPool.getResource
        jedis.select(dbIndex)
        val keys: util.Set[String] = jedis.keys(topic + "*")
        if (keys.size() == 0) {
          kafkaStreams = KafkaUtils.createDirectStream[String, String](
            scc,
            LocationStrategies.PreferConsistent,
            ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
        } else {
          val fromOffsets: Map[TopicPartition, Long] = RedisKeysListUtils.getKeysList(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, topic)
          kafkaStreams = KafkaUtils.createDirectStream[String, String](
            scc,
            LocationStrategies.PreferConsistent,
            ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams, fromOffsets))
        }
        RedisPool.getPool.returnResource(jedis)
        kafkaStreams.foreachRDD(rdd=>{
          if (!rdd.isEmpty()) {
          val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
          rdd.foreachPartition(partiton=>{
            val conf = HBaseConfiguration.create()
            conf.set("hbase.zookeeper.quorum", PropertiesScalaUtils.loadProperties("zk_hbase")) //zk的地址;
            conf.set("hbase.zookeeper.property.clientPort", PropertiesScalaUtils.loadProperties("zk_port"))
            conf.set("hbase.master", PropertiesScalaUtils.loadProperties("hbase_master"))
            conf.set("hbase.defaults.for.version.skip", "true")
            conf.set("hhbase.rootdir", PropertiesScalaUtils.loadProperties("hbase_rootdir"))
            conf.set("zookeeper.znode.parent", PropertiesScalaUtils.loadProperties("zookeeper_znode_parent"))
            myTable = new HTable(conf, TableName.valueOf(PropertiesScalaUtils.loadProperties("hbase_table")))
            myTable.setAutoFlush(false, false) //关闭自动提交
            myTable.setWriteBufferSize(3 * 1024 * 1024)
            partiton.foreach(pair=>{
              //自己的处理逻辑;
            })
            myTable.flushCommits()
            myTable.close()
            offsetRanges.foreach { offsetRange =>
              println("partition : " + offsetRange.partition + " fromOffset:  " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
              val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
              jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
            })
           }
          })
          scc.start()
          scc.awaitTermination()
        }
      }
     
    

    3.sparkstreaming同时消费多个topic的数据实现exactly-once的语义

    offest存到redis里了,当然也可以保存在zk,kafka,mysql,hbase中都可以。

    用了3个topic,每个topic5个partition。

    package spark
     
    import java.io.File
    import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
    import kafka.streamingRedisHive.{dbIndex}
    import org.apache.kafka.clients.consumer.ConsumerRecord
    import org.apache.kafka.common.serialization.StringDeserializer
    import org.apache.log4j.{Level, Logger}
    import org.apache.spark.TaskContext
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.streaming.dstream.InputDStream
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.streaming.kafka010._
    import redis.RedisPool
     
    object moreTopic {
      def main(args: Array[String]): Unit = {
        Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
        Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
        Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
        val warehouseLocation = new File("hdfs://cluster/hive/warehouse").getAbsolutePath
        val spark = SparkSession.builder().appName("Spark Jason").config("spark.sql.warehouse.dir",    warehouseLocation).enableHiveSupport().getOrCreate()
        spark.conf.set("spark.streaming.concurrentJobs", 10)
        spark.conf.set("spark.streaming.kafka.maxRetries", 50)
        spark.conf.set("spark.streaming.stopGracefullyOnShutdown",true)
        spark.conf.set("spark.streaming.backpressure.enabled",true)
        spark.conf.set("spark.streaming.backpressure.initialRate",5000)
        spark.conf.set("spark.streaming.kafka.maxRatePerPartition", 3000)
        @transient
        val sc = spark.sparkContext
        val scc = new StreamingContext(sc, Seconds(2))
        val kafkaParams = Map[String, Object](
          "auto.offset.reset" -> "latest",
          "value.deserializer" -> classOf[StringDeserializer]
          , "key.deserializer" -> classOf[StringDeserializer]
          , "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
          , "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
          , "enable.auto.commit" -> (false: java.lang.Boolean)
        )
        var stream: InputDStream[ConsumerRecord[String, String]] = null
        val topics = Array("jason_20180519", "jason_0606","jason_test")
        val maxTotal = 200
        val maxIdle = 100
        val minIdle = 10
        val testOnBorrow = false
        val testOnReturn = false
        val maxWaitMillis = 5000
        RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
        val jedis = RedisPool.getPool.getResource
        jedis.select(dbIndex)
        val keys = jedis.keys(topics(0) + "*")
        val keys_2 = jedis.keys(topics(1) +"*")
        val keys_3 = jedis.keys(topics(2) +"*")
        if(keys.size() == 0 && keys_2.size() == 0 && keys_3.size() == 0){
          println("第一次启动,从头开始消费数据-----------------------------------------------------------")
          stream = KafkaUtils.createDirectStream[String, String](
            scc,
            LocationStrategies.PreferConsistent,
            ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
          )
        }else{
          println("不是第一次启动,从上次的offest开始消费数据-----------------------------------------------")
          stream = KafkaUtils.createDirectStream[String, String](
            scc,
            LocationStrategies.PreferConsistent,
            ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, RedisKeysListUtils.getRedisOffest(topics,jedis)))
        }
        jedis.close()
        stream.foreachRDD(rdd=>{
          if (!rdd.isEmpty()) {
          val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
          rdd.foreachPartition(partition=>{
            val o = offsetRanges(TaskContext.get.partitionId)
            println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
            val jedis_jason = RedisPool.getPool.getResource
            jedis_jason.select(dbIndex)
            partition.foreach(pair=>{
              //自己的计算逻辑;
            })
            offsetRanges.foreach { offsetRange =>
              println("partition : " + offsetRange.partition + " fromOffset:  " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
              val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
              jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
            }
            jedis_jason.close()
          })
         }
        })
        scc.start()
        scc.awaitTermination()
      }
    }
    

    4.spark读取hbase数据(newAPIHadoopRDD方式)

    package hbase
     
    import org.apache.hadoop.hbase.HBaseConfiguration
    import org.apache.hadoop.hbase.mapreduce.TableInputFormat
    import org.apache.hadoop.hbase.util.Bytes
    import org.apache.log4j.{Level, Logger}
    import util.PropertiesScalaUtils
    import org.apache.spark.sql.SparkSession
     
    /**
      * spark读取hbase的数据
      */
    object ReadHbase {
      def main(args: Array[String]): Unit = {
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
        val spark = SparkSession
          .builder
          .appName("read hbase")
          .master("local[4]")
          .config("spark.some.config.option", "config-value")
          .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
          .getOrCreate
        val sc = spark.sparkContext
        val mode = "local"
        val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
        val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
        val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
        val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
        val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
        val hbase_table = PropertiesScalaUtils.loadProperties("hbase_table",mode)
     
        val conf = HBaseConfiguration.create()
        conf.set("hbase.zookeeper.quorum", zk_hbase)
        conf.set("hbase.zookeeper.property.clientPort", zk_port)
        conf.set("hbase.master", hbase_master)
        conf.set("hbase.defaults.for.version.skip", "true")
        conf.set("hhbase.rootdir", hbase_rootdir)
        conf.set("zookeeper.znode.parent", zookeeper_znode_parent)
        conf.set(TableInputFormat.INPUT_TABLE, "cbd:prod_base")
        val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
          classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
          classOf[org.apache.hadoop.hbase.client.Result])
        hbaseRDD.sample(false,0.1)foreachPartition(fp=>{
          fp.foreach(f=>{
            val rowkey = Bytes.toString(f._2.getRow)
            val InsertTime = Bytes.toString(f._2.getValue("cf1".getBytes,"InsertTime".getBytes))
            val VipPrice = Bytes.toString(f._2.getValue("cf1".getBytes,"VipPrice".getBytes))
            println(s"Row key:$rowkey InsertTime:$InsertTime VipPrice:$VipPrice")
          })
        })
        println("元素的个数:"+hbaseRDD.count())
        sc.stop()
      }
    }
    

    5.spark读取hbase中的数据

    import org.apache.hadoop.hbase.HBaseConfiguration
    import org.apache.hadoop.hbase.client.Result
    import org.apache.hadoop.hbase.io.ImmutableBytesWritable
    import org.apache.hadoop.hbase.mapreduce.TableInputFormat
    import org.apache.hadoop.hbase.util.Bytes
    import org.apache.spark.rdd.RDD
    import org.apache.spark.{SparkConf, SparkContext}
    
    object HbaseRdd1 {
    
      def main(args: Array[String]): Unit = {
        val conf = HBaseConfiguration.create()
        val sc = new SparkContext(new SparkConf())
        //设置查询的表名
        conf.set(TableInputFormat.INPUT_TABLE, "student")
        // hbase
        val stuRDD: RDD[(ImmutableBytesWritable, Result)] = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
          classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
          classOf[org.apache.hadoop.hbase.client.Result])
    
        val count = stuRDD.count()
        println("Students RDD Count:" + count)
        stuRDD.cache()
    
        //遍历输出
        stuRDD.foreach({ case (_,result) =>
          val key = Bytes.toString(result.getRow)
          val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes))
          val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes))
          val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes))
          println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age)
        })
      }
    }
    

    6.spark将数据写入到hbase

    import org.apache.hadoop.hbase.HBaseConfiguration  
    import org.apache.hadoop.hbase.mapreduce.TableOutputFormat  
    import org.apache.spark._  
    import org.apache.hadoop.mapreduce.Job  
    import org.apache.hadoop.hbase.io.ImmutableBytesWritable  
    import org.apache.hadoop.hbase.client.Result  
    import org.apache.hadoop.hbase.client.Put  
    import org.apache.hadoop.hbase.util.Bytes  
    
    object SparkWriteHBase {  
    
      def main(args: Array[String]): Unit = {  
        val sparkConf = new SparkConf().setAppName("SparkWriteHBase").setMaster("local")  
        val sc = new SparkContext(sparkConf)        
        val tablename = "student"        
        sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)  
    
        val job = new Job(sc.hadoopConfiguration)  
        job.setOutputKeyClass(classOf[ImmutableBytesWritable])  
        job.setOutputValueClass(classOf[Result])    
        job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])    
    
        val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26","4,Guanhua,M,27")) //构建两行记录
        val rdd = indataRDD.map(_.split(',')).map{arr=>{  
          val put = new Put(Bytes.toBytes(arr(0))) //行健的值 
          put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))  //info:name列的值
          put.add(Bytes.toBytes("info"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2)))  //info:gender列的值
                put.add(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(arr(3).toInt))  //info:age列的值
          (new ImmutableBytesWritable, put)   
        }}        
        rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())  
      }    
    }  
    

    7.Spark 读写 HBase 的两种方式(RDD、DataFrame)

    7.1使用 saveAsHadoopDataset 写入数据

    import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
    import org.apache.hadoop.hbase.client.{HBaseAdmin, Put, Result}
    import org.apache.hadoop.hbase.io.ImmutableBytesWritable
    import org.apache.hadoop.hbase.mapreduce.TableInputFormat
    //import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
    import org.apache.hadoop.hbase.mapred.TableOutputFormat
    import org.apache.hadoop.hbase.util.Bytes
    import org.apache.hadoop.mapred.JobConf
    //import org.apache.hadoop.mapreduce.Job
    import org.apache.log4j.{Level, Logger}
    import org.apache.spark.sql.SparkSession
    
    /**
      * Created by blockchain on 18-9-9 下午3:45 in Beijing.
      */
    
    object SparkHBaseRDD {
      def main(args: Array[String]) {
        // 屏蔽不必要的日志显示在终端上
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    
        val spark = SparkSession.builder().appName("SparkHBaseRDD").getOrCreate()
        val sc = spark.sparkContext
    
        val tablename = "SparkHBase"
    
        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set("hbase.zookeeper.quorum","localhost")  //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
        hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")       //设置zookeeper连接端口,默认2181
        hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tablename)
    
        // 初始化job,TableOutputFormat 是 org.apache.hadoop.hbase.mapred 包下的
        val jobConf = new JobConf(hbaseConf)
        jobConf.setOutputFormat(classOf[TableOutputFormat])
    
        val indataRDD = sc.makeRDD(Array("2,jack,16", "1,Lucy,15", "5,mike,17", "3,Lily,14"))
    
        val rdd = indataRDD.map(_.split(',')).map{ arr=>
          /*一个Put对象就是一行记录,在构造方法中指定主键
           * 所有插入的数据 须用 org.apache.hadoop.hbase.util.Bytes.toBytes 转换
           * Put.addColumn 方法接收三个参数:列族,列名,数据*/
          val put = new Put(Bytes.toBytes(arr(0)))
          put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
          put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("age"),Bytes.toBytes(arr(2)))
          (new ImmutableBytesWritable, put)
        }
        rdd.saveAsHadoopDataset(jobConf)
    
        spark.stop()
      }
    }
    
    

    7.2使用 newAPIHadoopRDD 读取数据

    import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
    import org.apache.hadoop.hbase.client.{HBaseAdmin, Put, Result}
    import org.apache.hadoop.hbase.io.ImmutableBytesWritable
    import org.apache.hadoop.hbase.mapreduce.TableInputFormat
    //import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
    import org.apache.hadoop.hbase.mapred.TableOutputFormat
    import org.apache.hadoop.hbase.util.Bytes
    import org.apache.hadoop.mapred.JobConf
    //import org.apache.hadoop.mapreduce.Job
    import org.apache.log4j.{Level, Logger}
    import org.apache.spark.sql.SparkSession
    
    /**
      * Created by blockchain on 18-9-9 下午3:45 in Beijing.
      */
    
    object SparkHBaseRDD {
      def main(args: Array[String]) {
        // 屏蔽不必要的日志显示在终端上
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    
        val spark = SparkSession.builder().appName("SparkHBaseRDD").getOrCreate()
        val sc = spark.sparkContext
    
        val tablename = "SparkHBase"
    
        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set("hbase.zookeeper.quorum","localhost")  //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
        hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")       //设置zookeeper连接端口,默认2181
        hbaseConf.set(TableInputFormat.INPUT_TABLE, tablename)
        
        // 如果表不存在,则创建表
        val admin = new HBaseAdmin(hbaseConf)
        if (!admin.isTableAvailable(tablename)) {
          val tableDesc = new HTableDescriptor(TableName.valueOf(tablename))
          admin.createTable(tableDesc)
        }
    
        //读取数据并转化成rdd TableInputFormat 是 org.apache.hadoop.hbase.mapreduce 包下的
        val hBaseRDD = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat],
          classOf[ImmutableBytesWritable],
          classOf[Result])
    
        hBaseRDD.foreach{ case (_ ,result) =>
          //获取行键
          val key = Bytes.toString(result.getRow)
          //通过列族和列名获取列
          val name = Bytes.toString(result.getValue("cf1".getBytes,"name".getBytes))
          val age = Bytes.toString(result.getValue("cf1".getBytes,"age".getBytes))
          println("Row key:"+key+"	cf1.Name:"+name+"	cf1.Age:"+age)
        }
        admin.close()
    
        spark.stop()
      }
    }
    
    

    7.3Spark DataFrame 通过 Phoenix 读写 HBase

    添加依赖:

    <dependency>
       <groupId>org.apache.phoenix</groupId>
       <artifactId>phoenix-core</artifactId>
       <version>${phoenix.version}</version>
    </dependency>
    
    <dependency>
      <groupId>org.apache.phoenix</groupId>
      <artifactId>phoenix-spark</artifactId>
      <version>${phoenix.version}</version>
    </dependency>
    
    

    代码:

    import org.apache.log4j.{Level, Logger}
    import org.apache.spark.sql.{SaveMode, SparkSession}
    
    /**
      * Created by blockchain on 18-9-9 下午8:33 in Beijing.
      */
    
    object SparkHBaseDataFrame {
      def main(args: Array[String]) {
        // 屏蔽不必要的日志显示在终端上
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    
        val spark = SparkSession.builder().appName("SparkHBaseDataFrame").getOrCreate()
    
        val url = s"jdbc:phoenix:localhost:2181"
        val dbtable = "PHOENIXTEST"
    
        //spark 读取 phoenix 返回 DataFrame 的 第一种方式
        val rdf = spark.read
          .format("jdbc")
          .option("driver", "org.apache.phoenix.jdbc.PhoenixDriver")
          .option("url", url)
          .option("dbtable", dbtable)
          .load()
        rdf.printSchema()
    
        //spark 读取 phoenix 返回 DataFrame 的 第二种方式
        val df = spark.read
          .format("org.apache.phoenix.spark")
          .options(Map("table" -> dbtable, "zkUrl" -> url))
          .load()
        df.printSchema()
    
        //spark DataFrame 写入 phoenix,需要先建好表
        df.write
          .format("org.apache.phoenix.spark")
          .mode(SaveMode.Overwrite)
          .options(Map("table" -> "PHOENIXTESTCOPY", "zkUrl" -> url))
          .save()
    
        spark.stop()
      }
    }
    
    
  • 相关阅读:
    SQLSERVER 分区分表
    SQLSERVER 执行计划
    SQL SERVER 自定义函数
    codeforces 414C C. Mashmokh and Reverse Operation(归并排序求逆序对)
    codeforces 414A A. Mashmokh and Numbers(素数筛)
    codeforces 414B B. Mashmokh and ACM(dp)
    bzoj-1012 1012: [JSOI2008]最大数maxnumber(线段树)
    codeforces 665E E. Beautiful Subarrays(trie树)
    codeforces 667D D. World Tour(最短路)
    codeforces 667C C. Reberland Linguistics(dp)
  • 原文地址:https://www.cnblogs.com/aixing/p/13327352.html
Copyright © 2020-2023  润新知