Spark-读写HBase,SparkStreaming操作,Spark的HBase相关操作
(1-4)原文地址:JasonLee’blog
(5-6)原文地址:Lu_Xiao_Yue
(7)原文地址:修行修心
1.sparkstreaming实时写入Hbase(saveAsNewAPIHadoopDataset方法)
import kafka.PropertiesScalaUtils
import net.sf.json.JSONObject
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import spark.wordcount.kafkaStreams
/**
* sparkstreaming写入hbase新的API;
*/
object sparkToHbase {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Hbase Test")
val scc = new StreamingContext(conf, Seconds(1))
val sc = scc.sparkContext
val tablename = "test"
val mode = args(0).toString
val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
val topic = PropertiesScalaUtils.loadProperties("topic_combine",mode)
val broker = PropertiesScalaUtils.loadProperties("broker",mode)
sc.hadoopConfiguration.set("hbase.zookeeper.quorum",zk_hbase)
sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", zk_port)
sc.hadoopConfiguration.set("hbase.master", hbase_master)
sc.hadoopConfiguration.set("hbase.defaults.for.version.skip", "true")
sc.hadoopConfiguration.set("hhbase.rootdir", hbase_rootdir)
sc.hadoopConfiguration.set("zookeeper.znode.parent", zookeeper_znode_parent)
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
val job = Job.getInstance(sc.hadoopConfiguration)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val topicSet = Set(topic)
val kafkaParams = Map[String, Object](
"auto.offset.reset" -> "latest", //latest;earliest
"value.deserializer" -> classOf[StringDeserializer] //key,value的反序列化;
, "key.deserializer" -> classOf[StringDeserializer]
, "bootstrap.servers" -> broker
, "group.id" -> "jason_test"
, "enable.auto.commit" -> (true: java.lang.Boolean)
)
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
try {
kafkaStreams.foreachRDD(rdd => {
if(!rdd.isEmpty()){
val save_rdd = rdd.map(x => {
val json = JSONObject.fromObject(x.value())
val put = new Put(Bytes.toBytes(json.get("rowkey").toString))
insert_hb(json,put)
(new ImmutableBytesWritable, put)
})
save_rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
}
})
}catch {
case e:Exception => println("报错了")
}
scc.start()
scc.awaitTermination()
}
def insert_hb(json: JSONObject, onePut: Put): Unit = {
val keys = json.keySet
val iterator_redis = keys.iterator
while (iterator_redis.hasNext) {
val hb_col = iterator_redis.next().toString
val col_value = json.get(hb_col).toString
onePut.addColumn(Bytes.toBytes("f1"), Bytes.toBytes(hb_col), Bytes.toBytes(col_value))
}
}
}
2.sparkstreaming整合kafka实现exactly-once语义
手动维护kafka的offest
为了实现exactly-once的语义,我采用自己保存offest的方法,offest可以保存在zk,kafka,mysql,hbase,redis中自己根据情况而定,我选择把offest保存到redis中.创建Dstream之前,先判断是否消费过,如果没有消费就从头开始,如果已经消费过了,就从上次保存的offest处开始消费。
spark版本2.2.0,scala版本2.11.8,kafka版本0.10.1,hbase版本1.1.2.。
package test
import java.util
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.SparkStreamingKafka.{dbIndex, kafkaStreams}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import redis.RedisPool
object sparkstreaming {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
val conf = new SparkConf().setAppName("sparkstreaming")
conf.set("spark.streaming.kafka.maxRatePerPartition", "2000")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.streaming.concurrentJobs", "10")
conf.set("spark.streaming.kafka.maxRetries", "50")
val scc = new StreamingContext(conf, Seconds(5))
val topic = PropertiesScalaUtils.loadProperties("topic")
val topicSet: Set[String] = Set(topic)
val kafkaParams = Map[String, Object](
"auto.offset.reset" -> "latest",
"value.deserializer" -> classOf[StringDeserializer]
, "key.deserializer" -> classOf[StringDeserializer]
, "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
, "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
, "enable.auto.commit" -> (false: java.lang.Boolean)
)
val maxTotal = 200
val maxIdle = 100
val minIdle = 10
val testOnBorrow = false
val testOnReturn = false
val maxWaitMillis = 500
RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
val jedis = RedisPool.getPool.getResource
jedis.select(dbIndex)
val keys: util.Set[String] = jedis.keys(topic + "*")
if (keys.size() == 0) {
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
} else {
val fromOffsets: Map[TopicPartition, Long] = RedisKeysListUtils.getKeysList(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, topic)
kafkaStreams = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams, fromOffsets))
}
RedisPool.getPool.returnResource(jedis)
kafkaStreams.foreachRDD(rdd=>{
if (!rdd.isEmpty()) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition(partiton=>{
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", PropertiesScalaUtils.loadProperties("zk_hbase")) //zk的地址;
conf.set("hbase.zookeeper.property.clientPort", PropertiesScalaUtils.loadProperties("zk_port"))
conf.set("hbase.master", PropertiesScalaUtils.loadProperties("hbase_master"))
conf.set("hbase.defaults.for.version.skip", "true")
conf.set("hhbase.rootdir", PropertiesScalaUtils.loadProperties("hbase_rootdir"))
conf.set("zookeeper.znode.parent", PropertiesScalaUtils.loadProperties("zookeeper_znode_parent"))
myTable = new HTable(conf, TableName.valueOf(PropertiesScalaUtils.loadProperties("hbase_table")))
myTable.setAutoFlush(false, false) //关闭自动提交
myTable.setWriteBufferSize(3 * 1024 * 1024)
partiton.foreach(pair=>{
//自己的处理逻辑;
})
myTable.flushCommits()
myTable.close()
offsetRanges.foreach { offsetRange =>
println("partition : " + offsetRange.partition + " fromOffset: " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
})
}
})
scc.start()
scc.awaitTermination()
}
}
3.sparkstreaming同时消费多个topic的数据实现exactly-once的语义
offest存到redis里了,当然也可以保存在zk,kafka,mysql,hbase中都可以。
用了3个topic,每个topic5个partition。
package spark
import java.io.File
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.streamingRedisHive.{dbIndex}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._
import redis.RedisPool
object moreTopic {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
val warehouseLocation = new File("hdfs://cluster/hive/warehouse").getAbsolutePath
val spark = SparkSession.builder().appName("Spark Jason").config("spark.sql.warehouse.dir", warehouseLocation).enableHiveSupport().getOrCreate()
spark.conf.set("spark.streaming.concurrentJobs", 10)
spark.conf.set("spark.streaming.kafka.maxRetries", 50)
spark.conf.set("spark.streaming.stopGracefullyOnShutdown",true)
spark.conf.set("spark.streaming.backpressure.enabled",true)
spark.conf.set("spark.streaming.backpressure.initialRate",5000)
spark.conf.set("spark.streaming.kafka.maxRatePerPartition", 3000)
@transient
val sc = spark.sparkContext
val scc = new StreamingContext(sc, Seconds(2))
val kafkaParams = Map[String, Object](
"auto.offset.reset" -> "latest",
"value.deserializer" -> classOf[StringDeserializer]
, "key.deserializer" -> classOf[StringDeserializer]
, "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
, "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
, "enable.auto.commit" -> (false: java.lang.Boolean)
)
var stream: InputDStream[ConsumerRecord[String, String]] = null
val topics = Array("jason_20180519", "jason_0606","jason_test")
val maxTotal = 200
val maxIdle = 100
val minIdle = 10
val testOnBorrow = false
val testOnReturn = false
val maxWaitMillis = 5000
RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
val jedis = RedisPool.getPool.getResource
jedis.select(dbIndex)
val keys = jedis.keys(topics(0) + "*")
val keys_2 = jedis.keys(topics(1) +"*")
val keys_3 = jedis.keys(topics(2) +"*")
if(keys.size() == 0 && keys_2.size() == 0 && keys_3.size() == 0){
println("第一次启动,从头开始消费数据-----------------------------------------------------------")
stream = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
}else{
println("不是第一次启动,从上次的offest开始消费数据-----------------------------------------------")
stream = KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, RedisKeysListUtils.getRedisOffest(topics,jedis)))
}
jedis.close()
stream.foreachRDD(rdd=>{
if (!rdd.isEmpty()) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition(partition=>{
val o = offsetRanges(TaskContext.get.partitionId)
println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
val jedis_jason = RedisPool.getPool.getResource
jedis_jason.select(dbIndex)
partition.foreach(pair=>{
//自己的计算逻辑;
})
offsetRanges.foreach { offsetRange =>
println("partition : " + offsetRange.partition + " fromOffset: " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
}
jedis_jason.close()
})
}
})
scc.start()
scc.awaitTermination()
}
}
4.spark读取hbase数据(newAPIHadoopRDD方式)
package hbase
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.log4j.{Level, Logger}
import util.PropertiesScalaUtils
import org.apache.spark.sql.SparkSession
/**
* spark读取hbase的数据
*/
object ReadHbase {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
val spark = SparkSession
.builder
.appName("read hbase")
.master("local[4]")
.config("spark.some.config.option", "config-value")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate
val sc = spark.sparkContext
val mode = "local"
val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
val hbase_table = PropertiesScalaUtils.loadProperties("hbase_table",mode)
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", zk_hbase)
conf.set("hbase.zookeeper.property.clientPort", zk_port)
conf.set("hbase.master", hbase_master)
conf.set("hbase.defaults.for.version.skip", "true")
conf.set("hhbase.rootdir", hbase_rootdir)
conf.set("zookeeper.znode.parent", zookeeper_znode_parent)
conf.set(TableInputFormat.INPUT_TABLE, "cbd:prod_base")
val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
hbaseRDD.sample(false,0.1)foreachPartition(fp=>{
fp.foreach(f=>{
val rowkey = Bytes.toString(f._2.getRow)
val InsertTime = Bytes.toString(f._2.getValue("cf1".getBytes,"InsertTime".getBytes))
val VipPrice = Bytes.toString(f._2.getValue("cf1".getBytes,"VipPrice".getBytes))
println(s"Row key:$rowkey InsertTime:$InsertTime VipPrice:$VipPrice")
})
})
println("元素的个数:"+hbaseRDD.count())
sc.stop()
}
}
5.spark读取hbase中的数据
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object HbaseRdd1 {
def main(args: Array[String]): Unit = {
val conf = HBaseConfiguration.create()
val sc = new SparkContext(new SparkConf())
//设置查询的表名
conf.set(TableInputFormat.INPUT_TABLE, "student")
// hbase
val stuRDD: RDD[(ImmutableBytesWritable, Result)] = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val count = stuRDD.count()
println("Students RDD Count:" + count)
stuRDD.cache()
//遍历输出
stuRDD.foreach({ case (_,result) =>
val key = Bytes.toString(result.getRow)
val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes))
val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes))
val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes))
println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age)
})
}
}
6.spark将数据写入到hbase
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.spark._
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
object SparkWriteHBase {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("SparkWriteHBase").setMaster("local")
val sc = new SparkContext(sparkConf)
val tablename = "student"
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
val job = new Job(sc.hadoopConfiguration)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26","4,Guanhua,M,27")) //构建两行记录
val rdd = indataRDD.map(_.split(',')).map{arr=>{
val put = new Put(Bytes.toBytes(arr(0))) //行健的值
put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(arr(1))) //info:name列的值
put.add(Bytes.toBytes("info"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2))) //info:gender列的值
put.add(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(arr(3).toInt)) //info:age列的值
(new ImmutableBytesWritable, put)
}}
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
}
}
7.Spark 读写 HBase 的两种方式(RDD、DataFrame)
7.1使用 saveAsHadoopDataset 写入数据
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
//import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
//import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
/**
* Created by blockchain on 18-9-9 下午3:45 in Beijing.
*/
object SparkHBaseRDD {
def main(args: Array[String]) {
// 屏蔽不必要的日志显示在终端上
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
val spark = SparkSession.builder().appName("SparkHBaseRDD").getOrCreate()
val sc = spark.sparkContext
val tablename = "SparkHBase"
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum","localhost") //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181") //设置zookeeper连接端口,默认2181
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tablename)
// 初始化job,TableOutputFormat 是 org.apache.hadoop.hbase.mapred 包下的
val jobConf = new JobConf(hbaseConf)
jobConf.setOutputFormat(classOf[TableOutputFormat])
val indataRDD = sc.makeRDD(Array("2,jack,16", "1,Lucy,15", "5,mike,17", "3,Lily,14"))
val rdd = indataRDD.map(_.split(',')).map{ arr=>
/*一个Put对象就是一行记录,在构造方法中指定主键
* 所有插入的数据 须用 org.apache.hadoop.hbase.util.Bytes.toBytes 转换
* Put.addColumn 方法接收三个参数:列族,列名,数据*/
val put = new Put(Bytes.toBytes(arr(0)))
put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("age"),Bytes.toBytes(arr(2)))
(new ImmutableBytesWritable, put)
}
rdd.saveAsHadoopDataset(jobConf)
spark.stop()
}
}
7.2使用 newAPIHadoopRDD 读取数据
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
//import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
//import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
/**
* Created by blockchain on 18-9-9 下午3:45 in Beijing.
*/
object SparkHBaseRDD {
def main(args: Array[String]) {
// 屏蔽不必要的日志显示在终端上
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
val spark = SparkSession.builder().appName("SparkHBaseRDD").getOrCreate()
val sc = spark.sparkContext
val tablename = "SparkHBase"
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum","localhost") //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181") //设置zookeeper连接端口,默认2181
hbaseConf.set(TableInputFormat.INPUT_TABLE, tablename)
// 如果表不存在,则创建表
val admin = new HBaseAdmin(hbaseConf)
if (!admin.isTableAvailable(tablename)) {
val tableDesc = new HTableDescriptor(TableName.valueOf(tablename))
admin.createTable(tableDesc)
}
//读取数据并转化成rdd TableInputFormat 是 org.apache.hadoop.hbase.mapreduce 包下的
val hBaseRDD = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
hBaseRDD.foreach{ case (_ ,result) =>
//获取行键
val key = Bytes.toString(result.getRow)
//通过列族和列名获取列
val name = Bytes.toString(result.getValue("cf1".getBytes,"name".getBytes))
val age = Bytes.toString(result.getValue("cf1".getBytes,"age".getBytes))
println("Row key:"+key+" cf1.Name:"+name+" cf1.Age:"+age)
}
admin.close()
spark.stop()
}
}
7.3Spark DataFrame 通过 Phoenix 读写 HBase
添加依赖:
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-core</artifactId>
<version>${phoenix.version}</version>
</dependency>
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-spark</artifactId>
<version>${phoenix.version}</version>
</dependency>
代码:
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{SaveMode, SparkSession}
/**
* Created by blockchain on 18-9-9 下午8:33 in Beijing.
*/
object SparkHBaseDataFrame {
def main(args: Array[String]) {
// 屏蔽不必要的日志显示在终端上
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
val spark = SparkSession.builder().appName("SparkHBaseDataFrame").getOrCreate()
val url = s"jdbc:phoenix:localhost:2181"
val dbtable = "PHOENIXTEST"
//spark 读取 phoenix 返回 DataFrame 的 第一种方式
val rdf = spark.read
.format("jdbc")
.option("driver", "org.apache.phoenix.jdbc.PhoenixDriver")
.option("url", url)
.option("dbtable", dbtable)
.load()
rdf.printSchema()
//spark 读取 phoenix 返回 DataFrame 的 第二种方式
val df = spark.read
.format("org.apache.phoenix.spark")
.options(Map("table" -> dbtable, "zkUrl" -> url))
.load()
df.printSchema()
//spark DataFrame 写入 phoenix,需要先建好表
df.write
.format("org.apache.phoenix.spark")
.mode(SaveMode.Overwrite)
.options(Map("table" -> "PHOENIXTESTCOPY", "zkUrl" -> url))
.save()
spark.stop()
}
}