• Spark 读取 HBase 数据


    1、pom.xml

    版本号

    <properties>
        <hbase.version>2.2.2</hbase.version>
        <hadoop.version>2.10.0</hadoop.version>
        <spark.version>2.4.2</spark.version>
    </properties>

    依赖包

    <dependencies>
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-core_2.11</artifactId>
                <version>${spark.version}</version>
                <exclusions>
                    <exclusion>
                        <groupId>commons-beanutils</groupId>
                        <artifactId>commons-beanutils-core</artifactId>
                    </exclusion>
                </exclusions>
            </dependency>
    
            <!-- Spark SQL-->
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-sql_2.11</artifactId>
                <version>${spark.version}</version>
            </dependency>
    
            <!-- Hadoop-->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-common</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-hdfs</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
    
            <!-- HBase -->
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-client</artifactId>
                <version>${hbase.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-hadoop2-compat</artifactId>
                <version>${hbase.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-server</artifactId>
                <version>${hbase.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-mapreduce</artifactId>
                <version>2.2.2</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-lang3</artifactId>
                <version>3.8.1</version>
            </dependency>
    
            <dependency>
                <groupId>org.scalaj</groupId>
                <artifactId>scalaj-http_2.11</artifactId>
                <version>2.2.2</version>
            </dependency>
    
            <dependency>
                <groupId>com.alibaba</groupId>
                <artifactId>fastjson</artifactId>
                <version>1.2.62</version>
            </dependency>
    </dependencies>

    2、Java 对 HBase 的增删改查操作

      (1)Junit 方式来实现操作,最开始需要连接 Hbase。

    Connection connection = null
    Admin admin = null
    Table table = null
    
    @Before
    public void setUp() throw IOException {
        Configuration conf = new Configuration();
        conf.set("hbase.rootdir", "hdfs://localhost:8020/hbase");
        conf.set("hbase.zookeeper.quorum", "localhost:2181");
    
        try {
            connection = ConnectionFactory.createConnection(conf);
            admin = connection.getAdmin();
        } catch(IOException e) {
            if(null != admin) {
                e.printStackTrace();
            }
        }
    }

        首先 HBase 的配置,Configuration 需要配置 hbase.rootdir 和 hbase.zookeeper.quorumhbase.rootdir 是 HBase 地址下的hbase文件夹,hbase.zookeeper.quorum 是 Zookeeper 的地址。HBaseconnection 是通过 ConnectionFactory 来获取的,其中就需要参数 conf

          然后我们还需要拿到 connection 中的 adminadmin 的作用是做一些删除表、创建表的操作。

      

      (2)创建 HBase  

    @Test
    public void createTable() throws Exception {
        TableName table = TableName.valueOf(tableName);
        
        // 首先判断HBase中是否存在预创建的表
        if(admin.tableExists(table)) {
            System.out.println(tableName + " 已经存在 。。。")
        } else {
            // 设置表名
            HTableDescriptor tableDescriptor = new HTableDescriptor(table);
            // 设置表的列族,列族需要在创建表的时候就设置
            tableDescriptor.addFamily(new HColumnDescriptor("info"));
            tableDescriptor.addFamily(new HColumnDescriptor("address"));
    
            // 利用 admin 创建表
            admin.createTable(tableDescriptor);
    
            System.out.println(tableName + " 创建成功。。。");
        }
    }

      

      (3)向创建表中插入数据

    @Test
    public void testPut() throws Exception {
        table = connection.getTable(TableName.valueOf(tableName));
    
        // 插入单条诗句
        // 添加 rowKey
        Put put = new Put(Bytes.toBytes("hunter"));
        // 通过Put设置要添加的数据的CF、qualifier、value
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes("23"));
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("birthday"), Bytes.toBytes("1997-12-12"));
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("company"), Bytes.toBytes("HaHa"));
    
        put.addColumn(Bytes.toBytes("address"), Bytes.toBytes("country"), Bytes.toBytes("CN"));
        put.addColumn(Bytes.toBytes("address"), Bytes.toBytes("provimce"), Bytes.toBytes("LN"));
        put.addColumn(Bytes.toBytes("address"), Bytes.toBytes("city"), Bytes.toBytes("DL"));
    
        // 将数据put到HBase中去
        table.put(put);
    
        // 插入多条诗句
        // 创建一个 Put 的 List,用作将多条 put 数据存入其中
        List<Put> puts = new ArrayList<Put>();
        
        // 第一个 Put
        Put put1 = new Put(Bytes.toBytes("jepson"));
        put1.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes("15"));
        put1.addColumn(Bytes.toBytes("info"), Bytes.toBytes("birthday"), Bytes.toBytes("xxxx-xx-xx"));
        put1.addColumn(Bytes.toBytes("info"), Bytes.toBytes("company"), Bytes.toBytes("WaWa"));
        put1.addColumn(Bytes.toBytes("address"), Bytes.toBytes("country"), Bytes.toBytes("CN"));
        put1.addColumn(Bytes.toBytes("address"), Bytes.toBytes("provimce"), Bytes.toBytes("BJ"));
        put1.addColumn(Bytes.toBytes("address"), Bytes.toBytes("city"), Bytes.toBytes("BJ"));
          
        // 第二个 Put
        Put put2 = new Put(Bytes.toBytes("okey"));
        put2.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes("19"));
        put2.addColumn(Bytes.toBytes("info"), Bytes.toBytes("birthday"), Bytes.toBytes("yy-yy-yy"));
        put2.addColumn(Bytes.toBytes("info"), Bytes.toBytes("company"), Bytes.toBytes("DuoDuo"));
        put2.addColumn(Bytes.toBytes("address"), Bytes.toBytes("country"), Bytes.toBytes("CN"));
        put2.addColumn(Bytes.toBytes("address"), Bytes.toBytes("provimce"), Bytes.toBytes("SH"));
        put2.addColumn(Bytes.toBytes("address"), Bytes.toBytes("city"), Bytes.toBytes("SH"));
    
        // 全部 add 到 puts list 中
        puts.add(put1);
        puts.add(put2);
    
        // 保存到 HBase 中
        table.put(puts);
    }    

      (4)HBase 表中查询数据

    @Test
    //通过RowKey来获取Hbase数据
    public void testGetByRowkey() throws IOException {
        // 获取表
        table = connection.getTable(TableName.valueOf(tableName));
        // 
        Get get = new Get("20200107_1003629081".getBytes());
        // 用来设置返回的列。参数:(列族,列名)
        // get.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"));
    
        Result result = table.get(get);
        printResult(result);
        }
    
        private void printResult(Result result) {
            for(Cell cell: result.rawCells()) {
                System.out.println(Bytes.toString(result.getRow())+ "	"
                        + Bytes.toString(CellUtil.cloneFamily(cell)) + "	"
                        + Bytes.toString(CellUtil.cloneQualifier(cell)) + "	"
                        + Bytes.toString(CellUtil.cloneValue(cell)) + "	"
                        + cell.getTimestamp()
                );
            }
        }
    
        @Test
        
        public void testScan() throws IOException {
            table = connection.getTable(TableName.valueOf(tableName));
    
          Scan scan = new Scan();
          scan.addFamily(Bytes.toBytes("info"));
         // scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("company"));
         // Scan scan = new Scan(Bytes.toBytes("jepson")); // >=
         // Scan scan = new Scan(new Get(Bytes.toBytes("jepson"))); 
         // 明确查询
         // Scan scan = new Scan(Bytes.toBytes("jepson"), 
        Bytes.toBytes("okey")); // 包含起始,不包含结束
    
         // ResultScanner results = table.getScanner(Bytes.toBytes("info"), Bytes.toBytes("company"));
        ResultScanner results =  table.getScanner(scan);
    
            for(Result result: results) {
                printResult(result);
       System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
            }
        }

        

  • 相关阅读:
    MYSQL profiling分析语句
    进程状态与僵尸进程、孤儿进程
    Nginx跨域设置
    Redis的应用场景
    Nginx的作用
    cgi、fast-cgi和php-fpm的介绍(作用)
    CoreDump开启和Swoole Tracker 3.0配置
    好题总结
    Atcoder总结 Part III
    Atcoder总结 Part II
  • 原文地址:https://www.cnblogs.com/Lyh1997/p/12228300.html
Copyright © 2020-2023  润新知