基本概念
来一段官网的基本解释
The Apache Hive™ data warehouse software facilitates reading, writing, and managing large datasets residing in distributed storage and queried using SQL syntax.
Built on top of Apache Hadoop™, Hive provides the following features:
- Tools to enable easy access to data via SQL, thus enabling data warehousing tasks such as extract/zzzform/load (ETL), reporting, and data analysis.
- A mechanism to impose structure on a variety of data formats
- Access to files stored either directly in Apache HDFS™ or in other data storage systems such as Apache HBase™
- Query execution via Apache Tez™, Apache Spark™, or MapReduce
- Procedural language with HPL-SQL
- Sub-second query retrieval via Hive LLAP, Apache YARN and Apache Slider.
Hive定义为基于Hadoop的分布式数据仓库,提供基于SQL的数据读写和数据管理管理功能,它的特色包括:支持数据的ETL、构建数据体系、基于HDFS/HBase等数据存储系统。Hive基于HDFS来做文件存储,需要使用Map-Reduce计算框架。
基本使用方法
#如果表xxx存在删除,为创建xxx做准备
drop table if exists xxx;
#创建表xxx,包含15个字段及其类型,并创建分区列index
CREATE TABLE xxx(yyy_attr INT, yyy_bin STRING, iss_mmm_id_cd STRING, pri_acct_no_conv INT, zzz_chnl STRING, zzz_curr_cd INT, zzz_id STRING, settle_dt INT, acpt_mmm_id_cd STRING, yyy_accptr_nm_addr STRING, mchnt_cd STRING, mchnt_tp STRING, term_id STRING, count STRING) COMMENT 'xxx details' PARTITIONED BY (index STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;
#给index'0001'的分区导入数据
LOAD DATA INPATH '/user/kejun.he/input/acquire_1.txt' OVERWRITE INTO TABLE xxx PARTITION (index='0001');
#给index'0002'的分区导入数据
LOAD DATA LOCAL INPATH '/home/kejun.he/hive/acquire_2.txt' OVERWRITE INTO TABLE xxx PARTITION (index='0002');
#查询,并检查order by 和limit用法
select a.yyy_accptr_nm_addr from xxx a where a.index='0001' order by a.yyy_accptr_nm_addr limit 20;
#查询,并检查两个分区的查询,速度很慢
select a.yyy_accptr_nm_addr from xxx a order by a.yyy_accptr_nm_addr limit 20;
#根据查询结果建表
CREATE TABLE mmmtitue AS select distinct iss_mmm_id_cSELEd from xxx where index='0001';
#修改表名
ALTER TABLE mmmtitue RENAME TO mmmtitute;
#inner join
select B.iss_mmm_id_cd ,A.yyy_bin from xxx A JOIN mmmtitute B ON (A.iss_mmm_id_cd=B.iss_mmm_id_cd) where A.index='0002';
#left join
select count(A.yyy_bin) from xxx A LEFT OUTER JOIN mmmtitute B ON (A.iss_mmm_id_cd=B.iss_mmm_id_cd) where A.index='0002';
#right join
select count(A.yyy_bin) from xxx A RIGHT OUTER JOIN mmmtitute B ON (A.iss_mmm_id_cd=B.iss_mmm_id_cd) where A.index='0002';
#full join
select count(A.yyy_bin) from xxx A FULL OUTER JOIN mmmtitute B ON (A.iss_mmm_id_cd=B.iss_mmm_id_cd) where A.index='0002';