1.hive 创建内部表
CREATE TABLE emp( empno int, ename string, job string, mgr int, hiredate string, sal double, comm double, deptno int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '; LOAD DATA LOCAL INPATH '/home/hadoop/data/emp.txt' OVERWRITE INTO TABLE emp; MANAGED_TABLE:内部表 删除表:HDFS上的数据被删除 & Meta也被删除
2.hive创建外表表 推荐创建外表表
CREATE EXTERNAL TABLE emp_external( # 加入EXTERNAL创建的表就是 empno int, ename string, job string, mgr int, hiredate string, sal double, comm double, deptno int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' location '/external/emp/'; LOAD DATA LOCAL INPATH '/home/hadoop/data/emp.txt' OVERWRITE INTO TABLE emp_external; EXTERNAL_TABLE HDFS上的数据不被删除 & Meta被删除
3.创建分区表
create external table track_info( ip string, country string, province string, city string, url string, time string, page string ) partitioned by (day string) # partitioned by 分区表 根据day进行分区 ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' location '/project/trackinfo/'; crontab表达式进行调度 Azkaban调度:ETLApp==>其他的统计分析 PySpark及调度系统 https://coding.imooc.com/class/chapter/249.html#Anchor
4.导入数据到表中
LOAD DATA INPATH 'hdfs://192.168.107.216:8020/project/input/raw/train_data.txt' OVERWRITE INTO TABLE track_info partition(day='2013-07-21'); # partition(day='2013-07-21') 前面建表时候 制定的分区
5.常见的几个使用
select count(*) from track_info where day='2013-07-21'; select province,count(*) as cnt from track_info where day='2013-07-21' group by province ; 省份统计表 create table track_info_province_stat( province string, cnt bigint ) partitioned by (day string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '; insert overwrite table track_info_province_stat partition(day='2013-07-21') select province,count(*) as cnt from track_info where day='2013-07-21' group by province ; 到现在为止,我们统计的数据已经在Hive表track_info_province_stat 而且这个表是一个分区表,后续统计报表的数据可以直接从这个表中查询 也可以将hive表的数据导出到RDBMS(sqoop)