测试数据位于:/home/hadoop/luogankun/workspace/sync_data/pig
dept和emp表来源自oracle数据库自带的表
dept.txt
10 ACCOUNTING NEW YORK 20 RESEARCH DALLAS 30 SALES CHICAGO 40 OPERATIONS BOSTON
emp.txt
7369 SMITH CLERK 7902 1980-12-17 800.00 20 7499 ALLEN SALESMAN 7698 1981-2-20 1600.00 300.00 30 7521 WARD SALESMAN 7698 1981-2-22 1250.00 500.00 30 7566 JONES MANAGER 7839 1981-4-2 2975.00 20 7654 MARTIN SALESMAN 7698 1981-9-28 1250.00 1400.00 30 7698 BLAKE MANAGER 7839 1981-5-1 2850.00 30 7782 CLARK MANAGER 7839 1981-6-9 2450.00 10 7788 SCOTT ANALYST 7566 1987-4-19 3000.00 20 7839 KING PRESIDENT 1981-11-17 5000.00 10 7844 TURNER SALESMAN 7698 1981-9-8 1500.00 0.00 30 7876 ADAMS CLERK 7788 1987-5-23 1100.00 20 7900 JAMES CLERK 7698 1981-12-3 950.00 30 7902 FORD ANALYST 7566 1981-12-3 3000.00 20 7934 MILLER CLERK 7782 1982-1-23 1300.00 10
上传数据到HDFS系统中
cd /home/hadoop/luogankun/workspace/sync_data/pig hadoop fs -put dept.txt input/pig/dept.txt hadoop fs -put emp.txt input/pig/emp.txt
导入
CREATE TABLE TMP_TABLE(USER VARCHAR(32),AGE INT,IS_MALE BOOLEAN); dept= LOAD 'input/pig/dept.txt' USING PigStorage(' ') AS (deptno:int,dname:chararray,loc:chararray); emp = LOAD 'input/pig/emp.txt' USING PigStorage(' ') AS (empno:int,ename:chararray, job:chararray, mgr:int, hiredate:chararray,sal:double,comm:double,dept:int);
查看表结构
desc TMP_TABLE;
describe dept dept: {deptno: int,dname: chararray,loc: chararray} describe emp emp: {empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double,comm: double,dept: int}
查询整张表
SELECT * FROM TMP_TABLE;
DUMP dept (10,ACCOUNTING,NEW YORK) (20,RESEARCH,DALLAS) (30,SALES,CHICAGO) (40,OPERATIONS,BOSTON) DUMP emp (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)
查询前N条
SELECT * FROM TMP_TABLE LIMIT 10;
emp_table_limit = LIMIT emp 10; DUMP emp_table_limit; (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30)
查询表中的某些列
SELECT USER FROM TMP_TABLE;
emp_table_empno_ename = FOREACH emp GENERATE empno, ename; describe emp_table_empno_ename emp_table_empno_ename: {empno: int,ename: chararray} DUMP emp_table_empno_ename; (7369,SMITH) (7499,ALLEN) (7521,WARD) (7566,JONES) (7654,MARTIN) (7698,BLAKE) (7782,CLARK) (7788,SCOTT) (7839,KING) (7844,TURNER) (7876,ADAMS) (7900,JAMES) (7902,FORD) (7934,MILLER)
给列取别名
SELECT USER AS USER_NAME,AGE AS USER_AGE FROM TMP_TABLE;
emp_table_column_alias = FOREACH emp GENERATE empno AS id,ename AS name; describe emp_table_column_alias emp_table_column_alias: {id: int,name: chararray} DUMP emp_table_column_alias (7369,SMITH) (7499,ALLEN) (7521,WARD) (7566,JONES) (7654,MARTIN) (7698,BLAKE) (7782,CLARK) (7788,SCOTT) (7839,KING) (7844,TURNER) (7876,ADAMS) (7900,JAMES) (7902,FORD) (7934,MILLER)
排序
SELECT * FROM TMP_TABLE ORDER BY AGE;
emp_table_order = ORDER emp BY empno ASC; DUMP emp_table_order; (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)
条件查询
SELECT * FROM TMP_TABLE WHERE AGE>20;
emp_table_where = FILTER emp by sal > 1500; DUMP emp_table_where; (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20)
内连接Inner Join
SELECT * FROM TMP_TABLE A JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_inner_join = JOIN emp BY dept,dept BY deptno; describe emp_table_inner_join emp_table_inner_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_inner_join; (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)
左连接Left Join
SELECT * FROM TMP_TABLE A LEFT JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_left_join = JOIN emp BY dept LEFT OUTER,dept BY deptno; describe emp_table_left_join emp_table_left_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_left_join; (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)
右连接Right Join
SELECT * FROM TMP_TABLE A RIGHT JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_right_join = JOIN emp BY dept RIGHT OUTER,dept BY deptno; describe emp_table_right_join emp_table_right_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_right_join; (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO) (,,,,,,,,40,OPERATIONS,BOSTON)
全连接Full Join
SELECT * FROM TMP_TABLE A JOIN TMP_TABLE_2 B ON A.AGE=B.AGE
emp_table_full_join = JOIN emp BY dept FULL OUTER,dept BY deptno; describe emp_table_full_join emp_table_full_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_full_join; (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO) (,,,,,,,,40,OPERATIONS,BOSTON)
同时对多张表交叉查询
SELECT * FROM TMP_TABLE,TMP_TABLE_2;
emp_table_cross = CROSS emp,dept; describe emp_table_cross emp_table_cross: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_cross; (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,30,SALES,CHICAGO) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,40,OPERATIONS,BOSTON) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,10,ACCOUNTING,NEW YORK) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,40,OPERATIONS,BOSTON) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,10,ACCOUNTING,NEW YORK) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,20,RESEARCH,DALLAS) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,40,OPERATIONS,BOSTON) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,10,ACCOUNTING,NEW YORK) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,30,SALES,CHICAGO) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,40,OPERATIONS,BOSTON) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,10,ACCOUNTING,NEW YORK) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,20,RESEARCH,DALLAS) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,40,OPERATIONS,BOSTON) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,10,ACCOUNTING,NEW YORK) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,20,RESEARCH,DALLAS) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,40,OPERATIONS,BOSTON) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,20,RESEARCH,DALLAS) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,30,SALES,CHICAGO) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,40,OPERATIONS,BOSTON) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,10,ACCOUNTING,NEW YORK) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,30,SALES,CHICAGO) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,40,OPERATIONS,BOSTON) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,20,RESEARCH,DALLAS) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,30,SALES,CHICAGO) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,40,OPERATIONS,BOSTON) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,10,ACCOUNTING,NEW YORK) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,20,RESEARCH,DALLAS) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,40,OPERATIONS,BOSTON) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,10,ACCOUNTING,NEW YORK) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,30,SALES,CHICAGO) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,40,OPERATIONS,BOSTON) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,10,ACCOUNTING,NEW YORK) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,20,RESEARCH,DALLAS) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,40,OPERATIONS,BOSTON) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,10,ACCOUNTING,NEW YORK) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,30,SALES,CHICAGO) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,40,OPERATIONS,BOSTON) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,20,RESEARCH,DALLAS) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,30,SALES,CHICAGO) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,40,OPERATIONS,BOSTON)
分组GROUP BY
SELECT * FROM TMP_TABLE GROUP BY IS_MALE;
emp_table_group = GROUP emp BY dept; describe emp_table_group emp_table_group: { group: int, emp: { (empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double, comm: double,dept: int) } } DUMP emp_table_group; (10,{ (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10), (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10), (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)}) (20,{ (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20), (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20), (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20), (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20), (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20)}) (30,{ (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30), (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30), (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30), (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30), (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30), (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30)})
分组并统计
SELECT IS_MALE,COUNT(*) FROM TMP_TABLE GROUP BY IS_MALE;
emp_table_group_count = GROUP emp BY dept; describe emp_table_group_count emp_table_group_count: { group: int, emp: { (empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double,comm: double,dept: int ) } } emp_table_group_count = FOREACH emp_table_group_count GENERATE group,COUNT($1); describe emp_table_group_count DUMP emp_table_group_count; (10,3) (20,5) (30,6) emp_table_group_count = FOREACH emp_table_group_count GENERATE emp.dept,COUNT($1); describe emp_table_group_count emp_table_group_count: {{(dept: int)},long} DUMP emp_table_group_count; ({(10),(10),(10)},3) ({(20),(20),(20),(20),(20)},5) ({(30),(30),(30),(30),(30),(30)},6)
去重DISTINCT
SELECT DISTINCT IS_MALE FROM TMP_TABLE;
emp_table_distinct = FOREACH emp GENERATE dept; describe emp_table_distinct emp_table_distinct: {dept: int} emp_table_distinct = DISTINCT emp_table_distinct; describe emp_table_distinct emp_table_distinct: {dept: int} DUMP emp_table_distinct; (10) (20) (30)
pig判空
select * from emp where comm is not null;
emp_table_where_null = FILTER emp by comm is not null; DUMP emp_table_where_null; (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30)