大数据平台–Hive入门3
1.Hive常用的hql语句
(1)过滤
-
where
-
hive (default)> select emp from emp where sal > 3000; FAILED: SemanticException Error in parsing hive (default)> select * from emp where sal > 3000; OK emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno 7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10 Time taken: 0.405 seconds, Fetched: 1 row(s)
-
limit
-
hive (default)> select * from emp limit 1; OK emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno 7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20 Time taken: 0.083 seconds, Fetched: 1 row(s)
-
distinct
-
hive (default)> select distinct deptno from emp; deptno 10 20 30 Time taken: 36.298 seconds, Fetched: 3 row(s)
-
between and
-
hive (default)> select * from emp where sal between 2000 and 3000; OK emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno 7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20 7698 BLAKE MANAGER 7839 1981-5-1 2850.0 NULL 30 7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10 7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20 7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20 Time taken: 0.33 seconds, Fetched: 5 row(s)
-
is null & is not null
-
hive (default)> select * from emp where comm is null; OK emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno 7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20 7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20 7698 BLAKE MANAGER 7839 1981-5-1 2850.0 NULL 30 7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10 7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20 7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10 7876 ADAMS CLERK 7788 1987-5-23 1100.0 NULL 20 7900 JAMES CLERK 7698 1981-12-3 950.0 NULL 30 7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20 7934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10 Time taken: 0.182 seconds, Fetched: 10 row(s) hive (default)> select * from emp where comm is not null; OK emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno 7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30 7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30 7654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 30 7844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 30 Time taken: 0.086 seconds, Fetched: 4 row(s)
-
having:分组语句之后
-
hive (default)> select deptno,avg(sal) avg_sal from emp group by deptno having avg_sal > 2000; deptno avg_sal 10 2916.6666666666665 20 2175.0 Time taken: 23.326 seconds, Fetched: 2 row(s) hive (default)> select deptno,avg from (select deptno,avg(sal) as avg from emp group by deptno) a where avg > 2000; deptno avg 10 2916.6666666666665 20 2175.0 Time taken: 23.609 seconds, Fetched: 2 row(s) #注意:where和having可以在一条语句中执行,where先执行,不能太多层子查询,where中不能嵌套子查询,子查询不能做成字段。
-
in
-
hive (default)> select * from emp where deptno in (10,20); OK emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno 7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20 7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20 7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10 7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20 7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10 7876 ADAMS CLERK 7788 1987-5-23 1100.0 NULL 20 7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20 7934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10 Time taken: 0.221 seconds, Fetched: 8 row(s)
(2)聚合函数
-
count
-
hive (default)> select count(1) from emp; _c0 14 Time taken: 23.065 seconds, Fetched: 1 row(s)
-
sum,avg
-
hive (default)> select sum(sal) from emp; _c0 29025.0 hive (default)> select avg(sal) avg_sal from emp; avg_sal 2073.214285714286
-
max,min
(3)group by
hive (default)> select deptno,avg(sal) from emp group by deptno;
deptno _c1
10 2916.6666666666665
20 2175.0
30 1566.6666666666667
Time taken: 22.805 seconds, Fetched: 3 row(s)
(4)join
-
等值join(inner join)
-
hive (default)> select e.empno,e.ename,d.deptno,e.sal from emp e join dept d on e.deptno = d.deptno; e.empno e.ename d.deptno e.sal 7369 SMITH 20 800.0 7499 ALLEN 30 1600.0 7521 WARD 30 1250.0 7566 JONES 20 2975.0 7654 MARTIN 30 1250.0 7698 BLAKE 30 2850.0 7782 CLARK 10 2450.0 7788 SCOTT 20 3000.0 7839 KING 10 5000.0 7844 TURNER 30 1500.0 7876 ADAMS 20 1100.0 7900 JAMES 30 950.0 7902 FORD 20 3000.0 7934 MILLER 10 1300.0
-
左join(left join)
-
hive (default)> select e.empno,e.ename,d.deptno,e.sal from emp e left join dept d on e.deptno = d.deptno; e.empno e.ename d.deptno e.sal 7369 SMITH 20 800.0 7499 ALLEN 30 1600.0 7521 WARD 30 1250.0 7566 JONES 20 2975.0 7654 MARTIN 30 1250.0 7698 BLAKE 30 2850.0 7782 CLARK 10 2450.0 7788 SCOTT 20 3000.0 7839 KING 10 5000.0 7844 TURNER 30 1500.0 7876 ADAMS 20 1100.0 7900 JAMES 30 950.0 7902 FORD 20 3000.0 7934 MILLER 10 1300.0
-
右join(right join)
-
hive (default)> select e.empno,e.ename,d.deptno,e.sal from emp e right join dept d on e.deptno = d.deptno; e.empno e.ename d.deptno e.sal 7782 CLARK 10 2450.0 7839 KING 10 5000.0 7934 MILLER 10 1300.0 7369 SMITH 20 800.0 7566 JONES 20 2975.0 7788 SCOTT 20 3000.0 7876 ADAMS 20 1100.0 7902 FORD 20 3000.0 7499 ALLEN 30 1600.0 7521 WARD 30 1250.0 7654 MARTIN 30 1250.0 7698 BLAKE 30 2850.0 7844 TURNER 30 1500.0 7900 JAMES 30 950.0 NULL NULL 40 NULL
-
全join(full join)
-
hive (default)> select e.empno,e.ename,d.deptno,e.sal from emp e full join dept d on e.deptno = d.deptno; e.empno e.ename d.deptno e.sal 7934 MILLER 10 1300.0 7839 KING 10 5000.0 7782 CLARK 10 2450.0 7876 ADAMS 20 1100.0 7788 SCOTT 20 3000.0 7369 SMITH 20 800.0 7566 JONES 20 2975.0 7902 FORD 20 3000.0 7844 TURNER 30 1500.0 7499 ALLEN 30 1600.0 7698 BLAKE 30 2850.0 7654 MARTIN 30 1250.0 7521 WARD 30 1250.0 7900 JAMES 30 950.0 NULL NULL 40 NULL
-
LIKE
-
like "%qq%":RDBMS不会使用到索引,性能最差 like "%qq":不会使用到索引,性能差 like "qq%":使用索引,性能会提升
2.Hive分析函数
3.Hive的窗口函数
窗口函数:做单独的窗口来计算这列数据的值
4.Hive与mr的关系
排序类型的sql
(1)order by:全局排序,默认字典序
5.Hive的udf函数以及beeline客户端
Hive的函数有三种:
- UDF:用户自定义的格式转化函数(一条数据输入,就有且仅有一条数据输出)
- UDAF:用户自定义聚合函数(多条数据输入,可能只有一条数据输出)
- UDTF:用户自定义表格生成函数(一条数据输入,可能会有多条数据输出)
Hive客户端
hiveserver2
1.将hive变成一个服务对外开放,通过客户端连接
bin/hiveserver2 #启动服务
bin/beeline #启动客户端
beeline> !connect jdbc:hive2://pseudo:10000
Connecting to jdbc:hive2://pseudo:10000
Enter username for jdbc:hive2://pseudo:10000: root
Enter password for jdbc:hive2://pseudo:10000:
Connected to: Apache Hive (version 1.2.1)
Driver: Hive JDBC (version 1.2.1)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://pseudo:10000> show databases;
+----------------+--+
| database_name |
+----------------+--+
| default |
+----------------+--+
1 row selected (1.925 seconds)
0: jdbc:hive2://pseudo:10000>