创建分区表
1 | create external table hive1.test2(id int ,name string , age int) |
手动添加分区
1 | alter table hive1.test2 add partition(province='hebei' , city='baoding'); |
插入数据到分区表
1 | insert into hive1.test2 partition (province='hebei',city='baoding') |
动态分区
创建test3分区表
1
2
3
4
5create table test3(id int,name string, age int ) partitioned by ( proince string,city string)
row format DELIMITED
FIELDS TERMINATED by '\t'
LINES TERMINATED by '\n'
STORED as TEXTFILE ;
动态分区,复制一个表数据到分区表,动态创建分区
如果两个都是动态分区需要关闭严格模式
1
set hive.exec.dynamic.partition.mode=nonstrict //关闭动态分区的严格模式
1 | insert into hive1.test2 partition (province,city) |
使用分区动静混合
1
2
3insert into hive1.test2 partition (province='henan',city)
select id,name,age,'henan' as province, 'kaifeng' as city
from table2;
查询期间,动态创建表,并将数据写入创建表中
1
create table test3 as select id ,name from test2 where province = 'hebei' and city = 'baoding' ;
导出hive数据到本地目录(下载)
1
insert overwrite local directory '/home/ubuntu/hive' select * from test2 where province = 'hebei' ;
导出hive数据到HDFS目录
1
insert overwrite directory 'hdfs://s100:8020/user/ubuntu/xxx' select * from hive1.test2 where province = 'hebei' ;
查询数据向多个目录同时输出
1
2
3
4
5from test2 t
insert overwrite local directory '/home/ubuntu/hebei' select
t.id,t.name,t.age,t.province,t.city ta where t.province = 'hebei'
insert overwrite local directory '/home/ubuntu/henan' select
t.id,t.name,t.age,t.province,t.city tb where t.province = 'henan'
查询
查询,投影查询,指定表的别名
1
select col1,col2,... from table t
查询,使用函数
1
2select upper(name) from test2 ;
select lower(name) from test2 ;
四舍五入
1
2
3
4select round(12.345); //四舍五入12
select floor(12.345) //地板12
select ceil(12.345) //天花板13
select rand(12.345) //随机数
聚合函数
1
2
3
4
5select count(*) from test2 ;
select sum(*) from test2 ;
select avg(*) from test2 ;
select max(*) from test2 ;
select min(*) from test2 ;
去重
1
select count(distinct (name)) from test2 ;
表生成函数
1
2select explode(name) from test2 ;
select explode(array('tom','tomas','tomsLee')) from test2;
ascii函数,字符串首个字母ascii值
1
select ascii('abc');
base64字符串编码,需要二进制数据
1
select base64(BINARY('http://localhost:8080/helloworld'))
binary函数,将字符串转换成需要二进制数据
1
select binary('http://localhost:8080/helloworld');
类型转换
1
select cast('120' as BIGINT) + 200; //320
字符串连接
1
select concat('120',200) ; //120200
分页查询limit
1
select * from concat('120',200) ; //120200
嵌套子查询
1 | from (select * from test2 where province = 'hebei')e select e.id,e.name,e.age where city = 'baoding' |
何时Hive可以避免MR操作
不使用MR的job就是本地模式
- 全表扫描:没有where子句。
1
select * from test2;
- where子句作用只有分区字段,也不需要mr
1
select * from test2 where province = 'hebei';
- 设置hive.exec.mode.local.auto = true
该属性hive会尽量使用local模式查询。 - 其余的所有查询都会转换成MR
不能在where使用列别名
1
select id,name n,age from test2 where n like 't%' ; //wrong,where 终不能字段别名
1 | select id,name n,age from test2 where name like 't%' ; //wright |
范围运算
1
select id,name n ,age from test2 where age between 12 and 15 ;
浮点数比较的规避方案
1
select cast(0.2 as FLOAT) ;
group by 分组
1
2select count(*) ,province from test2 group by province;
select count(*) as c ,province from test2 group by province having c > 3 ; //having是组内过滤
#JOIN 连接
##Hive的Join操作,只支持等值连接
###创建customers和orders表,一对多关系
###创建客户表
1
2
3
4
5create table customers(id int , name string , age int)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile;
###创建订单表
1 | create table orders (id int , orderno string , price float, cid int) |
###准备数据
customers.txt数据
| id | name | age |
| :—– | :—— | :—- |
| 1 | tom1 | 12 |
| 2 | tom2 | 13 |
| 3 | tom3 | 14 |
orders.txt数据
| id | orderno | price | cid |
| :—– | :———- | :——– | :—— |
| 1 | No001 | 121.34 | 1 |
| 2 | No002 | 121.35 | 1 |
| 3 | No003 | 121.00 | 1 |
| 4 | No004 | 22.66 | 2 |
| 5 | No005 | 300.65 | 2 |
| 6 | No006 | 800.56 | 2 |
| 1 | No007 | 1000.12 | 1 |
###加载数据到customers + orders中
1 | load data local inpath '/home/ubuntu/customers.txt' into table hive1.customers; |
##内连接 JOIN … ON
1 | select a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id = b.cid |
a.id | a.name | b.id | b.orderno | b.price |
---|---|---|---|---|
1 | tom1 | 1 | No001 | 121.34 |
1 | tom1 | 2 | No002 | 121.35 |
1 | tom1 | 3 | No003 | 121.0 |
2 | tom2 | 4 | No004 | 22.66 |
2 | tom2 | 5 | No005 | 300.65 |
2 | tom2 | 6 | No006 | 800.56 |
##连接查询优化手段:查询表的大小从左到右是递增的
1 | select c.id,c.name,c.age,o.orderno,o.price from customers c JOIN orders o on c.id = o.cid where ... ; //wright |
1 | select c.id,c.name,c.age,o.orderno,o.price from orders o JOIN customers c on c.id = o.cid where ... ; //wrong |
###使用查询暗示 hint
1 | select /*+STREAMTABLE (o)*/ c.id,c.name,c.age,o.orderno,o.price from orders o JOIN customers c on c.id = o.cid; |
LEFT OUTER JOIN
1 | select a.id,a.name,b.id,b.orderno,b.price from customers a LEFT OUTER JOIN orders b on a.id = b.cid; |
RIGHT OUTER JOIN
1 | select a.id,a.name,b.id,b.orderno,b.price from customers a RIGHT OUTER JOIN orders b on a.id = b.cid; |
FULL OUTER JOIN
1 | select a.id,a.name,b.id,b.orderno,b.price from customers a FULL OUTER JOIN orders b on a.id = b.cid; |
左半连接,select 和where子句不能引用到右边表字段
左表的记录在右表中一旦找到对应的记录,右侧表即停止扫描。
1 | select c.id,c.name from customers c LEFT semi JOIN orders o on c.id = o.cid; |
hive不支持右半连接操作,RIGHT SEMI JOIN 不存在
笛卡尔连接 m x n
1 | select c.id,c.name,o.orderno from customers c JOIN orders o ; |
map端连接,一张小表,通过mapper的时候,将小表完全载入内存。
暗示 MAPJOIN(c) 在0.7之前使用
1 | select /*+MAPJOIN(c)*/ c.id,c.name,o.orderno from customers c JOIN orders o ; |
转换连接
- map端优化,在右外连接和全外连接中不支持
1
set hive.auto.convert.join = true
设置小表的阈值
- 如果小于这个值,就不进行Reduce了
1
set hive.mapjoin.smalltable.filesize=25000000
order by 全排序,对所有数据通过一个reduce进行排序。
- 如果开启了hive.mapred.mode=strict,在全排序时必须结合limit(分页)使用
- 现在推荐使用hive.strict.checks.* 属性
1 | `全局排序` |
sort by 每个reduce进行排序。
- 可以说类似局部排序
1
2`局部排序`
select * from orders sort by cid asc , price desc;
Hadoop中的 sort by 局部排序
- 自定义Key的排序方法(默认是升序)
Hadoop中的 order by 全排序 如何实现
- 自定义分区函数
- 组合Key
- 设置组分区 二次排序
distribute by 等价于自定义分区函数
- 写在sort by之前
1 | `局部排序` |
cluster by 等价于 distribute by … sort by …
分桶采样
1 | select * from orders tablesample(bucket 3 out of 10 on number); |
按照数据块百分比采样
- 100块,抽取10块,如果总共一块,没有采样。
1 | select * from orders tablesample(0.1 percent); |
union all 联合操作
- 字段类型和个数需要匹配
1
select id ,name from customers union select id,orderno from orders;
视图(View,虚表)
- 降低查询的复杂度
创建视图
1 | create view v_name as select ... |
通过视图直接查询
1 | select * from view1 ; |
使用like方式创建view
1 | create view v2 like view1 |
删除视图
1 | drop view if exists v2 ; |
索引
- hive没有key(primary key , auto_increment)
创建索引
- DEFERRED REBUILD该选项时,索引为空白状态,需要rebuild才能够初始化
1
2
3
4
5`创建索引`
create index idx_customers_id ON TABLE customers (id)
as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
WITH DEFERRED REBUILD IDXPROPERTIES ('creator' = 'me')
IN TABLE customers_index COMMENT 'this is a comment' ;
重建索引
1 | alter index idx_customers_id on customers REBUILD ; |
索引文件(表)
hdfs://s100:8020/user/hive/warehouse/hive1.db/customers_index/000000_0
[内容]
1 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [0]
2 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [10]
3 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [20]
创建订单索引
1 | create index idx_orders_orderno ON TABLE orders (orderno) |
重建订单索引
1 | alter index idx_orders_orderno on orders REBUILD; |
桶表 (bucket)
- 桶表是文件
- 分区是路径,是目录,是文件逻辑隔离。有效降低查询量
- hashcode算法
- 高16位向右移动 变 低16位,高低16位做异或运算,最后对桶的数量取模
创建桶表
create table … clustered by (field_name) into n buckets ;
1 | create table orderitems (id int , itemname string , oid int) |
能否参与评论,且看个人手段。