hive_03

创建分区表

1
2
3
4
5
6
create external table hive1.test2(id int ,name string , age int)
partitioned by(province string , city string)
row format delimted
fields terminated by '\t'
lines terminated by '\n'
store as textfile ;

手动添加分区

1
alter table hive1.test2 add partition(province='hebei' , city='baoding');

插入数据到分区表

1
2
3
insert into hive1.test2 partition (province='hebei',city='baoding')
select * from hive1.test2;
insert overwrite table test2 partition (province='hebei',city='shijiazhuang') select id,name,age from test1;

动态分区

创建test3分区表

1
2
3
4
5
create table test3(id int,name string, age int ) partitioned by ( proince string,city string)
row format DELIMITED
FIELDS TERMINATED by '\t'
LINES TERMINATED by '\n'
STORED as TEXTFILE ;

动态分区,复制一个表数据到分区表,动态创建分区
如果两个都是动态分区需要关闭严格模式

1
set hive.exec.dynamic.partition.mode=nonstrict          //关闭动态分区的严格模式

1
2
3
insert into hive1.test2 partition (province,city)
select id,name,age,'henan' as province, 'kaifeng' as city
from table2;

使用分区动静混合

1
2
3
insert into hive1.test2 partition (province='henan',city)
select id,name,age,'henan' as province, 'kaifeng' as city
from table2;

查询期间,动态创建表,并将数据写入创建表中

1
create table test3 as select id ,name from test2 where province = 'hebei' and city = 'baoding' ;

导出hive数据到本地目录(下载)

1
insert overwrite local directory '/home/ubuntu/hive' select * from test2 where province = 'hebei' ;

导出hive数据到HDFS目录

1
insert overwrite directory 'hdfs://s100:8020/user/ubuntu/xxx' select * from hive1.test2 where province = 'hebei' ;

查询数据向多个目录同时输出

1
2
3
4
5
from test2 t
insert overwrite local directory '/home/ubuntu/hebei' select
t.id,t.name,t.age,t.province,t.city ta where t.province = 'hebei'
insert overwrite local directory '/home/ubuntu/henan' select
t.id,t.name,t.age,t.province,t.city tb where t.province = 'henan'

查询

查询,投影查询,指定表的别名

1
select col1,col2,... from table t

查询,使用函数

1
2
select upper(name) from test2 ;
select lower(name) from test2 ;

四舍五入

1
2
3
4
select round(12.345);        //四舍五入12
select floor(12.345) //地板12
select ceil(12.345) //天花板13
select rand(12.345) //随机数

聚合函数

1
2
3
4
5
select count(*) from test2 ;
select sum(*) from test2 ;
select avg(*) from test2 ;
select max(*) from test2 ;
select min(*) from test2 ;

去重

1
select count(distinct (name)) from test2 ;

表生成函数

1
2
select explode(name) from test2 ;
select explode(array('tom','tomas','tomsLee')) from test2;

ascii函数,字符串首个字母ascii值

1
select ascii('abc');

base64字符串编码,需要二进制数据

1
select base64(BINARY('http://localhost:8080/helloworld'))

binary函数,将字符串转换成需要二进制数据

1
select binary('http://localhost:8080/helloworld');

类型转换

1
select cast('120' as BIGINT) + 200;     //320

字符串连接

1
select concat('120',200) ;       //120200

分页查询limit

1
select * from  concat('120',200) ;       //120200

嵌套子查询

1
2
3
4
5
6
7
8
9
10
11
12
from (select * from test2 where province = 'hebei')e select e.id,e.name,e.age where city = 'baoding'
select e.id,e.name,e.age where city = 'baoding'
from (select * from test2 where province = 'hebei') e

**case..when..then**
select id,name,
case
when age <= 12 then 'young'
when age >12 and age <=13 then 'middle'
when age >13 and age <=15 then 'old'
else 'too old'
end as yearState from test2 ;

何时Hive可以避免MR操作

不使用MR的job就是本地模式

  1. 全表扫描:没有where子句。
    1
    select * from test2;
  2. where子句作用只有分区字段,也不需要mr
    1
    select * from test2 where province = 'hebei';
  3. 设置hive.exec.mode.local.auto = true
    该属性hive会尽量使用local模式查询。
  4. 其余的所有查询都会转换成MR

不能在where使用列别名

1
select id,name n,age from test2 where n like 't%' ; //wrong,where 终不能字段别名

1
select id,name n,age from test2 where name like 't%' ; //wright

范围运算

1
select id,name n ,age from test2 where age between 12 and 15 ;

浮点数比较的规避方案

1
select cast(0.2 as FLOAT) ;

group by 分组

1
2
select count(*) ,province from test2 group by province;  
select count(*) as c ,province from test2 group by province having c > 3 ; //having是组内过滤

#JOIN 连接
##Hive的Join操作,只支持等值连接

###创建customers和orders表,一对多关系

###创建客户表

1
2
3
4
5
create table customers(id int , name string , age int)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

###创建订单表

1
2
3
4
5
create table orders (id int , orderno string , price float, cid int)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

###准备数据
customers.txt数据
| id | name | age |
| :—– | :—— | :—- |
| 1 | tom1 | 12 |
| 2 | tom2 | 13 |
| 3 | tom3 | 14 |

orders.txt数据
| id | orderno | price | cid |
| :—– | :———- | :——– | :—— |
| 1 | No001 | 121.34 | 1 |
| 2 | No002 | 121.35 | 1 |
| 3 | No003 | 121.00 | 1 |
| 4 | No004 | 22.66 | 2 |
| 5 | No005 | 300.65 | 2 |
| 6 | No006 | 800.56 | 2 |
| 1 | No007 | 1000.12 | 1 |

###加载数据到customers + orders中

1
2
load data local inpath '/home/ubuntu/customers.txt' into table hive1.customers;  
load data local inpath '/home/ubuntu/orders.txt' into table hive1.orders;

##内连接 JOIN … ON

1
select a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id = b.cid
a.id a.name b.id b.orderno b.price
1 tom1 1 No001 121.34
1 tom1 2 No002 121.35
1 tom1 3 No003 121.0
2 tom2 4 No004 22.66
2 tom2 5 No005 300.65
2 tom2 6 No006 800.56

##连接查询优化手段:查询表的大小从左到右是递增的

1
select c.id,c.name,c.age,o.orderno,o.price from customers c JOIN orders o on c.id = o.cid where ... ;       //wright
1
select c.id,c.name,c.age,o.orderno,o.price from orders o JOIN customers c on c.id = o.cid where ... ;       //wrong

###使用查询暗示 hint

1
select /*+STREAMTABLE (o)*/ c.id,c.name,c.age,o.orderno,o.price from orders o JOIN customers c on c.id = o.cid;

LEFT OUTER JOIN

1
select a.id,a.name,b.id,b.orderno,b.price from customers a LEFT OUTER JOIN orders b on a.id = b.cid;

RIGHT OUTER JOIN

1
select a.id,a.name,b.id,b.orderno,b.price from customers a RIGHT OUTER JOIN orders b on a.id = b.cid;

FULL OUTER JOIN

1
select a.id,a.name,b.id,b.orderno,b.price from customers a FULL OUTER JOIN orders b on a.id = b.cid;

左半连接,select 和where子句不能引用到右边表字段

左表的记录在右表中一旦找到对应的记录,右侧表即停止扫描。

1
select c.id,c.name from customers c LEFT semi JOIN orders o on c.id = o.cid;

hive不支持右半连接操作,RIGHT SEMI JOIN 不存在

笛卡尔连接 m x n

1
select c.id,c.name,o.orderno from customers c JOIN orders o ;

map端连接,一张小表,通过mapper的时候,将小表完全载入内存。

暗示 MAPJOIN(c) 在0.7之前使用

1
2
select /*+MAPJOIN(c)*/ c.id,c.name,o.orderno from customers c JOIN orders o ;
select /*+MAPJOIN(o)*/ c.id,c.name,o.orderno from customers c JOIN orders o ;

转换连接

  • map端优化,在右外连接和全外连接中不支持
    1
    set hive.auto.convert.join = true

设置小表的阈值

  • 如果小于这个值,就不进行Reduce了
    1
    set hive.mapjoin.smalltable.filesize=25000000

order by 全排序,对所有数据通过一个reduce进行排序。

  • 如果开启了hive.mapred.mode=strict,在全排序时必须结合limit(分页)使用
  • 现在推荐使用hive.strict.checks.* 属性
1
2
`全局排序`
select * from orders order by cid asc , price desc;

sort by 每个reduce进行排序。

  • 可以说类似局部排序
    1
    2
    `局部排序`
    select * from orders sort by cid asc , price desc;

Hadoop中的 sort by 局部排序

  • 自定义Key的排序方法(默认是升序)

Hadoop中的 order by 全排序 如何实现

  • 自定义分区函数
  • 组合Key
  • 设置组分区 二次排序

distribute by 等价于自定义分区函数

  • 写在sort by之前
1
2
`局部排序`
select * from orders distribute by cid sort by price desc ;

cluster by 等价于 distribute by … sort by …


分桶采样

1
select * from orders tablesample(bucket 3 out of 10 on number);

按照数据块百分比采样

  • 100块,抽取10块,如果总共一块,没有采样。
1
select * from orders tablesample(0.1 percent);

union all 联合操作

  • 字段类型和个数需要匹配
    1
    select id ,name from customers union select id,orderno from orders;

视图(View,虚表)

  • 降低查询的复杂度

创建视图

1
2
3
4
5
6
create view v_name as select ...
create view view1
as select c.id cid,c.name,c.age, o.id oid,o.orderno,o.price
from customers c
LEFT OUTER JOIN orders o
on o.cid = c.id ;

通过视图直接查询

1
2
select * from view1 ;  
select * from view1 where price > 200 ;

使用like方式创建view

1
create view v2 like view1

删除视图

1
drop view if exists v2 ;

索引

  • hive没有key(primary key , auto_increment)

创建索引

  • DEFERRED REBUILD该选项时,索引为空白状态,需要rebuild才能够初始化
    1
    2
    3
    4
    5
    `创建索引`
    create index idx_customers_id ON TABLE customers (id)
    as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
    WITH DEFERRED REBUILD IDXPROPERTIES ('creator' = 'me')
    IN TABLE customers_index COMMENT 'this is a comment' ;

重建索引

1
alter index idx_customers_id on customers REBUILD ;
  • 索引文件(表)
    hdfs://s100:8020/user/hive/warehouse/hive1.db/customers_index/000000_0
    [内容]
    1 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [0]
    2 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [10]
    3 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [20]

创建订单索引

1
2
3
4
create index idx_orders_orderno ON TABLE orders (orderno)
as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
WITH DEFERRED REBUILD IDXPROPERTIES ('creator' = 'me')
IN TABLE orders_index COMMENT 'this is a comment' ;

重建订单索引

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
alter index idx_orders_orderno on orders REBUILD;
```
- 索引地址
/user/hive/warehouse/hive1.db/orders_index/000000_0
- 内容
No001hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt0
No002hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt17
No003hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt34
No004hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt51
No005hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt67
No006hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt84
No007hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt101


`删除索引`
```sql
drop index idx_customers_id on table customers;

桶表 (bucket)

  • 桶表是文件
  • 分区是路径,是目录,是文件逻辑隔离。有效降低查询量
  • hashcode算法
    • 高16位向右移动 变 低16位,高低16位做异或运算,最后对桶的数量取模

创建桶表
create table … clustered by (field_name) into n buckets ;

1
2
3
4
5
6
7
8
create table orderitems (id int , itemname string , oid int)
clustered by (oid)
INTO 3 BUCKETS
ROW FORMAT DELIMITED
fields terminated by '\t'
LINES TERMINATED by '\n'
STORED as TEXTFILE ;
into 5 buckets ;