hive_03

2018-09-08 技术大数据 8 评论字数统计: 2.2k(字) 阅读时长: 11(分)

创建分区表

create external table hive1.test2(id int ,name string , age int)
    partitioned by(province string , city string)  
    row format delimted  
    fields terminated by '\t'  
    lines terminated by '\n'  
    store as textfile ;

手动添加分区

1	alter table hive1.test2 add partition(province='hebei' , city='baoding');

插入数据到分区表

1
2
3

insert into hive1.test2 partition (province='hebei',city='baoding')
select * from hive1.test2;
insert overwrite table test2 partition (province='hebei',city='shijiazhuang')  select id,name,age from test1;

动态分区

创建test3分区表

create table test3(id int,name string, age int ) partitioned by ( proince string,city string)
  row format DELIMITED
    FIELDS TERMINATED by '\t'
    LINES TERMINATED by '\n'
  STORED as TEXTFILE ;

动态分区，复制一个表数据到分区表，动态创建分区
如果两个都是动态分区需要关闭严格模式

1	set hive.exec.dynamic.partition.mode=nonstrict //关闭动态分区的严格模式

1
2
3

insert into hive1.test2 partition (province,city)
  select id,name,age,'henan' as province, 'kaifeng' as city
  from table2;

使用分区动静混合

1
2
3

insert into hive1.test2 partition (province='henan',city)
  select id,name,age,'henan' as province, 'kaifeng' as city
  from table2;

查询期间，动态创建表，并将数据写入创建表中

1	create table test3 as select id ,name from test2 where province = 'hebei' and city = 'baoding' ;

导出hive数据到本地目录(下载)

1	insert overwrite local directory '/home/ubuntu/hive' select * from test2 where province = 'hebei' ;

导出hive数据到HDFS目录

1	insert overwrite directory 'hdfs://s100:8020/user/ubuntu/xxx' select * from hive1.test2 where province = 'hebei' ;

查询数据向多个目录同时输出

from test2 t
insert overwrite local directory '/home/ubuntu/hebei' select   
  t.id,t.name,t.age,t.province,t.city ta  where t.province = 'hebei'  
insert overwrite local directory '/home/ubuntu/henan' select   
  t.id,t.name,t.age,t.province,t.city tb  where t.province = 'henan'

查询

查询,投影查询，指定表的别名

1	select col1,col2,... from table t

查询，使用函数

1 2	select upper(name) from test2 ; select lower(name) from test2 ;

四舍五入

select round(12.345);        //四舍五入12
select floor(12.345)         //地板12
select ceil(12.345)          //天花板13
select rand(12.345)          //随机数

聚合函数

select count(*) from test2 ;
select sum(*) from test2 ;
select avg(*) from test2 ;
select max(*) from test2 ;
select min(*) from test2 ;

去重

1	select count(distinct (name)) from test2 ;

表生成函数

1 2	select explode(name) from test2 ; select explode(array('tom','tomas','tomsLee')) from test2;

ascii函数，字符串首个字母ascii值

1	select ascii('abc');

base64字符串编码,需要二进制数据

1	select base64(BINARY('http://localhost:8080/helloworld'))

binary函数，将字符串转换成需要二进制数据

1	select binary('http://localhost:8080/helloworld');

类型转换

1	select cast('120' as BIGINT) + 200; //320

字符串连接

1	select concat('120',200) ; //120200

分页查询limit

1	select * from concat('120',200) ; //120200

嵌套子查询

from (select * from test2 where province = 'hebei')e select e.id,e.name,e.age where city = 'baoding'
select e.id,e.name,e.age where city = 'baoding'
from (select * from test2 where province = 'hebei') e

**case..when..then**  
select id,name,
case
  when age <= 12 then 'young'
  when age >12 and age <=13 then 'middle'
  when age >13 and age <=15 then 'old'
else 'too old'
end as yearState from test2 ;

何时Hive可以避免MR操作

不使用MR的job就是本地模式

全表扫描：没有where子句。
1
select * from test2;

where子句作用只有分区字段，也不需要mr

1	select * from test2 where province = 'hebei';

设置hive.exec.mode.local.auto = true
该属性hive会尽量使用local模式查询。
其余的所有查询都会转换成MR

不能在where使用列别名

1	select id,name n,age from test2 where n like 't%' ; //wrong,where 终不能字段别名

1	select id,name n,age from test2 where name like 't%' ; //wright

范围运算

1	select id,name n ,age from test2 where age between 12 and 15 ;

浮点数比较的规避方案

1	select cast(0.2 as FLOAT) ;

group by 分组

1 2	select count() ,province from test2 group by province; select count() as c ,province from test2 group by province having c > 3 ; //having是组内过滤

#JOIN 连接
##Hive的Join操作，只支持等值连接

###创建customers和orders表，一对多关系

###创建客户表

create table customers(id int , name string , age int)
       row format delimited  
       fields terminated by '\t'  
       lines terminated by '\n'  
       stored as textfile;

###创建订单表

create table orders (id int , orderno string , price float, cid int)
       row format delimited  
       fields terminated by '\t'  
       lines terminated by '\n'  
       stored as textfile;

###准备数据
customers.txt数据
| id | name | age |
| :—– | :—— | :—- |
| 1 | tom1 | 12 |
| 2 | tom2 | 13 |
| 3 | tom3 | 14 |

orders.txt数据
| id | orderno | price | cid |
| :—– | :———- | :——– | :—— |
| 1 | No001 | 121.34 | 1 |
| 2 | No002 | 121.35 | 1 |
| 3 | No003 | 121.00 | 1 |
| 4 | No004 | 22.66 | 2 |
| 5 | No005 | 300.65 | 2 |
| 6 | No006 | 800.56 | 2 |
| 1 | No007 | 1000.12 | 1 |

###加载数据到customers + orders中

1 2	load data local inpath '/home/ubuntu/customers.txt' into table hive1.customers; load data local inpath '/home/ubuntu/orders.txt' into table hive1.orders;

##内连接 JOIN … ON

1	select a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id = b.cid

a.id	a.name	b.id	b.orderno	b.price
1	tom1	1	No001	121.34
1	tom1	2	No002	121.35
1	tom1	3	No003	121.0
2	tom2	4	No004	22.66
2	tom2	5	No005	300.65
2	tom2	6	No006	800.56

##连接查询优化手段：查询表的大小从左到右是递增的

1	select c.id,c.name,c.age,o.orderno,o.price from customers c JOIN orders o on c.id = o.cid where ... ; //wright

1	select c.id,c.name,c.age,o.orderno,o.price from orders o JOIN customers c on c.id = o.cid where ... ; //wrong

###使用查询暗示 hint

1	select /+STREAMTABLE (o)/ c.id,c.name,c.age,o.orderno,o.price from orders o JOIN customers c on c.id = o.cid;

LEFT OUTER JOIN

1	select a.id,a.name,b.id,b.orderno,b.price from customers a LEFT OUTER JOIN orders b on a.id = b.cid;

RIGHT OUTER JOIN

1	select a.id,a.name,b.id,b.orderno,b.price from customers a RIGHT OUTER JOIN orders b on a.id = b.cid;

FULL OUTER JOIN

1	select a.id,a.name,b.id,b.orderno,b.price from customers a FULL OUTER JOIN orders b on a.id = b.cid;

左半连接,select 和where子句不能引用到右边表字段

左表的记录在右表中一旦找到对应的记录，右侧表即停止扫描。

1	select c.id,c.name from customers c LEFT semi JOIN orders o on c.id = o.cid;

hive不支持右半连接操作,RIGHT SEMI JOIN 不存在

笛卡尔连接 m x n

1	select c.id,c.name,o.orderno from customers c JOIN orders o ;

map端连接，一张小表，通过mapper的时候，将小表完全载入内存。

暗示 MAPJOIN(c) 在0.7之前使用

1 2	select /+MAPJOIN(c)/ c.id,c.name,o.orderno from customers c JOIN orders o ; select /+MAPJOIN(o)/ c.id,c.name,o.orderno from customers c JOIN orders o ;

转换连接

map端优化，在右外连接和全外连接中不支持
1
set hive.auto.convert.join = true

设置小表的阈值

如果小于这个值，就不进行Reduce了

1	set hive.mapjoin.smalltable.filesize=25000000

order by 全排序，对所有数据通过一个reduce进行排序。

如果开启了hive.mapred.mode=strict,在全排序时必须结合limit(分页)使用
现在推荐使用hive.strict.checks.* 属性

1 2	`全局排序` select * from orders order by cid asc , price desc;

sort by 每个reduce进行排序。

可以说类似局部排序

1 2	`局部排序` select * from orders sort by cid asc , price desc;

Hadoop中的 sort by 局部排序

自定义Key的排序方法（默认是升序）

Hadoop中的 order by 全排序如何实现

自定义分区函数
组合Key
设置组分区二次排序

distribute by 等价于自定义分区函数

写在sort by之前

1 2	`局部排序` select * from orders distribute by cid sort by price desc ;

cluster by 等价于 distribute by … sort by …

分桶采样

1	select * from orders tablesample(bucket 3 out of 10 on number);

按照数据块百分比采样

100块，抽取10块，如果总共一块，没有采样。

1	select * from orders tablesample(0.1 percent);

union all 联合操作

字段类型和个数需要匹配

1	select id ,name from customers union select id,orderno from orders;

视图（View，虚表）

降低查询的复杂度

创建视图

create view v_name as select ...
create view view1  
  as select c.id cid,c.name,c.age, o.id oid,o.orderno,o.price  
  from customers c  
  LEFT OUTER JOIN orders o  
  on o.cid = c.id ;

通过视图直接查询

1 2	select * from view1 ; select * from view1 where price > 200 ;

使用like方式创建view

1	create view v2 like view1

删除视图

1	drop view if exists v2 ;

索引

hive没有key(primary key , auto_increment)

创建索引

DEFERRED REBUILD该选项时，索引为空白状态，需要rebuild才能够初始化

`创建索引`
create index idx_customers_id ON TABLE customers (id)
  as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
  WITH DEFERRED REBUILD IDXPROPERTIES ('creator' = 'me')
  IN TABLE customers_index COMMENT 'this is a comment' ;

重建索引

1	alter index idx_customers_id on customers REBUILD ;

索引文件（表）
hdfs://s100:8020/user/hive/warehouse/hive1.db/customers_index/000000_0
[内容]
1 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [0]
2 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [10]
3 hdfs://mycluster/user/hive/warehouse/hive1.db/customers/customers.txt [20]

创建订单索引

create index idx_orders_orderno ON TABLE orders (orderno)
  as 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
  WITH DEFERRED REBUILD IDXPROPERTIES ('creator' = 'me')
  IN TABLE orders_index COMMENT 'this is a comment' ;

重建订单索引

alter index idx_orders_orderno on orders REBUILD;
```  
  - 索引地址  
/user/hive/warehouse/hive1.db/orders_index/000000_0
  - 内容
No001hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt0
No002hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt17
No003hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt34
No004hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt51
No005hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt67
No006hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt84
No007hdfs://mycluster/user/hive/warehouse/hive1.db/orders/orders.txt101


`删除索引`  
```sql
drop index idx_customers_id on table customers;

桶表 (bucket)

桶表是文件
分区是路径，是目录，是文件逻辑隔离。有效降低查询量
hashcode算法
- 高16位向右移动变低16位，高低16位做异或运算，最后对桶的数量取模

创建桶表
create table … clustered by (field_name) into n buckets ;

create table orderitems (id int , itemname string , oid int)
  clustered by (oid)  
  INTO 3 BUCKETS  
  ROW FORMAT DELIMITED  
  fields terminated by '\t'  
  LINES TERMINATED by '\n'  
  STORED as TEXTFILE ;
  into 5 buckets ;

本文链接： https://blog.takfu.cf/2018/09/08/hive_03/

版权声明： 本博客所有文章除特别声明外，均采用 CC BY 4.0 CN协议许可协议。转载请注明出处！

无人访问之无聊博客 ++

hive_03

创建分区表

手动添加分区

插入数据到分区表

动态分区

查询

何时Hive可以避免MR操作

LEFT OUTER JOIN

RIGHT OUTER JOIN

FULL OUTER JOIN

左半连接,select 和where子句不能引用到右边表字段

笛卡尔连接 m x n

map端连接，一张小表，通过mapper的时候，将小表完全载入内存。

暗示 MAPJOIN(c) 在0.7之前使用

order by 全排序，对所有数据通过一个reduce进行排序。

sort by 每个reduce进行排序。

Hadoop中的 sort by 局部排序

Hadoop中的 order by 全排序如何实现

distribute by 等价于自定义分区函数

cluster by 等价于 distribute by … sort by …

分桶采样

按照数据块百分比采样

union all 联合操作

视图（View，虚表）

通过视图直接查询

使用like方式创建view

删除视图

索引

桶表 (bucket)

无人访问之无聊博客 ++

hive_03

创建分区表

手动添加分区

插入数据到分区表

动态分区

查询

何时Hive可以避免MR操作

LEFT OUTER JOIN

RIGHT OUTER JOIN

FULL OUTER JOIN

左半连接,select 和where子句不能引用到右边表字段

笛卡尔连接 m x n

map端连接，一张小表，通过mapper的时候，将小表完全载入内存。

暗示 MAPJOIN(c) 在0.7之前使用

order by 全排序，对所有数据通过一个reduce进行排序。

sort by 每个reduce进行排序。

Hadoop中的 sort by 局部排序

Hadoop中的 order by 全排序 如何实现

distribute by 等价于自定义分区函数

cluster by 等价于 distribute by … sort by …

分桶采样

按照数据块百分比采样

union all 联合操作

视图（View，虚表）

通过视图直接查询

使用like方式创建view

删除视图

索引

桶表 (bucket)

无人访问之无聊博客 ++

Hadoop中的 order by 全排序如何实现