处理小文件

1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
sql="
set mapred.max.split.size=256000000;
set mapred.min.split.size.per.node=100000000;
set mapred.min.split.size.per.rack=100000000;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.merge.smallfiles.avgsize=512000000;
set hive.merge.mapfiles = true;
set hive.exec.dynamici.partition=true;
set hive.merge.mapredfiles = true;
set hive.merge.size.per.task = 512000000;
set hive.support.quoted.identifiers=None;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set hive.exec.max.dynamic.partitions=10000;
set mapreduce.map.memory.mb=8192;
set mapreduce.map.java.opts='-Xmx8192M' -XX:+UseG1GC;;
set mapreduce.reduce.memory.mb=8192;
set mapreduce.reduce.java.opts='-Xmx8192M';
set mapred.job.queue.name=root.;
insert overwrite table tb partition(month,source) select * from tb where month='202111' and source='anhui_mobile'
;
"
hive -e "$sql"

2

1
2
3
4
5
6
7
8
hive -e "
set mapred.job.queue.name=root.;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table tb partition(month,source)
select * from tb where month='202101' and source='unicom'
distribute by month,source
;
"

方法1很快,没有走reduce,方法2走了reduce,源码待研究

点击查看更多小文件处理参数