测试和生产环境中大数据集群以及Spark的相关配置

Yarn、 容量调度、 HDFS、Spark

测试集群

​ 测试集群为Standalone模式

  • hdfs-site.xml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    <?xml version="1.0" encoding="UTF-8"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

    <configuration>
    <!-- About service -->
    <property>
    <name>dfs.namenode.secondary.http-address</name>
    <value>probd04:50090</value>
    </property>
    <property>
    <name>dfs.namenode.secondary.https-address</name>
    <value>probd04:50091</value>
    </property>
    <property>
    <name>dfs.namenode.name.dir</name>
    <value>
    file:///data/system/probd/probd-0.3.1/hadoop-2.6.3/name
    </value>
    </property>



    <!-- About datanode -->
    <property>
    <name>dfs.datanode.data.dir</name>
    <value>
    file:///data/system/probd/probd-0.3.1/hadoop-2.6.3/data/dfs/data
    </value>
    </property>



    <property>
    <name>dfs.replication</name>
    <value>3</value>
    </property>



    <!-- About web hdfs -->
    <property>
    <name>dfs.webhdfs.enabled</name>
    <value>true</value>
    </property>


    <property>
    <name>dfs.permissions</name>
    <value>false</value>
    </property>
    </configuration>
  • yarn-site.xml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    <?xml version="1.0"?>
    <configuration>

    <property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>10240</value>
    </property>

    <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle,spark_shuffle</value>
    </property>



    <property>
    <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
    <property>
    <name>yarn.resourcemanager.address</name>
    <value>probd02:8032</value>
    </property>
    <property>
    <name>yarn.resourcemanager.scheduler.address</name>
    <value>probd02:8030</value>
    </property>
    <property>
    <name>yarn.resourcemanager.resource-tracker.address</name>
    <value>probd02:8031</value>
    </property>
    <property>
    <name>yarn.resourcemanager.admin.address</name>
    <value>probd02:8033</value>
    </property>
    <property>
    <name>yarn.resourcemanager.webapp.address</name>
    <value>probd02:8088</value>
    </property>

    <property>
    <name>yarn.nodemanager.resource.cpu-vcores</name>
    <value>32</value>
    </property>
    <property>
    <name>yarn.nodemanager.pmem-check-enabled</name>
    <value>false</value>
    <description>
    Whether physical memory limits will be enforced for containers.
    </description>
    </property>
    <property>
    <name>yarn.nodemanager.vmem-check-enabled</name>
    <value>false</value>
    <description>
    Whether virtual memory limits will be enforced for containers.
    </description>
    </property>
    <property>
    <name>yarn.scheduler.minimum-allocation-mb</name>
    <value>2048</value>
    </property>
    <property>
    <name>yarn.scheduler.maximum-allocation-mb</name>
    <value>10240</value>
    </property>
    <property>
    <name>yarn.app.mapreduce.am.resource.mb</name>
    <value>2048</value>
    </property>
    <property>
    <name>yarn.app.mapreduce.am.command-opts</name>
    <value>-Xmx1024m</value>
    </property>
    <property>
    <name>spark.shuffle.service.enabled</name>
    <value>true</value>
    </property>
    <property>
    <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
    <value>org.apache.spark.network.yarn.YarnShuffleService</value>
    </property>
    <property>
    <name>spark.shuffle.service.port</name>
    <value>7337</value>
    </property>

    <!--是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是true -->
    <property>
    <name>yarn.nodemanager.pmem-check-enabled</name>
    <value>false</value>
    </property>

    <!--是否启动一个线程检查每个任务正使用的虚拟内存量,如果任务超出分配值,则直接将其杀掉,默认是true -->
    <property>
    <name>yarn.nodemanager.vmem-check-enabled</name>
    <value>false</value>
    </property>
    <!-- 日志聚集功能使能 -->
    <property>
    <name>yarn.log-aggregation-enable</name>
    <value>true</value>
    </property>

    <!-- 日志保留时间设置7天 -->
    <property>
    <name>yarn.log-aggregation.retain-seconds</name>
    <value>604800</value>
    </property>
    <!--<property>
    <name>yarn.resourcemanager.scheduler.class</name>
    <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
    </property>-->

    <property>
    <name>yarn.scheduler.minimum-allocation-vcores</name>
    <value>20</value>
    </property>

    <property>
    <name>yarn.scheduler.maximum-allocation-vcores</name>
    <value>32/value>
    </property>
    </configuration>
  • capacity-scheduler

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    <configuration>
    <property>
    <name>yarn.scheduler.capacity.root.queues</name>
    <value>default</value>
    </property>
    <property>
    <name>yarn.scheduler.capacity.root.default.capacity</name>
    <value>100</value>
    </property>
    <property>
    <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
    <value>100</value>
    </property>
    <property>
    <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
    <value>1</value>
    </property>
    <property>
    <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
    <value>*</value>
    </property>
    <property>
    <name>yarn.scheduler.capacity.resource-calculator</name>
    <value>org.apache.hadoop.yarn.util.resource.DominantResourceCalculator</value>
    </property>
    </configuration>
  • mapred-site.xml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    <configuration>
    <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
    </property>
    <property>
    <name>mapreduce.jobhistory.address</name>
    <value>probd03:10020</value>
    </property>
    <property>
    <name>mapreduce.jobhistory.webapp.address</name>
    <value>probd03:19888</value>
    </property>
    </configuration>

产线

硬件

  • 内存
    image-20200723153642641

  • 核心

    image-20200723154338591

配置

  • yarn-site.xml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    <?xml version="1.0"?>
    <configuration>
    <!-- resourcemanager -->
    <property>
    <name>yarn.resourcemanager.ha.enabled</name>
    <value>true</value>
    </property>
    <property>
    <name>yarn.resourcemanager.cluster-id</name>
    <value>yarn-probd</value>
    </property>
    <property>
    <name>yarn.resourcemanager.ha.rm-ids</name>
    <value>rm1,rm2</value>
    </property>
    <property>
    <name>yarn.resourcemanager.hostname.rm1</name>
    <value>probd02</value>
    </property>
    <property>
    <name>yarn.resourcemanager.hostname.rm2</name>
    <value>probd03</value>
    </property>


    <!-- About zookeeper -->
    <property>
    <name>yarn.resourcemanager.zk-address</name>
    <value>probd02:2181,probd03:2181,probd04:2181</value>
    </property>

    <!-- About nodemanager memory allocate, that is, NM memory resource -->
    <property>
    <name>yarn.scheduler.maximum-allocation-mb</name>
    <value>40960</value>
    </property>
    <property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>40960</value>
    </property>
    <property>
    <name>yarn.resourcemanager.address</name>
    <value>8032</value>
    </property>
    <!-- About nodemanager CPU allocate, that is, NM CPU resource -->
    <property>
    <name>yarn.nodemanager.resource.cpu-vcores</name>
    <value>40</value>
    <description>
    Number of vcores that can be allocated for containers.
    This is used by the RM scheduler when allocating resources for containers.
    This is not used to limit the number of physical cores used by YARN containers.
    </description>
    </property>
    <property>
    <name>yarn.nodemanager.resource.percentage-physical-cpu-limit</name>
    <value>100</value>
    <description>
    Percentage of CPU that can be allocated for containers.
    This setting allows users to limit the amount of CPU that YARN containers use.
    Currently functional only on Linux using cgroups.
    The default is to use 100% of CPU.
    </description>
    </property>

    <property>
    <name>yarn.log.server.url</name>
    <value>http://probd:19888/jobhistory/job/</value>
    </property>
    <property>
    <name>yarn.log-aggregation.retain-seconds</name>
    <value>864000</value>
    </property>
    <property>
    <name>spark.shuffle.service.enabled</name>
    <value>true</value>
    </property>
    <property>
    <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
    <value>org.apache.spark.network.yarn.YarnShuffleService</value>
    </property>
    <property>
    <name>spark.shuffle.service.port</name>
    <value>7337</value>
    </property>
    <property>
    <name>yarn.nodemanager.pmem-check-enabled</name>
    <value>false</value>
    </property>

    <property>
    <name>yarn.nodemanager.vmem-check-enabled</name>
    <value>false</value>
    </property>

    <property>
    <name>yarn.resourcemanager.recovery.enabled</name>
    <value>true</value>
    </property>

    </configuration>
  • hdfs-site.xml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    <?xml version="1.0" encoding="UTF-8"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    <!--
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. See accompanying LICENSE file.
    -->

    <!-- Put site-specific property overrides in this file. -->

    <configuration>
    <!-- About service -->
    <property>
    <name>dfs.nameservices</name>
    <value>probd</value>
    <description>For all clusters.</description>
    </property>

    <!-- About namenode -->
    <property>
    <name>dfs.ha.namenodes.probd</name>
    <value>nn1,nn2</value>
    </property>
    <property>
    <name>dfs.namenode.rpc-address.probd.nn1</name>
    <value>probd02:8020</value>
    </property>
    <property>
    <name>dfs.namenode.rpc-address.probd.nn2</name>
    <value>probd03:8020</value>
    </property>
    <property>
    <name>dfs.namenode.http-address.probd.nn1</name>
    <value>probd02:50070</value>
    </property>
    <property>
    <name>dfs.namenode.http-address.probd.nn2</name>
    <value>probd03:50070</value>
    </property>
    <property>
    <name>dfs.namenode.name.dir</name>
    <value>
    file:///probd/probd-0.3.1/hadoop-2.6.3/name
    </value>
    </property>
    <property>
    <name>dfs.namenode.shared.edits.dir</name>
    <value>qjournal://probd02:8485;probd03:8485;probd04:8485/probd</value>
    </property>

    <!-- About journal node -->
    <property>
    <name>dfs.journalnode.edits.dir</name>
    <value>/probd/probd-0.3.1/hadoop-2.6.3/qjm</value>
    </property>

    <!-- About sshfence -->
    <property>
    <name>dfs.ha.fencing.methods</name>
    <value>shell(/bin/true)</value>
    </property>
    <property>
    <name>dfs.ha.fencing.ssh.private-key-files</name>
    <value>/root/.ssh/id_rsa</value>
    </property>

    <!-- About datanode -->
    <property>
    <name>dfs.datanode.data.dir</name>
    <value>
    file:///probd/probd-0.3.1/hadoop-2.6.3/data/dfs/data
    </value>
    </property>
    <property>
    <name>dfs.datanode.handler.count</name>
    <value>30</value>
    <description>thread count for request</description>
    </property>
    <property>
    <name>dfs.datanode.max.xcievers</name>
    <value>4096</value>
    </property>
    <property>
    <name>dfs.datanode.du.reserved</name>
    <value>10737418240</value>
    <description>10G</description>
    </property>
    <property>
    <name>dfs.datanode.directoryscan.threads</name>
    <value>3</value>
    <description>
    How many threads should the threadpool used to compile reports
    for volumes in parallel have.
    </description>
    </property>

    <!--
    <property>
    <name>dfs.datanode.ipc.address</name>
    <value>node102.probd:50020</value>
    </property>
    <property>
    <name>dfs.datanode.http.address</name>
    <value>node102.probd:50075</value>
    </property>
    -->

    <!-- About client failover -->
    <property>
    <name>dfs.client.failover.proxy.provider.probd</name>
    <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>

    <!-- About failover -->
    <property>
    <name>dfs.ha.automatic-failover.enabled</name>
    <value>true</value>
    </property>

    <!-- About replication -->
    <!--
    <property>
    <name>dfs.replication.max</name>
    <value>3</value>
    </property>
    -->
    <property>
    <name>dfs.replication</name>
    <value>3</value>
    </property>

    <!--
    <property>
    <name>dfs.datanode.ipc.address</name>
    <value>node102.probd:50020</value>
    </property>
    <property>
    <name>dfs.datanode.http.address</name>
    <value>node102.probd:50075</value>
    </property>
    -->

    <!-- About web hdfs -->
    <property>
    <name>dfs.webhdfs.enabled</name>
    <value>true</value>
    </property>

    <!-- About dfsadmin -->
    <property>
    <name>dfs.hosts.exclude</name>
    <value>/probd/probd-0.3.1/hadoop-2.6.3/etc/hadoop/nodes-exclude</value>
    </property>

    <!-- About balance -->
    <property>
    <name>dfs.balance.bandwidthPerSec</name>
    <value>10485760</value>
    <description>
    Specifies the maximum bandwidth that each datanode can utilize
    for the balancing purpose in term of the number of bytes per second.
    the current value is 10M/S,default is 1M/S
    </description>
    </property>

    <!-- add: 1 namenode:time out-->
    <property>
    <name>dfs.qjournal.write-txns.timeout.ms</name>
    <value>90000</value>
    </property>
    <property>
    <name>dfs.qjournal.start-segment.timeout.ms</name>
    <value>120000</value>
    </property>
    <property>
    <name>dfs.qjournal.select-input-streams.timeout.ms</name>
    <value>120000</value>
    </property>
    <property>
    <name>ha.zookeeper.session-timeout.ms</name>
    <value>120000</value>
    </property>
    <property>
    <name>dfs.qjournal.prepare-recovery.timeout.ms</name>
    <value>600000000</value>
    </property>
    <property>
    <name>dfs.qjournal.accept-recovery.timeout.ms</name>
    <value>600000000</value>
    </property>
    <property>
    <name>dfs.qjournal.finalize-segment.timeout.ms</name>
    <value>600000000</value>
    </property>
    <property>
    <name>dfs.qjournal.get-journal-state.timeout.ms</name>
    <value>600000000</value>
    </property>
    <property>
    <name>dfs.qjournal.new-epoch.timeout.ms</name>
    <value>600000000</value>
    </property>
    <property>
    <name>dfs.image.transfer.bandwidthPerSec</name>
    <value>1048576</value>
    </property>
    <property>
    <name>dfs.namenode.handler.count</name>
    <value>30</value>
    <description>thread count for request</description>
    </property>
    <property>
    <name>dfs.datanode.socket.write.timeout</name>
    <value>10800000</value>
    </property>
    <property>
    <name>dfs.client.socket-timeout</name>
    <value>600000</value>
    </property>

    <!--
    <property>
    <name>dfs.datanode.fsdataset.volume.choosing.policy</name>
    <value>org.apache.hadoop.hdfs.server.datanode.fsdataset.AvailableSpaceVolumeChoosingPolicy</value>
    </property>
    <property>
    <name>dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold</name>
    <value>32212254720</value>
    </property>
    <property>
    <name>dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction</name>
    <value>1</value>
    </property> -->

    </configuration>
  • capacity-scheduler.xml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    <!--
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. See accompanying LICENSE file.
    -->
    <configuration>

    <property>
    <name>yarn.scheduler.capacity.maximum-applications</name>
    <value>10000</value>
    <description>
    Maximum number of applications that can be pending and running.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
    <value>0.1</value>
    <description>
    Maximum percent of resources in the cluster which can be used to run
    application masters i.e. controls number of concurrent running
    applications.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.resource-calculator</name>
    <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
    <description>
    The ResourceCalculator implementation to be used to compare
    Resources in the scheduler.
    The default i.e. DefaultResourceCalculator only uses Memory while
    DominantResourceCalculator uses dominant-resource to compare
    multi-dimensional resources such as Memory, CPU etc.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.root.queues</name>
    <value>default</value>
    <description>
    The queues at the this level (root is the root queue).
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.root.default.capacity</name>
    <value>100</value>
    <description>Default queue target capacity.</description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
    <value>1</value>
    <description>
    Default queue user limit a percentage from 0.0 to 1.0.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
    <value>100</value>
    <description>
    The maximum capacity of the default queue.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.root.default.state</name>
    <value>RUNNING</value>
    <description>
    The state of the default queue. State can be one of RUNNING or STOPPED.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
    <value>*</value>
    <description>
    The ACL of who can submit jobs to the default queue.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
    <value>*</value>
    <description>
    The ACL of who can administer jobs on the default queue.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.node-locality-delay</name>
    <value>40</value>
    <description>
    Number of missed scheduling opportunities after which the CapacityScheduler
    attempts to schedule rack-local containers.
    Typically this should be set to number of nodes in the cluster, By default is setting
    approximately number of nodes in one rack which is 40.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.queue-mappings</name>
    <value></value>
    <description>
    A list of mappings that will be used to assign jobs to queues
    The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
    Typically this list will be used to map users to queues,
    for example, u:%user:%user maps all users to queues with the same name
    as the user.
    </description>
    </property>

    <property>
    <name>yarn.scheduler.capacity.queue-mappings-override.enable</name>
    <value>false</value>
    <description>
    If a queue mapping is present, will it override the value specified
    by the user? This can be used by administrators to place jobs in queues
    that are different than the one specified by the user.
    The default is false.
    </description>
    </property>

    </configuration>
    • mapred-site.xml

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
      100
      101
      102
      103
      104
      105
      106
      107
      <?xml version="1.0"?>
      <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
      <configuration>
      <property>
      <name>mapreduce.framework.name</name>
      <value>yarn</value>
      </property>
      <property>
      <name>mapred.output.compress</name>
      <value>true</value>
      <description>
      Should the job outputs be compressed?
      </description>
      </property>
      <property>
      <name>mapred.output.compression.type</name>
      <value>RECORD</value>
      <description>
      If the job outputs are to compressed as SequenceFiles, how should
      they be compressed? Should be one of NONE, RECORD or BLOCK.
      </description>
      </property>
      <property>
      <name>mapred.output.compression.codec</name>
      <value>org.apache.hadoop.io.compress.SnappyCodec</value>
      <description>
      If the job outputs are compressed, how should they be compressed?
      </description>
      </property>
      <property>
      <name>mapred.compress.map.output</name>
      <value>true</value>
      <description>
      Should the outputs of the maps be compressed before being
      sent across the network. Uses SequenceFile compression.
      </description>
      </property>
      <property>
      <name>mapred.map.output.compression.codec</name>
      <value>org.apache.hadoop.io.compress.SnappyCodec</value>
      <description>
      If the map outputs are compressed, how should they be compressed?
      </description>
      </property>

      <!--
      <property>
      <name>mapreduce.job.ubertask.enable</name>
      <value>true</value>
      </property>
      <property>
      <name>mapreduce.job.ubertask.maxmaps</name>
      <value>9</value>
      </property>
      <property>
      <name>mapreduce.job.ubertask.maxreduces</name>
      <value>1</value>
      </property>
      -->

      <!-- About memory resource allocate -->
      <property>
      <name>mapreduce.map.memory.mb</name>
      <value>20480</value>
      <description>
      The amount of memory to request from the scheduler for each map task.
      </description>
      </property>
      <property>
      <name>mapreduce.reduce.memory.mb</name>
      <value>20480</value>
      <description>
      The amount of memory to request from the scheduler for each reduce task.
      </description>
      </property>
      <property>
      <name>mapreduce.child.java.opts</name>
      <value>-Xmx10240m</value>
      </property>


      <!--
      <property>
      <name>mapreduce.map.java.opts</name>
      <value>-Xmx800m</value>
      </property>
      <property>
      <name>mapreduce.reduce.java.opts</name>
      <value>-Xmx800m</value>
      </property>
      -->
      <!-- About memory resource allocate -->
      <property>
      <name>mapreduce.map.cpu.vcores</name>
      <value>20</value>
      <description>
      The number of virtual cores to request from the scheduler for each map task.
      </description>
      </property>
      <property>
      <name>mapreduce.reduce.cpu.vcores</name>
      <value>20</value>
      <description>
      The number of virtual cores to request from the scheduler for each reduce task.
      </description>
      </property>
      </configuration>
  • 说明

    应用为基于webservice接收请求包装分析任务处理的web应用,sparkContext生命周期是和web程序挂钩的

    所以容量调度只有一个队列,并且初始化占满所有资源.

spark(测试集群和产线同)

  • spark-env.sh

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    export SPARK_HOME=/probd/probd-0.3.1/spark-2.2.3-bin-hadoop-2.6
    export HADOOP_HOME=/probd/probd-0.3.1/hadoop-2.6.3
    export YARN_HOME=/probd/probd-0.3.1/hadoop-2.6.3
    export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
    export YARN_CONF_DIR=${YARN_HOME}/etc/hadoop
    export SPARK_YARN_USER_ENV="CLASSPATH=${HADOOP_HOME}/etc/hadoop"
    export LD_LIBRARY_PATH=${HADOOP_HOME}/lib/native
    export SCALA_HOME=/usr/lib/scala-2.11.8
    export JAVA_HOME=/usr/lib/java/jdk1.8.0_171
    export SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER -Dspark.deploy.zookeeper.url=probd02:2181,probd03:2181,probd04:2181 -Dspark.deploy.zookeeper.dir=/spark223 -Dspark.storage.blockManagerHeartBeatMs=6000000 -Dspark.rpc.askTimeout=300 -Dspark.ui.retainedStages=1000 -Dspark.worker.timeout=300"
    SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=hdfs://probd/spark223/job/history"
    YARN_CONF_DIR=${HADOOP_HOME}/etc/hadoop
  • spark-default.conf

    1
    2
    3
    4
    5
    6
    7
    8
    spark.driver.extraJavaOptions -XX:MetaspaceSize=2G -XX:MaxMetaspaceSize=2G
    spark.executor.memoryOverhead 2048 #不建议使用,unsafeWriter并不稳定
    spark.driver.memory 5G
    spark.eventLog.enabled true
    spark.eventLog.dir hdfs://probd/spark223/job/history
    spark.yarn.historyServer.address http://probd02:18080
    spark.kryoserializer.buffer.max=64m #不建议使用,unsafeWriter并不稳定
    spark.kryoserializer.buffer=64k #不建议使用,unsafeWriter并不稳定
  • spark-sbumit 完整脚本重定向到github

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    nohup $SPARK_HOME/bin/spark-submit \
    --master yarn \
    --deploy-mode client \
    --name onlineAnalyse \
    --num-executors ${en} \
    --driver-memory 3g \
    --driver-cores 2 \
    --executor-memory 35g \
    --executor-cores ${ec} \
    --driver-java-options "-Dlog4j.configuration=file:${prop} \
    -Dexecutor.num=${en} -Dexecutor.core=${ec} \
    -XX:+PrintGCApplicationConcurrentTime -Xloggc:gc.log" \
    --conf spark.driver.port=20002 \
    --conf spark.default.parallelism=300 \
    --conf spark.driver.maxResultSize=2g \
    --conf spark.kryoserializer.buffer.max=256m \
    --conf spark.kryoserializer.buffer=128m \
    --conf spark.memory.fraction=0.8 \
    ${ANY}.jar 2>&1 &

EOF