• 多个线程运行MR程序时hadoop出现的问题


    夜间多个任务同时并行,总有几个随机性有任务失败,查看日志: 刷选关键词 Caused by  或者  FAILED

      cat -n ads_channel.log |grep "Caused by"
      7732    Caused by: java.util.concurrent.ExecutionException: java.io.IOException: Rename cannot overwrite non empty destination directory /tmp/hadoop-hdfs/mapred/local/1576781334421
      7737    Caused by: java.io.IOException: Rename cannot overwrite non empty destination directory /tmp/hadoop-hdfs/mapred/local/1576781334421
    In order to change the average load for a reducer (in bytes):
      set hive.exec.reducers.bytes.per.reducer=<number>
    In order to limit the maximum number of reducers:
      set hive.exec.reducers.max=<number>
    In order to set a constant number of reducers:
      set mapreduce.job.reduces=<number>
    java.io.IOException: java.util.concurrent.ExecutionException: org.apache.hadoop.fs.FileAlreadyExistsException: rename destination /tmp/hadoop-hdfs/mapred/local/1579374502408 already exists.
        at org.apache.hadoop.mapred.LocalDistributedCacheManager.setup(LocalDistributedCacheManager.java:143)
        at org.apache.hadoop.mapred.LocalJobRunner$Job.<init>(LocalJobRunner.java:171)
        at org.apache.hadoop.mapred.LocalJobRunner.submitJob(LocalJobRunner.java:758)
        at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:244)
        at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1307)
        at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1304)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
        at org.apache.hadoop.mapreduce.Job.submit(Job.java:1304)
        at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:578)
        at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:573)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
        at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:573)
        at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:564)
        at org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:436)
        at org.apache.hadoop.hive.ql.exec.mr.MapRedTask.execute(MapRedTask.java:142)
        at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:214)
        at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:99)
        at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:2052)
        at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1748)
        at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1501)
        at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1285)
        at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1275)
        at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:226)
        at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:175)
        at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:389)
        at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:324)
        at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:726)
        at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:699)
        at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:634)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
    Caused by: java.util.concurrent.ExecutionException: org.apache.hadoop.fs.FileAlreadyExistsException: rename destination /tmp/hadoop-hdfs/mapred/local/1579374502408 already exists.
        at java.util.concurrent.FutureTask.report(FutureTask.java:122)
        at java.util.concurrent.FutureTask.get(FutureTask.java:192)
        at org.apache.hadoop.mapred.LocalDistributedCacheManager.setup(LocalDistributedCacheManager.java:139)
        ... 38 more
    Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: rename destination /tmp/hadoop-hdfs/mapred/local/1579374502408 already exists.
        at org.apache.hadoop.fs.FileSystem.rename(FileSystem.java:1310)
        at org.apache.hadoop.fs.DelegateToFileSystem.renameInternal(DelegateToFileSystem.java:193)
        at org.apache.hadoop.fs.AbstractFileSystem.renameInternal(AbstractFileSystem.java:744)
        at org.apache.hadoop.fs.FilterFs.renameInternal(FilterFs.java:236)
        at org.apache.hadoop.fs.AbstractFileSystem.rename(AbstractFileSystem.java:674)
        at org.apache.hadoop.fs.FileContext.rename(FileContext.java:932)
        at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:369)
        at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:60)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
    Job Submission failed with exception 'java.io.IOException(java.util.concurrent.ExecutionException: org.apache.hadoop.fs.FileAlreadyExistsException: rename destination /tmp/hadoop-hdfs/mapred/local/1579374502408 already exists.)'
    FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask

     扩展:

    cat -n ads_channel.log |grep "Caused by" 或者 grep ads_channel.log  -e "Caused by" 或者 grep -E "Caused by|FAILED"  ads_channel.log #两个关键词
    grep "2019-12-21" ads_channel.log | grep "Caused by" ads_channel.log
    cat ads_channel.log | grep "Caused by" -B 10 ##根据关键字查看前20行日志
    cat ads_channel.log | grep "Caused by" -A 10 ##根据关键字查看后20行日志
    cat ads_channel.log | grep "Caused by" -C 10 #根据关键字查看前后10行日志
    
    
    说明:
    -A 表示关键字之后,After
    -B 表示关键字之前,Before
    -C 表示关键字前后,Context
    
    vim ads_channel.log
    :set nu  :7749 (跳转到指定行数)
    
    实时查询多个关键字的日志信息
    命令:tail -f ads_channel.log |grep -E “Caused by"

    问题原因:

        当多个线程运行MR程序时hadoop出现的问题:
            https://issues.apache.org/jira/browse/MAPREDUCE-6992
            https://issues.apache.org/jira/browse/MAPREDUCE-6441

    hdfs会创建一个以当前时间的时间戳命名的文件.当两个mr任务在同一毫秒提交,造成了文件的并发访问问题.

    yarn的运行模式:

    1-本地模式(LocalJobRunner实现)
    mapreduce.framework.name设置为local,则不会使用YARN集群来分配资源,在本地节点执行。在本地模式运行的任务,无法发挥集群的优势。注:在web UI是查看不到本地模式运行的任务。

            对 hive有些了解的人都会知道,hive 会将 SQL 语句最终转化成分布式执行的 mapreduce 任务计划。对于大数量集的数据启动 mapreduce 所花费的时间是渺小的。因为数据量大,并且分布再不同的机器上,在不同的机器上处理,这样做是 hive 的优势之一。然而当处理小数量,并且数据都聚集再一台机器上时,那么启动本地模式是非常有意的,不可避免的启动 mapreduce,将数据拉回客户端,本地处理,这样减少了分处理后合并花费的时间。如此一来,对数据量比较小的操作,就可以在本地执行,这样要比提交任务到集群执行效率要快很多。
    启动本地模式,需要配置如下参数: 

         hive.exec.mode.local.auto                    决定 Hive 是否应该自动地根据输入文件大小,在本地运行。    
         hive.exec.mode.local.auto.inputbytes.max     最大输入数据量,当输入数据量小于这个值的时候将会启动本地模式,默认是 128M。    
         hive.exec.mode.local.auto.tasks.max          最大输入文件个数,当输入文件个数小于这个值的时候将会启动本地模式。(默认4)

    当一个job满足如下条件才能真正使用本地模式:      

      1.job的输入数据大小必须小于参数:hive.exec.mode.local.auto.inputbytes.max(默认128MB)    
      2.job的map数必须小于参数:hive.exec.mode.local.auto.tasks.max(默认4)    
      3.job的reduce数必须为0或者1

    2-Yarn模式(YARNRunner实现)
            mapreduce.framework.name设置为yarn,当客户端配置mapreduce.framework.name为yarn时, 客户端会使用YARNRunner与服务端通信, 而YARNRunner真正的实现是通过ClientRMProtocol与RM交互, 包括提交Application, 查询状态等功能。但是根据任务的特性,分为两种方式执行任务

    3-Uber模式:

            为降低小作业延迟而设计的一种模式,所有任务,不管是Map Task,还是Reduce Task,均在同一个Container中顺序执行,这个Container其实也是MRAppMaster所在Container

    4-Non-Uber模式:

             对于运行时间较长的大作业,先为Map Task申请资源,当Map Task运行完成数目达到一定比例后再为Reduce Task申请资源。

    解决办法:

        1-在不改源代码的情况下,取消自动启动本地模式,根据集群环境,临时在运行程序时设置:

    set hive.exec.mode.local.auto = false
    2-在调度系统中设置设置失败重试.
    azkaban配置失败重试如下:
    type =command
    command = xxxxxx
    retries=3
    retry.backoff=60000 #毫秒数

    参考:https://blog.csdn.net/weixin_39445556/article/details/80348976

    在官网找到了这个bug,在2.7.1版本中已经修复了这个bug,对集群进行升级:

    This is a bug in Hadoop 2.6.0. It's been marked as fixed but it still happens occasionally (see: https://issues.apache.org/jira/browse/YARN-2624).

    https://stackoverflow.com/questions/30857413/hadoop-complains-about-attempting-to-overwrite-nonempty-destination-directory

    [hdfs@el-hadoop-1 logs]$ hadoop dfsadmin -report  ##查看hadoop状况:
    DEPRECATED: Use of this script to execute hdfs command is deprecated.
    Instead use the hdfs command for it.
    
    Configured Capacity: 1242537227061 (1.13 TB)
    Present Capacity: 1154802876345 (1.05 TB)
    DFS Remaining: 1125514018745 (1.02 TB)
    DFS Used: 29288857600 (27.28 GB)
    DFS Used%: 2.54%
    Under replicated blocks: 0
    Blocks with corrupt replicas: 0
    Missing blocks: 0
    Missing blocks (with replication factor 1): 0
    
    -------------------------------------------------
    Live datanodes (3):
    
    Name: 172.26.0.106:50010 (el-hadoop-1)
    Hostname: el-hadoop-1
    Rack: /default
    Decommission Status : Normal
    Configured Capacity: 414179075687 (385.73 GB)
    DFS Used: 9740627968 (9.07 GB)
    Non DFS Used: 22051710567 (20.54 GB)
    DFS Remaining: 360492523769 (335.73 GB)
    DFS Used%: 2.35%
    DFS Remaining%: 87.04%
    Configured Cache Capacity: 4294967296 (4 GB)
    Cache Used: 0 (0 B)
    Cache Remaining: 4294967296 (4 GB)
    Cache Used%: 0.00%
    Cache Remaining%: 100.00%
    Xceivers: 8
    Last contact: Sat Dec 21 11:29:07 CST 2019
    
    
    Name: 172.26.0.108:50010 (el-hadoop-2)
    Hostname: el-hadoop-2
    Rack: /default
    Decommission Status : Normal
    Configured Capacity: 414179075687 (385.73 GB)
    DFS Used: 9774043136 (9.10 GB)
    Non DFS Used: 0 (0 B)
    DFS Remaining: 382510819168 (356.24 GB)
    DFS Used%: 2.36%
    DFS Remaining%: 92.35%
    Configured Cache Capacity: 4294967296 (4 GB)
    Cache Used: 0 (0 B)
    Cache Remaining: 4294967296 (4 GB)
    Cache Used%: 0.00%
    Cache Remaining%: 100.00%
    Xceivers: 8
    Last contact: Sat Dec 21 11:29:06 CST 2019
    
    
    Name: 172.26.0.109:50010 (el-hadoop-3)
    Hostname: el-hadoop-3
    Rack: /default
    Decommission Status : Normal
    Configured Capacity: 414179075687 (385.73 GB)
    DFS Used: 9774186496 (9.10 GB)
    Non DFS Used: 0 (0 B)
    DFS Remaining: 382510675808 (356.24 GB)
    DFS Used%: 2.36%
    DFS Remaining%: 92.35%
    Configured Cache Capacity: 4294967296 (4 GB)
    Cache Used: 0 (0 B)
    Cache Remaining: 4294967296 (4 GB)
    Cache Used%: 0.00%
    Cache Remaining%: 100.00%
    Xceivers: 8
    Last contact: Sat Dec 21 11:29:08 CST 2019
  • 相关阅读:
    atom介绍
    举例介绍重构(译)
    java单双派机制理解
    AngularJS开发指南03:HTML编译器
    AngularJS开发指南02:引导程序
    AngularJS开发指南01:AngularJS简介
    1.angular之Hello World
    31天重构学习笔记(java版本)
    一个农夫的故事 分类: 其他 2015-01-24 16:44 104人阅读 评论(0) 收藏
    一个农夫的故事 分类: 其他 2015-01-24 16:44 103人阅读 评论(0) 收藏
  • 原文地址:https://www.cnblogs.com/shengyang17/p/12076353.html
Copyright © 2020-2023  润新知