• Flume 之 安装|简单使用|问题汇总


    一、安装

    进入目录
    cd conf/
    cp flume-env.sh.template flume-env.sh
    vi flume-env.sh
    添加 >>> JAVA_HOME=/opt/bigdata/java/jdk180
    然后配置环境变量
    vi /etc/profile
    #flume
    export FLUME_HOME=/opt/bigdata/flume160
    export PATH=$PATH:$FLUME_HOME/bin
    source /etc/profile
    验证flume安装成功:
    [root@vbserver ~]# flume-ng version
    Flume 1.6.0-cdh5.14.2
    Source code repository: https://git-wip-us.apache.org/repos/asf/flume.git
    Revision: 50436774fa1c7eaf0bd9c89ac6ee845695fbb687
    Compiled by jenkins on Tue Mar 27 13:55:10 PDT 2018
    From source with checksum 30217fe2b34097676ff5eabb51f4a11d

    二、配置参数

    见:https://www.cnblogs.com/yin-fei/p/10778719.html

    官方:http://flume.apache.org/releases/content/1.9.0/FlumeUserGuide.html#flume-sources

    三、简单使用

    1.单个文件到HDFS

    写一个flume.properties,建立/root/flume/data && /root/flume/checkpoint

    vi flume.properties

    a1.channels = c1
    a1.sources = r1
    a1.sinks = k1
    
    a1.sources.r1.type = exec
    a1.sources.r1.channels = c1
    a1.sources.r1.command = tail -F /root/abc.log
    
    a1.channels.c1.type = file
    a1.channels.c1.checkpointDir = /root/flume/checkpoint  # 需要已经存在
    a1.channels.c1.dataDir = /root/flume/data  # 需要已经存在
    
    a1.sinks.k1.type = hdfs
    a1.sinks.k1.channel = c1
    a1.sinks.k1.hdfs.path = hdfs://192.168.56.111:9000/flume/events

    2.执行命令

    flume-ng agent --name a1 -f /root/flumetest/flume.properties

    3.结果

    2.实验1:用spooldir监听目录并输出多个文件

    发现结果有10份,十行十events一个文件

    如何指定目录中特定格式的文件?  a1.sources.r1.includePattern= users_[0-9]{4}.csv 

    a1.channels = c1
    a1.sources = r1
    a1.sinks = k1
    
    a1.sources.r1.type = spooldir
    a1.sources.r1.channels = c1
    a1.sources.r1.spoolDir = /root/flumeData
    
    a1.channels.c1.type = file
    a1.channels.c1.checkpointDir = /root/flume/checkpoint
    a1.channels.c1.dataDirs = /root/flume/data
    
    a1.sinks.k1.type = hdfs
    a1.sinks.k1.channel = c1
    a1.sinks.k1.hdfs.path = hdfs://192.168.56.111:9000/flume/logs1
    a1.sinks.k1.hdfs.fileType=DataStream

    结果

    如果发现卡住,原因:

    mv abc.log.COMPLETED abc.log

    3.实验2:用spooldir监听目录并输出1个文件

    rollcount=1000 => 1000条以下都是1个文件

    a1.channels = c1
    a1.sources = r1
    a1.sinks = k1
    
    
    a1.sources.r1.type = spooldir
    a1.sources.r1.channels = c1
    a1.sources.r1.spoolDir = /root/flumeData
    
    
    a1.channels.c1.type = file
    a1.channels.c1.checkpointDir = /root/flume/checkpoint
    a1.channels.c1.dataDirs = /root/flume/data
    
    
    a1.sinks.k1.type = hdfs
    a1.sinks.k1.channel = c1
    a1.sinks.k1.hdfs.path = hdfs://192.168.56.111:9000/flume/logs3
    a1.sinks.k1.hdfs.fileType=DataStream
    a1.sinks.k1.hdfs.rollCount = 10000

    结果

    4.实验3:验证spooldir监听目录的作用:新增新log

    hdfs dfs -cat /flume/logs3/*

    查看hdfs结果:abc.log中所有文件(从test1-test99)
    查看监听目录:abc标记为completed

    [root@vbserver flumeData]# ls
    abc.log.COMPLETED 

    新建log:cde.log(写有c1-c4)

    [root@vbserver flumeData]# ls
    abc.log.COMPLETED cde.log

    再次执行命令:

    flume-ng agent --name a1 -f /root/flumetest/flume.properties

    查看hdfs结果:
    (test1-test99 c1-c4)

    5.改进:上传HDFS合成一个文件

    a1.channels = c1
    a1.sources = r1
    a1.sinks = k1
    
    a1.sources.r1.type = spooldir
    a1.sources.r1.channels = c1
    a1.sources.r1.spoolDir = /root/flumeData
    a1.sources.r1.batchSize = 10000
    
    a1.channels.c1.type = file
    a1.channels.c1.checkpointDir = /root/flume/checkpoint
    a1.channels.c1.dataDirs = /root/flume/data
    
    a1.sinks.k1.type = hdfs
    a1.sinks.k1.channel = c1
    a1.sinks.k1.hdfs.path = hdfs://192.168.56.111:9000/flume/logs4
    a1.sinks.k1.hdfs.fileType=DataStream
    a1.sinks.k1.hdfs.rollCount = 0                #设置文件的生成和events数无关
    a1.sinks.k1.hdfs.rollSize = 10485760          #1024 * 10 * 1024 = 10G
    a1.sinks.k1.hdfs.rollInterval = 60            #每60秒生成一个文件
    a1.sinks.k1.hdfs.useLocalTimeStamp = true     #使用本地时间戳,如:用来命名文件

    5.输出到Kafka,每一行是一个小消息!

    a1.channels = c1
    a1.sources = r1
    a1.sinks = k1
    
    a1.sources.r1.type = spooldir
    a1.sources.r1.channels = c1
    a1.sources.r1.spoolDir = /root/flumeData
    a1.sources.r1.batchSize = 10000
    
    a1.channels.c1.type = file
    a1.channels.c1.checkpointDir = /root/flume/checkpoint
    a1.channels.c1.dataDirs = /root/flume/data
    
    a1.sinks.k1.channel  =  c1
    a1.sinks.k1.type  =  org.apache.flume.sink.kafka.KafkaSink 
    a1.sinks.k1.kafka.topic  =  mytopic # 可以原来没这个topic,能自动生成
    a1.sinks.k1.kafka.bootstrap.servers  = 192.168.56.111:9092 
    a1.sinks.k1.kafka.flumeBatchSize  =  100 
    a1.sinks.k1.kafka.producer.acks  =  1

    启动kafka后,实时验证:

    kafka-console-consumer.sh --bootstrap-server 192.168.56.111:9092 --topic mytopic // 实时消费

    执行命令:

    flume-ng agent --name a1 -f /root/flumetest/flume.properties

    结果:

    6.使用拦截器给csv去头

    a1.channels = c1
    a1.sources = r1
    a1.sinks = k1
    
    a1.sources.r1.type = spooldir
    a1.sources.r1.channels = c1
    a1.sources.r1.spoolDir = /root/flumeData
    a1.sources.r1.batchSize = 10000
    
    a1.sources.r1.interceptors=i1
    a1.sources.r1.interceptors.i1.type=regex_filter
    a1.sources.r1.interceptors.i1.regex = user_id.*   # 第一行是user_id开头
    a1.sources.r1.interceptors.i1.excludeEvents=true  # exclude 匹配上的
    a1.channels.c1.type = file a1.channels.c1.checkpointDir = /root/flume/checkpoint a1.channels.c1.dataDirs = /root/flume/data a1.sinks.k1.channel = c1 a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink a1.sinks.k1.kafka.topic = users # users.csv => users这个topic a1.sinks.k1.kafka.bootstrap.servers = 192.168.56.111:9092 a1.sinks.k1.kafka.flumeBatchSize = 100 a1.sinks.k1.kafka.producer.acks = 1

    比对传输前后的条数:

    [root@vbserver flumeData]# wc -l users.csv
    38210 users.csv

    开启kafka实时消费:

    kafka-console-consumer.sh --bootstrap-server 192.168.56.111:9092 --topic users // 实时消费

    执行命令:

    flume-ng agent --name a1 -f /root/flumetest/flume.properties

    结果:

    Processed a total of 38209 messages  # csv去头成功!

    四、问题汇总

    1.GC OOM

    Flume安装目录下bin
    vi flume-ng
    解决方法: 改大  JAVA_OPTS="-Xms100m -Xmx2000m -Dcom.sun.management.jmxremote" 

    2.传入KFK后消息行数翻倍

    原因:单行超过最大length会截断,另起下一行 => 所以kfk的消息数会变大!

     解决方法: a1.sources.r1.deserializer.maxLineLength=1200000 

    3.Topic xxx does not exist on ZK path xxxx

    kfk的topic删除后 => kfk 和zk关于该topic的信息都灭

    但是flume没有关,发现没有传输成功,便又开始传输 => 所以查看kfk的topic时会出现报错

    解决方法:先停flume,再delete kfk的topic

    4.上游下游都卡住没动

    检查一下spoolDir目录下是否所有文件都COMPLETED

  • 相关阅读:
    【数论】X problem
    【数论】约瑟夫问题
    【组合数学】购票问题
    【组合数学】计数原理
    spring batch批处理框架学习
    eclipse自动添加javadoc注释
    eclipse手动安装alibaba代码规范插件
    现代支付系统的资金流向
    利用网易有道在谷歌浏览器进行网页滑词翻译
    spring配置遇到的问题
  • 原文地址:https://www.cnblogs.com/sabertobih/p/14115501.html
Copyright © 2020-2023  润新知