1. 日志格式
#Software: Microsoft Internet Information Services 6.0 #Version: 1.0 #Date: 2014-01-03 00:00:34 #Fields: date time s-sitename s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-win32-status 2014-01-03 00:00:34 W3SVC1 2001:da8:7007:102::244 GET /skin6/film_sort.asp id=10 80 - 2001:da8:7007:f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko 200 0 0 2014-01-03 00:00:34 W3SVC1 2001:da8:7007:102::244 GET /news.asp - 80 - 2001:da8:7007:f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko 200 0 0 2014-01-03 00:00:34 W3SVC1 2001:da8:7007:102::244 GET /UploadFile/20131028231421.jpg - 80 - 2001:da8:7007:f07:ac50:d2b:f22d:5dec Mozilla/5.0+(Windows+NT+6.1;+Trident/7.0;+rv:11.0)+like+Gecko 200 0 0
2. 建立的对应的hive表:
CREATE EXTERNAL TABLE IF NOT EXISTS exmovielog ( log_date TIMESTAMP, s_sitename STRING, s_ip STRING, cs_method STRING, cs_uri_stem STRING, cs_uri_query STRING, s_port INT, cs_username STRING, c_ip STRING, user_agen STRING, sc_status INT, sc_substatus INT, sc_win32_status INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '`' LOCATION '/movielog';
3. flume配置文件
agent1.sources = source1 agent1.channels = channel1 agent1.sinks = sink1 # Each channel's type is defined. agent1.channels.channel1.type = file agent1.channels.channel1.checkpointDir = /home/hadoop_admin/flumeTemp/fchannel/spool/checkpoint agent1.channels.channel1.dataDirs = /home/hadoop_admin/flumeTemp/fchannel/spool/data agent1.channels.channel1.capacity = 10000 # For each one of the sources, the type is defined agent1.sources.source1.type = spooldir agent1.sources.source1.inputCharset = GBK agent1.sources.source1.spoolDir =/home/hadoop_admin/movielog agent1.sources.source1.fileHeader = true agent1.sources.source1.deletePolicy = immediate agent1.sources.source1.batchSize = 1000 agent1.sources.source1.channels = channel1 # remove the line that starts with '#' agent1.sources.source1.interceptors = i1 search-replace1 search-replace2 search-replace3 agent1.sources.source1.interceptors.i1.type = regex_filter agent1.sources.source1.interceptors.i1.regex = ^[^#].*$ # the default value of this configuration is flase # agent1.sources.source1.interceptors.i1.excludeEvents = true # agent1.sources.source1.interceptors.i1.regex = ^# # connect the date and time to be a timestamp agent1.sources.source1.interceptors.search-replace1.type = search_replace agent1.sources.source1.interceptors.search-replace1.searchPattern = (\d\d\d\d-\d\d-\d\d)\s(\d\d:\d\d:\d\d) agent1.sources.source1.interceptors.search-replace1.replaceString = $1T$2 # change the split char agent1.sources.source1.interceptors.search-replace2.type = search_replace agent1.sources.source1.interceptors.search-replace2.searchPattern = \s agent1.sources.source1.interceptors.search-replace2.replaceString = ` agent1.sources.source1.interceptors.search-replace3.type = search_replace agent1.sources.source1.interceptors.search-replace3.searchPattern = (\d\d\d\d-\d\d-\d\d)T(\d\d:\d\d:\d\d) agent1.sources.source1.interceptors.search-replace3.replaceString = $1 $2 # Each sink's type must be defined agent1.sinks.sink1.type = hdfs agent1.sinks.sink1.channel = channel1 agent1.sinks.sink1.hdfs.path = hdfs://master:9000/movielog agent1.sinks.sink1.hdfs.writeFormat = Text agent1.sinks.sink1.hdfs.fileType = DataStream agent1.sinks.sink1.hdfs.rollInterval = 0 agent1.sinks.sink1.hdfs.idleTimeout = 0 agent1.sinks.sink1.hdfs.rollCount = 0 agent1.sinks.sink1.hdfs.rollSize = 67108864 agent1.sinks.sink1.hdfs.batchSize = 1000 agent1.sinks.sink1.hdfs.callTimeout = 3000