1.说明
在局部聚合的类中必须有一个内部类,用处存储状态值,中间的结果
GroupBy+局部聚合,不需要过多的跨网络传输
GroupBy+全局聚合,需要大量的网络传输
一:局部聚合
1.需求
另一个流,基于分钟的订单金额总数,局部聚合
2.驱动类
1 package com.jun.trident; 2 3 import backtype.storm.Config; 4 import backtype.storm.LocalCluster; 5 import backtype.storm.StormSubmitter; 6 import backtype.storm.generated.AlreadyAliveException; 7 import backtype.storm.generated.InvalidTopologyException; 8 import backtype.storm.tuple.Fields; 9 import backtype.storm.tuple.Values; 10 import storm.trident.Stream; 11 import storm.trident.TridentState; 12 import storm.trident.TridentTopology; 13 import storm.trident.operation.Function; 14 import storm.trident.operation.TridentCollector; 15 import storm.trident.operation.TridentOperationContext; 16 import storm.trident.operation.builtin.Count; 17 import storm.trident.operation.builtin.Sum; 18 import storm.trident.testing.FixedBatchSpout; 19 import storm.trident.testing.MemoryMapState; 20 import storm.trident.tuple.TridentTuple; 21 22 import java.util.Map; 23 24 public class TridentDemo { 25 public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException { 26 TridentTopology tridentTopology=new TridentTopology(); 27 //模拟数据 28 Fields field=new Fields("log","flag"); 29 FixedBatchSpout spout=new FixedBatchSpout(field,5, 30 new Values("168.214.187.214 - - [1481953616092] "GET /view.php HTTP/1.1" 200 0 "http://cn.bing.com/search?q=spark mllib" "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" "-"","A"), 31 new Values("168.187.202.202 - - [1481953537038] "GET /IBEIfeng.gif?order_id=1063&orderTime=1481953537038&memberId=4000012340500607&productInfos=10005-2099.48-B-1|10004-1886.62-A-2|10001-961.99-A-1&orderAmt=6834.70 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2;Tident/6.0)" "-"","A"), 32 new Values("61.30.167.187 - - [1481953539039] "GET /IBEIfeng.gif?order_id=1064&orderTime=1481953539039&memberId=4000930409959999&productInfos=10007-3329.13-B-1|10009-2607.71-B-1|10002-390.62-A-1|10006-411.00-B-2&orderAmt=7149.46 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (Linux; Android 4.2.1; Galaxy Nexus Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" "-"","A"), 33 new Values("30.29.132.190 - - [1481953544042] "GET /IBEIfeng.gif?order_id=1065&orderTime=1481953544043&memberId=1234568970080798&productInfos=10005-2099.48-B-1|10001-3242.40-C-2|10006-411.00-B-1&orderAmt=8995.28 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 7_)_3 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B511 Safari/9537.53" "-"","B"), 34 new Values("222.190.187.201 - - [1481953578068] "GET /IBEIfeng.gif?order_id=1066&orderTime=1481953578068&memberId=3488586887970809&productInfos=10005-2099.48-B-1|10001-2774.16-C-2&orderAmt=7647.80 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" "-"","B"), 35 new Values("72.202.43.53 - - [1481953579069] "GET /IBEIfeng.gif?order_id=1067&orderTime=1481953579069&memberId=2084859896989877&productInfos=10007-3329.13-B-1|10001-961.99-A-2&orderAmt=5253.10 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (Linux; Android 4.2.1; Galaxy Nexus Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" "-"","B") 36 ); 37 //多次循环 38 spout.setCycle(true); 39 //流处理 40 Stream stream=tridentTopology.newStream("orderAnalyse",spout) 41 //过滤 42 .each(new Fields("log"),new ValidLogFilter()) 43 //解析 44 .each(new Fields("log"), new LogParserFunction(),new Fields("orderId","orderTime","orderAmtStr","memberId")) 45 //投影 46 .project(new Fields("orderId","orderTime","orderAmtStr","memberId")) 47 //时间解析 48 .each(new Fields("orderTime"),new DateTransFormerFunction(),new Fields("day","hour","minter")) 49 ; 50 //分流 51 //1.基于minter统计订单数量,分组统计 52 TridentState state=stream.groupBy(new Fields("minter")) 53 //全局聚合,使用内存存储状态信息 54 .persistentAggregate(new MemoryMapState.Factory(),new Count(),new Fields("orderNumByMinter")); 55 // state.newValuesStream().each(new Fields("minter","orderNumByMinter"),new PrintFilter()); 56 57 //2.另一个流,基于分钟的订单金额,局部聚合 58 Stream partitionStream=stream.each(new Fields("orderAmtStr"),new TransforAmtToDoubleFunction(),new Fields("orderAmt")) 59 .groupBy(new Fields("minter")) 60 //局部聚合 61 .chainedAgg() //聚合链 62 .partitionAggregate(new Fields("orderAmt"),new LocalSum(),new Fields("orderAmtSumOfLocal")) 63 .chainEnd(); //聚合链 64 partitionStream.each(new Fields("minter","orderAmtSumOfLocal"),new PrintFilter()); 65 66 //提交 67 Config config=new Config(); 68 if(args==null || args.length<=0){ 69 LocalCluster localCluster=new LocalCluster(); 70 localCluster.submitTopology("tridentDemo",config,tridentTopology.build()); 71 }else { 72 config.setNumWorkers(2); 73 StormSubmitter.submitTopology(args[0],config,tridentTopology.build()); 74 } 75 } 76 }
3.金额从字符串转为double类型的方法类
1 package com.jun.trident; 2 3 import backtype.storm.tuple.Values; 4 import org.slf4j.Logger; 5 import org.slf4j.LoggerFactory; 6 import storm.trident.operation.Function; 7 import storm.trident.operation.TridentCollector; 8 import storm.trident.operation.TridentOperationContext; 9 import storm.trident.tuple.TridentTuple; 10 11 import java.util.Map; 12 13 public class TransforAmtToDoubleFunction implements Function { 14 private static final Logger logger= LoggerFactory.getLogger(TransforAmtToDoubleFunction.class); 15 @Override 16 public void execute(TridentTuple tridentTuple, TridentCollector tridentCollector) { 17 String orderAmtStr=tridentTuple.getStringByField("orderAmtStr"); 18 try{ 19 Double orderAmt=Double.parseDouble(orderAmtStr); 20 tridentCollector.emit(new Values(orderAmt)); 21 }catch (Exception e){ 22 logger.error("金额转换错误:"+orderAmtStr); 23 } 24 25 } 26 27 @Override 28 public void prepare(Map map, TridentOperationContext tridentOperationContext) { 29 30 } 31 32 @Override 33 public void cleanup() { 34 35 } 36 }
4.局部聚合的类
1 package com.jun.trident; 2 3 import backtype.storm.tuple.Values; 4 import storm.trident.operation.Aggregator; 5 import storm.trident.operation.TridentCollector; 6 import storm.trident.operation.TridentOperationContext; 7 import storm.trident.tuple.TridentTuple; 8 9 import java.util.Map; 10 11 12 public class LocalSum implements Aggregator<LocalSum.InnerState> { 13 //内部状态类,用于暂存累计的结果状态 14 public static class InnerState{ 15 public double amtSum=0.0; 16 } 17 @Override 18 public InnerState init(Object o, TridentCollector tridentCollector) { 19 InnerState innerState=new InnerState(); 20 innerState.amtSum=0.0; 21 return innerState; 22 } 23 24 //執行循环累计 25 @Override 26 public void aggregate(InnerState preState, TridentTuple tridentTuple, TridentCollector tridentCollector) { 27 Double orderAmt=tridentTuple.getDoubleByField("orderAmt"); 28 //累计 29 double preSum=preState.amtSum; 30 double newAmt=preSum+orderAmt; 31 //更新 32 preState.amtSum=newAmt; 33 } 34 35 @Override 36 public void complete(InnerState innerState, TridentCollector tridentCollector) { 37 //最终结果 38 tridentCollector.emit(new Values(innerState.amtSum)); 39 } 40 41 @Override 42 public void prepare(Map map, TridentOperationContext tridentOperationContext) { 43 44 } 45 46 @Override 47 public void cleanup() { 48 49 } 50 51 52 53 }
5.效果
二:全局聚合
1.说明
在上一个文档中的程序中已经开始使用全局聚合了,但是这里在和局部聚合放在一起重新说明一次
2.驱动类
1 package com.jun.trident; 2 3 import backtype.storm.Config; 4 import backtype.storm.LocalCluster; 5 import backtype.storm.StormSubmitter; 6 import backtype.storm.generated.AlreadyAliveException; 7 import backtype.storm.generated.InvalidTopologyException; 8 import backtype.storm.tuple.Fields; 9 import backtype.storm.tuple.Values; 10 import storm.trident.Stream; 11 import storm.trident.TridentState; 12 import storm.trident.TridentTopology; 13 import storm.trident.operation.Function; 14 import storm.trident.operation.TridentCollector; 15 import storm.trident.operation.TridentOperationContext; 16 import storm.trident.operation.builtin.Count; 17 import storm.trident.operation.builtin.Sum; 18 import storm.trident.testing.FixedBatchSpout; 19 import storm.trident.testing.MemoryMapState; 20 import storm.trident.tuple.TridentTuple; 21 22 import java.util.Map; 23 24 public class TridentDemo { 25 public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException { 26 TridentTopology tridentTopology=new TridentTopology(); 27 //模拟数据 28 Fields field=new Fields("log","flag"); 29 FixedBatchSpout spout=new FixedBatchSpout(field,5, 30 new Values("168.214.187.214 - - [1481953616092] "GET /view.php HTTP/1.1" 200 0 "http://cn.bing.com/search?q=spark mllib" "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" "-"","A"), 31 new Values("168.187.202.202 - - [1481953537038] "GET /IBEIfeng.gif?order_id=1063&orderTime=1481953537038&memberId=4000012340500607&productInfos=10005-2099.48-B-1|10004-1886.62-A-2|10001-961.99-A-1&orderAmt=6834.70 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2;Tident/6.0)" "-"","A"), 32 new Values("61.30.167.187 - - [1481953539039] "GET /IBEIfeng.gif?order_id=1064&orderTime=1481953539039&memberId=4000930409959999&productInfos=10007-3329.13-B-1|10009-2607.71-B-1|10002-390.62-A-1|10006-411.00-B-2&orderAmt=7149.46 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (Linux; Android 4.2.1; Galaxy Nexus Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" "-"","A"), 33 new Values("30.29.132.190 - - [1481953544042] "GET /IBEIfeng.gif?order_id=1065&orderTime=1481953544043&memberId=1234568970080798&productInfos=10005-2099.48-B-1|10001-3242.40-C-2|10006-411.00-B-1&orderAmt=8995.28 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 7_)_3 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B511 Safari/9537.53" "-"","B"), 34 new Values("222.190.187.201 - - [1481953578068] "GET /IBEIfeng.gif?order_id=1066&orderTime=1481953578068&memberId=3488586887970809&productInfos=10005-2099.48-B-1|10001-2774.16-C-2&orderAmt=7647.80 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" "-"","B"), 35 new Values("72.202.43.53 - - [1481953579069] "GET /IBEIfeng.gif?order_id=1067&orderTime=1481953579069&memberId=2084859896989877&productInfos=10007-3329.13-B-1|10001-961.99-A-2&orderAmt=5253.10 HTTP/1.1" 200 0 "-" "Mozilla/5.0 (Linux; Android 4.2.1; Galaxy Nexus Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" "-"","B") 36 ); 37 //多次循环 38 spout.setCycle(true); 39 //流处理 40 Stream stream=tridentTopology.newStream("orderAnalyse",spout) 41 //过滤 42 .each(new Fields("log"),new ValidLogFilter()) 43 //解析 44 .each(new Fields("log"), new LogParserFunction(),new Fields("orderId","orderTime","orderAmtStr","memberId")) 45 //投影 46 .project(new Fields("orderId","orderTime","orderAmtStr","memberId")) 47 //时间解析 48 .each(new Fields("orderTime"),new DateTransFormerFunction(),new Fields("day","hour","minter")) 49 ; 50 //分流 51 //1.基于minter统计订单数量,分组统计 52 TridentState state=stream.groupBy(new Fields("minter")) 53 //全局聚合,使用内存存储状态信息 54 .persistentAggregate(new MemoryMapState.Factory(),new Count(),new Fields("orderNumByMinter")); 55 // state.newValuesStream().each(new Fields("minter","orderNumByMinter"),new PrintFilter()); 56 57 //2.另一个流,基于分钟的订单金额,局部聚合 58 Stream partitionStream=stream.each(new Fields("orderAmtStr"),new TransforAmtToDoubleFunction(),new Fields("orderAmt")) 59 .groupBy(new Fields("minter")) 60 //局部聚合 61 .chainedAgg() //聚合链 62 .partitionAggregate(new Fields("orderAmt"),new LocalSum(),new Fields("orderAmtSumOfLocal")) 63 .chainEnd(); //聚合链 64 // partitionStream.each(new Fields("minter","orderAmtSumOfLocal"),new PrintFilter()); 65 //做一次全局聚合 66 TridentState partitionState=partitionStream.groupBy(new Fields("minter")) 67 //全局聚合 68 .persistentAggregate(new MemoryMapState.Factory(),new Fields("orderAmtSumOfLocal"),new Sum(),new Fields("totalOrderAmt")); 69 partitionState.newValuesStream().each(new Fields("minter","totalOrderAmt"),new PrintFilter()); 70 71 //提交 72 Config config=new Config(); 73 if(args==null || args.length<=0){ 74 LocalCluster localCluster=new LocalCluster(); 75 localCluster.submitTopology("tridentDemo",config,tridentTopology.build()); 76 }else { 77 config.setNumWorkers(2); 78 StormSubmitter.submitTopology(args[0],config,tridentTopology.build()); 79 } 80 } 81 }
3.效果