场景:例如需要处理数据库大量的数据,先查询出来,然后操作数据,然后1000条合成1批次,然后再插入到另外一张表。
作用:数据流批处理,多核操作增加效率,批处理改变单个插入为批量插入增加效率
定义数据Model
1 public class Employee 2 { 3 public int EmployeeID { get; set; } 4 public string LastName { get; set; } 5 public string FirstName { get; set; } 6 7 // A random number generator that helps tp generate 8 // Employee property values. 9 static Random rand = new Random(42); 10 11 // Possible random first names. 12 static readonly string[] firstNames = { "Tom", "Mike", "Ruth", "Bob", "John" }; 13 // Possible random last names. 14 static readonly string[] lastNames = { "Jones", "Smith", "Johnson", "Walker" }; 15 16 // Creates an Employee object that contains random 17 // property values. 18 public static Employee Random() 19 { 20 return new Employee 21 { 22 EmployeeID = -1, 23 LastName = lastNames[rand.Next() % lastNames.Length], 24 FirstName = firstNames[rand.Next() % firstNames.Length] 25 }; 26 } 27 }
定义数据库操作(这里简化)
1 public class EmployeeRepository 2 { 3 /// <summary> 4 /// 执行次数 5 /// </summary> 6 private static int Count; 7 public static void InsertEmployees(Employee[] employees) 8 { 9 Console.WriteLine($"线程Id:{Thread.CurrentThread.ManagedThreadId} 第{Interlocked.Increment(ref Count)}次 总数:{ employees.Count()},数据:{JsonConvert.SerializeObject(employees.Select(p=>p.EmployeeID))}开始执行批量插入"); 10 //todo: db op 11 return; 12 } 13 }
数据流批处理封装类
1 public class BatchBlockPipeline<T> 2 { 3 /// <summary> 4 /// 批处理块 5 /// </summary> 6 private BatchBlock<T> _batchBlock; 7 /// <summary> 8 /// 批处理执行块 9 /// </summary> 10 private ActionBlock<T[]> _actionBlock; 11 /// <summary> 12 /// 是否为定时触发 13 /// </summary> 14 private bool _timeTrigger; 15 /// <summary> 16 /// 定时触发时候用到的连接块 17 /// </summary> 18 private TransformBlock<T, T> _transformBlock; 19 /// <summary> 20 /// 定时触发器 21 /// </summary> 22 private readonly Timer _timer; 23 24 /// <summary> 25 /// 基本构造函数 26 /// </summary> 27 /// <param name="batchSize">每次处理的数据量</param> 28 /// <param name="action">执行委托方法</param> 29 /// <param name="boundedCapacity">最大处理的数据量 默认 int.MaxValue 2147483647</param> 30 /// <param name="maxDegreeOfParallelism">最大并行量 默认1</param> 31 /// <param name="timeTrigger">定时触发批处理 默认不处理, 设置大于0则处理,秒级别</param> 32 public BatchBlockPipeline(int batchSize, Action<T[]> action, int boundedCapacity = int.MaxValue, int maxDegreeOfParallelism = 1, int timeTrigger = 0) 33 { 34 _batchBlock = new BatchBlock<T>(batchSize, new GroupingDataflowBlockOptions() { BoundedCapacity = boundedCapacity }); 35 _actionBlock = new ActionBlock<T[]>(data => action(data), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism }); 36 _batchBlock.LinkTo(_actionBlock, new DataflowLinkOptions() { PropagateCompletion = true }); 37 _batchBlock.Completion.ContinueWith(delegate { _actionBlock.Complete(); }); 38 if (timeTrigger > 0) 39 { 40 _timeTrigger = true; 41 _transformBlock = new TransformBlock<T, T>(model => 42 { 43 _timer.Change(TimeSpan.FromSeconds(timeTrigger), Timeout.InfiniteTimeSpan); 44 return model; 45 }, new ExecutionDataflowBlockOptions() { BoundedCapacity = boundedCapacity }); 46 _transformBlock.LinkTo(_batchBlock, new DataflowLinkOptions() { PropagateCompletion = true }); 47 } 48 } 49 50 /// <summary> 51 /// post 数据 52 /// </summary> 53 /// <param name="model"></param> 54 /// <returns></returns> 55 public bool PostValue(T model) 56 { 57 if (!_timeTrigger) 58 { 59 return _batchBlock.Post(model); 60 } 61 return _transformBlock.Post(model); 62 } 63 64 /// <summary> 65 /// 主动触发数据处理,例如:当数据剩余未达到batchsize 主动触发处理数据 66 /// </summary> 67 /// <param name="model"></param> 68 /// <returns></returns> 69 public void TriggerBatch() 70 { 71 _batchBlock.TriggerBatch(); 72 } 73 74 /// <summary> 75 /// 返回当前执行总数 76 /// </summary> 77 /// <returns></returns> 78 public int GetBatchSum() 79 { 80 return _batchBlock.Receive().Count(); 81 } 82 83 /// <summary> 84 /// 主动关闭 85 /// </summary> 86 /// <returns></returns> 87 public void Close() 88 { 89 if (!_timeTrigger) 90 { 91 _batchBlock.Complete(); 92 } 93 _transformBlock.Complete(); 94 } 95 }
测试方法
1 class Program 2 { 3 static void Main(string[] args) 4 { 5 6 var batchDataPipeline = new BatchBlockPipeline<Employee>(10, EmployeeRepository.InsertEmployees); 7 8 for (int i = 0; i < 100; i++) 9 { 10 batchDataPipeline.PostValue(Employee.Random(i)); 11 } 12 13 Console.ReadKey(); 14 } 15 }
1.测试:100条数据,10个一批次插入数据库,并行数量1
//var batchDataPipeline = new BatchBlockPipeline<Employee>(10, EmployeeRepository.InsertEmployees);
2.测试:100条数据,10个一批次插入数据库,并行数量1,最大限制处理20个
// var batchDataPipeline = new BatchBlockPipeline<Employee>(10, EmployeeRepository.InsertEmployees,20);
3.测试 100条数据,10个一批次插入数据库,多核多线程并行数量10
//var batchDataPipeline = new BatchBlockPipeline<Employee>(10, EmployeeRepository.InsertEmployees, maxDegreeOfParallelism: 10);
4.测试 100条数据,10个一批次插入数据库,定时触发3秒触发一次 (模拟不定期post数据,或者最后剩余的数据达不到batchsize的情况)
//var batchDataPipeline = new BatchBlockPipeline<Employee>(10, EmployeeRepository.InsertEmployees, timeTrigger: 3);
//for (int i = 0; i < 100; i++)
//{
// batchDataPipeline.PostValue(Employee.Random(i));
// if (i % 3 == 0|| i % 7 == 0 || i % 9 == 0)
// {
// Thread.Sleep(5000);
// }
//}