• c# 中的 map-reduce-filter(map-reduce的简单探索)


    js中的es6 中提出 map  reduce filter 等方法;

    那么我们在c#中似乎没看到呢,真的吗? are you kiding me?

     先看map

          static IEnumerable<TResult> Map<T,TResult>(Func<T, TResult> func,IEnumerable<T> list)
            {
                foreach(var i in list)
                {
                    yield return func(i);
                }
            }
    
    
    
            static void Main(string[] args)
            {
                var testList = new List<int> { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
    
                var mapResult = Map<int, int>(x => x + 2, testList).ToList<int>();
    
                foreach(var i in mapResult)
                {
                    Console.WriteLine(i);
                }
    
                //然而,在我们的额linq 早就有了该方法;那就是我们的SELECT
                var result = testList.Select(obj => obj + 2).ToList<int>();
    
                Console.ReadLine();

     Reduce;其实,这个函数命名为reduce,我觉得很不习惯,明明就是:Aggregate; 为啥还要叫reduce呢.....

            //还是比较挺有意思的;
            static T Reduce<T,U>(Func<U,T,T> func, IEnumerable<U> list,T acc)
            {
                foreach(var i in list)
                {
                    acc = func(i, acc);
                    //相加的结果,成为下一个函数开始的参数 聚合
                    //fn(x,y)=> fn(fn(x+y),z)=>fn(fn(fn(x+y),z),zz)
                }
                return acc;
            }
    
    
            static void Main(string[] args)
            {
                var testList = new List<int> { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
                var reduceResult = Reduce<int, int>((x, y) => x + y, testList, 0);
                Console.WriteLine(reduceResult);
    
    
                //然而,我们的c# 中也是早有了该方法了滴呀;
                var linqReduce = Enumerable.Range(1, 10).Aggregate(0, (acc, x) => acc + x);

    c# 中的filter ,这个就没啥好说的了,其实就是我们的Where;

    总结:

    Map = Select | Enumerable.Range(1, 10).Select(x => x + 2);
    Reduce = Aggregate | Enumerable.Range(1, 10).Aggregate(0, (acc, x) => acc + x);
    Filter = Where | Enumerable.Range(1, 10).Where(x => x % 2 == 0);

    看到这里,难道你不会想到,某种高级的技术吗?

    那就是google 的 mapReduce呀;

     看这:https://ayende.com/blog/4435/map-reduce-a-visual-explanation

     好文 推荐:http://www.justinshield.com/2011/06/mapreduce-in-c/

    下面就是我们探索的代码:

    using System;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.Linq;
    using System.Text;
    using System.Threading;
    using System.Threading.Tasks;
    
    namespace ConsoleApplication80
    {
    
        /// <summary>
        /// 今天大概就研究两个东西;
        /// linq =>map reduce 
        /// plinq
        /// 并行处理,还有我们actor模型的;当然中间少不了我们 go 语言的 csp;
        /// 递归
        /// </summary>
        class Program
        {
    
            static void PlinqInfo()
            {
                int[] arr = new int[10] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
                //串行计算;
                int target = 5;
                int len = arr.Count();
                int index = 0;
                for (int i = 0; i < len; i++)
                {
                    if (arr[i] == target)
                    {
                        index = i;
                    }
                }
    
                //这个是中串行的方式再坚决问题;
                //查找方式;一个从后端,一个从前端,进行查找,可以简单的理解成折半查找,检查查找的循环的次数;
                //虽然我们进行任务的拆分,减少的循环的次数,但是,内部的执行依然是串行滴呀;
                for (int j = 0; j < len / 3; j++)
                {
                    //循环次数减少了,但是里面的自行任务执行,还是串行滴呀;
                    //step 1
                    if (arr[j] == target)
                    {
                        index = j;
                    }
                    //step 2
                    if (arr[len - j] == target)
                    {
                        index = len - j;
                    }
                }
    
                //继续优化方式;
                //这种方式,才算一次循环中,并发执行了两个操作;
                //并发的进行任务的操作,并发的进行模式匹配;
                //但是最后又要涉及到,结果的,统计,reduce;
                //最终变化为
                //每一次循环,开两个线程去并发处理;
                for (int j = 0; j < len / 3; j++)
                {
                    //step 1
                    new Thread(o =>
                    {
                        int param = (int)o;
                        if (arr[param] == target)
                        {
                            index = param;
                        }
                    }).Start(j);
    
                    //step 2
                    new Thread(o =>
                    {
                        int param = (int)o;
                        if (arr[param] == target)
                        {
                            index = param;
                        }
                    }).Start((len - 1));
    
                }
    
                // 方法式三:对任务进行拆分;
                //与其说是任务拆分,不如说是数据拆分;
                int[] arr1 = new int[5];
                int[] arr2 = new int[5];
                Array.Copy(arr, 4, arr1, 0, 5);
                Array.Copy(arr, 5, arr1, 0, 5);
                //先将一个数组,分成两份; 然后对每一份进行并行操作;
                new Thread(o =>
                {
                    //操作arr1
    
                });
                new Thread(o =>
                {
                    //操作arr2
    
                });
    
                //处理完之后,进行聚合的统计操作;
                //中间利用并行处理,并行的关键,还是我们的 任务才分;
                //任务才分,并发处理,函数编程;之间的通信;actor 模型;
                //并发处理的顶级方法,还是我们的actor;
    
            }
    
    
            /// <summary>
            /// 这种方式应该是最土的一种做法了;
            /// </summary>
            static void BadCountWord()
            {
    
            }
    
            /// <summary>
            /// 最原始的方式来实现我们单词实现的次数的统计;
            /// Count Word;
            /// </summary>
            static void CountWord()
            {
                //这个是单线程的执行,串行的执行;
                //查询每个次出现的频率;
                String[] text = new String[] { "hello world", "hello every one", "say hello to everyone in the world", "fuck the world" };
                //统计每个词出现的频率;
                Dictionary<string, int> dic = new Dictionary<string, int>();
                //遍历三次;
                int len = text.Length;
                for (int i = 0; i < len; i++)
                {
                    string[] temp = text[i].Split(' ');//这里假设我们的数据都是很有规律的,中间都有我们的空格;
                    int tempLen = temp.Length;
                    for (int j = 0; j < tempLen; j++)
                    {
                        string key = temp[j];
                        if (!dic.ContainsKey(key))
                        {
                            dic.Add(key, 1);
                        }
                        else
                        {
                            //取出原来的值
                            int value = dic[key];
                            value++;
                            dic[key] = value;
                        }
                    }
    
                }
    
                //最后统计结果;
                foreach (var item in dic)
                {
                    Console.WriteLine($"{item.Key}:{item.Value}");
                }
    
            }
    
    
            /// <summary>
            /// 并发的去执行我们的单词统计;
            /// 效果是相当不错滴呀;
            /// 一个节点处理一个数据;
            /// 可以冲真个过程来看,如果是小规模的数据,这样做map reduce ,明显是不划算滴呀;
            /// </summary>
            static void ParalleCountWord()
            {
                string[] text = new string[] { "hello world", "hello every one", "say hello to everyone in the world", "fuck the world" };
                //假设我的电脑有四个cpu,实际上也是有的;
                //先进行任务拆分; 其实就是我们数据的拆分;
                string[] task1 = text[0].Split(' ');
                string[] task2 = text[1].Split(' ');
                string[] task3 = text[2].Split(' ');
                string[] task4 = text[3].Split(' ');
    
    
                //如果是在统一作用域中,我们当然可以使用这种方式来实现
                //Task<Dictionary<string, int>> task1_Result = Task.Run(()=> MapNode(task1));
                //然后并发执行四个任务;
                //Task<Dictionary<string, int>> task1_Result = Task.Factory.StartNew((data)=>MapNode(data),(task1);
    
    
                //这个相当于我们的map 并行的去map
                Task<Dictionary<string, int>> task1_Result = Task.Run(() => MapNode(task1));
                Task<Dictionary<string, int>> task2_Result = Task.Run(() => MapNode(task2));
                Task<Dictionary<string, int>> task3_Result = Task.Run(() => MapNode(task3));
                Task<Dictionary<string, int>> task4_Result = Task.Run(() => MapNode(task4));
    
    
    
                //并行的去reduec
                //然后来做我们的reduce;就是我们最后结果的统计的拉;
                Task.WaitAll(task1_Result, task2_Result, task3_Result, task4_Result);
                //都并发执行完之后,来进行我么结果的统计;
                //这里我们用连个节点去做reduce,主要是为了模拟一下我们德 map reduce 的整个过程滴啊;
                //也就是我们的两次聚合,
                //反正你,会发现如果数据量小的话,这种map reduce 是完全部划算滴呀;
    
                //第一次reduce;
    
                Task<Dictionary<string, int>> reduce1_result_1_plus_2 = Task.Run(() => ReduceInfo(task1_Result.Result, task2_Result.Result));
                Task<Dictionary<string, int>> reduce2_result_3_plus_4 = Task.Run(() => ReduceInfo(task3_Result.Result, task4_Result.Result));
    
                Task.WaitAll(reduce1_result_1_plus_2, reduce2_result_3_plus_4);
                //第二次reduce,也就是我们的最后一次reduce
    
                Task<Dictionary<string, int>> finallyResult = Task.Run(() => ReduceInfo(reduce1_result_1_plus_2.Result, reduce2_result_3_plus_4.Result));
    
                var result = finallyResult.Result;
    
                foreach (var item in result)
                {
                    Console.WriteLine($"{item.Key}:{item.Value}");
                }
    
                //不知道你注意到没有,上面的map 和 reduce 方法是串行执行的,其实 我们的 map 和 reduce方法 是可以独立开来的;
    
    
            }
    
            /// <summary>
            /// 一个可以独立执行的单元;
            /// 每个node 处理完之后,都会返回我们的处理结果;
            /// </summary>
            /// <param name="arr"></param>
            static Task<Dictionary<string, int>> MapNode(string[] arr)
            {
                int len = arr.Length;
                Dictionary<string, int> dic = new Dictionary<string, int>();
                for (int i = 0; i < len; i++)
                {
                    string key = arr[i];
                    if (!dic.ContainsKey(key))
                    {
                        dic.Add(key, 1);
                    }
                    else
                    {
                        dic[key] = dic[key]++;
                    }
                }
    
                return Task.FromResult(dic);
            }
    
            /// <summary>
            /// 现在我们又要对结果进行各种统计了;
            /// 难道又开启四个节点去统计他们的结果?
            /// 这里的聚合,其实,我们进行group by 然后 count xx 接可以了,不过,这里主要是为了模拟 map reudce
            /// </summary>
            /// <returns></returns>
            static Task<Dictionary<string, int>> ReduceInfo(params Dictionary<string, int>[] dics)
            {
    
                Dictionary<string, int> result = new Dictionary<string, int>();
                int len = dics.Length;
                for (int i = 0; i < len; i++)
                {
                    var temp = dics[i];
                    foreach (var key in temp.Keys)
                    {
                        if (!result.ContainsKey(key))
                        {
                            int value = temp[key]; //添加
                            result.Add(key, value);
                        }
                        else
                        {
                            int value = temp[key];
                            result[key] = result[key] + value; //重新赋值
                        }
    
                    }
                }
                return Task.FromResult(result);
            }
    
    
            /// <summary>
            /// 接下来我们ms自带的plinq 来实现;
            /// 并发执行应该考虑的
            /// 任务是否可以拆分;也就是说单个任务是否可以独立运行;
            /// 并且运行的结果没有先后顺序;
            /// </summary>
            static void PLinq()
            {
                //接下来,就用ms自带的plinq的方式来进行 map reduce 的使用;
                //
                string[] text = new string[] { "hello","world", "hello","every","one", "say","hello","to","everyone","in","the","world", "fuck","the","world" };
                ////先要个每个值 标记一个类似权重的东西;
                //var result = text.Select(o =>  new { key = o,value = 1 }).ToList(); //这样进行我们的第一次的map 这样我们就有可以进行先关的各种操作了滴呀;
                ////然后进行group by;
                //var list = result.GroupBy(o => o.key, (key, elements) => new { key = key, Cout = elements.Count() }).ToList();
    
                ////我擦,就这样简单的方式,就统计完了;我擦;
    
                //foreach (var item in list)
                //{
                //    Console.WriteLine($"{item.key}:{item.Cout}");
                //}
    
                //大数据,肯定不会直接这样去map滴呀;
    
                var mapReuslt= text.AsParallel().Select(o => new { key = o, value = 1 }).ToList();  //并行的去map
    
    
                //group by 是如何实现的呢;往一个 dic添加值,如果存在就将value 加1;
                var list = mapReuslt.AsParallel().GroupBy(o => o.key, (key, elements) => new { key = key, Cout = elements.Count() }).ToList(); // groupby=>key
    
                //大概就是这么执行的喽;
                //充分利用我们的并行;
                foreach (var item in list)
                {
                    Console.WriteLine($"{item.key}:{item.Cout}");
                }
    
            }
    
    
            /// <summary>
            /// 这样额并行,执行结果,就爽是完成了,窝草尼玛;
            /// 我操tam的,不就是gourp by 然后 就 count 吗,我日,要搞这么麻烦,我
            /// </summary>
            static void PLinq2()
            {
                string[] text = new string[] { "hello", "world", "hello", "every", "one", "say", "hello", "to", "everyone", "in", "the", "world", "fuck", "the", "world" };
    
                var re = text.ToLookup(o=>o);   //但是group by 背后真正的逻辑是撒;它是如何去做到去重和数据统计滴呀; 返回这样我们想要的记过的滴呀;
    
                foreach(var i in re)
                {
                    Console.WriteLine(i.Key+"--"+ i.Count());
                }
    
                //var result = text.ToLookup(key => key, value => 1).Select(o => new { key = o.Key, cout = o.Sum() }).ToList();
    
    
    
                //foreach(var i in result)
                //{
                //    Console.WriteLine(i.key + " "+i.cout);
                //}
    
    
    
            }
    
            static void F()
            {
                // Create a list of Packages to put into a Lookup data structure.
                List<Package> packages = new List<Package> { new Package { Company = "Coho Vineyard", Weight = 1, TrackingNumber = 89453312L },
                                                     new Package { Company = "Lucerne Publishing", Weight = 2, TrackingNumber = 89112755L },
                                                     new Package { Company = "Wingtip Toys", Weight = 3, TrackingNumber = 299456122L },
                                                     new Package { Company = "Contoso Pharmaceuticals", Weight = 4, TrackingNumber = 670053128L },
                                                     new Package { Company = "Wide World Importers", Weight = 5, TrackingNumber = 4665518773L } };
    
                // Create a Lookup to organize the packages. Use the first character of Company as the key value.
                // Select Company appended to TrackingNumber for each element value in the Lookup.
    
    
                var resultMax = packages.GroupBy(o => o.Company[0], (key, elements) => new { key = key, eleMax = elements.Max(o => o.Weight) }).ToList(); //group by 之后返回 weith 最大的一条
    
                var resultMin = packages.GroupBy(o => o.Company[0], (key, elements) => new { key = key, eleMin = elements.Min(o => o.Weight) }).ToList();//group by  之后,返回weight最小的一条;
    
    
                //上面返回的仅仅是 最大个 最小的值;我要的是obje
    
                //var resultMax = packages.GroupBy(o => o.Company[0], (key, elements) => new { key = key, eleMax = element }).ToList(); //group by 之后返回 weith 最大的一条
    
    
    
                var resultList = packages.GroupBy(o => o.Company[0], (key, elements) => new { key = key, eles =elements }).ToList();//group by  之后,返回对应的数据集合;
    
                var resultSum = packages.GroupBy(o => o.Company[0], (key, elements) => new { key = key, eleCount = elements.Count() }).ToList();//group by  之后,求重负的总条数目
    
                var resultSd = packages.GroupBy(o => o.Company[0], (key, elements) => new { key = key, Sum = elements.Sum(o=>o.Weight) }).ToList();//group by  之后,记录的综合;
    
    
                var result = packages.GroupBy(o => o.Company[0], (key, elements) => new { key = key, ele = elements.Select(obj => obj.Company + obj.TrackingNumber).Aggregate((acc,next)=>acc+"--"+next) }).ToList(); //gory by 之后,求出结合累加的操作;
    
                foreach(var o in result)
                {
                    Console.WriteLine($"{o.key}-----{o.ele}");
                }
    
    
                //Lookup<char, string> lookup = (Lookup<char, string>)packages.ToLookup(p => Convert.ToChar(p.Company.Substring(0, 1)),
                //                                                p => p.Company + " " + p.TrackingNumber);
    
                
                //var result = lookup.Select(o=>o.);
                //Console.WriteLine(result);
    
            }
    
    
    
            static void Main(string[] args)
            {
                PLinq2();
                //F();
    
                //string[] words = new string[] { "Able", "was", "I", "ere", "I", "saw", "Elba" };
                //string s = words.Aggregate((a, n) => a + " " + n);
    
    
                //IEnumerable<string> list = new List<string>() { "I", "want", "to", "fuck", "the", "life" };
    
                ////seed 你可以理解成我们的初始化值吧了;
                ////Iaccresult =seed; //如果有的话;
                ////当然考虑性能的话还是用StringBuilder吧,这里主要介绍用法。这个Sum做不到吧!
                ////它是对最后的中的结果进行操作;
                //var result = list.Aggregate("[seed]", (acc, next) => acc + " " + next, o => o + "执行完最后的结果后,进行最后一次的result的map");
                ////map reduce的 关键点事plin map-reduce big data handle processs
                //Console.WriteLine(result);
                //这里,我再做一哈时间统计;
                //Console.WriteLine("常规串行执行方法----------");
                //Stopwatch sw = Stopwatch.StartNew();
                //sw.Restart();
                //CountWord();
                //Console.WriteLine($"串行执行时间{sw.ElapsedMilliseconds}");
    
    
                //Console.WriteLine("map-reduce-并行处理方法-----------");
                //sw.Restart();
                //ParalleCountWord();
                //Console.WriteLine($"map reduce 并行执行时间{sw.ElapsedMilliseconds}");
    
    
                //Console.WriteLine($"Plinq方式执行---------------");
                //sw.Restart();
                //PLinq();
                //Console.WriteLine($"PLinq 并行执行时间{sw.ElapsedMilliseconds}");
    
                //PLinq();
    
                Console.ReadLine();
    
            }
        }
    
        internal class Package
        {
            internal string Company;
            internal long TrackingNumber;
            internal double Weight;
        }
    }
  • 相关阅读:
    《高效能人士的七个习惯》读书笔记
    《精进》读书摘要
    讲述测试自己的故事
    搭建项目自动化框架的搭建、改进与思考
    真是个信息爆炸的世界
    C#中的WebBrowser控件的使用
    C#动态调用webService出现 基础连接已经关闭: 未能为 SSL/TLS 安全通道建立信任关系。
    C#抓取网面上的html内容(JS动态生成的无法抓取)
    SQL还原数据库后,数据库显示受限制用户解决方法
    [Microsoft][ODBC 驱动程序管理器] 在指定的 DSN 中,驱动程序和应用程序之间的体系结构不匹配
  • 原文地址:https://www.cnblogs.com/mc67/p/7391007.html
Copyright © 2020-2023  润新知