• C# 使用SIMD系列方法加速批量运算


    我们现在想做一些简单的批量运算,比如累乘得积,累加求和

    public class NormalCalc
    {
        public static double Multiply(double[] nums)
        {
            double result = 1.0d;
    
            for (int i = 0; i < nums.Length; i++)
            {
                result *= nums[i];
            }
            return result;
        }
    
        public static double AddTotal(double[] nums)
        {
            double result = 0.0d;
    
            for (int i = 0; i < nums.Length; i++)
            {
                result += nums[i];
            }
            return result;
        }
    }

    这种批量运算不正是指令集的优势么,那就试试吧

    C#中可以使用Vector类来做宽位运算,我这里有avx2指令集,也就是256位,double是64位的,那就有4个,如果做int运算自然就有8个

    在这里就是4个4个放到一个Vector里一起做乘法运算,最后把4拷贝到数组中互乘,再把多余的乘完就好了,乘法嘛,用1作为种子

        public unsafe static double Multiply(double[] nums)
        {
            int vectorSize = Vector<double>.Count;
            var accVector = Vector<double>.One;
            int i;
            var array = nums;
            double result = 1.0d;
            fixed (double* p = array)
            {
                for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
                {
                    //var v = new Vector<double>(array, i);
                    var v = Unsafe.Read<Vector<double>>(p + i);
                    accVector = Vector.Multiply(accVector, v);
                }
            }
            var tempArray = new double[Vector<double>.Count];
            accVector.CopyTo(tempArray);
            for (int j = 0; j < tempArray.Length; j++)
            {
                result = result * tempArray[j];
            }
    
            for (; i < array.Length; i++)
            {
                result *= array[i];
            }
    
            return result;
        }

    下一个问题就是,我总不见得又得每种数据类型都写一遍吧,咱有没有办法用C#的各种新特性写成泛型?咱有Span有预览特性INumber,试了下还真可以

    用new Vector构造,泛型T用INumber约束就有了T.One来表示数字1,并且能随便的做乘法运算了

        public static T Multiply<T>(T[] nums) where T : struct, INumber<T>
        {
            int vectorSize = Vector<T>.Count;
            var accVector = Vector<T>.One;
            int i;
            var array = nums;
            T result = T.One;
    
            for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
            {
                var v = new Vector<T>(array, i);
                accVector = Vector.Multiply(accVector, v);
            }
    
            var tempArray = new T[Vector<T>.Count];
            accVector.CopyTo(tempArray);
            for (int j = 0; j < tempArray.Length; j++)
            {
                result = result * tempArray[j];
            }
    
            for (; i < array.Length; i++)
            {
                result *= array[i];
            }
    
            return result;
        }

    理论上Span速度不会比指针快,new Vector不会比Unsafe.Read快,但是差不了太多,就能写成泛型方法

    来测试一下速度:

    
    //生成运算数组
    double[] nums = new double[100000];
    Random random = new Random();
    for (int i = 0; i < nums.Length; i++)
    {
        nums[i] = random.NextDouble() * 2.723;
    }
    
    //普通连乘
    Stopwatch stopwatch = Stopwatch.StartNew();
    for (int i = 0; i < 10000; i++)
    {
        NormalCalc.Multiply(nums);
    }
    stopwatch.Stop();
    Console.WriteLine(stopwatch.ElapsedMilliseconds);
    
    //Vector
    stopwatch = Stopwatch.StartNew();
    for (int i = 0; i < 10000; i++)
    {
        SIMD_Calc.Multiply(nums);
    }
    stopwatch.Stop();
    Console.WriteLine(stopwatch.ElapsedMilliseconds);
    
    //Vector+Span+INumber写成泛型
    stopwatch = Stopwatch.StartNew();
    for (int i = 0; i < 10000; i++)
    {
        SIMD_Calc.MultiplySpan(nums);
    }
    stopwatch.Stop();
    Console.WriteLine(stopwatch.ElapsedMilliseconds);

    结果为:

    730

    185

    190

    不错不错,效果还挺满意的

    接下来来个泛型的累加

        public static T AddTotal<T>(T[] nums) where T : struct, INumber<T>
        {
            int vectorSize = Vector<T>.Count;
            var accVector = Vector<T>.Zero;
            int i;
            var array = nums;
            for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
            {
                var v = new Vector<T>(array, i);
                accVector = Vector.Add(accVector, v);
            }
            T result = Vector.Dot(accVector, Vector<T>.One);
            for (; i < array.Length; i++)
            {
                result += array[i];
            }
            return result;
        }

    还有一种方式不使用Vector,而是直接使用Avx2类下的方法做运算,需要加个是否支持的判断

        public unsafe static int AddTotal_Avx2(int[] nums)
        {
            if (Avx2.IsSupported)
            {
                int vectorSize = 256 / 8 / 4;
                var accVector = Vector256<int>.Zero;
                int i;
                var array = nums;
                fixed (int* ptr = array)
                {
                    for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
                    {
                        var v = Avx2.LoadVector256(ptr + i);
                        accVector = Avx2.Add(accVector, v);
                    }
                }
                int result = 0;
                var temp = stackalloc int[vectorSize];
                Avx2.Store(temp, accVector);
                for (int j = 0; j < vectorSize; j++)
                {
                    result += temp[j];
                }
                for (; i < array.Length; i++)
                {
                    result += array[i];
                }
                return result;
            }
            else
            {
                throw new NotSupportedException();
            }
        }

    但是我们要做好回落,比如没有avx2就用sse,没有sse就用普通的

    不过System.Runtime.Intrinsics.X86下面一堆这种,再说了还有arm的,所以通用性不如Vector方法

        public unsafe static int AddTotal_2(int[] nums)
        {
            if (Avx2.IsSupported)
            {
                return AddTotal_Avx2(nums);
            }
            else if (Sse2.IsSupported)
            {
                return AddTotal_Sse2(nums);
            }
            else
            {
                return NormalCalc.AddTotal(nums);
            }
        }

    把Vector用在两个数组相加相乘上会更加简单

        public unsafe static T[] Multiply<T>(T[] numsl, T[] numsr) where T : struct,INumber<T>
        {
            if (numsl.Length != numsr.Length)
            {
                throw new ArgumentException();
            }
    
            T[] result = new T[numsl.Length];
            int vectorSize = Vector<T>.Count;
            int i;
    
            for (i = 0; i <= numsl.Length - vectorSize; i += vectorSize)
            {
                var l = new Vector<T>(numsl, i);
                var r = new Vector<T>(numsr, i);
                var multiplied = Vector.Multiply(l, r);
                //for (int j = i; j < i + vectorSize; j++)
                //{
                //    result[j] = multiplied[j % vectorSize];
                //}
                multiplied.CopyTo(result, i);
            }
    
            for (; i < numsl.Length; i++)
            {
                result[i] = numsl[i] * numsr[i];
            }
            return result;
        }

    批量加1

    普通方法:

        public static void AddOne(int[] nums)
        {
            for (int i = 0; i < nums.Length; i++)
            {
                nums[i]++;
            }
        }

    SIMD:

        public static void AddOne<T>(T[] nums) where T : struct, INumber<T>
        {
            int vectorSize = Vector<T>.Count;
            var accVector = Vector<T>.One;
            int i;
            var array = nums;
            for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
            {
                var v = new Vector<T>(array, i);
                var vec = Vector.Add(v, accVector);
                vec.CopyTo(array, i);
            }
            for (; i < array.Length; i++)
            {
                array[i]++;
            }
        }

    跑分是  int类型 普通方法:390   SIMD:70

    double类型  普通方法:578  SIMD:145

    这对于我们平时普通计算的性能帮助还是有的,官方还用SIMD优化了Matrix的一些类,不过都是很小的二位矩阵,你可以根据自己的需要去设计更复杂的大矩阵运算类,如果需要更复杂的批量多维矩阵处理推荐OpenCvSharp

    代码下载:https://wwu.lanzoub.com/iglMD032ky0f

    参考链接:

    https://zhuanlan.zhihu.com/p/60171538

    https://habr.com/en/post/467689

    https://www.zhihu.com/question/266256257

  • 相关阅读:
    RabbitMQ笔记-死信队列与延时队列
    设计模式-迭代器模式
    RabbitMQ笔记-Demo(C#)
    RabbitMQ笔记-消息追踪【未完成】
    RabbitMQ笔记-安装&命令
    RabbitMQ笔记-Exchange、Queue、Message详细说明
    MySQL笔记-MVCC【没写】
    MySQL笔记-基础知识
    多线程笔记-基础知识
    在Redis中进行分页排序查询【转】
  • 原文地址:https://www.cnblogs.com/gxrsprite/p/16135450.html
Copyright © 2020-2023  润新知