C# 使用SIMD系列方法加速批量运算

我们现在想做一些简单的批量运算，比如累乘得积，累加求和

public class NormalCalc
{
    public static double Multiply(double[] nums)
    {
        double result = 1.0d;

        for (int i = 0; i < nums.Length; i++)
        {
            result *= nums[i];
        }
        return result;
    }

    public static double AddTotal(double[] nums)
    {
        double result = 0.0d;

        for (int i = 0; i < nums.Length; i++)
        {
            result += nums[i];
        }
        return result;
    }
}

这种批量运算不正是指令集的优势么，那就试试吧

C#中可以使用Vector类来做宽位运算，我这里有avx2指令集，也就是256位，double是64位的，那就有4个，如果做int运算自然就有8个

在这里就是4个4个放到一个Vector里一起做乘法运算，最后把4拷贝到数组中互乘，再把多余的乘完就好了，乘法嘛，用1作为种子

    public unsafe static double Multiply(double[] nums)
    {
        int vectorSize = Vector<double>.Count;
        var accVector = Vector<double>.One;
        int i;
        var array = nums;
        double result = 1.0d;
        fixed (double* p = array)
        {
            for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
            {
                //var v = new Vector<double>(array, i);
                var v = Unsafe.Read<Vector<double>>(p + i);
                accVector = Vector.Multiply(accVector, v);
            }
        }
        var tempArray = new double[Vector<double>.Count];
        accVector.CopyTo(tempArray);
        for (int j = 0; j < tempArray.Length; j++)
        {
            result = result * tempArray[j];
        }

        for (; i < array.Length; i++)
        {
            result *= array[i];
        }

        return result;
    }

下一个问题就是，我总不见得又得每种数据类型都写一遍吧，咱有没有办法用C#的各种新特性写成泛型？咱有Span有预览特性INumber，试了下还真可以

用new Vector构造，泛型T用INumber约束就有了T.One来表示数字1，并且能随便的做乘法运算了

    public static T Multiply<T>(T[] nums) where T : struct, INumber<T>
    {
        int vectorSize = Vector<T>.Count;
        var accVector = Vector<T>.One;
        int i;
        var array = nums;
        T result = T.One;

        for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
        {
            var v = new Vector<T>(array, i);
            accVector = Vector.Multiply(accVector, v);
        }

        var tempArray = new T[Vector<T>.Count];
        accVector.CopyTo(tempArray);
        for (int j = 0; j < tempArray.Length; j++)
        {
            result = result * tempArray[j];
        }

        for (; i < array.Length; i++)
        {
            result *= array[i];
        }

        return result;
    }

理论上Span速度不会比指针快，new Vector不会比Unsafe.Read快，但是差不了太多，就能写成泛型方法

来测试一下速度：


//生成运算数组
double[] nums = new double[100000];
Random random = new Random();
for (int i = 0; i < nums.Length; i++)
{
    nums[i] = random.NextDouble() * 2.723;
}

//普通连乘
Stopwatch stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
    NormalCalc.Multiply(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);

//Vector
stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
    SIMD_Calc.Multiply(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);

//Vector+Span+INumber写成泛型
stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
    SIMD_Calc.MultiplySpan(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);

结果为：

730

185

190

不错不错，效果还挺满意的

接下来来个泛型的累加

    public static T AddTotal<T>(T[] nums) where T : struct, INumber<T>
    {
        int vectorSize = Vector<T>.Count;
        var accVector = Vector<T>.Zero;
        int i;
        var array = nums;
        for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
        {
            var v = new Vector<T>(array, i);
            accVector = Vector.Add(accVector, v);
        }
        T result = Vector.Dot(accVector, Vector<T>.One);
        for (; i < array.Length; i++)
        {
            result += array[i];
        }
        return result;
    }

还有一种方式不使用Vector，而是直接使用Avx2类下的方法做运算，需要加个是否支持的判断

    public unsafe static int AddTotal_Avx2(int[] nums)
    {
        if (Avx2.IsSupported)
        {
            int vectorSize = 256 / 8 / 4;
            var accVector = Vector256<int>.Zero;
            int i;
            var array = nums;
            fixed (int* ptr = array)
            {
                for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
                {
                    var v = Avx2.LoadVector256(ptr + i);
                    accVector = Avx2.Add(accVector, v);
                }
            }
            int result = 0;
            var temp = stackalloc int[vectorSize];
            Avx2.Store(temp, accVector);
            for (int j = 0; j < vectorSize; j++)
            {
                result += temp[j];
            }
            for (; i < array.Length; i++)
            {
                result += array[i];
            }
            return result;
        }
        else
        {
            throw new NotSupportedException();
        }
    }

但是我们要做好回落，比如没有avx2就用sse，没有sse就用普通的

不过System.Runtime.Intrinsics.X86下面一堆这种，再说了还有arm的，所以通用性不如Vector方法

    public unsafe static int AddTotal_2(int[] nums)
    {
        if (Avx2.IsSupported)
        {
            return AddTotal_Avx2(nums);
        }
        else if (Sse2.IsSupported)
        {
            return AddTotal_Sse2(nums);
        }
        else
        {
            return NormalCalc.AddTotal(nums);
        }
    }

把Vector用在两个数组相加相乘上会更加简单

    public unsafe static T[] Multiply<T>(T[] numsl, T[] numsr) where T : struct,INumber<T>
    {
        if (numsl.Length != numsr.Length)
        {
            throw new ArgumentException();
        }

        T[] result = new T[numsl.Length];
        int vectorSize = Vector<T>.Count;
        int i;

        for (i = 0; i <= numsl.Length - vectorSize; i += vectorSize)
        {
            var l = new Vector<T>(numsl, i);
            var r = new Vector<T>(numsr, i);
            var multiplied = Vector.Multiply(l, r);
            //for (int j = i; j < i + vectorSize; j++)
            //{
            //    result[j] = multiplied[j % vectorSize];
            //}
            multiplied.CopyTo(result, i);
        }

        for (; i < numsl.Length; i++)
        {
            result[i] = numsl[i] * numsr[i];
        }
        return result;
    }

批量加1

普通方法：

    public static void AddOne(int[] nums)
    {
        for (int i = 0; i < nums.Length; i++)
        {
            nums[i]++;
        }
    }

SIMD：

    public static void AddOne<T>(T[] nums) where T : struct, INumber<T>
    {
        int vectorSize = Vector<T>.Count;
        var accVector = Vector<T>.One;
        int i;
        var array = nums;
        for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
        {
            var v = new Vector<T>(array, i);
            var vec = Vector.Add(v, accVector);
            vec.CopyTo(array, i);
        }
        for (; i < array.Length; i++)
        {
            array[i]++;
        }
    }

跑分是 int类型普通方法：390 SIMD：70

double类型普通方法：578 SIMD：145

这对于我们平时普通计算的性能帮助还是有的，官方还用SIMD优化了Matrix的一些类，不过都是很小的二位矩阵，你可以根据自己的需要去设计更复杂的大矩阵运算类，如果需要更复杂的批量多维矩阵处理推荐OpenCvSharp

代码下载：https://wwu.lanzoub.com/iglMD032ky0f

参考链接：

https://zhuanlan.zhihu.com/p/60171538

https://habr.com/en/post/467689

https://www.zhihu.com/question/266256257

相关阅读:
菜鸟学IT之豆瓣爬取初体验
 菜鸟学IT之python网页爬取多页爬取
 菜鸟学IT之python网页爬取初体验
 菜鸟学IT之python词云初体验
 菜鸟学IT之python3关于列表，元组，字典，集合浅认识！
整合ssm框架
 MyBatis
服务出现服务名无效的原因及解决方法
 数据库（三）
数据库（二）
原文地址：https://www.cnblogs.com/gxrsprite/p/16135450.html