• visual studio自动向量化


    /////////////////////////////////////////////////
    /*
    SSE 和 AVX 每个都有16个寄存器
    SSE 有 XMM0 ~ XMM15,是128bit
    AVX 有 YMM0 ~ YMM15,是256bit
    */
    // vs2015 有自动向量化功能,使用 c/c++->command line->additional options->/Qvec-report:2 可输出向量化结果
    // 可自动向量化
    void loop1()
    {
    int a[1024];
    int b[1024];
    int c[1024];
    for (int i = 0; i < 1024; i++)
    {
    c[i] = a[i] + b[i];
    }
    }

    /*
    向量化结果
    for (int i = 0; i < 1024; i++)
    {
    c[i] = a[i] + b[i];
    }

    vmovdqu ymm0, YMMWORD PTR _b$[ebp+eax]
    vpaddd ymm0, ymm0, YMMWORD PTR _a$[ebp+eax]
    vmovdqu YMMWORD PTR _c$[ebp+eax], ymm0
    vmovdqu ymm0, YMMWORD PTR _a$[ebp+eax+32]
    vpaddd ymm0, ymm0, YMMWORD PTR _b$[ebp+eax+32]
    vmovdqu YMMWORD PTR _c$[ebp+eax+32], ymm0
    add eax, 64 ; 00000040H
    cmp eax, 4096 ; 00001000H
    jl SHORT $LL4@main
    */

    // 不可自动向量化(1301)
    // 循环步长必须严格为1
    void loop2()
    {
    int a[1024];
    int b[1024];
    int c[1024];
    for (int i = 0; i < 1024; i += 4)
    {
    c[i] = a[i] + b[i];
    c[i + 1] = a[i + 1] + b[i + 1];
    c[i + 2] = a[i + 2] + b[i + 2];
    c[i + 3] = a[i + 3] + b[i + 3];
    }
    }

    /*
    汇编结果
    c[i] = a[i] + b[i];
    mov ecx, DWORD PTR _b$[ebp+eax]
    add DWORD PTR _a$[ebp+eax], ecx
    c[i + 1] = a[i + 1] + b[i + 1];
    mov ecx, DWORD PTR _a$[ebp+eax+4]
    add DWORD PTR _b$[ebp+eax+4], ecx
    c[i + 2] = a[i + 2] + b[i + 2];
    mov ecx, DWORD PTR _a$[ebp+eax+8]
    add DWORD PTR _b$[ebp+eax+8], ecx
    c[i + 3] = a[i + 3] + b[i + 3];
    mov ecx, DWORD PTR _a$[ebp+eax+12]
    add DWORD PTR _b$[ebp+eax+12], ecx
    */

    // 不可自动向量化(1203),循环体内包含了非连续内存访问
    // 将数组 x[128 * 3] 拆分为 b[128], g[128], r[128] 可实现自动向量化,见 loop4
    void loop3()
    {
    // 该函数实现了 y=Ax 运算,x,y为3*1向量,A为3*3矩阵
    // 这是图像处理基本操作,如图像变换,RGB调节等
    int x[128 * 3];
    int y[128 * 3];
    int A[9];
    for (int i = 0; i < 128; ++i)
    {
    y[i * 3 + 0] = x[i * 3 + 0] * A[0] + x[i * 3 + 1] * A[1] + x[i * 3 + 2] * A[2];
    y[i * 3 + 1] = x[i * 3 + 0] * A[3] + x[i * 3 + 1] * A[4] + x[i * 3 + 2] * A[5];
    y[i * 3 + 2] = x[i * 3 + 0] * A[6] + x[i * 3 + 1] * A[6] + x[i * 3 + 2] * A[6];
    }

    }

    /*
    汇编结果
    y[i * 3 + 0] = x[i * 3 + 0] * A[0] + x[i * 3 + 1] * A[1] + x[i * 3 + 2] * A[2];
    mov edi, DWORD PTR _x$[ebp+eax+4]
    mov ecx, edi
    imul ecx, DWORD PTR _A$[ebp+4]
    mov esi, DWORD PTR _x$[ebp+eax]
    mov edx, esi
    imul edx, DWORD PTR _A$[ebp]
    mov ebx, DWORD PTR _x$[ebp+eax+8]
    add edx, ecx
    mov ecx, ebx
    imul ecx, DWORD PTR _A$[ebp+8]
    add edx, ecx
    y[i * 3 + 1] = x[i * 3 + 0] * A[3] + x[i * 3 + 1] * A[4] + x[i * 3 + 2] * A[5];
    mov ecx, edi
    imul ecx, DWORD PTR _A$[ebp+16]
    mov DWORD PTR _y$[ebp+eax], edx
    mov edx, esi
    imul edx, DWORD PTR _A$[ebp+12]
    add edx, ecx
    mov ecx, ebx
    imul ecx, DWORD PTR _A$[ebp+20]
    add edx, ecx
    y[i * 3 + 2] = x[i * 3 + 0] * A[6] + x[i * 3 + 1] * A[6] + x[i * 3 + 2] * A[6];
    lea ecx, DWORD PTR [esi+edi]
    add ecx, ebx
    mov DWORD PTR _y$[ebp+eax+4], edx
    imul ecx, DWORD PTR _A$[ebp+24]
    mov DWORD PTR _y$[ebp+eax+8], ecx
    add eax, 12 ; 0000000cH
    cmp eax, 1536 ; 00000600H
    jl SHORT $LL4@loop3
    */

    // 可自动向量化
    // loop4_ don't need more space to split image data, and memory consitstent is better
    void loop4()
    {
    int b[128], g[128], r[128];
    int A[9];
    for (int i = 0; i < 128; ++i)
    {
    int _b = b[i] * A[0] + g[i] * A[1] + r[i] * A[2];
    int _g = b[i] * A[3] + g[i] * A[4] + r[i] * A[5];
    int _r = b[i] * A[6] + g[i] * A[7] + r[i] * A[8];

    b[i] = _b;
    g[i] = _g;
    r[i] = _r;
    }
    }

    // better than loop4
    void loop4_()
    {

    int bgr[128 * 3];
    int b[8];
    int g[8];
    int r[8];
    int A[9];
    for (int i = 0; i < 128 / 8; ++i)
    {
    // split data
    int start = i * 8 * 3;
    for (int j = 0; j < 8; ++j)
    {
    b[j] = bgr[start + j * 3];
    g[j] = bgr[start + j * 3 + 1];
    r[j] = bgr[start + j * 3 + 2];
    }

    // adjust
    for (int j = 0; j < 8; ++j)
    {
    int _b = b[j] * A[0] + g[j] * A[1] + r[j] * A[2];
    int _g = b[j] * A[3] + g[j] * A[4] + r[j] * A[5];
    int _r = b[j] * A[6] + g[j] * A[7] + r[j] * A[8];

    b[j] = _b;
    g[j] = _g;
    r[j] = _r;
    }

    // merge data
    for (int j = 0; j < 8; ++j)
    {
    bgr[start + j * 3] = b[j];
    bgr[start + j * 3 + 1] = g[j];
    bgr[start + j * 3 + 2] = r[j];
    }
    }

    }

    /*
    向量化结果
    vpbroadcastd ymm1, DWORD PTR _A$[ebp]
    vpbroadcastd ymm6, DWORD PTR _A$[ebp+8]
    vpbroadcastd ymm7, DWORD PTR _A$[ebp+4]
    vmovdqu YMMWORD PTR tv886[ebp], ymm1
    vpbroadcastd ymm0, DWORD PTR _A$[ebp+20]
    vmovdqu YMMWORD PTR tv881[ebp], ymm0
    vpbroadcastd ymm0, DWORD PTR _A$[ebp+16]
    vmovdqu YMMWORD PTR tv882[ebp], ymm0
    vpbroadcastd ymm0, DWORD PTR _A$[ebp+12]
    vmovdqu YMMWORD PTR tv883[ebp], ymm0
    vpbroadcastd ymm0, DWORD PTR _A$[ebp+32]
    vmovdqu YMMWORD PTR tv878[ebp], ymm0
    vpbroadcastd ymm0, DWORD PTR _A$[ebp+28]
    vmovdqu YMMWORD PTR tv879[ebp], ymm0
    vpbroadcastd ymm0, DWORD PTR _A$[ebp+24]
    vmovdqu YMMWORD PTR tv880[ebp], ymm0
    xor eax, eax
    npad 2
    $LL4@loop3_:
    vmovdqu ymm3, YMMWORD PTR _b$[ebp+eax]
    vmovdqu ymm4, YMMWORD PTR _g$[ebp+eax]
    vmovdqu ymm5, YMMWORD PTR _r$[ebp+eax]
    vpmulld ymm2, ymm3, ymm1
    vpmulld ymm1, ymm4, ymm7
    vpmulld ymm0, ymm5, ymm6
    vpaddd ymm0, ymm0, ymm1

    vpmulld ymm1, ymm4, YMMWORD PTR tv882[ebp]
    vpaddd ymm0, ymm0, ymm2
    vpmulld ymm2, ymm3, YMMWORD PTR tv883[ebp]
    vmovdqu YMMWORD PTR __b$[ebp+eax], ymm0
    vpmulld ymm0, ymm5, YMMWORD PTR tv881[ebp]
    vpaddd ymm0, ymm0, ymm1

    vpmulld ymm1, ymm4, YMMWORD PTR tv879[ebp]
    vmovdqu ymm4, YMMWORD PTR _g$[ebp+eax+32]
    vpaddd ymm0, ymm0, ymm2
    vpmulld ymm2, ymm3, YMMWORD PTR tv880[ebp]
    vmovdqu ymm3, YMMWORD PTR _b$[ebp+eax+32]
    vmovdqu YMMWORD PTR __g$[ebp+eax], ymm0
    vpmulld ymm0, ymm5, YMMWORD PTR tv878[ebp]
    vmovdqu ymm5, YMMWORD PTR _r$[ebp+eax+32]
    vpaddd ymm0, ymm0, ymm1
    vpaddd ymm0, ymm0, ymm2
    vpmulld ymm2, ymm3, YMMWORD PTR tv886[ebp]
    vmovdqu YMMWORD PTR __r$[ebp+eax], ymm0
    vpmulld ymm1, ymm4, ymm7
    vpmulld ymm0, ymm5, ymm6
    vpaddd ymm0, ymm0, ymm1
    vpmulld ymm1, ymm4, YMMWORD PTR tv882[ebp]
    vpaddd ymm0, ymm0, ymm2
    vpmulld ymm2, ymm3, YMMWORD PTR tv883[ebp]
    vmovdqu YMMWORD PTR __b$[ebp+eax+32], ymm0
    vpmulld ymm0, ymm5, YMMWORD PTR tv881[ebp]
    vpaddd ymm0, ymm0, ymm1
    vpmulld ymm1, ymm4, YMMWORD PTR tv879[ebp]
    vpaddd ymm0, ymm0, ymm2
    vpmulld ymm2, ymm3, YMMWORD PTR tv880[ebp]
    vmovdqu YMMWORD PTR __g$[ebp+eax+32], ymm0
    vpmulld ymm0, ymm5, YMMWORD PTR tv878[ebp]
    vpaddd ymm0, ymm0, ymm1
    vmovdqu ymm1, YMMWORD PTR tv886[ebp]
    vpaddd ymm0, ymm0, ymm2
    vmovdqu YMMWORD PTR __r$[ebp+eax+32], ymm0
    add eax, 64 ; 00000040H
    cmp eax, 512 ; 00000200H
    jl $LL4@loop3_
    */

    // 内层for循环期望向量化,VS编译器没有自动向量化(1505)
    // 内层循环中使用变量i使得编译器无法优化,loop6使用step使内层循环可优化
    void loop5()
    {
    int *x = new int[1024 * 1024];
    int *y = new int[1024 * 1024];
    int *z = new int[1024 * 1024];
    for (int i = 0; i < 1024; ++i)
    {
    for (int j = 0; j < 1024; ++j)
    {
    z[i * 1024 + j] = x[i * 1024 + j] + y[i * 1024 + j];
    }
    }

    delete[]x;
    delete[]y;
    delete[]z;
    }

    // 内层for循环自动向量化
    void loop6()
    {
    int *x = new int[1024 * 1024];
    int *y = new int[1024 * 1024];
    int *z = new int[1024 * 1024];
    for (int i = 0; i < 1024; ++i)
    {
    int step = i * 1024;
    for (int j = 0; j < 1024; ++j)
    {
    z[step + j] = x[step + j] + y[step + j];
    }
    }

    delete []x;
    delete []y;
    delete []z;


    }

    // vs编译器提示循环体内包含较少计算,当加入更多计算时提示循环体内包含了非连续内存访问(1203)
    // 通过展开内层循环可以实现优化操作,见loop8
    // 该函数模拟图像相加操作
    void loop7()
    {
    int *x = new int[1024 * 1024 * 3];
    int *y = new int[1024 * 1024 * 3];
    int *z = new int[1024 * 1024 * 3];
    for (int i = 0; i < 1024; ++i)
    {
    int step = i * 1024 * 3;
    for (int j = 0; j < 1024; ++j)
    {
    z[step + j * 3 ] = x[step + j * 3 ] + y[step + j * 3 ];
    z[step + j * 3 + 1] = x[step + j * 3 + 1] + y[step + j * 3 + 1];
    z[step + j * 3 + 2] = x[step + j * 3 + 2] + y[step + j * 3 + 2];
    }
    }

    delete[]x;
    delete[]y;
    delete[]z;
    }

    // 可自动向量化化
    void loop8()
    {
    unsigned char *x = new unsigned char[1024 * 1024 * 3];
    unsigned char *y = new unsigned char[1024 * 1024 * 3];
    unsigned char *z = new unsigned char[1024 * 1024 * 3];
    for (int i = 0; i < 1024; ++i)
    {
    int step = i * 1024 * 3;
    for (int j = 0; j < 1024 * 3; ++j)
    {
    z[step + j] = x[step + j] + y[step + j];
    }
    }

    delete[]x;
    delete[]y;
    delete[]z;
    }
    /////////////////////////////////////////////////

  • 相关阅读:
    ASP.NET使用Coolite.Ext.Web.dll,显示ext"未定义"的解决方法
    浏览器引擎模式与DOCTYPE
    MVC 参数如何自动绑定数组对象
    IIS7.5 配置 PHP
    TFSDeleteProject:删除团队项目
    自动生成存储过程一
    如何更新ntext字段信息
    清除所有默认样式的CSS代码
    自定义动作过滤器属性
    IE9点击别的网页弹出空白页
  • 原文地址:https://www.cnblogs.com/luofeiju/p/15762436.html
Copyright © 2020-2023  润新知