• Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 1


    ▶ 第三章,逐步优化了一个二维卷积计算的过程

    ● 基准代码

     1 #include <stdio.h>
     2 #include <stdlib.h>
     3 #include <string.h>
     4 #include <math.h>
     5 #include <time.h>
     6 #include <sys/time.h>
     7 #include <omp.h>
     8 #include <assert.h>
     9 #include <sys/mman.h>
    10 
    11 #define REAL    double
    12 #define WIDTH   1030
    13 #define HEIGHT  2048
    14 #define COUNT   1000
    15 #define PAD64   0
    16 #if PAD64                                                                   // 是否对齐到 64 Byte
    17     #define WIDTHP ((WIDTH * sizeof(REAL) + 63) / 64 * 64 / sizeof(REAL))
    18 #else
    19     #define WIDTHP WIDTH
    20 #endif
    21 
    22 void initbuf(REAL *fbuf, const int width, const int height)                 // 初始化矩阵
    23 {    
    24     for (int y = 0; y < height; y++) 
    25     {
    26         REAL val = (y % 10) ? 0 : 1.0;
    27         for (int x = 0; x < width; x++)         
    28             fbuf[y * WIDTHP + x] = val;
    29     }
    30     return;
    31 }
    32 
    33 // 重复计算模糊 count 次
    34 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height, 
    35     const REAL ctr, const REAL next, const REAL diag, const int count)
    36 {
    37     REAL *fin = finp, *fout = foutp;    
    38     for (int i = 0; i < count; i++) 
    39     {
    40         for (int y = 1; y < height - 1; y++)    // 不处理光环元素
    41         {            
    42             int c = 1 + y * WIDTHP;         // ?为什么要加两次 1
    43             int n = c - WIDTHP, s = c + WIDTHP, w = c - 1, e = c + 1;
    44             int nw = n - 1, ne = n + 1, sw = s - 1, se = s + 1;
    45             for (int x = 1; x < width - 1; x++) 
    46             {
    47                 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] +
    48                     next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c];                                                  
    49                 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++;
    50             }
    51         }
    52         REAL *ftmp = fin;
    53         fin = fout;
    54         fout = ftmp;
    55     }
    56     return;
    57 }
    58 
    59 static double dtime() 
    60 {
    61     double tseconds = 0.0;
    62     struct timeval mytime;
    63     gettimeofday(&mytime, (struct timezone *) 0);
    64     tseconds = (double)(mytime.tv_sec + (double)mytime.tv_usec * 1.0e-6);
    65     return (tseconds);
    66 }
    67 
    68 int main(int argc, char *argv[])
    69 {        
    70     REAL *fa = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT), *fb = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT);
    71     assert(fa != MAP_FAILED);
    72     assert(fb != MAP_FAILED);
    73 
    74     printf("Initializing..%d Threads, %d x %d, PAD=%d..
    
    ", omp_get_num_threads(), WIDTH, HEIGHT, WIDTHP);
    75     initbuf(fa, WIDTHP, HEIGHT);
    76     initbuf(fb, WIDTHP, HEIGHT);
    77 
    78     printf("Running stencil kernel %d times
    ", COUNT);
    79     const REAL stendiag = 0.00125, stennext = 0.00125, stenctr = 0.99;
    80     double time_b, time_e;
    81     time_b = dtime();
    82     stencil9pt(fa, fb, WIDTHP, HEIGHT, stenctr, stennext, stendiag, COUNT);
    83     time_e = dtime();
    84     printf("Elapsed time : %.3f (s)
    ", time_e - time_b);
    85     printf("FLOPS: %.3f (MFlops)
    ", (WIDTHP * HEIGHT) * 17.0 * COUNT / (time_e - time_b) * 1.0e-06);// 计算一个元素需要 17 次乘法或加法
    86 
    87     free(fa);
    88     free(fb);
    89     return 0;
    90 }

    ■ 输出结果

    Xeon:
    Running stencil kernel 1000 times
    Elapsed time: 5.754 (s)
    FLOPS       : 6232.567 (MFlops)
    
    XeonPhi:
    Running stencil kernel 1000 times
    Elapsed time: 102.042 (s)
    FLOPS       : 351.428 (MFlops)

    ● 修改计算函数,忽略指针依赖关系

     1 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height,
     2     const REAL ctr, const REAL next, const REAL diag, const int count)
     3 {
     4     REAL *fin = finp, *fout = foutp;
     5     for (int i = 0; i<count; i++) 
     6     {
     7         for (int y = 1; y < height - 1; y++) 
     8         {       
     9             int c = 1 + y * WIDTHP;            
    10             int n = c - WIDTHP, s = c + WIDTHP, w = c - 1, e = c + 1;
    11             int nw = n - 1, ne = n + 1, sw = s - 1, se = s + 1;
    12             #pragma ivdep                       // 忽略不明显的(指针)依赖关系
    13             for (int x = 1; x < width - 1; x++) 
    14             {
    15                 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] +diag * fin[se] + 
    16                     next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c];
    17                 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++;
    18             }
    19         }
    20         REAL *ftmp = fin;
    21         fin = fout;
    22         fout = ftmp;
    23     }
    24     return;
    25 }

    ■ 输出结果(Xeon Phi)

    Running stencil kernel 1000 times
    Elapsed time: 24.052 (s)
    FLOPS       : 1490.925 (MFlops)

    ● OpenMP

     1 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height,
     2     const REAL ctr, const REAL next, const REAL diag, const int count)
     3 {
     4     REAL *fin = finp, *fout = foutp;
     5     for (int i = 0; i<count; i++)
     6     {
     7         int y, x;
     8 #pragma omp parallel for private(x)             // 一句 OpenMp,注意 x 线程私有 
     9         for (y = 1; y < height - 1; y++)
    10         {
    11             int c = 1 + y * WIDTHP;
    12             int n = c - WIDTHP, s = c + WIDTHP, w = c - 1, e = c + 1;
    13             int nw = n - 1, ne = n + 1, sw = s - 1, se = s + 1;
    14 #pragma ivdep
    15             for (x = 1; x < width - 1; x++)
    16             {
    17                 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] +
    18                     next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c];
    19                 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++;
    20             }
    21         }
    22         REAL *ftmp = fin;
    23         fin = fout;
    24         fout = ftmp;
    25     }
    26     return;
    27 }

    ■ 输出结果(Xeon Phi)

    Running stencil kernel 1000 times
    Elapsed time : 0.700 (s)
    FLOPS        : 51201.595 (MFlops)

    ● 数组对齐

     1 // 打开 PAD64 
     2 #define PAD64   1
     3 
     4 // 替换 malloc 和 free
     5     REAL *fa = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT);
     6     REAL *fb = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT);
     7     ...    
     8     free(fa);
     9     free(fb);
    10 
    11 // 替换为    
    12     REAL *fa = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, 64);
    13     REAL *fb = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, 64);
    14     ...
    15     _mm_free(fa);
    16     _mm_free(fb);

    ■ 输出结果(Xeon Phi)

    Running stencil kernel 1000 times
    Elapsed time : 0.651 (s)
    FLOPS        : 55155.399 (MFlops)

    ● 流存储

     1 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height,
     2     const REAL ctr, const REAL next, const REAL diag, const int count)
     3 {
     4     REAL *fin = finp, *fout = foutp;
     5     for (int i = 0; i<count; i++)
     6     {
     7         int y, x;
     8 #pragma omp parallel for private(x)             
     9         for (y = 1; y < height - 1; y++)
    10         {
    11             int c = 1 + y * WIDTHP;
    12             int n = c - WIDTHP, s = c + WIDTHP, w = c - 1, e = c + 1;
    13             int nw = n - 1, ne = n + 1, sw = s - 1, se = s + 1;
    14 #pragma ivdep
    15 #pragma vector nontemporal                      // 使用流存储指令(或使用编译选项 -opt-streaming-storesalways)
    16             for (x = 1; x < width - 1; x++)
    17             {
    18                 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] +
    19                     next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c];
    20                 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++;
    21             }
    22         }
    23         REAL *ftmp = fin;
    24         fin = fout;
    25         fout = ftmp;
    26     }
    27     return;
    28 }

    ■ 输出结果

    Xeon:
    Running stencil kernel 1000 times
    Elapsed time: 1.905 (s)
    FLOPS       : 18824.053 (MFlops)
    
    XeonPhi:
    Running stencil kernel 1000 times
    Elapsed time: 0.639 (s)
    FLOPS       : 56145.417 (MFlops)

    ● 2 MB 存储页(书中代码没有 munmap 的部分)。参考(https://www.aliyun.com/jiaocheng/208004.html),但是服务器上编译时找不到 MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB 等定义,无法使用

     1 // 替换 _mm_malloc 和 _mm_free
     2     REAL *fa = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, 64);
     3     REAL *fb = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, 64);
     4     ...
     5     _mm_free(fa);
     6     _mm_free(fb);
     7 // 替换为    
     8     REAL *fa = (REAL *)mmap(0, WIDTHP*HEIGHT * sizeof(REAL), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0);
     9     REAL *fb = (REAL *)mmap(0, WIDTHP*HEIGHT * sizeof(REAL), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0);
    10     ...
    11     munmap(fa, WIDTHP*HEIGHT * sizeof(REAL));
    12     munmap(fb, WIDTHP*HEIGHT * sizeof(REAL));
  • 相关阅读:
    jdk源码阅读笔记之java集合框架(四)(LinkedList)
    jdk源码阅读笔记之java集合框架(二)(ArrayList)
    jdk源码阅读笔记之java集合框架(三)(modCount)
    java文件拷贝的一点思考
    mac(10.11.5 )安装pt-query-digest所遇问题总结
    关于springboot启动所需所有jar包详解
    volatile的一点理解
    java 虚拟机自动内存管理
    虚拟机运行时数据区划分
    笔记本连上wifi(WiFi完全没问题)却无法上网
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/10323542.html
Copyright © 2020-2023  润新知