▶ 第三章,逐步优化了一个二维卷积计算的过程
● 基准代码
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include <math.h> 5 #include <time.h> 6 #include <sys/time.h> 7 #include <omp.h> 8 #include <assert.h> 9 #include <sys/mman.h> 10 11 #define REAL double 12 #define WIDTH 1030 13 #define HEIGHT 2048 14 #define COUNT 1000 15 #define PAD64 0 16 #if PAD64 // 是否对齐到 64 Byte 17 #define WIDTHP ((WIDTH * sizeof(REAL) + 63) / 64 * 64 / sizeof(REAL)) 18 #else 19 #define WIDTHP WIDTH 20 #endif 21 22 void initbuf(REAL *fbuf, const int width, const int height) // 初始化矩阵 23 { 24 for (int y = 0; y < height; y++) 25 { 26 REAL val = (y % 10) ? 0 : 1.0; 27 for (int x = 0; x < width; x++) 28 fbuf[y * WIDTHP + x] = val; 29 } 30 return; 31 } 32 33 // 重复计算模糊 count 次 34 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height, 35 const REAL ctr, const REAL next, const REAL diag, const int count) 36 { 37 REAL *fin = finp, *fout = foutp; 38 for (int i = 0; i < count; i++) 39 { 40 for (int y = 1; y < height - 1; y++) // 不处理光环元素 41 { 42 int c = 1 + y * WIDTHP; // ?为什么要加两次 1 43 int n = c - WIDTHP, s = c + WIDTHP, w = c - 1, e = c + 1; 44 int nw = n - 1, ne = n + 1, sw = s - 1, se = s + 1; 45 for (int x = 1; x < width - 1; x++) 46 { 47 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] + 48 next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c]; 49 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++; 50 } 51 } 52 REAL *ftmp = fin; 53 fin = fout; 54 fout = ftmp; 55 } 56 return; 57 } 58 59 static double dtime() 60 { 61 double tseconds = 0.0; 62 struct timeval mytime; 63 gettimeofday(&mytime, (struct timezone *) 0); 64 tseconds = (double)(mytime.tv_sec + (double)mytime.tv_usec * 1.0e-6); 65 return (tseconds); 66 } 67 68 int main(int argc, char *argv[]) 69 { 70 REAL *fa = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT), *fb = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT); 71 assert(fa != MAP_FAILED); 72 assert(fb != MAP_FAILED); 73 74 printf("Initializing..%d Threads, %d x %d, PAD=%d.. ", omp_get_num_threads(), WIDTH, HEIGHT, WIDTHP); 75 initbuf(fa, WIDTHP, HEIGHT); 76 initbuf(fb, WIDTHP, HEIGHT); 77 78 printf("Running stencil kernel %d times ", COUNT); 79 const REAL stendiag = 0.00125, stennext = 0.00125, stenctr = 0.99; 80 double time_b, time_e; 81 time_b = dtime(); 82 stencil9pt(fa, fb, WIDTHP, HEIGHT, stenctr, stennext, stendiag, COUNT); 83 time_e = dtime(); 84 printf("Elapsed time : %.3f (s) ", time_e - time_b); 85 printf("FLOPS: %.3f (MFlops) ", (WIDTHP * HEIGHT) * 17.0 * COUNT / (time_e - time_b) * 1.0e-06);// 计算一个元素需要 17 次乘法或加法 86 87 free(fa); 88 free(fb); 89 return 0; 90 }
■ 输出结果
Xeon: Running stencil kernel 1000 times Elapsed time: 5.754 (s) FLOPS : 6232.567 (MFlops) XeonPhi: Running stencil kernel 1000 times Elapsed time: 102.042 (s) FLOPS : 351.428 (MFlops)
● 修改计算函数,忽略指针依赖关系
1 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height, 2 const REAL ctr, const REAL next, const REAL diag, const int count) 3 { 4 REAL *fin = finp, *fout = foutp; 5 for (int i = 0; i<count; i++) 6 { 7 for (int y = 1; y < height - 1; y++) 8 { 9 int c = 1 + y * WIDTHP; 10 int n = c - WIDTHP, s = c + WIDTHP, w = c - 1, e = c + 1; 11 int nw = n - 1, ne = n + 1, sw = s - 1, se = s + 1; 12 #pragma ivdep // 忽略不明显的(指针)依赖关系 13 for (int x = 1; x < width - 1; x++) 14 { 15 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] +diag * fin[se] + 16 next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c]; 17 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++; 18 } 19 } 20 REAL *ftmp = fin; 21 fin = fout; 22 fout = ftmp; 23 } 24 return; 25 }
■ 输出结果(Xeon Phi)
Running stencil kernel 1000 times Elapsed time: 24.052 (s) FLOPS : 1490.925 (MFlops)
● OpenMP
1 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height, 2 const REAL ctr, const REAL next, const REAL diag, const int count) 3 { 4 REAL *fin = finp, *fout = foutp; 5 for (int i = 0; i<count; i++) 6 { 7 int y, x; 8 #pragma omp parallel for private(x) // 一句 OpenMp,注意 x 线程私有 9 for (y = 1; y < height - 1; y++) 10 { 11 int c = 1 + y * WIDTHP; 12 int n = c - WIDTHP, s = c + WIDTHP, w = c - 1, e = c + 1; 13 int nw = n - 1, ne = n + 1, sw = s - 1, se = s + 1; 14 #pragma ivdep 15 for (x = 1; x < width - 1; x++) 16 { 17 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] + 18 next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c]; 19 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++; 20 } 21 } 22 REAL *ftmp = fin; 23 fin = fout; 24 fout = ftmp; 25 } 26 return; 27 }
■ 输出结果(Xeon Phi)
Running stencil kernel 1000 times Elapsed time : 0.700 (s) FLOPS : 51201.595 (MFlops)
● 数组对齐
1 // 打开 PAD64 2 #define PAD64 1 3 4 // 替换 malloc 和 free 5 REAL *fa = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT); 6 REAL *fb = (REAL *)malloc(sizeof(REAL)*WIDTHP*HEIGHT); 7 ... 8 free(fa); 9 free(fb); 10 11 // 替换为 12 REAL *fa = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, 64); 13 REAL *fb = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, 64); 14 ... 15 _mm_free(fa); 16 _mm_free(fb);
■ 输出结果(Xeon Phi)
Running stencil kernel 1000 times Elapsed time : 0.651 (s) FLOPS : 55155.399 (MFlops)
● 流存储
1 void stencil9pt(REAL *finp, REAL *foutp, const int width, const int height, 2 const REAL ctr, const REAL next, const REAL diag, const int count) 3 { 4 REAL *fin = finp, *fout = foutp; 5 for (int i = 0; i<count; i++) 6 { 7 int y, x; 8 #pragma omp parallel for private(x) 9 for (y = 1; y < height - 1; y++) 10 { 11 int c = 1 + y * WIDTHP; 12 int n = c - WIDTHP, s = c + WIDTHP, w = c - 1, e = c + 1; 13 int nw = n - 1, ne = n + 1, sw = s - 1, se = s + 1; 14 #pragma ivdep 15 #pragma vector nontemporal // 使用流存储指令(或使用编译选项 -opt-streaming-storesalways) 16 for (x = 1; x < width - 1; x++) 17 { 18 fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] + 19 next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c]; 20 c++; n++; s++; e++; w++; nw++; ne++; sw++; se++; 21 } 22 } 23 REAL *ftmp = fin; 24 fin = fout; 25 fout = ftmp; 26 } 27 return; 28 }
■ 输出结果
Xeon: Running stencil kernel 1000 times Elapsed time: 1.905 (s) FLOPS : 18824.053 (MFlops) XeonPhi: Running stencil kernel 1000 times Elapsed time: 0.639 (s) FLOPS : 56145.417 (MFlops)
● 2 MB 存储页(书中代码没有 munmap 的部分)。参考(https://www.aliyun.com/jiaocheng/208004.html),但是服务器上编译时找不到 MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB 等定义,无法使用
1 // 替换 _mm_malloc 和 _mm_free 2 REAL *fa = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, 64); 3 REAL *fb = (REAL *)_mm_malloc(sizeof(REAL)*WIDTHP*HEIGHT, 64); 4 ... 5 _mm_free(fa); 6 _mm_free(fb); 7 // 替换为 8 REAL *fa = (REAL *)mmap(0, WIDTHP*HEIGHT * sizeof(REAL), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0); 9 REAL *fb = (REAL *)mmap(0, WIDTHP*HEIGHT * sizeof(REAL), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0); 10 ... 11 munmap(fa, WIDTHP*HEIGHT * sizeof(REAL)); 12 munmap(fb, WIDTHP*HEIGHT * sizeof(REAL));