• Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 3


    ▶ 第二章,几个简单的程序

    ● 代码,单线程

     1 #include <stdio.h>
     2 #include <stdlib.h>
     3 #include <string.h>
     4 #include <sys/time.h>
     5 
     6 #define SIZE            (1024*1024) 
     7 #define MAXFLOP_ITER    100000000
     8 #define LOOP_COUNT      128
     9 #define FLOP_PER_CALC   2
    10 
    11 float fa[SIZE] __attribute__((align(64)));
    12 float fb[SIZE] __attribute__((align(64)));
    13 
    14 double dtime()
    15 {
    16     struct timeval mytime;
    17     gettimeofday(&mytime, (struct timezone*)0);
    18     return (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6);
    19 }
    20 
    21 int main(int argc, char *argv[])
    22 {    
    23     const float a = 1.1;
    24 
    25     printf("Initializing
    ");
    26     for (int i = 0; i < SIZE; i++)
    27     {
    28         fa[i] = (float)i + 0.1;
    29         fb[i] = (float)i + 0.2;
    30     }
    31 
    32     printf("Starting Compute
    ");
    33     double time_b, time_e;
    34     time_b = dtime();    
    35     for (int j = 0; j < MAXFLOP_ITER; j++)
    36     {    
    37         for (int k = 0; k < LOOP_COUNT; k++)
    38             fa[k] = a * fa[k] + fb[k];
    39     }
    40     time_e = dtime();
    41     
    42     double gflops = 1.0e-9 * LOOP_COUNT * MAXFLOP_ITER * FLOP_PER_CALC;
    43     printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf
    ", gflops, time_e - time_b, gflops / (time_e - time_b));
    44     
    45     return 0;
    46 }

    ■ 输出结果

    GFlops =     25.600, Secs =      1.464, GFlops per sec =     17.484

    ● 单核心两线程的 OpenMP(注意总计算量提升了,而不是固定计算量看运行时间减少)

     1 int main(int argc, char *argv[])
     2 {
     3     const float a = 1.1;
     4     int i, j, k, numthreads;                    // 循环变量放到外边来
     5 
     6     omp_set_num_threads(2);                     // 运行时设置 OpenMP 参数
     7     kmp_set_defaults("KMP_AFFINITY=compact");
     8 
     9 #pragma omp parallel
    10 #pragma omp master
    11     numthreads = omp_get_num_threads();
    12 
    13     printf("Initializing
    ");
    14 #pragma omp parallel for
    15     for (i = 0; i < SIZE; i++)
    16     {
    17         fa[i] = (float)i + 0.1;
    18         fb[i] = (float)i + 0.2;
    19     }
    20     printf("Starting Compute on %d threads
    ", numthreads);
    21     double time_b, time_e;
    22     time_b = dtime();
    23 #pragma omp parallel for private(j, k)
    24     for (i = 0; i < numthreads; i++)
    25     {
    26         int offset = i * LOOP_COUNT;
    27         for (j = 0; j < MAXFLOP_ITER; j++)
    28         {
    29             for (k = 0; k < LOOP_COUNT; k++)
    30                 fa[k + offset] = a * fa[k + offset] + fb[k + offset];
    31         }
    32     }
    33     time_e = dtime();
    34 
    35     double gflops = 1.0e-9 * numthreads * LOOP_COUNT * MAXFLOP_ITER * FLOP_PER_CALC;
    36     printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf
    ", gflops, time_e - time_b, gflops / (time_e - time_b));
    37 
    38     return 0;
    39 }

    ■ 输出结果

    1 GFlops =     51.200, Secs =      1.464, GFlops per sec =     34.968

    ● 线程数、线程亲缘性调整

    1 // 替换
    2     omp_set_num_threads(2);                     
    3     kmp_set_defaults("KMP_AFFINITY=compact");
    4 // 替换为
    5     omp_set_num_threads(112);
    6     kmp_set_defaults("KMP_AFFINITY=scatter");

    ■ 输出结果

    GFlops =   2867.200, Secs =      1.619, GFlops per sec =   1771.298

    ● 代码,带宽测试

     1 #include <stdio.h>
     2 #include <stdlib.h>
     3 #include <string.h>
     4 #include <sys/time.h>
     5 #include <omp.h>
     6 
     7 #define REAL            double
     8 #define SIZE            (1000*1000*64) 
     9 #define MAXFLOP_ITER    1000
    10 #define FLOP_PER_CALC   2    
    11 
    12 REAL fa[SIZE] __attribute__((align(64)));
    13 REAL fb[SIZE] __attribute__((align(64)));
    14 REAL fc[SIZE] __attribute__((align(64)));
    15 
    16 double dtime()
    17 {
    18     struct timeval mytime;
    19     gettimeofday(&mytime, (struct timezone*)0);
    20     return (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6);
    21 }
    22 
    23 int main(int argc, char *argv[])
    24 {
    25     const REAL a = 1.1;
    26     int i, j;           
    27 
    28     omp_set_num_threads(112);
    29     kmp_set_defaults("KMP_AFFINITY=scatter");
    30     
    31     printf("Initializing
    ");
    32 #pragma omp parallel for
    33     for (i = 0; i < SIZE; i++)
    34     {
    35         fa[i] = (REAL)i + 0.1;
    36         fb[i] = (REAL)i + 0.2;
    37     }
    38 
    39 #pragma omp parallel
    40 #pragma omp master 
    41     printf("Starting BW Test on %d threads
    ", omp_get_num_threads());
    42     double time_b, time_e;
    43     time_b = dtime();
    44     for (i = 0; i < MAXFLOP_ITER; i++)
    45     {
    46 #pragma omp parallel for
    47         for (j = 0; j < SIZE; j++)
    48             fa[j] = fb[j];
    49     }
    50     time_e = dtime();
    51     double gbytes = 1.0e-9 * MAXFLOP_ITER * SIZE * FLOP_PER_CALC * sizeof(REAL);
    52     printf("Gbytes = %10.3lf, Secs = %10.3lf, GBytes per sec = %10.3lf
    ", gbytes, time_e - time_b, gbytes / (time_e - time_b));
    53 
    54     return 0;
    55 }

    ■ 输出结果

    Starting BW Test on 112 threads
    Gbytes =   1024.000, Secs =     10.293, GBytes per sec =     99.488

    ● 代码,offload 模式(注意全局变量和编译选项的调整)

     1 #include <stdio.h>
     2 #include <stdlib.h>
     3 #include <string.h>
     4 #include <sys/time.h>
     5 #include <omp.h>
     6 
     7 #define SIZE            (1024*512) 
     8 #define MAXFLOP_ITER    100000000
     9 #define LOOP_COUNT      128
    10 #define FLOP_PER_CALC   2     
    11 
    12 __declspec (target(mic)) float fa[SIZE] __attribute__((align(64)));  // 声明 mic 上的存储类型
    13 __declspec (target(mic)) float fb[SIZE] __attribute__((align(64)));
    14 
    15 double dtime()
    16 {
    17     struct timeval mytime;
    18     gettimeofday(&mytime, (struct timezone*)0);
    19     return (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6);
    20 }
    21 
    22 int main(int argc, char *argv[])
    23 {
    24     const float a = 1.1;
    25     int i, j, k, numthreads;
    26 
    27     omp_set_num_threads(112);
    28     kmp_set_defaults("KMP_AFFINITY=scatter");
    29 #pragma offload target (mic)// 声明需要使用 mic 的 offload 模式
    30 #pragma omp parallel
    31 #pragma omp master
    32     numthreads = omp_get_num_threads();
    33 
    34     printf("Initializing
    ");
    35 #pragma omp parallel for
    36     for (i = 0; i<SIZE; i++)
    37     {
    38         fa[i] = (float)i + 0.1;
    39         fb[i] = (float)i + 0.2;
    40     }
    41     printf("Starting Compute on %d threads
    ", numthreads);
    42     double time_b, time_e;
    43     time_b = dtime();
    44 #pragma offload target (mic)// 声明需要使用 mic 的 offload 模式
    45 #pragma omp parallel for private(j, k)
    46     for (i = 0; i<numthreads; i++)
    47     {        
    48         int offset = i * LOOP_COUNT;
    49         for (j = 0; j < MAXFLOP_ITER; j++)
    50         {            
    51 #pragma vector aligned// 强制向量对齐
    52             for (k = 0; k < LOOP_COUNT; k++)            
    53                 fa[k + offset] = a * fa[k + offset] + fb[k + offset];
    54         }
    55     }
    56     time_e = dtime();
    57 
    58     double gflops = 1.0e-9 * numthreads * LOOP_COUNT * MAXFLOP_ITER * FLOP_PER_CALC;
    59     printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf
    ", gflops, time_e - time_b, gflops / (time_e - time_b));
    60            
    61     return 0;
    62 }

    ■ 输出结果

    Starting Compute on 224 threads
    GFlops =   5734.400, Secs =      2.976, GFlops per sec =   1927.124
  • 相关阅读:
    kafka 配置权限
    转战 rocketmq
    从 spring-cloud-alibaba-nacos-config 进入 nacos-client
    sc 使用了配置中心后,如何设置远程和本地配置的优先级
    nacos 使用 servlet 异步处理客户端配置长轮询
    NacosValue 注解
    curl 使用 post 请求,传递 json 参数,下载文件
    nginx 代理 https 后,应用变成 http
    数据集市
    支付宝数据建模介绍
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/10323601.html
Copyright © 2020-2023  润新知