• 矩阵乘法


    ▶ 计算矩阵矩阵乘法 Am×n Bn×p == Cm×p 过程。

    ▶ 原始矩阵乘法,一个线程计算结果矩阵中的一个元素。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <malloc.h>
      4 #include <time.h>
      5 #include "cuda_runtime.h"
      6 #include "device_launch_parameters.h"
      7 
      8 #define M           (1024 + 13)
      9 #define N           (1024 + 31)
     10 #define P           (1024 + 7)
     11 #define THREAD      16
     12 #define SEED        (unsigned int)clock()
     13 #define CEIL(x,y)   (( x - 1 ) /  y + 1)
     14 
     15 void checkNULL(void *input)
     16 {
     17     if (input == NULL)
     18     {
     19         printf("
    	find a NULL!");
     20         exit(1);
     21     }
     22     return;
     23 }
     24 
     25 void checkCudaError(cudaError input)
     26 {
     27     if (input != cudaSuccess)
     28     {
     29         printf("
    	find a cudaError!");
     30         exit(1);
     31     }
     32     return;
     33 }
     34 
     35 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(np)
     36 {
     37     int k;
     38     float temp;
     39     int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号
     40     int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号
     41 
     42     for (; idy < M; idy += gridDim.y*blockDim.y)
     43     {
     44         for (; idx < P; idx += gridDim.x*blockDim.x)
     45         {
     46             for (k = 0, temp = 0.0f; k < N; k++)
     47                 temp += a[idy * N + k] * b[k * P + idx];
     48             c[idy * P + idx] = temp;
     49         }
     50     }
     51     return;
     52 }
     53 
     54 int main()
     55 {
     56     int i, j, k, flag;
     57     float *a, *b, *c, *dev_a, *dev_b, *dev_c, temp, elapsedTime;
     58     clock_t time, cputime;
     59     cudaEvent_t start, stop;
     60     cudaEventCreate(&start);
     61     cudaEventCreate(&stop);
     62 
     63     printf("
    	Matrix size: A(%d, %d) × B(%d, %d) == C(%d, %d)", M, N, N, P, M, P);
     64     printf("
    	Start!");
     65 
     66     a = (float *)malloc(sizeof(float) * M * N);
     67     b = (float *)malloc(sizeof(float) * N * P);
     68     c = (float *)malloc(sizeof(float) * M * P);
     69     checkNULL(a);
     70     checkNULL(b);
     71     checkNULL(c);
     72     checkCudaError(cudaMalloc((float **)&dev_a, sizeof(float) * M * N));
     73     checkCudaError(cudaMalloc((float **)&dev_b, sizeof(float) * N * P));
     74     checkCudaError(cudaMalloc((float **)&dev_c, sizeof(float) * M * P));
     75 
     76     time = clock();
     77     srand(SEED);
     78     for (i = 0; i < M * N; a[i] = (float)rand() / RAND_MAX, i++);
     79     for (i = 0; i < N * P; b[i] = (float)rand() / RAND_MAX, i++);
     80     printf("
    	Initialized! Time:	%6.2f ms", (float)(clock() - time));
     81 
     82     // 用CPU计算
     83     time = clock();
     84     for (i = 0; i < M; i++)
     85     {
     86         for (j = 0; j < P; j++)
     87         {
     88             for (k = 0, temp = 0.0f; k < N; k++)
     89                 c[i * P + j] += a[i * N + k] * b[k * P + j];
     90         }
     91     }
     92     cputime = clock() - time;
     93     printf("
    	CPU cost time:	%6.2f ms", (float)cputime);
     94 
     95     // 用GPU计算
     96     time = clock();
     97     cudaEventRecord(start, 0);
     98 
     99     cudaMemcpy(dev_a, a, sizeof(float) * M * N, cudaMemcpyHostToDevice);
    100     cudaMemcpy(dev_b, b, sizeof(float) * N * P, cudaMemcpyHostToDevice);
    101 
    102     times << < dim3(CEIL(P, THREAD), CEIL(M, THREAD), 1), dim3(THREAD, THREAD) >> > (dev_a, dev_b, dev_c);
    103 
    104     cudaMemcpy(c, dev_c, sizeof(float) * M * P, cudaMemcpyDeviceToHost);
    105     cudaDeviceSynchronize();
    106 
    107     time = clock() - time;
    108     cudaEventRecord(stop, 0);
    109 
    110     cudaEventSynchronize(stop);
    111     cudaEventElapsedTime(&elapsedTime, start, stop);
    112     printf("
    	GPU cost time by CPU API:	%6.2f ms
    	GPU cost time by GPU API:	%6.2f ms
    
    ", (float)time, elapsedTime);
    113     printf("
    	AccRatio =	%3.1f
    
    ", (float)cputime / time);
    114 
    115     // 检验结果(第一次CPU计算的结果被覆盖掉了,这里相当于再算一次,逐个检查)
    116     for (i = 0, flag = 1; i < M; i++)
    117     {
    118         for (j = 0; j < P; j++)
    119         {
    120             for (k = 0, temp = 0.0f; k < N; k++)
    121                 temp += a[i * N + k] * b[k * P + j];
    122             if (temp != c[i * P + j])
    123             {
    124                 printf("
    	Calculate error at (%d,%d),
    		c[i] == %f,CPU == %f", i, j, c[i * P + j], temp);
    125                 flag = 0;
    126             }
    127         }
    128     }
    129     if (flag == 1)
    130         printf("
    	Calculate correctly!");
    131 
    132     cudaEventDestroy(start);
    133     cudaEventDestroy(stop);
    134     cudaFree(dev_a);
    135     cudaFree(dev_b);
    136     cudaFree(dev_c);
    137     getchar();
    138     return 0;
    139 }

     ● 输出结果

            Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031)
            Start!
            Initialized! Time:      127.00 ms
            CPU cost time:  4049.00 ms
            GPU cost time by CPU API:       103.00 ms
            GPU cost time by GPU API:       103.02 ms
    
    
            AccRatio =      39.3
    
    
            Calculate correctly!

    ● 矩阵初始化消耗的时间,在计算超大型矩阵的过程中初始化需要占用时间(因为矩阵元素是在CPU中逐个初始化的)。

    ● 比较了GPU运算过程的CPU和GPU计时的结果,在如上代码中(在内存拷贝前开始计时,在内存回拷完成后结束计时),两种计时方式差距很小,基本上认为结果相同。

    ● 下方为相同的矩阵乘法在CPU中的计算耗时,可见目前GPU效率约为CPU的39倍。

    ● 若去除内存拷贝时间,则GPU耗时可减少到100ms左右,注意此时若想利用利用CPU计时,则必须在第二次掐表以前加入 cudaDeviceSynchronize(); 。因为核函数调用的同时,主机代码会继续往下执行,只到同步语句或者亟需GPU预算结果的语句才会停止,在上面的例子中,若不加入同步语句,则计时结果总是为0 ms。

    ● 可以进一步细致调节线程粒度 THREAD,耗时在小范围内有变化。

    ▶ 改进一,使用共享内存减少全局内存访问量

     1 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(mp)
     2 {
     3     __shared__ float shareA[THREAD * THREAD];
     4     __shared__ float shareB[THREAD * THREAD];
     5 
     6     int k, t;
     7     float temp;
     8     int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号
     9     int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号  
    10 
    11     for (k = 0, temp = 0.0f; k < CEIL(N, THREAD); k++)// N为行程长,k为行程步数
    12     {
    13         shareA[threadIdx.y * THREAD + threadIdx.x] = (idy < M && k * THREAD + threadIdx.x < N) ? a[idy * N + k * THREAD + threadIdx.x] : 0.0f;
    14         shareB[threadIdx.y * THREAD + threadIdx.x] = (idx < P && k * THREAD + threadIdx.y < N) ? b[(k * THREAD + threadIdx.y) * P + idx] : 0.0f;
    15         __syncthreads();
    16 
    17         for (t = 0; t < THREAD; t++)// THREAD为行程长,t为行程步数
    18             temp += shareA[threadIdx.y * THREAD + t] * shareB[t * THREAD + threadIdx.x];
    19         __syncthreads();
    20     }
    21     if (idx < P && idy < M && idy * P + idx < M * P)
    22         c[idy * P + idx] = temp;
    23     return;
    24 }

    ● 输出结果

            Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031)
            Start!
            Initialized! Time:      123.00 ms
            CPU cost time:  4044.00 ms
            GPU cost time by CPU API:       147.00 ms
            GPU cost time by GPU API:       146.75 ms
    
    
            AccRatio =      27.5
    
    
            Calculate correctly!

    ▶改进二,共享内存存储方式调整

     1 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(mp)
     2 {
     3     __shared__ float shareA[THREAD * THREAD];
     4     __shared__ float shareB[THREAD * THREAD];
     5 
     6     int k, t;
     7     float temp;
     8     int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号
     9     int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号  
    10 
    11     for (k = 0, temp = 0.0f; k < CEIL(N, THREAD); k++)// N为行程长,k为行程步数
    12     {
    13         // shareA 改为转置(原始方法读取 a,写入 shareMemory 时挑适当位置
    14         shareA[threadIdx.x * THREAD + threadIdx.y] = (idy < M && k * THREAD + threadIdx.x < N) ? a[idy * N + k * THREAD + threadIdx.x] : 0.0f;
    15         shareB[threadIdx.y * THREAD + threadIdx.x] = (idx < P && k * THREAD + threadIdx.y < N) ? b[(k * THREAD + threadIdx.y) * P + idx] : 0.0f;
    16         __syncthreads();
    17 
    18         for (t = 0; t < THREAD; t++)// THREAD为行程长,t为行程步数
    19             // 注意A的行列发生了交换
    20             temp += shareA[t * THREAD + threadIdx.y] * shareB[t * THREAD + threadIdx.x];
    21         __syncthreads();
    22     }
    23     if (idx < P && idy < M && idy * P + idx < M * P)
    24         c[idy * P + idx] = temp;
    25     return;
    26 }

    ● 输出结果

            Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031)
            Start!
            Initialized! Time:      121.00 ms
            CPU cost time:  4059.00 ms
            GPU cost time by CPU API:       135.00 ms
            GPU cost time by GPU API:       134.80 ms
    
    
            AccRatio =      30.1
    
    
            Calculate correctly!
  • 相关阅读:
    ant构建Jmeter脚本的build文件配置(build.xml)
    Jmeter加密函数__digest总结
    Python接口自动化测试脚本-实现禅道登录
    转载:windows下安装mac虚拟机(Vmvare+mac)
    jstat监控JVM内存使用、GC回收情况
    Pycharm添加Python文件模板
    总结:Jmeter常用参数化方式
    Mysql添加索引及索引的优缺点
    Mysql常用语法
    性能测试中TPS上不去的原因
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7657668.html
Copyright © 2020-2023  润新知