• 0_Simple__matrixMulCUBLAS


    使用CUDA的线性代数库cuBLAS来计算矩阵乘法。这里主要记录调用规则,关于乘法函数中详细的参数说明和调用规则见另一篇随笔。

    ▶ 源代码:

      1 #include <assert.h>
      2 #include <helper_string.h>
      3 #include <cuda_runtime.h>
      4 #include <cublas_v2.h>
      5 #include <helper_functions.h>
      6 #include <helper_cuda.h>
      7 
      8 #ifndef min
      9 #define min(a,b) ((a < b) ? a : b)
     10 #endif
     11 #ifndef max
     12 #define max(a,b) ((a > b) ? a : b)
     13 #endif
     14 
     15 // 存放各矩阵维数的结构体
     16 typedef struct _matrixSize
     17 {
     18     unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
     19 } sMatrixSize;
     20 
     21 // CPU 计算矩阵乘法。三个参数分别用于行定位、行程定位、列定位,没有查错机制。
     22 void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
     23 {
     24     for (unsigned int i = 0; i < hA; ++i)           // 从上往下数 i 行
     25     {
     26         for (unsigned int j = 0; j < wB; ++j)       // 从左往右数 j 列
     27         {
     28             double sum = 0;
     29             for (unsigned int k = 0; k < wA; ++k)   // 行程长
     30             {
     31                 double a = A[i * wA + k];           // 中间过程用 double,结果输出 float
     32                 double b = B[k * wB + j];
     33                 sum += a * b;
     34             }
     35             C[i * wB + j] = (float)sum;
     36         }
     37     }
     38 }
     39 
     40 // 初始化数组
     41 void randomInit(float *data, int size)
     42 {
     43     for (int i = 0; i < size; ++i)
     44         data[i] = rand() / (float)RAND_MAX;
     45 }
     46 
     47 //输出两个矩阵的不相等的值及其位置,允许容差为 fListTol ,最多输出 iListLength 个
     48 void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
     49 {
     50     printf("Listing first %d Differences > %.6f...
    ", iListLength, fListTol);
     51     int i, j, k;
     52     int error_count = 0;
     53 
     54     for (j = 0; j < height; j++)
     55     {
     56         if (error_count < iListLength)
     57             printf("
      Row %d:
    ", j);
     58 
     59         for (i = 0; i < width; i++)
     60         {
     61             k = j * width + i;
     62             float fDiff = fabs(data1[k] - data2[k]);
     63             if (fDiff > fListTol)
     64             {
     65                 if (error_count < iListLength)
     66                     printf("    Loc(%d,%d)	CPU=%.5f	GPU=%.5f	Diff=%.6f
    ", i, j, data1[k], data2[k], fDiff);
     67                 error_count++;
     68             }
     69         }
     70     }
     71     printf(" 
      Total Errors = %d
    ", error_count);
     72 }
     73 
     74 // 初始化设备。包括选择设备,设定维数
     75 void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
     76 {
     77     cudaError_t error;
     78 
     79     // 选择设备,略去了检查错误部分
     80     devID = 0;
     81     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
     82     {
     83         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
     84         error = cudaSetDevice(devID);
     85     }
     86     error = cudaGetDevice(&devID);
     87     if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
     88         iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
     89     iSizeMultiple = max(min(iSizeMultiple, 10), 1);
     90     cudaDeviceProp deviceProp;
     91     error = cudaGetDeviceProperties(&deviceProp, devID);
     92     printf("GPU Device %d: "%s" with compute capability %d.%d
    
    ", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
     93 
     94     // Fermi 以上构架的计算机使用更大的线程块(blockDim),这里用了32
     95     int block_size = (deviceProp.major < 2) ? 16 : 32;
     96 
     97     matrix_size.uiWA = 3 * block_size * iSizeMultiple;
     98     matrix_size.uiHA = 4 * block_size * iSizeMultiple;
     99     matrix_size.uiWB = 2 * block_size * iSizeMultiple;
    100     matrix_size.uiHB = 3 * block_size * iSizeMultiple;
    101     matrix_size.uiWC = 2 * block_size * iSizeMultiple;
    102     matrix_size.uiHC = 4 * block_size * iSizeMultiple;
    103 
    104     printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)
    ",                                                       
    105         matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC, matrix_size.uiWC);
    106     if (matrix_size.uiWA != matrix_size.uiHB || matrix_size.uiHA != matrix_size.uiHC || matrix_size.uiWB != matrix_size.uiWC)
    107     {
    108        printf("ERROR: Matrix sizes do not match!
    ");
    109        exit(-1);
    110     }
    111 }
    112 
    113 // 计算矩阵乘法部分
    114 int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
    115 {
    116     cudaDeviceProp deviceProp;
    117     cudaGetDeviceProperties(&deviceProp, devID);
    118     int block_size = (deviceProp.major < 2) ? 16 : 32;
    119 
    120     unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
    121     unsigned int mem_size_A = sizeof(float) * size_A;
    122     float *h_A = (float *)malloc(mem_size_A);
    123     unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
    124     unsigned int mem_size_B = sizeof(float) * size_B;
    125     float *h_B = (float *)malloc(mem_size_B);
    126 
    127     srand(2006);
    128     randomInit(h_A, size_A);
    129     randomInit(h_B, size_B);
    130 
    131     float *d_A, *d_B, *d_C;
    132     unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
    133     unsigned int mem_size_C = sizeof(float) * size_C;
    134     //float *h_C      = (float *) malloc(mem_size_C);           // TM 没有用!
    135     float *h_CUBLAS = (float *) malloc(mem_size_C);             // 保存 d_C 回传的结果
    136     cudaMalloc((void **) &d_A, mem_size_A);
    137     cudaMalloc((void **) &d_B, mem_size_B);
    138     cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
    139     cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
    140     cudaMalloc((void **) &d_C, mem_size_C);
    141 
    142     dim3 threads(block_size, block_size);
    143     dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
    144 
    145     printf("Computing result using CUBLAS...");
    146     int nIter = 30;
    147 
    148     //cuBLAS代码块
    149     {
    150         const float alpha = 1.0f;
    151         const float beta  = 0.0f;
    152         cublasHandle_t handle;
    153         cudaEvent_t start, stop;
    154 
    155         cublasCreate(&handle);
    156 
    157         //热身,注意转置的问题,不采用 <<< >>> 调用核函数
    158         cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, 
    159             &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
    160 
    161         cudaEventCreate(&start);
    162         cudaEventCreate(&stop);
    163 
    164         cudaEventRecord(start, NULL);
    165 
    166         for (int j = 0; j < nIter; j++)
    167         {
    168             cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA,
    169                 &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
    170         }
    171         printf("done.
    ");
    172         cudaEventRecord(stop, NULL);
    173         cudaEventSynchronize(stop);
    174         float msecTotal = 0.0f;
    175         cudaEventElapsedTime(&msecTotal, start, stop);
    176 
    177         // 计算了耗时、操作数以及操作速度
    178         float msecPerMatrixMul = msecTotal / nIter;
    179         double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
    180         double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    181         printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops
    ", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul);
    182 
    183         cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
    184         cublasDestroy(handle);
    185     }
    186 
    187     printf("Computing result using host CPU...");
    188     float *reference = (float *)malloc(mem_size_C);
    189     matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
    190     printf("done.
    ");
    191 
    192     bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
    193 
    194     if (resCUBLAS != true)
    195         printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
    196 
    197     printf("Comparing CUBLAS Matrix Multiply with CPU results: %s
    ", (true == resCUBLAS) ? "PASS" : "FAIL");
    198     printf("
    NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
    ");
    199 
    200     free(h_A);
    201     free(h_B);
    202     free(h_CUBLAS);
    203     free(reference);
    204     cudaFree(d_A);
    205     cudaFree(d_B);
    206     cudaFree(d_C);
    207 
    208     if (resCUBLAS == true)
    209         return EXIT_SUCCESS;
    210     else
    211         return EXIT_FAILURE;
    212 }
    213 
    214 int main(int argc, char **argv)
    215 {
    216     printf("[Matrix Multiply CUBLAS] - Starting...
    ");
    217 
    218     int devID = 0, sizeMult = 5;
    219     sMatrixSize matrix_size;
    220 
    221     initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
    222 
    223     int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
    224 
    225     getchar();
    226     return matrix_result;
    227 }

    ▶ 输出结果:

    [Matrix Multiply CUBLAS] - Starting...
    GPU Device 0: "GeForce GTX 1070" with compute capability 6.1
    
    MatrixA(640,480), MatrixB(480,320), MatrixC(640,320)
    Computing result using CUBLAS...done.
    Performance= 2887.08 GFlop/s, Time= 0.068 msec, Size= 196608000 Ops
    Computing result using host CPU...done.
    Comparing CUBLAS Matrix Multiply with CPU results: PASS
    
    NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

    ▶ 涨姿势:

    ● 代码依然很烂。
    多用了一个 h_C 根本没有用上,其作用被 h_CUBLAS 取代了,而且源代码中有free(h_C)却没有free(h_CUBLAS)。

    1 float *h_C = (float *)malloc(mem_size_C);
    2 ...
    3 free(h_C);

    ● 句柄的创造与销毁,定义于cublas_api.h 中

    1 /*cublas_api.h*/
    2 typedef struct cublasContext *cublasHandle_t;
    3 
    4 /*cublas_v2.h*/
    5 #define cublasCreate cublasCreate_v2
    6 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle);
    7  
    8 #define cublasDestroy cublasDestroy_v2
    9 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);

    ● 矩阵乘法计算核心函数。实际上该函数不是用来专门计算矩阵乘法的,而且对应不同的数据类型(实数、复数)和数据精度(单精度、双精度)一共有四个函数。

     1 #define cublasSgemm cublasSgemm_v2
     2 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2
     3 (
     4     cublasHandle_t handle,
     5     cublasOperation_t transa, cublasOperation_t transb,
     6     int m, int n, int k,
     7     const float *alpha,
     8     const float *A, int lda,
     9     const float *B, int ldb,
    10     const float *beta,
    11     float *C, int ldc
    12 );


    ● 定义在 helper_image.h 中的一个函数,用于比较两个长为 len 数组是否相等,允许容差为epsilon

    1 inline bool sdkCompareL2fe(const float *reference, const float *data, const unsigned int len, const float epsilon)

    ● 调用cublas计算矩阵乘法的过程摘要(详细的参数说明和调用规则见另一篇随笔)

     1 ...// 准备d_A,d_B,d_C,其中 d_A 和 d_B 中存放了需要相乘的两个矩阵,d_C初始化自动为零矩阵
     2 
     3 // 规定使用的线程块和线程尺寸
     4 dim3 threads(1, 1);
     5 dim3 grid(1, 1);
     6 
     7 // 常数因子,计算 d_C = d_A * d_B 时设定为 α = 1.0, β = 0.0
     8 const float alpha = 1.0f;
     9 const float beta = 0.0f;
    10 
    11 // 创建句柄,需要在计算完成后销毁
    12 cublasHandle_t handle;
    13 cublasCreate(&handle);
    14 
    15 // 调用计算函数,注意参数顺序
    16 cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WB, HA, WA, &alpha, d_B, WB, d_A, WA, &beta, d_C, WB);
    17 
    18 cublasDestroy(handle);
    19 
    20 ...// 回收计算结果,顺序可以和销毁句柄交换
  • 相关阅读:
    Python 正则表达式(分组)
    django 笔记
    Java代理和动态代理机制分析和应用
    Chrome浏览器如何调试移动端网页信息
    【数据分析】Excle中安装数据分析工具
    【BigData】Java基础_socket编程中使用多线程
    【BigData】Java基础_多线程
    【BigData】Java基础_socket编程
    财务报表之利润表
    资产负债表的会计恒等式
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7760562.html
Copyright © 2020-2023  润新知