矩阵乘法,使用一维线程块和共享内存。并且在静态代码和运行时编译两种条件下使用。
▶ 源代码:静态使用
1 #include <stdio.h> 2 #include <assert.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_functions.h> 6 #include <helper_cuda.h> 7 8 template <int BLOCK_SIZE> __global__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) 9 { 10 int bx = blockIdx.x; 11 int by = blockIdx.y; 12 int tx = threadIdx.x; 13 int ty = threadIdx.y; 14 15 int aBegin = wA * BLOCK_SIZE * by; // A的行程起点 16 int aEnd = aBegin + wA - 1; // A的行程终点 17 int aStep = BLOCK_SIZE; // A的跨度(一个 block 为宽 BLOCK_SIZE 的一维条带,各线程分别对应其中的一个元素) 18 int bBegin = BLOCK_SIZE * bx; // B的行程起点 19 int bStep = BLOCK_SIZE * wB; // B的跨度(一个 block 为高 BLOCK_SIZE 的一维条带,各线程分别对应其中的一个元素) 20 float Csub = 0; 21 22 for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) 23 { 24 __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 25 __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 26 As[ty][tx] = A[a + wA * ty + tx]; 27 Bs[ty][tx] = B[b + wB * ty + tx]; 28 __syncthreads(); 29 30 #pragma unroll// 循环展开为 BLOCK_SIZE 个赋值语句,提高效率 31 for (int k = 0; k < BLOCK_SIZE; ++k) 32 Csub += As[ty][k] * Bs[k][tx]; 33 __syncthreads(); 34 } 35 36 int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 37 C[c + wB * ty + tx] = Csub; 38 } 39 40 void constantInit(float *data, int size, float val) 41 { 42 for (int i = 0; i < size; ++i) 43 data[i] = val; 44 } 45 46 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) 47 { 48 unsigned int size_A = dimsA.x * dimsA.y; 49 unsigned int mem_size_A = sizeof(float) * size_A; 50 float *h_A = (float *)malloc(mem_size_A); 51 unsigned int size_B = dimsB.x * dimsB.y; 52 unsigned int mem_size_B = sizeof(float) * size_B; 53 float *h_B = (float *)malloc(mem_size_B); 54 constantInit(h_A, size_A, 1.0f); 55 constantInit(h_B, size_B, 0.01f); 56 dim3 dimsC(dimsB.x, dimsA.y, 1); 57 unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 58 float *h_C = (float *) malloc(mem_size_C); 59 float *d_A, *d_B, *d_C; 60 cudaMalloc((void **) &d_A, mem_size_A); 61 cudaMalloc((void **) &d_B, mem_size_B); 62 cudaMalloc((void **) &d_C, mem_size_C); 63 cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 64 cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 65 66 // 热身 67 dim3 threads(block_size, block_size); 68 dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); 69 if (block_size == 16) 70 matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 71 else 72 matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 73 printf("done "); 74 cudaDeviceSynchronize(); 75 76 printf("Computing result using CUDA Kernel... "); 77 cudaEvent_t start; 78 cudaEventCreate(&start); 79 cudaEvent_t stop; 80 cudaEventCreate(&stop); 81 cudaEventRecord(start, NULL); 82 83 int nIter = 300; 84 for (int j = 0; j < nIter; j++) 85 { 86 if (block_size == 16) 87 matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 88 else 89 matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 90 } 91 cudaEventRecord(stop, NULL); 92 cudaEventSynchronize(stop); 93 94 float msecTotal = 0.0f; 95 cudaEventElapsedTime(&msecTotal, start, stop); 96 float msecPerMatrixMul = msecTotal / nIter; 97 double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x; 98 double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 99 printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block ", 100 gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); 101 cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 102 103 // 检查结果,要求相对误差:|<x, y>_cpu - <x,y>_gpu| / <|x|, |y|> < eps 104 printf("Checking computed result for correctness: "); 105 bool correct = true; 106 double eps = 1.e-6 ; // machine zero 107 for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) 108 { 109 double abs_err = fabs(h_C[i] - (dimsA.x * valB)); 110 double dot_length = dimsA.x; 111 double abs_val = fabs(h_C[i]); 112 double rel_err = abs_err/abs_val/dot_length ; 113 if (rel_err > eps) 114 { 115 printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E ", i, h_C[i], dimsA.x*valB, eps); 116 correct = false; 117 } 118 } 119 printf("%s ", correct ? "Result = PASS" : "Result = FAIL"); 120 121 free(h_A); 122 free(h_B); 123 free(h_C); 124 cudaFree(d_A); 125 cudaFree(d_B); 126 cudaFree(d_C); 127 printf(" NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled. "); 128 if (correct) 129 return EXIT_SUCCESS; 130 else 131 return EXIT_FAILURE; 132 } 133 134 int main(int argc, char **argv) 135 { 136 printf("[Matrix Multiply Using CUDA] - Starting... "); 137 138 if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) 139 { 140 printf("Usage -device=n (n >= 0 for deviceID) "); 141 printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A) "); 142 printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B) "); 143 printf(" Note: Outer matrix dimensions of A & B matrices must be equal. "); 144 exit(EXIT_SUCCESS); 145 } 146 147 int devID = 0;// 指定设备,默认用0号设备 148 if (checkCmdLineFlag(argc, (const char **)argv, "device")) 149 { 150 devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 151 cudaSetDevice(devID); 152 } 153 cudaDeviceProp deviceProp; 154 cudaGetDevice(&devID); 155 cudaGetDeviceProperties(&deviceProp, devID); 156 157 if (deviceProp.computeMode == cudaComputeModeProhibited) 158 { 159 fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice(). "); 160 exit(EXIT_SUCCESS); 161 } 162 163 int block_size = (deviceProp.major < 2) ? 16 : 32; 164 165 dim3 dimsA(5*2*block_size, 5*2*block_size, 1); 166 dim3 dimsB(5*4*block_size, 5*2*block_size, 1); 167 168 // 使用命令行指定的A、B的维度参数 169 if (checkCmdLineFlag(argc, (const char **)argv, "wA")) 170 dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); 171 if (checkCmdLineFlag(argc, (const char **)argv, "hA")) 172 dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); 173 if (checkCmdLineFlag(argc, (const char **)argv, "wB")) 174 dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); 175 if (checkCmdLineFlag(argc, (const char **)argv, "hB")) 176 dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); 177 if (dimsA.x != dimsB.y) 178 { 179 printf("Error: outer matrix dimensions must be equal. (%d != %d) ", 180 dimsA.x, dimsB.y); 181 exit(EXIT_FAILURE); 182 } 183 printf("MatrixA(%d,%d), MatrixB(%d,%d) ", dimsA.x, dimsA.y, dimsB.x, dimsB.y); 184 185 int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); 186 187 getchar(); 188 exit(matrix_result); 189 }
▶ 源代码:运行时编译
1 /*matrixMul_kernel.cu*/ 2 template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) 3 { 4 int bx = blockIdx.x; 5 int by = blockIdx.y; 6 int tx = threadIdx.x; 7 int ty = threadIdx.y; 8 int aBegin = wA * BLOCK_SIZE * by; 9 int aEnd = aBegin + wA - 1; 10 int aStep = BLOCK_SIZE; 11 int bBegin = BLOCK_SIZE * bx; 12 int bStep = BLOCK_SIZE * wB; 13 float Csub = 0; 14 for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) 15 { 16 __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 17 __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 18 As[ty][tx] = A[a + wA * ty + tx]; 19 Bs[ty][tx] = B[b + wB * ty + tx]; 20 __syncthreads(); 21 #pragma unroll 22 for (int k = 0; k < BLOCK_SIZE; ++k) 23 Csub += As[ty][k] * Bs[k][tx]; 24 __syncthreads(); 25 } 26 int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 27 C[c + wB * ty + tx] = Csub; 28 } 29 30 extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB) 31 { 32 matrixMulCUDA<16>(C,A,B,wA,wB); 33 } 34 35 extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB) 36 { 37 matrixMulCUDA<32>(C,A,B,wA,wB); 38 }
1 /*matrixMul.cpp*/ 2 #include <stdio.h> 3 #include <assert.h> 4 #include <cuda_runtime.h> 5 #include "device_launch_parameters.h" 6 #include "nvrtc_helper.h" 7 #include <helper_functions.h> 8 9 void constantInit(float *data, int size, float val) 10 { 11 for (int i = 0; i < size; ++i) 12 data[i] = val; 13 } 14 15 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) 16 { 17 // Allocate host memory for matrices A and B 18 unsigned int size_A = dimsA.x * dimsA.y; 19 unsigned int mem_size_A = sizeof(float) * size_A; 20 float *h_A = (float *)malloc(mem_size_A); 21 unsigned int size_B = dimsB.x * dimsB.y; 22 unsigned int mem_size_B = sizeof(float) * size_B; 23 float *h_B = (float *)malloc(mem_size_B); 24 const float valB = 0.01f; 25 constantInit(h_A, size_A, 1.0f); 26 constantInit(h_B, size_B, valB); 27 CUdeviceptr d_A, d_B, d_C; 28 29 char *ptx, *kernel_file; 30 size_t ptxSize; 31 kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]); 32 compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize); 33 CUmodule module = loadPTX(ptx, argc, argv); 34 35 dim3 dimsC(dimsB.x, dimsA.y, 1); 36 unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 37 float *h_C = (float *) malloc(mem_size_C); 38 cuMemAlloc(&d_A, mem_size_A); 39 cuMemAlloc(&d_B, mem_size_B); 40 cuMemAlloc(&d_C, mem_size_C); 41 cuMemcpyHtoD(d_A, h_A, mem_size_A); 42 cuMemcpyHtoD(d_B, h_B, mem_size_B); 43 44 dim3 threads(block_size, block_size); 45 dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); 46 47 printf("Computing result using CUDA Kernel... "); 48 49 CUfunction kernel_addr; 50 if (block_size == 16) 51 cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"); 52 else 53 cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"); 54 55 void *arr[] = { (void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x }; 56 57 // Execute the kernel 58 int nIter = 300; 59 60 for (int j = 0; j < nIter; j++) 61 { 62 cuLaunchKernel(kernel_addr, 63 grid.x, grid.y, grid.z, 64 threads.x, threads.y, threads.z, 65 0, 0, &arr[0], 0); 66 cuCtxSynchronize(); 67 } 68 cuMemcpyDtoH(h_C, d_C, mem_size_C); 69 70 printf("Checking computed result for correctness: "); 71 bool correct = true; 72 double eps = 1.e-6 ; 73 for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) 74 { 75 double abs_err = fabs(h_C[i] - (dimsA.x * valB); 76 double dot_length = dimsA.x; 77 double abs_val = fabs(h_C[i]); 78 double rel_err = abs_err/abs_val/dot_length ; 79 if (rel_err > eps) 80 { 81 printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E ", i, h_C[i], dimsA.x*valB, eps); 82 correct = false; 83 } 84 } 85 printf("%s ", correct ? "Result = PASS" : "Result = FAIL"); 86 87 printf(" NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled. "); 88 free(h_A); 89 free(h_B); 90 free(h_C); 91 cuMemFree(d_A); 92 cuMemFree(d_B); 93 cuMemFree(d_C); 94 if (correct) 95 return EXIT_SUCCESS; 96 else 97 return EXIT_FAILURE; 98 } 99 100 int main(int argc, char **argv) 101 { 102 printf("[Matrix Multiply Using CUDA] - Starting... "); 103 104 if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) 105 { 106 printf("Usage -device=n (n >= 0 for deviceID) "); 107 printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A) "); 108 printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B) "); 109 printf(" Note: Outer matrix dimensions of A & B matrices must be equal. "); 110 exit(EXIT_SUCCESS); 111 } 112 113 int block_size = 32; 114 dim3 dimsA(5*2*block_size, 5*2*block_size, 1); 115 dim3 dimsB(5*4*block_size, 5*2*block_size, 1); 116 117 if (checkCmdLineFlag(argc, (const char **)argv, "wA")) 118 dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); 119 if (checkCmdLineFlag(argc, (const char **)argv, "hA")) 120 dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); 121 if (checkCmdLineFlag(argc, (const char **)argv, "wB")) 122 dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); 123 if (checkCmdLineFlag(argc, (const char **)argv, "hB")) 124 dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); 125 if (dimsA.x != dimsB.y) 126 { 127 printf("Error: outer matrix dimensions must be equal. (%d != %d) ", dimsA.x, dimsB.y); 128 } exit(EXIT_FAILURE); 129 printf("MatrixA(%d,%d), MatrixB(%d,%d) ", dimsA.x, dimsA.y, dimsB.x, dimsB.y); 130 131 int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); 132 133 getchar(); 134 exit(matrix_result); 135 }
▶ 输出结果:
[Matrix Multiply Using CUDA] - Starting... GPU Device 0: "GeForce GTX 1070" with compute capability 6.1 MatrixA(320,320), MatrixB(640,320) Computing result using CUDA Kernel... done Performance= 22.95 GFlop/s, Time= 5.712 msec, Size= 131072000 Ops, WorkgroupSize= 1024 threads/block Checking computed result for correctness: Result = PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
▶ 涨姿势:
● 程序写得很烂,各种声明、初始化杂糅。
● 一个根据cuda错误种类返回错误描述的函数
extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
● 预编译命令展开循环
1 #pragma unroll 2 for (i = 0; i < m; i++) 3 c[i] = a[i] + b[i];
等价于
1 c[0] = a[0] + b[0]; 2 c[1] = a[1] + b[1]; 3 c[2] = a[2] + b[2]; 4 ... 5 c[m-1] = a[m-1] + b[m-1];
#pragma unroll 命令后面可接数字,表明展开前多少次迭代,例如 #pragma unroll 4
● 核函数泛型编程。可以在调用核函数时传入一个常量参数,变相使用动态数组来规定共享内存等数组的大小。
1 template <int BLOCK_SIZE> __global__ void functionName(void) 2 { 3 __shared__ int shareArray[BLOCK_SIZE]; 4 ... 5 } 6 7 cunctionName<16> << < blocksize, threadsize >> >();
● 热身,在多次重复实验前提前算一次。对缓存有帮助,有效减小实验结果(计算耗时)的方差。