GPU端耗时统计
1 cudaEvent_t start, stop; 2 checkCudaErrors(cudaEventCreate(&start)); 3 checkCudaErrors(cudaEventCreate(&stop)); 4 checkCudaErrors(cudaDeviceSynchronize()); 5 6 float gpu_time = 0.0f; 7 cudaEventRecord(start, 0);//cuda context中的操作完毕事件被记录 8 //分配设备端内存 9 float *d_idata; 10 checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size)); 11 12 //将主机端数据拷贝到设备端内存 13 checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice)); 14 15 //设备端为结果分配内存 16 float *d_odata; 17 checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size)); 18 19 //设置执行参数 20 dim3 grid(1, 1, 1); 21 dim3 threads(num_threads, 1, 1); 22 23 //执行内核,参数含义:grid是网格的纬度,threads是块的纬度,mem_size最多能动态分配的共享内存大小 24 testKernel<<< grid, threads, mem_size >>>(d_idata, d_odata); 25 26 //检查内核执行状态 27 getLastCudaError("Kernel execution failed"); 28 29 //在主机端为结果分配内存 30 float *h_odata = (float *) malloc(mem_size); 31 //从设备端拷贝结果到主机端 32 checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, 33 cudaMemcpyDeviceToHost)); 34 35 cudaEventRecord(stop, 0); 36 unsigned long int counter = 0; 37 while (cudaEventQuery(stop) == cudaErrorNotReady) 38 { 39 counter++; 40 } 41 checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); 42 printf("GPU执行耗时: %.2f (ms) ", gpu_time); 43 printf("CPU executed %lu iterations while waiting for GPU to finish ", counter);
CPU端耗时统计
1 StopWatchInterface *timer = 0; 2 sdkCreateTimer(&timer); 3 sdkResetTimer(&timer); 4 5 sdkStartTimer(&timer); 6 //计算参考方案 7 float *reference = (float *) malloc(mem_size); 8 computeGold(reference, h_idata, num_threads); 9 sdkStopTimer(&timer); 10 printf("串行耗时:%f (ms) ", sdkGetTimerValue(&timer));