1.clock()函数是C/C++中的计时函数,相关的数据类型是clock_t,使用clock函数可以计算运行某一段程序所需的时间,如下所示程序计算从10000000逐渐减一直到0所需的时间。
注:每次运行所需时间可能会不一样
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 #include <stdio.h> 4 #include <time.h> 5 int main() 6 { 7 //测试clock_t的使用 8 clock_t start, end; 9 long n = 10000000L; 10 double duration; 11 printf("使 %ld 循环减一变为 0 所需的时间是:",n); 12 //开始时间 13 start = clock(); 14 //循环减一 15 while(n--); 16 //结束时间 17 end = clock(); 18 //计算整个过程的时间结束时间减开始时间), 19 //CLOCKS_PER_SEC是"time.h"文件中定义的常量, 20 //表示一秒钟包含多少时钟计时单元(即毫秒)。 21 duration = (double)(end-start) / CLOCKS_PER_SEC; 22 printf(" %f 秒 ",duration); 23 return 0; 24 }
2.
CLOCKS_PER_SEC,它用来表示一秒钟会有多少个时钟计时单元,其定义如下:
#define CLOCKS_PER_SEC ((clock_t)1000)
可以看到每过千分之一秒(1毫秒),调用clock()函数返回的值就加1。
可以使用公式clock()/CLOCKS_PER_SEC来计算一个进程自身的运行时间。
矢量求和运算
假设我们有两组数据,我们需要将这两组数据中对应的元素两两相加,并将结果保存在第三个数组中。
1 //CUDA的头文件 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 //C语言的头文件 5 #include <stdio.h> 6 #include <time.h> 7 8 #define N 6000 9 #define thread_num 1024 10 11 //GPU函数声明 12 __global__ void add(int* a, int* b, int* c); 13 //CPU函数声明 14 void add_CPU(int *a, int *b,int *c); 15 16 int main() 17 { 18 //GPU方法计时声明 19 float time_CPU, time_GPU; 20 cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU; 21 //CPU方法计时声明 22 float time_cpu, time_gpu; 23 clock_t start_cpu, stop_cpu, start_gpu, stop_gpu; 24 int a[N], b[N], c[N], c_CPU[N]; 25 int *dev_a, *dev_b, *dev_c; 26 27 int block_num; 28 block_num = (N + thread_num - 1)/thread_num; 29 30 //在GPU上分配内存 31 cudaMalloc((void**)&dev_a, N*sizeof(int)); 32 cudaMalloc((void**)&dev_b, N*sizeof(int)); 33 cudaMalloc((void**)&dev_c, N*sizeof(int)); 34 35 //在CPU上进行赋值 36 for(int i = 0; i < N; i++) 37 { 38 a[i] = -i; 39 b[i] = i*i; 40 } 41 42 43 //记录当前时间 44 start_cpu = clock(); 45 46 add_CPU(a, b, c_CPU); 47 48 stop_cpu = clock(); 49 //记录当前时间 50 printf("Tne time for CPU: %f(ms) ", (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC); 51 52 53 //输出CPU结果 54 printf(" Result from CPU: "); 55 for(int i = 0; i<N; i++) 56 { 57 printf("CPU: %d+%d=%d ",a[i],b[i],c_CPU[i]); 58 } 59 60 //GPU计算 61 cudaMemcpy(dev_a,a,N*sizeof(int), cudaMemcpyHostToDevice); 62 cudaMemcpy(dev_b,b,N*sizeof(int), cudaMemcpyHostToDevice); 63 64 //创建Event 65 cudaEventCreate(&start_GPU); 66 cudaEventCreate(&stop_GPU); 67 68 //记录当时时间 69 cudaEventRecord(start_GPU,0); 70 start_gpu = clock(); 71 //调用核函数 72 add<<<block_num,thread_num>>>(dev_a,dev_b,dev_c); 73 74 stop_gpu = clock(); 75 //记录当时时间 76 cudaEventRecord(stop_GPU,0); 77 cudaEventSynchronize(start_GPU); 78 cudaEventSynchronize(stop_GPU); 79 cudaEventElapsedTime(&time_GPU, start_GPU, stop_GPU); 80 printf(" The time from GPU : %f(ms) ",time_GPU); 81 82 //将device复制到host 83 cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost); 84 //将GPU中的结果拷贝出来 85 cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost); 86 87 //输出 88 printf(" Result from GPU: "); 89 for(int i = 0; i<N; i++) 90 { 91 printf("GPU: %d+%d=%d ",a[i],b[i],c[i]); 92 } 93 cudaEventDestroy(start_GPU); 94 cudaEventDestroy(stop_GPU); 95 96 //释放内存 97 cudaFree(dev_a); 98 cudaFree(dev_b); 99 cudaFree(dev_c); 100 printf(" The time for CPU by event: %f(ms) ", time_CPU); 101 printf("The time for GPU by event: %f(ms) ", time_GPU); 102 103 time_cpu = (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC; 104 time_gpu = (float)(stop_gpu - start_gpu) / CLOCKS_PER_SEC; 105 printf(" The time for CPU by host: %f(ms) ", time_cpu); 106 printf("The time for GPU by host: %f(ms) ", time_gpu); 107 108 109 return 0; 110 } 111 //GPU函数 112 __global__ void add(int *a, int *b, int *c) 113 { 114 int tid = blockIdx.x*blockDim.x+threadIdx.x;//计算该索引处的数据 115 if (tid < N) 116 { 117 c[tid] = a[tid] + b[tid]; 118 } 119 } 120 121 //CPU函数 122 void add_CPU(int *a, int *b, int *c) 123 { 124 for (int i = 0; i < N; i++) 125 { 126 c[i] = a[i] + b[i]; 127 } 128 }
3.CUDA实现矩阵乘
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <time.h> 4 #include "cuda_runtime.h" 5 #include "device_launch_parameters.h" 6 #define M 1024 7 #define K 1024 8 #define N 1024 9 10 void initial(double* list,int row,int col) 11 { 12 double *num = list; 13 // srand((unsigned)time(NULL)); 14 for (int i=0; i<row*col; i++) 15 { 16 num[i] = rand()%10; 17 } 18 } 19 20 void CpuMatrix(double *A,double *B,double *C) 21 { 22 int i,j,k; 23 24 for( i=0; i<M; i++) 25 { 26 for(j=0; j<N; j++) 27 { 28 double sum = 0; 29 for(int k=0; k<K; k++) 30 { 31 sum += A[i*K + k] * B[k * N + j]; 32 } 33 C[i * N + j] = sum; 34 } 35 } 36 } 37 38 __global__ void GpuMatrix(double *dev_A,double *dev_B,double *dev_C) 39 { 40 int ix = threadIdx.x + blockDim.x * blockIdx.x; 41 int iy = threadIdx.y + blockDim.y * blockIdx.y; 42 43 if(ix<K && iy<M) 44 { 45 double sum = 0; 46 for( int k = 0; k < K;k++) 47 { 48 sum += dev_A[iy*K + k] * dev_B[k*N + ix]; 49 } 50 dev_C[iy * N + ix] = sum; 51 } 52 } 53 54 void printMatrix(double *list,int row,int col) 55 { 56 double *p = list; 57 for(int i=0; i<row; i++) 58 { 59 for(int j=0; j<col; j++) 60 { 61 printf("%10lf",p[j]); 62 } 63 p = p + col; 64 printf(" "); 65 } 66 } 67 68 int main(int argc,char **argv) 69 { 70 clock_t start_cpu,stop_cpu,start_gpu,stop_gpu; 71 double time_cpu,time_gpu; 72 73 float time_CPU, time_GPU; 74 cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU; 75 76 //printf("Amatrix:(%d*%d) ",M,K); 77 int Axy = M*K; 78 int Abytes = Axy * sizeof(double); 79 80 // printf("Bmatrix:(%d*%d) ",K,N); 81 int Bxy = K*N; 82 int Bbytes = Bxy * sizeof(double); 83 84 int nxy = M*N; 85 int nbytes = nxy * sizeof(double); 86 87 double *host_A, *host_B, *host_C, *c_CPU; 88 host_A = (double*)malloc(Abytes); 89 host_B = (double*)malloc(Bbytes); 90 host_C = (double*)malloc(nbytes); 91 c_CPU = (double*)malloc(nbytes); 92 93 //初始化 94 initial(host_A,M,K); 95 //输出 96 printf("A:(%d,%d): ",M,K); 97 // printMatrix(host_A,M,K); 98 99 initial(host_B,K,N); 100 //输出 101 printf("B:(%d,%d): ",K,N); 102 // printMatrix(host_B,K,N); 103 104 start_cpu = clock(); 105 CpuMatrix(host_A,host_B,host_C); 106 stop_cpu = clock(); 107 108 printf("The time from CPU is %f(ms) ",(float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC); 109 //输出 110 printf("Host_C:(%d,%d): ",M,N); 111 // printMatrix(host_C,M,N); 112 113 //GPU计算 114 double *dev_A,*dev_B,*dev_C; 115 cudaMalloc((void**)&dev_A,Axy*sizeof(double)); 116 cudaMalloc((void**)&dev_B,Bxy*sizeof(double)); 117 cudaMalloc((void**)&dev_C,nxy*sizeof(double)); 118 dim3 block(1024,1024); 119 dim3 grid(32,32); 120 cudaMemcpy(dev_A,host_A,Abytes,cudaMemcpyHostToDevice); 121 cudaMemcpy(dev_B,host_B,Bbytes,cudaMemcpyHostToDevice); 122 //创建Event 123 cudaEventCreate(&start_GPU); 124 cudaEventCreate(&stop_GPU); 125 cudaEventRecord(start_GPU,0); 126 start_gpu = clock(); 127 128 GpuMatrix<<<grid,block>>>(dev_A,dev_B,dev_C); 129 130 stop_gpu = clock(); 131 cudaEventRecord(stop_GPU,0); 132 cudaEventSynchronize(start_GPU); 133 cudaEventSynchronize(stop_GPU); 134 //计算时间差 135 cudaEventElapsedTime(&time_GPU,start_GPU,stop_GPU); 136 printf("The time from GPU is %f(ms) ",time_GPU); 137 //消除Event 138 cudaEventDestroy(start_GPU); 139 cudaEventDestroy(stop_GPU); 140 141 cudaMemcpy(c_CPU,dev_C,nbytes,cudaMemcpyDeviceToHost); 142 //输出 143 printf("device_C:(%d,%d): ",M,N); 144 // printMatrix(c_CPU,M,N); 145 146 //释放内存 147 cudaFree(dev_A); 148 cudaFree(dev_B); 149 cudaFree(dev_C); 150 free(host_A); 151 free(host_B); 152 free(host_C); 153 free(c_CPU); 154 155 time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC; 156 time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC; 157 printf(" The time for CPU by host: %f(ms) ", time_cpu); 158 printf("The time for GPU by host: %f(ms) ", time_gpu); 159 return 0; 160 161 }