• CUDA实战2


    1.clock()函数是C/C++中的计时函数,相关的数据类型是clock_t,使用clock函数可以计算运行某一段程序所需的时间,如下所示程序计算从10000000逐渐减一直到0所需的时间。

    注:每次运行所需时间可能会不一样

     1 #include "cuda_runtime.h"
     2 #include "device_launch_parameters.h"
     3 #include <stdio.h>
     4 #include <time.h>
     5 int main()
     6 {
     7     //测试clock_t的使用
     8     clock_t start, end;
     9     long n = 10000000L;
    10     double duration;
    11     printf("使 %ld 循环减一变为 0 所需的时间是:",n);
    12     //开始时间
    13     start = clock();
    14     //循环减一
    15     while(n--);
    16     //结束时间
    17     end = clock();
    18     //计算整个过程的时间结束时间减开始时间), 
    19     //CLOCKS_PER_SEC是"time.h"文件中定义的常量,
    20     //表示一秒钟包含多少时钟计时单元(即毫秒)。
    21     duration = (double)(end-start) / CLOCKS_PER_SEC;
    22     printf(" %f 秒
    ",duration);
    23     return 0;
    24 }
    View Code

     

     

     2.

    CLOCKS_PER_SEC,它用来表示一秒钟会有多少个时钟计时单元,其定义如下:
    #define CLOCKS_PER_SEC ((clock_t)1000)
    可以看到每过千分之一秒(1毫秒),调用clock()函数返回的值就加1。
    可以使用公式clock()/CLOCKS_PER_SEC来计算一个进程自身的运行时间。
     

    矢量求和运算

    假设我们有两组数据,我们需要将这两组数据中对应的元素两两相加,并将结果保存在第三个数组中。

      1 //CUDA的头文件
      2 #include "cuda_runtime.h"
      3 #include "device_launch_parameters.h"
      4 //C语言的头文件
      5 #include <stdio.h>
      6 #include <time.h>
      7 
      8 #define N 6000
      9 #define thread_num 1024
     10 
     11 //GPU函数声明
     12 __global__ void add(int* a, int* b, int* c);
     13 //CPU函数声明
     14 void add_CPU(int *a, int *b,int *c);
     15 
     16 int main()
     17 {
     18     //GPU方法计时声明
     19     float time_CPU, time_GPU;
     20     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
     21     //CPU方法计时声明
     22     float time_cpu, time_gpu;
     23     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
     24     int a[N], b[N], c[N], c_CPU[N];
     25     int *dev_a, *dev_b, *dev_c;
     26     
     27     int block_num;
     28     block_num = (N + thread_num - 1)/thread_num;
     29     
     30     //在GPU上分配内存
     31     cudaMalloc((void**)&dev_a, N*sizeof(int));
     32     cudaMalloc((void**)&dev_b, N*sizeof(int));
     33     cudaMalloc((void**)&dev_c, N*sizeof(int));
     34     
     35     //在CPU上进行赋值
     36     for(int i = 0; i < N; i++)
     37     {
     38         a[i] = -i;
     39         b[i] = i*i;
     40     }
     41     
     42  
     43     //记录当前时间
     44     start_cpu = clock();
     45     
     46     add_CPU(a, b, c_CPU);
     47     
     48     stop_cpu = clock();
     49     //记录当前时间
     50     printf("Tne time for CPU:	%f(ms)
    ", (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC);
     51     
     52     
     53     //输出CPU结果
     54     printf("
    Result from CPU:
    ");
     55     for(int i = 0; i<N; i++)
     56     {
     57         printf("CPU:	%d+%d=%d
    ",a[i],b[i],c_CPU[i]);
     58     }
     59     
     60     //GPU计算
     61     cudaMemcpy(dev_a,a,N*sizeof(int), cudaMemcpyHostToDevice);
     62     cudaMemcpy(dev_b,b,N*sizeof(int), cudaMemcpyHostToDevice);
     63     
     64     //创建Event
     65     cudaEventCreate(&start_GPU);
     66     cudaEventCreate(&stop_GPU);
     67     
     68     //记录当时时间
     69     cudaEventRecord(start_GPU,0);
     70     start_gpu = clock();
     71     //调用核函数
     72     add<<<block_num,thread_num>>>(dev_a,dev_b,dev_c);
     73     
     74     stop_gpu = clock();
     75     //记录当时时间
     76     cudaEventRecord(stop_GPU,0);
     77     cudaEventSynchronize(start_GPU);
     78     cudaEventSynchronize(stop_GPU);
     79     cudaEventElapsedTime(&time_GPU, start_GPU, stop_GPU);
     80     printf("
    The time from GPU :	%f(ms)
    ",time_GPU);
     81     
     82     //将device复制到host
     83     cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
     84     //将GPU中的结果拷贝出来
     85     cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
     86     
     87     //输出
     88     printf("
    Result from GPU:
    ");
     89     for(int i = 0; i<N; i++)
     90     {
     91         printf("GPU:	%d+%d=%d
    ",a[i],b[i],c[i]);
     92     }
     93     cudaEventDestroy(start_GPU);
     94     cudaEventDestroy(stop_GPU);
     95     
     96     //释放内存
     97     cudaFree(dev_a);
     98     cudaFree(dev_b);
     99     cudaFree(dev_c);
    100     printf("
    The time for CPU by event:	%f(ms)
    ", time_CPU);
    101     printf("The time for GPU by event:	%f(ms)
    ", time_GPU);
    102     
    103     time_cpu = (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC;
    104     time_gpu = (float)(stop_gpu - start_gpu) / CLOCKS_PER_SEC;
    105     printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    106     printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    107     
    108     
    109     return 0;
    110 }
    111 //GPU函数
    112 __global__ void add(int *a, int *b, int *c)
    113 {
    114     int tid = blockIdx.x*blockDim.x+threadIdx.x;//计算该索引处的数据
    115     if (tid < N)
    116     {
    117         c[tid] = a[tid] + b[tid];
    118     }
    119 }
    120 
    121 //CPU函数
    122 void add_CPU(int *a, int *b, int *c)
    123 {
    124     for (int i = 0; i < N; i++)
    125     {
    126         c[i] = a[i] + b[i];
    127     }
    128 }

     3.CUDA实现矩阵乘

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 #define M 1024
      7 #define K 1024
      8 #define N 1024
      9 
     10 void initial(double* list,int row,int col)
     11 {
     12     double *num = list;
     13 //    srand((unsigned)time(NULL));
     14     for (int i=0; i<row*col; i++)
     15     {
     16         num[i] = rand()%10;
     17     }
     18 }
     19 
     20 void CpuMatrix(double *A,double *B,double *C)
     21 {
     22        int i,j,k;
     23        
     24        for( i=0; i<M; i++)
     25        {
     26            for(j=0; j<N; j++)
     27            {
     28                double sum = 0;
     29                for(int k=0; k<K; k++)
     30                {
     31                    sum += A[i*K + k] * B[k * N + j];
     32                }
     33                C[i * N + j] = sum;
     34            }
     35        }
     36 }
     37 
     38 __global__ void GpuMatrix(double *dev_A,double *dev_B,double *dev_C)
     39 {
     40     int ix = threadIdx.x + blockDim.x * blockIdx.x;
     41     int iy = threadIdx.y + blockDim.y * blockIdx.y;
     42     
     43     if(ix<K && iy<M)
     44     {
     45     double sum = 0;
     46     for( int k = 0; k < K;k++)
     47     {
     48         sum += dev_A[iy*K + k] * dev_B[k*N + ix];
     49     }
     50     dev_C[iy * N + ix] = sum;
     51    }
     52 }
     53 
     54 void printMatrix(double *list,int row,int col)
     55 {
     56     double *p = list;
     57     for(int i=0; i<row; i++)
     58     {
     59         for(int j=0; j<col; j++)
     60         {
     61             printf("%10lf",p[j]);
     62         }
     63         p = p + col;
     64         printf("
    ");
     65     }
     66 }
     67 
     68 int main(int argc,char **argv)
     69 {
     70     clock_t start_cpu,stop_cpu,start_gpu,stop_gpu;
     71     double time_cpu,time_gpu;
     72     
     73     float time_CPU, time_GPU;
     74     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
     75     
     76     //printf("Amatrix:(%d*%d)
    ",M,K);
     77     int Axy = M*K;
     78     int Abytes = Axy * sizeof(double);
     79     
     80 //    printf("Bmatrix:(%d*%d)
    ",K,N);
     81     int Bxy = K*N;
     82     int Bbytes = Bxy * sizeof(double);
     83     
     84     int nxy = M*N;
     85     int nbytes = nxy * sizeof(double);
     86     
     87     double *host_A, *host_B, *host_C, *c_CPU;
     88     host_A = (double*)malloc(Abytes);
     89     host_B = (double*)malloc(Bbytes);
     90     host_C = (double*)malloc(nbytes);
     91     c_CPU = (double*)malloc(nbytes);
     92     
     93     //初始化
     94     initial(host_A,M,K);
     95     //输出
     96     printf("A:(%d,%d):
    ",M,K);
     97 //    printMatrix(host_A,M,K);
     98     
     99     initial(host_B,K,N);
    100     //输出
    101     printf("B:(%d,%d):
    ",K,N);
    102 //    printMatrix(host_B,K,N);
    103    
    104     start_cpu = clock();
    105     CpuMatrix(host_A,host_B,host_C);
    106     stop_cpu = clock();
    107     
    108     printf("The time from CPU is 	%f(ms)
    ",(float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
    109     //输出
    110     printf("Host_C:(%d,%d):
    ",M,N);
    111 //    printMatrix(host_C,M,N);
    112     
    113     //GPU计算
    114     double *dev_A,*dev_B,*dev_C;
    115     cudaMalloc((void**)&dev_A,Axy*sizeof(double));
    116     cudaMalloc((void**)&dev_B,Bxy*sizeof(double));
    117     cudaMalloc((void**)&dev_C,nxy*sizeof(double));
    118     dim3 block(1024,1024);
    119     dim3 grid(32,32); 
    120     cudaMemcpy(dev_A,host_A,Abytes,cudaMemcpyHostToDevice);
    121     cudaMemcpy(dev_B,host_B,Bbytes,cudaMemcpyHostToDevice);
    122     //创建Event 
    123     cudaEventCreate(&start_GPU);
    124     cudaEventCreate(&stop_GPU);
    125     cudaEventRecord(start_GPU,0);
    126     start_gpu = clock();
    127     
    128     GpuMatrix<<<grid,block>>>(dev_A,dev_B,dev_C);
    129     
    130     stop_gpu = clock();
    131     cudaEventRecord(stop_GPU,0);
    132     cudaEventSynchronize(start_GPU);
    133     cudaEventSynchronize(stop_GPU);
    134     //计算时间差
    135     cudaEventElapsedTime(&time_GPU,start_GPU,stop_GPU);
    136     printf("The time from GPU is 	%f(ms)
    ",time_GPU);
    137     //消除Event
    138     cudaEventDestroy(start_GPU);
    139     cudaEventDestroy(stop_GPU);
    140     
    141     cudaMemcpy(c_CPU,dev_C,nbytes,cudaMemcpyDeviceToHost);
    142     //输出
    143     printf("device_C:(%d,%d):
    ",M,N);
    144 //    printMatrix(c_CPU,M,N);
    145     
    146     //释放内存
    147      cudaFree(dev_A);
    148      cudaFree(dev_B);
    149      cudaFree(dev_C);
    150      free(host_A);
    151      free(host_B);
    152      free(host_C);
    153      free(c_CPU);
    154      
    155      time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    156      time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    157      printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    158      printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    159      return 0;
    160 
    161 }

     
     
  • 相关阅读:
    【foj 1746 A Water Problem】
    【hdu 1049 Climbing Worm】
    【hdu 1070 Milks】
    【B ECJTU_ACM 11级队员2012年暑假训练赛(2)】
    『C Free 5.0 注册码共享』
    【E ECJTU_ACM 11级队员2012年暑假训练赛(2)】
    【J ECJTU_ACM 11级队员2012年暑假训练赛(2)】
    【H ECJTU_ACM 11级队员2012年暑假训练赛(2)】
    【C ECJTU_ACM 11级队员2012年暑假训练赛(2)】
    『为黑客正名』
  • 原文地址:https://www.cnblogs.com/lin1216/p/12677841.html
Copyright © 2020-2023  润新知