• CUDA速度测试


    CPU ==> AMD X4 635

    GPU ==> GeForce GT 240

    三个很简单的测试..

    1. 最笨的算法,一堆FOR..

    2. 四个线程(4物理核的CPU)..各算一块

    3.GPU 分成64*64个BLOCK..每个BLOCK 16*16个线程

    4.使用CUBLAS库

    结果如下

    6687720.500000, 6869132.500000, 6410965.000000, 6952017.500000
    TIMES: 47125

    6687720.500000, 6869132.500000, 6410965.000000, 6952017.500000
    TIMES: 14203

    6687720.500000, 6869132.500000, 6410964.500000, 6952017.000000
    TIMES: 328

    6687720.500000, 6869132.500000, 6410964.500000, 6952017.000000
    TIMES: 250

    时间比例大概为 1885:570:13:10  ....GPU的确比较强悍..

    没用INTEL的库..AMD的U怕支持不好..改天找个机器试下.

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <string.h>
      4 #include <time.h>
      5 #include <assert.h>
      6 #include <conio.h>
      7 #include <windows.h>
      8 #include <process.h>
      9 #include <cuda_runtime.h>
     10 #include <cublas_v2.h>
     11 #include <device_launch_parameters.h>
     12 
     13 
     14 
     15 #define MAX_RUN      0
     16 #define TILE_WIDTH   16
     17 #define MAX_DIM      1024
     18 
     19 float MatrixA[MAX_DIM][MAX_DIM];
     20 float MatrixB[MAX_DIM][MAX_DIM];
     21 float MatrixC[MAX_DIM][MAX_DIM];
     22 
     23 volatile unsigned long thr_run;
     24 
     25 /* 设置矩阵内容 */
     26 void FillMatrix()
     27 {
     28     register int i, j;
     29     srand( ( unsigned int )time( NULL ) );
     30 
     31     for ( i = 0; i < MAX_DIM; i ++ )
     32     {
     33         for ( j = 0; j < MAX_DIM; j ++ )
     34         {
     35             MatrixA[i][j] = ( float )rand() * rand() / 100 / RAND_MAX;
     36             MatrixB[i][j] = ( float )rand() * rand() / 100 / RAND_MAX;
     37         }
     38     }
     39 }
     40 
     41 
     42 /********************************************************************/
     43 
     44 /* 运行在CPU上,最笨的方法 */
     45 void RunOnCPU()
     46 {
     47     float sum;
     48     register int i, j, k;
     49 
     50     for ( i = 0; i < MAX_DIM; ++ i )
     51     {
     52         for ( j = 0; j < MAX_DIM; ++ j )
     53         {
     54             sum = 0;
     55             for ( k = 0; k < MAX_DIM; ++ k )
     56             {
     57                 sum += MatrixA[i][k] * MatrixB[k][j];
     58             }
     59             MatrixC[i][j] = sum;
     60         }
     61     }
     62 }
     63 
     64 
     65 
     66 /********************************************************************/
     67 
     68 /* 子线程ROUTINE */
     69 void CPUThread( void* arg )
     70 {
     71     register int i, j, k;
     72     int dy, dy1;
     73     float mulResult;
     74 
     75     dy = ( ( int )MAX_DIM >> 2 ) * ( int )arg ;
     76     dy1 = dy + ( ( int )MAX_DIM >> 2 );
     77 
     78     for ( i = dy; i < dy1; i ++ )
     79     {
     80         for ( j = 0; j < MAX_DIM; j ++ )
     81         {
     82             mulResult = 0;
     83             for ( k = 0; k < MAX_DIM; k ++ )
     84             {
     85                 mulResult += MatrixA[i][k] * MatrixB[k][j];
     86             }
     87 
     88             MatrixC[i][j] = mulResult;
     89         }
     90     }
     91 
     92     InterlockedIncrement( &thr_run );
     93 
     94     _endthread();
     95 }
     96 
     97 
     98 /* 运行在CPU上, CPU==>X4 635刚好开4个线程, 4个核全100% */
     99 void RunOnCPUMulThr()
    100 {
    101     int i;
    102     unsigned int ret;
    103 
    104     thr_run = 0;
    105 
    106     for ( i = 0; i < 4; i ++ )
    107     {
    108         ret = _beginthread( CPUThread, 0, ( void* )i );
    109         assert( ret != -1 );
    110     }
    111 
    112     while ( thr_run != 4 )
    113     {
    114         Sleep( 1 );
    115     }
    116 }
    117 
    118 /********************************************************************/
    119 
    120 /* 运行在GPU上 */
    121 __global__ void Matrix_Mul1( float* c, const float* a, const float* b )
    122 {
    123     unsigned int i, j, bx, by, tx, ty;
    124     float mulResult;
    125     __shared__ float d_m[TILE_WIDTH][TILE_WIDTH];
    126     __shared__ float d_n[TILE_WIDTH][TILE_WIDTH];
    127 
    128     bx = blockIdx.x;
    129     by = blockIdx.y;
    130     tx = threadIdx.x;
    131     ty = threadIdx.y;
    132 
    133     mulResult = 0.0;
    134 
    135     for ( i = 0; i < gridDim.x; ++i )
    136     {
    137         d_m[ty][tx] = *( a + ( by * blockDim.y + ty ) * MAX_DIM + i * blockDim.x + tx );
    138         d_n[ty][tx] = *( b + ( i * blockDim.y + ty ) * MAX_DIM + bx * blockDim.x + tx );
    139         __syncthreads();
    140 
    141         for ( j = 0; j < blockDim.x; ++ j )
    142         {
    143             mulResult += d_m[ty][j] * d_n[j][tx];
    144         }
    145         __syncthreads();
    146     }
    147     c[( by * blockDim.y + ty ) * MAX_DIM + bx * blockDim.x + tx] = mulResult;
    148 }
    149 
    150 void MatrixMul1( float* c, const float* a, const float* b )
    151 {
    152     int cnt;
    153     float* dev_a;
    154     float* dev_b;
    155     float* dev_c;
    156     cudaError_t cudaStatus;
    157     // 64 * 64 ====> 16 * 16
    158     dim3 grid( MAX_DIM / TILE_WIDTH, MAX_DIM / TILE_WIDTH );
    159     dim3 blocks( TILE_WIDTH, TILE_WIDTH );
    160 
    161     cnt = MAX_DIM * MAX_DIM;
    162     dev_a = NULL;
    163     dev_b = NULL;
    164     dev_c = NULL;
    165 
    166     /* 设置显卡,构建上下文 */
    167     cudaStatus = cudaSetDevice( 0 );
    168     assert( cudaStatus == cudaSuccess );
    169 
    170     /* 分配显存 */
    171     cudaStatus = cudaMalloc( ( void** )&dev_c, cnt * sizeof( float ) );
    172     assert( cudaStatus == cudaSuccess );
    173 
    174     cudaStatus = cudaMalloc( ( void** )&dev_a, cnt * sizeof( float ) );
    175     assert( cudaStatus == cudaSuccess );
    176 
    177     cudaStatus = cudaMalloc( ( void** )&dev_b, cnt * sizeof( float ) );
    178     assert( cudaStatus == cudaSuccess );
    179 
    180 
    181     /* 内存传送数据到显存 */
    182     cudaStatus = cudaMemcpy( dev_a, a, cnt * sizeof( float ), cudaMemcpyHostToDevice );
    183     assert( cudaStatus == cudaSuccess );
    184 
    185     cudaStatus = cudaMemcpy( dev_b, b, cnt * sizeof( float ), cudaMemcpyHostToDevice );
    186     assert( cudaStatus == cudaSuccess );
    187 
    188     /* 调用显卡 */
    189     Matrix_Mul1 <<< grid, blocks >>> ( dev_c, dev_a, dev_b );
    190 
    191     /* 设备同步 */
    192     cudaStatus = cudaDeviceSynchronize();
    193     assert( cudaStatus == cudaSuccess );
    194 
    195 
    196     /* 结果从显存传送到内存 */
    197     cudaStatus = cudaMemcpy( c, dev_c, cnt * sizeof( float ), cudaMemcpyDeviceToHost );
    198     assert( cudaStatus == cudaSuccess );
    199 
    200     /* 释放显存 */
    201     cudaFree( dev_c );
    202     cudaFree( dev_a );
    203     cudaFree( dev_b );
    204 
    205     /* 重启显卡(上下文) */
    206     cudaDeviceReset();
    207 }
    208 
    209 
    210 /********************************************************************/
    211 
    212 /* 使用CUBLAS库 */
    213 void MatrixMul2( float* c, const float* a, const float* b )
    214 {
    215     int cnt;
    216     float* dev_a;
    217     float* dev_b;
    218     float* dev_c;
    219     cublasHandle_t handle;
    220     cublasStatus_t cuBlasStatus;
    221     cudaError_t cudaStatus;
    222     float alpha;
    223     float beta;
    224     // 64 * 64 ====> 16 * 16
    225     dim3 grid( MAX_DIM / TILE_WIDTH, MAX_DIM / TILE_WIDTH );
    226     dim3 blocks( TILE_WIDTH, TILE_WIDTH );
    227 
    228 
    229     dev_a = NULL;
    230     dev_b = NULL;
    231     dev_c = NULL;
    232 
    233     cnt = MAX_DIM * MAX_DIM;
    234 
    235     alpha = 1.0f;
    236     beta  = 0.0f;
    237 
    238 
    239     /* 设置显卡,构建上下文 */
    240     cudaStatus = cudaSetDevice( 0 );
    241     assert( cudaStatus == cudaSuccess );
    242 
    243     /* 初始化BLAS库 */
    244     cuBlasStatus = cublasCreate( &handle );
    245     assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
    246 
    247     /* 分配显存 */
    248     cudaStatus = cudaMalloc( ( void** )&dev_c, cnt * sizeof( float ) );
    249     assert( cudaStatus == cudaSuccess );
    250 
    251     cudaStatus = cudaMalloc( ( void** )&dev_a, cnt * sizeof( float ) );
    252     assert( cudaStatus == cudaSuccess );
    253 
    254     cudaStatus = cudaMalloc( ( void** )&dev_b, cnt * sizeof( float ) );
    255     assert( cudaStatus == cudaSuccess );
    256 
    257     /* 内存传送数据到显存 */
    258     cudaStatus = cudaMemcpy( dev_a, a, cnt * sizeof( float ), cudaMemcpyHostToDevice );
    259     assert( cudaStatus == cudaSuccess );
    260 
    261     cudaStatus = cudaMemcpy( dev_b, b, cnt * sizeof( float ), cudaMemcpyHostToDevice );
    262     assert( cudaStatus == cudaSuccess );
    263 
    264 
    265     /* 处理 */
    266     cuBlasStatus = cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, \
    267                                 MAX_DIM, MAX_DIM, MAX_DIM, &alpha, \
    268                                 dev_b, MAX_DIM, dev_a, MAX_DIM, &beta, dev_c, MAX_DIM );
    269     assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
    270 
    271     /* 处理 */
    272     cuBlasStatus = cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, \
    273                                 MAX_DIM, MAX_DIM, MAX_DIM, &alpha, \
    274                                 dev_b, MAX_DIM, dev_a, MAX_DIM, &beta, dev_c, MAX_DIM );
    275     assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
    276 
    277     /* 结果从显存传送到内存 */
    278     cudaStatus = cudaMemcpy( c, dev_c, cnt * sizeof( float ), cudaMemcpyDeviceToHost );
    279     assert( cudaStatus == cudaSuccess );
    280 
    281     /* 销毁BLAS */
    282     cuBlasStatus = cublasDestroy( handle );
    283     assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
    284 
    285     /* 重启显卡(上下文) */
    286     cudaDeviceReset();
    287 }
    288 
    289 
    290 /********************************************************************/
    291 
    292 
    293 int main()
    294 {
    295     DWORD dwTime1, dwTime2;
    296 
    297     FillMatrix();
    298 
    299     memset( MatrixC, 0, sizeof( MatrixC ) );
    300     dwTime1 = GetTickCount();
    301     RunOnCPU();
    302     dwTime2 = GetTickCount() - dwTime1;
    303     printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 );
    304 
    305     memset( MatrixC, 0, sizeof( MatrixC ) );
    306     dwTime1 = GetTickCount();
    307     RunOnCPUMulThr();
    308     dwTime2 = GetTickCount() - dwTime1;
    309     printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 );
    310 
    311     memset( MatrixC, 0, sizeof( MatrixC ) );
    312     dwTime1 = GetTickCount();
    313     MatrixMul1( ( float* )MatrixC, ( const float* )MatrixA, ( const float* )MatrixB );
    314     dwTime2 = GetTickCount() - dwTime1;
    315     printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 );
    316 
    317     memset( MatrixC, 0, sizeof( MatrixC ) );
    318     dwTime1 = GetTickCount();
    319     MatrixMul2( ( float* )MatrixC, ( const float* )MatrixA, ( const float* )MatrixB );
    320     dwTime2 = GetTickCount() - dwTime1;
    321     printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 );
    322 
    323     getch();
    324 
    325     return 0;
    326 }
  • 相关阅读:
    路由重分发 最重要 最难 ccnp
    (01)Zookeeper简介
    (01)kafka以及消息系统的基本介绍
    (05)使用kafka脚本发送消息和接收消息
    (04)kafka多机多Broker(集群)的基本配置
    (03)kafka单机多Broker(伪分布式)的基本配置
    (02)安装配置kafka单Broker及其基本操作
    (02)安装zookeeper集群
    (09)使用xshell工具查看redis客户端,汉字显示乱码解决方法
    (08)redis之使用java客户端、spring连接redis、redis集群示例
  • 原文地址:https://www.cnblogs.com/javado/p/3097648.html
Copyright © 2020-2023  润新知