VS2008+QT中使用CUDA的示例

各工具或库的版本：

IDE：VS2008

QT：4.8.0 (使用MSVC编译器)

CUDA：3.2

单独使用CUDA文件在vccc下进行编译很多书上都讲过。这里主要讲在QT中如何的使用CUDA，QT在VS2008环境下开发。

下面以宽为1024的举证乘法为例，再辅以QT GUI说明如何在VS2008的QT工程中使用CUDA，VS2010没试过，不过应该是差不多的步骤。

1.新建VS2008下QT工程

这个就不多说了，熟悉VS2008下进行QT开发的都知道怎么弄，新建一个Qt GUI程序，基类我选择的QDialog，如下：

2.设置项目属性

需要设置的项目属性包括：更改生成规则(添加对CUDA文件的编译连接支持)、添加使用CUDA需要用到的链接库(.lib文件)

首先添加生成规则：右键工程——>自定义生成规则,添加CUDA Runtime API Build Rule (v3.2),如下图：

然后添加链接库：项目——>属性——>链接器——>常规，在附加库目录中添加ToolKit和SDK目录里的lib，在输入的附加库目录下添加需要用到的lib文件。这一步和在单独使用CUDA时的做法是一样的，详见http://www.cnblogs.com/Romi/archive/2012/04/20/2459669.html

3.编写CUDA文件(.cu)

在项目中新建一个.cu的文件，加上如下代码，完成在GPU设备上进行矩阵乘法：

View Code

 1 //CUDAtest.cu
 2 
 3 #include "cuda_runtime.h"  
 4 
 5 #define TILE_WIDTH 64
 6 
 7 //核函数
 8 __global__ static void MatrixMulKernel(const float* Md,const float* Nd,float* Pd,int Width)
 9 {
10     //计算Pd和Md中元素的行索引
11     int Row = blockIdx.y*TILE_WIDTH+threadIdx.y; //行
12     int Col = blockIdx.x*TILE_WIDTH+threadIdx.x; //列
13 
14     float Pvalue = 0.0;
15     for (int k=0;k<Width;k++)
16     {
17         Pvalue +=Md[Row*Width+k]*Nd[k*Width+Col];
18     }
19     //每个线程负责计算P中的一个元素
20     Pd[Row*Width+Col]=Pvalue;
21 }
22 
23 //矩阵乘法(CUDA中)
24 //在外部调用，使用extern
25 extern "C" void MatrixMultiplication_CUDA(const float* M,const float* N,float* P,int Width)
26 {
27     cudaSetDevice(0);  //设置目标GPU 
28 
29     float *Md,*Nd,*Pd;
30     int size = Width*Width*sizeof(float);//字节长度
31 
32     cudaMalloc((void**)&Md,size);
33     cudaMalloc((void**)&Nd,size);
34     cudaMalloc((void**)&Pd,size);
35 
36     //Copies a matrix from the memory* area pointed to by src to the memory area pointed to by dst
37     cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
38     cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);
39 
40     //
41     dim3 dimGrid(Width/TILE_WIDTH,Width/TILE_WIDTH); //网格的维度
42     dim3 dimBlock(TILE_WIDTH,TILE_WIDTH); //块的维度
43     MatrixMulKernel<<<dimGrid,dimBlock>>> (Md,Nd,Pd,Width);
44 
45     cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
46     //释放设备上的矩阵
47     cudaFree(Md);
48     cudaFree(Nd);
49     cudaFree(Pd);
50 }

这里使用extern以声明函数可以在外部被调用。如果是在调用该函数的原文件中使用include “XXX.cu”，我这会出现编译错误，暂没有解决，所以使用extern

4.在Qt响应源文件中添加CUDA的引用

Qt GUI设计如下图，点击“GPU计算”按钮进行CUDA计算，后面显示计算的时间：

源文件如下(包含用到的自定义函数和按钮响应函数)：

View Code

 1 //cudainqt.cpp源文件
 2   
 3   #include "cudainqt.h"
 4   #include <QProgressDialog>
 5   #include <time.h>
 6   
 7   //这里不要忘了加引用声明
 8   extern "C" void MatrixMultiplication_CUDA(const float* M,const float* N,float* P,int Width);
 9   
10   //构造函数...
11   //析构函数...
12   
13   //产生矩阵,矩阵中元素0~1
14   void matgen(float* a,int Width)
15   {
16       int i,j;
17       for (i=0;i<Width;i++)
18       {
19           for (j=0;j<Width;j++)
20           {
21               a[i*Width+j]=(float)rand()/RAND_MAX + (float)rand()/(RAND_MAX*RAND_MAX);
22           }
23       }
24   }
25   
26   //矩阵乘法(CPU验证)
27   void MatrixMultiplication(const float* M,const float* N,float* P,int Width)
28   {
29       QProgressDialog progress("Progress", "Cancel", 0, 100);
30       int i,j,k;
31       for (i=0;i<Width;i++)
32       {
33           for (j=0;j<Width;j++)
34           {
35               double sum=0;
36               for (k=0;k<Width;k++)
37               {
38                   sum += M[i*Width+k]*N[k*Width+j];
39               }
40               P[i*Width+j]=sum;
41           }
42           if(0==i%5)
43               progress.setValue(100*i/(Width-1));
44       }
45   }
46   
47   void cudaInQt::OnButtonClicked_GPU()
48   { 
49       float *M,*N,*Pg;
50       int Width=1024; //1024×1024矩阵乘法
51       M=(float*)malloc(sizeof(float)*Width*Width);
52       N=(float*)malloc(sizeof(float)*Width*Width);
53       Pg=(float*)malloc(sizeof(float)*Width*Width); //保存GPU计算结果
54   
55       srand(0);
56   
57       matgen(M,Width); //产生矩阵M
58       matgen(N,Width); //产生矩阵N
59   
60       double timeStart,timeEnd; //定义时间，求时间差用
61       timeStart = clock();
62       MatrixMultiplication_CUDA(M,N,Pg,Width); //GPU上计算
63       timeEnd = clock();
64       ui.textEdit_GPU->setText(QString::number(timeEnd-timeStart)+"ms");
65  
66       free(M);
67       free(N);
68       free(Pg);
69   }
70   
71   void cudaInQt::OnButtonClicked_CPU()
72   {
73       float *M,*N,*Pc;
74       int Width=1024; //1024×1024矩阵乘法
75       M=(float*)malloc(sizeof(float)*Width*Width);
76       N=(float*)malloc(sizeof(float)*Width*Width);
77       Pc=(float*)malloc(sizeof(float)*Width*Width); //保存CPU计算结果
78   
79       srand(0);
80   
81       matgen(M,Width); //产生矩阵M
82       matgen(N,Width); //产生矩阵N
83   
84       double timeStart,timeEnd; //定义时间，求时间差用
85       timeStart = clock();
86       MatrixMultiplication(M,N,Pc,Width); //CPU上计算
87       timeEnd = clock();
88       ui.textEdit_CPU->setText(QString::number(timeEnd-timeStart)+"ms");
89  
90       free(M);
91       free(N);
92       free(Pc);
93   }

5.测试结果

测试时开了其他的应用程序，另外本机配置很戳，看看吧，使用CUDA进行加速甩了使用传统方法几条街呢

后注：代码中有点问题，测试结果也不对，后来发现了，改过的结果见该文http://www.cnblogs.com/Romi/archive/2012/05/17/2506787.html

相关阅读:
线程的补充
 线程
 进程
 操作系统和进程
 socketserver模块实现并发和连接合法性验证
 socketserver实例化过程
 粘包现象和解决方法
 网络通信协议
 初探网络
 Python网络编程
原文地址：https://www.cnblogs.com/Romi/p/2492363.html