CUDA 例程

scalar add

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <iostream> 
__global__ void add(int *a, int *b,int *c)
{
c[blockIdx.x]=a[blockIdx.x]+b[blockIdx.x];
}
int main(void) 
{ // H has storage for 4 integers
int a,b,c;
int *da,*db,*dc;
int size=1*sizeof(int); //scalar;
cudaMalloc((void**)&da,size);
cudaMalloc((void**)&db,size);
cudaMalloc((void**)&dc,size);

a=2;
b=7;
cudaMemcpy(da,&a,size,cudaMemcpyHostToDevice);
cudaMemcpy(db,&b,size,cudaMemcpyHostToDevice);


add<<<1,1>>>(da,db,dc);
cudaMemcpy(&c,dc,size,cudaMemcpyDeviceToHost );
std::cout<<c<<std::endl;


cudaFree(da);
cudaFree(db);
cudaFree(dc);



std::cout<<"hell"; 
thrust::host_vector<int> H(4); 
// initialize individual elements 
H[0] = 14; H[1] = 20; H[2] = 38; H[3] = 46; 
// H.size() returns the size of vector H 
std::cout << "H has size " << H.size() << std::endl; 
// print contents of H 
for(int i = 0; i < H.size(); i++) std::cout << "H[" << i << "] = " << H[i] << std::endl; 
// resize H 
H.resize(2); 
std::cout << "H now has size " << H.size() << std::endl; 
// Copy host_vector H to device_vector D 
thrust::device_vector<int> D = H; 
// elements of D can be modified 
D[0] = 99; D[1] = 88; // print contents of D 
for(int i = 0; i < D.size(); i++) std::cout << "D[" << i << "] = " << D[i] << std::endl; 
// H and D are automatically deleted when the function returns 
return 0; }

block or thread

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <iostream> 
 
const int N=512;
__global__ void add(int *a, int *b,int *c)
{
c[blockIdx.x]=a[blockIdx.x]+b[blockIdx.x];      //c[threadIdx.x]=a[threadIdx.x]+b[threadIdx.x];
}
int main(void) 
{ // H has storage for 4 integers
int *a,*b,*c;
int *da,*db,*dc;
int size=N*sizeof(int); //scalar;


cudaMalloc((void**)&da,size);
cudaMalloc((void**)&db,size);
cudaMalloc((void**)&dc,size);

a=(int *) malloc(size);  
memset(a,0,N*sizeof(int));//rand_ints(a,N);

a[0]=10;
a[3]=3;
b=(int *) malloc(size);  memset(b,0, N*sizeof(int));// rand_ints(b,N);
b[0]=2;
b[4]=32;

c=(int *) malloc(size); //rand_ints(c,N);
memset(c,0, N*sizeof(int));

cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(db,b,size,cudaMemcpyHostToDevice);


add<<<N,1>>>(da,db,dc);               //N blocks  add<<<1,N>>>(da,db,dc);   N threads

cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost ); for (int i=0; i<20;i++) std::cout<<c[i]<<std::endl; //_syncthreads(); //useless cudaDeviceSynchronize(); free(a); free(b); free(c); cudaFree(da); cudaFree(db); cudaFree(dc); return 0; }

block+thread
#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <iostream> 

/*

#define N (2048*2048)
#define M 512 // THREADS_PER_BLOCK
…
add<<<N/M, M>>>(d_a, d_b, d_c);

N /M      blocks used
M   threads / block
*/

 
const int N=2048*2048;
const int M=512;
__global__ void add(int *a, int *b,int *c,int n)
{
int index=threadIdx.x+blockIdx.x*blockDim.x;
c[index]=a[index]+b[index];
if (index<n)
  c[index]=a[index]+b[index];
//c[threadIdx.x]=a[threadIdx.x]+b[threadIdx.x];
}
int main(void) 
{ // H has storage for 4 integers
int *a,*b,*c;
int *da,*db,*dc;
int size=N*sizeof(int); //scalar;


cudaMalloc((void**)&da,size);
cudaMalloc((void**)&db,size);
cudaMalloc((void**)&dc,size);

a=(int *) malloc(size);  
memset(a,0,N*sizeof(int));//rand_ints(a,N);

a[0]=10;
a[3]=3;
b=(int *) malloc(size);  memset(b,0, N*sizeof(int));// rand_ints(b,N);
b[0]=2;
b[4]=32;

c=(int *) malloc(size); //rand_ints(c,N);
memset(c,0, N*sizeof(int));

cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(db,b,size,cudaMemcpyHostToDevice);


add<<<(N+M-1)/M,M>>>(da,db,dc,N);                  
cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost );
for (int i=0; i<20;i++)
std::cout<<c[i]<<std::endl;


//_syncthreads();

//useless
cudaDeviceSynchronize();


free(a);
free(b);
free(c);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
 
 
return 0; }

相关阅读:
JS-排序详解-选择排序
 JS-排序详解-快速排序
 JS-排序详解-冒泡排序
 正则表达式入门
 JS-最全的创建对象的方式
 用JS实现回文数的精准辨别！！！
基本包装类型
 引用类型之Function类型
 引用类型之Array类型
 Object类型
原文地址：https://www.cnblogs.com/huashiyiqike/p/3869093.html