1 #include<string.h> 2 #include<math.h> 3 #include<stdlib.h> 4 #include<stdio.h> 5 #define N 100 6 7 __global__ void vecAdd(float* A,float* B,float* C){ 8 int i=threadIdx.x; 9 if(i<N) 10 C[i]=A[i]+B[i]; 11 } 12 13 14 15 16 int main(int argc,int argv){ 17 size_t size=N*sizeof(float); 18 float *h_A,*h_B,*h_C; 19 h_A=(float*)malloc(size); 20 h_B=(float*)malloc(size); 21 h_C=(float*)malloc(size); 22 float* d_A; 23 cudaMalloc((void**)&d_A,size); 24 float* d_B; 25 cudaMalloc((void**)&d_B,size); 26 float* d_C; 27 cudaMalloc((void**)&d_C,size); 28 srand(time(NULL)); 29 for(int i=0;i<N;i++){ 30 h_A[i]=rand()%100; 31 h_B[i]=rand()%100; 32 } 33 cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice); 34 cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice); 35 int threadsPerBlock=256; 36 int threadsPerGrid=(N+threadsPerBlock-1)/threadsPerBlock; 37 vecAdd<<<threadsPerGrid,threadsPerBlock>>>(d_A,d_B,d_C); 38 cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost); 39 for(int i=0;i<N;i++){ 40 printf("%5.0d:%.0f+%.0f=%.0f ",i,h_A[i],h_B[i],h_C[i]); 41 } 42 free(h_A); 43 free(h_B); 44 free(h_C); 45 46 cudaFree(d_A); 47 cudaFree(d_B); 48 cudaFree(d_C); 49 }