#include "cuda_runtime.h" #include <stdio.h> #include <stdlib.h> #include <math.h> #include <memory.h> __global__ static void kernel(int *d_int){ int i; int x = threadIdx.y; for(i=0;i<3;i++){ d_int[i*4 + x] = i; } } int main(){ int gridsize = 1; dim3 blocksize(3,4); int *h_int,*d_int; h_int = (int *)malloc(sizeof(int)*3*4); cudaMalloc((void **) &d_int,sizeof(int)*3*4); cudaMemset(d_int,0,sizeof(int)*3*4); memset(h_int,0,sizeof(int)*3*4); kernel<<<gridsize,blocksize>>>(d_int); cudaMemcpy(h_int,d_int,sizeof(int)*3*4,cudaMemcpyDeviceToHost); for(int i=0;i<3;i++){ for(int j=0;j<4;j++){ printf("%d ",h_int[i*4+j]); } printf(" "); } printf(" "); getchar(); return 0; }
执行结果: