Created on 2013-8-5
URL : http://blog.sina.com.cn/s/blog_a502f1a30101mi6t.html
@author: zhxfl
转载请说明出处
1 cudaDeviceProp prop; 2 3 int count; 4 ( cudaGetDeviceCount( &count ) ); 5 for (int i=0; i< count; i++) { 6 ( cudaGetDeviceProperties( &prop, i ) ); 7 printf( " --- General Information for device %d --- ", i ); 8 printf( "Name: %s ", prop.name ); 9 printf( "Compute capability: %d.%d ", prop.major, prop.minor ); 10 printf( "Clock rate: %d ", prop.clockRate ); 11 printf( "Device copy overlap: " ); 12 if (prop.deviceOverlap) 13 printf( "Enabled " ); 14 else 15 printf( "Disabled "); 16 printf( "Kernel execution timeout : " ); 17 if (prop.kernelExecTimeoutEnabled) 18 printf( "Enabled " ); 19 else 20 printf( "Disabled " ); 21 22 printf( " --- Memory Information for device %d --- ", i ); 23 printf( "Total global mem: %ld ", prop.totalGlobalMem ); 24 printf( "Total constant Mem: %ld ", prop.totalConstMem ); 25 printf( "Max mem pitch: %ld ", prop.memPitch ); 26 printf( "Texture Alignment: %ld ", prop.textureAlignment ); 27 28 printf( " --- MP Information for device %d --- ", i ); 29 printf( "Multiprocessor count: %d ", 30 prop.multiProcessorCount ); 31 printf( "Shared mem per mp: %ld ", prop.sharedMemPerBlock ); 32 printf( "Registers per mp: %d ", prop.regsPerBlock ); 33 printf( "Threads in warp: %d ", prop.warpSize ); 34 printf( "Max threads per block: %d ", 35 prop.maxThreadsPerBlock ); 36 printf( "Max thread dimensions: (%d, %d, %d) ", 37 prop.maxThreadsDim[0], prop.maxThreadsDim[1], 38 prop.maxThreadsDim[2] ); 39 printf( "Max grid dimensions: (%d, %d, %d) ", 40 prop.maxGridSize[0], prop.maxGridSize[1], 41 prop.maxGridSize[2] ); 42 printf( " " ); 43 } 44 45 int n1 = rand() % base + base; 46 int m1 = rand() % base + base; 47 int n2 = m1; 48 int m2 = rand() % base + base; 49 int *g1 = new int[n1 * m1]; 50 int *g2 = new int[n2 * m2]; 51 printf("matrix A[%3d %3d] ", n1, m1); 52 for(int i = 0; i < n1 * m1;i++) 53 { 54 g1[i] = rand() % large; 55 //printf("%5d ", g1[i]); 56 //if((i + 1) % m1 == 0)printf(" "); 57 } 58 printf("matrix B[%3d %3d] ", n2, m2); 59 for(int i = 0; i < n2 * m2;i++) 60 { 61 g2[i] = rand() % large; 62 //printf("%5d ", g2[i]); 63 //if((i + 1) % m2 == 0)printf(" "); 64 } 65 int *g; 66 67 68 g = matrixMultiply(g1,n1,m1,g2,n2,m2); 69 70 printf("matrix C[%3d %3d] ", n1, m2); 71 for(int i = 0; i< n1*m2;i++) 72 { 73 //printf("%5d ", g[i]); 74 //if((i + 1) % m2 == 0) printf(" "); 75 }
上面是cuda example的代码,其中maxThreadsPerBlock是指每个块上的最大线程数,maxGridSize是最大的blocks数。理论上讲最大的并发量是maxThreadsPerBlock * maxGridSize。下面是在我本机上的运行结果,可以看到65535 * 1024是我想要的答案。
下面来测试一下
1)崩溃:function <<<65535,1024>>()
2)正常:function <<<65536,1>>()
3)崩溃:function <<<65536,1>>()
4)正常:function <<<1,1024>>()
5)崩溃:function <<<1,1025>>()
其中第一个的崩溃让人无法理解,其他都符合预期,对于这种申请资源失败的情况,目前还没有较好的对策,如果有我会及时补上