cuda中当元素个数超过线程个数时的处理案例

当向量元素超过线程个数时的情况

向量元素个数为（33 * 1024）/（128 * 128）=2.x倍

 1 /*
 2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 3 *
 4 * NVIDIA Corporation and its licensors retain all intellectual property and
 5 * proprietary rights in and to this software and related documentation.
 6 * Any use, reproduction, disclosure, or distribution of this software
 7 * and related documentation without an express license agreement from
 8 * NVIDIA Corporation is strictly prohibited.
 9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 */
15 
16 
17 #include "../common/book.h"
18 #include "cuda.h"
19 #include "cuda_runtime.h"
20 #include "device_launch_parameters.h"
21 
22 #define N   (33 * 1024)
23 
24 __global__ void add(int *a, int *b, int *c) {
25     int tid = threadIdx.x + blockIdx.x * blockDim.x;
26     while (tid < N) {
27         c[tid] = a[tid] + b[tid];
28         tid += blockDim.x * gridDim.x;
29     }
30 }
31 
32 int main(void) {
33     int *a, *b, *c;
34     int *dev_a, *dev_b, *dev_c;
35 
36     // allocate the memory on the CPU
37     a = (int*)malloc(N * sizeof(int));
38     b = (int*)malloc(N * sizeof(int));
39     c = (int*)malloc(N * sizeof(int));
40 
41     // allocate the memory on the GPU
42     HANDLE_ERROR(cudaMalloc((void**)&dev_a, N * sizeof(int)));
43     HANDLE_ERROR(cudaMalloc((void**)&dev_b, N * sizeof(int)));
44     HANDLE_ERROR(cudaMalloc((void**)&dev_c, N * sizeof(int)));
45 
46     // fill the arrays 'a' and 'b' on the CPU
47     for (int i = 0; i<N; i++) {
48         a[i] = i;
49         b[i] = 2 * i;
50     }
51 
52     // copy the arrays 'a' and 'b' to the GPU
53     HANDLE_ERROR(cudaMemcpy(dev_a, a, N * sizeof(int),
54         cudaMemcpyHostToDevice));
55     HANDLE_ERROR(cudaMemcpy(dev_b, b, N * sizeof(int),
56         cudaMemcpyHostToDevice));
57 
58     /*
59     当向量元素超过线程个数时的情况
60     向量元素个数为（33 * 1024）/（128 * 128）=2.x倍
61     */
62     add << <128, 128 >> >(dev_a, dev_b, dev_c);
63 
64     // copy the array 'c' back from the GPU to the CPU
65     HANDLE_ERROR(cudaMemcpy(c, dev_c, N * sizeof(int),
66         cudaMemcpyDeviceToHost));
67 
68     // verify that the GPU did the work we requested
69     bool success = true;
70     for (int i = 0; i<N; i++) {
71         if ((a[i] + b[i]) != c[i]) {
72             printf("Error:  %d + %d != %d
", a[i], b[i], c[i]);
73             success = false;
74         }
75     }
76     if (success)    printf("We did it!
");
77 
78     // free the memory we allocated on the GPU
79     HANDLE_ERROR(cudaFree(dev_a));
80     HANDLE_ERROR(cudaFree(dev_b));
81     HANDLE_ERROR(cudaFree(dev_c));
82 
83     // free the memory we allocated on the CPU
84     free(a);
85     free(b);
86     free(c);
87 
88     return 0;
89 }

相关阅读:
rest简单实例
 Rest简介
 java视频
 j2ee开发中的“java容器”和“web容器”有什么区别？
用Java实现自己的ArrayList
Java中关于枚举的7种用法
 Java多线程实现自然同步（内含演示案例）
Java实现简单的文件复制
 Java之自动拆装箱
 写一个SingleTon，（饿最终、懒同步）
原文地址：https://www.cnblogs.com/liangliangdetianxia/p/3985040.html