共享内存实现大规模点积

项目打包下载

  1 /*
  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 *
  4 * NVIDIA Corporation and its licensors retain all intellectual property and
  5 * proprietary rights in and to this software and related documentation.
  6 * Any use, reproduction, disclosure, or distribution of this software
  7 * and related documentation without an express license agreement from
  8 * NVIDIA Corporation is strictly prohibited.
  9 *
 10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 * associated with this source code for terms and conditions that govern
 12 * your use of this NVIDIA software.
 13 *
 14 */
 15 
 16 
 17 #include "../common/book.h"
 18 #include "cuda.h"
 19 #include "cuda_runtime.h"
 20 #include "device_launch_parameters.h"
 21 #include "device_functions.h"
 22 #define imin(a,b) (a<b?a:b)
 23 
 24 const int N = 33 * 1024;
 25 const int threadsPerBlock = 256;//每个线程块启动256个线程
 26 const int blocksPerGrid = imin(32, (N + threadsPerBlock - 1) / threadsPerBlock);
 27 
 28 /*
 29 内核函数
 30 */
 31 __global__ void dot(float *a, float *b, float *c) {
 32     //设备上的共享内存，在每个线程块中都有
 33     __shared__ float cache[threadsPerBlock];
 34     int tid = threadIdx.x + blockIdx.x * blockDim.x;
 35     //线程块中的线程索引赋值给缓冲索引
 36     int cacheIndex = threadIdx.x;
 37 
 38     float   temp = 0;
 39     //当前索引小于总共的数据量时
 40     while (tid < N) {
 41         temp += a[tid] * b[tid];
 42         //步长为活动的线程数
 43         tid += blockDim.x * gridDim.x;
 44     }//如果再次在这个线程上执行时，temp中存放的是上次计算的值，也就是再次计算的结果是加上上次计算的值
 45 
 46     // set the cache values
 47     //将结果存放在共享存储中，每个线程对应一个共享存储
 48     cache[cacheIndex] = temp;
 49 
 50     /*
 51     synchronize threads in this block
 52     同步操作，使得每个线程都计算完毕，再继续后面的操作
 53     */
 54     __syncthreads();
 55  
 56 
 57     // for reductions, threadsPerBlock must be a power of 2
 58     // because of the following code
 59     /*
 60     归约操作
 61     blockDim.x / 2块中的线程个数除以2，相当于取中间值
 62     因为这个blockDim是2的倍数，所以不会有除不尽的情况
 63     */
 64     int i = blockDim.x / 2;
 65     while (i != 0) {
 66         if (cacheIndex < i)
 67             /*
 68             前半部分和后半部分对应的第一个相加，以此类推
 69             */
 70             cache[cacheIndex] += cache[cacheIndex + i];
 71         /*
 72         同步使得所有线程完成了第一次归约在进行下一次归约
 73         */
 74         __syncthreads();
 75         //下次归约的中间值
 76         i /= 2;
 77     }
 78     //最终结果存放在cache[0]中，所以将cache[0]赋给以块索引为下标的数组中
 79     if (cacheIndex == 0)
 80         c[blockIdx.x] = cache[0];
 81 }
 82 
 83 
 84 int main(void) {
 85     float   *a, *b, c, *partial_c;
 86     float   *dev_a, *dev_b, *dev_partial_c;
 87 
 88     // allocate memory on the cpu side
 89     a = (float*)malloc(N*sizeof(float));
 90     b = (float*)malloc(N*sizeof(float));
 91     partial_c = (float*)malloc(blocksPerGrid*sizeof(float));
 92 
 93     // allocate the memory on the GPU
 94     HANDLE_ERROR(cudaMalloc((void**)&dev_a,
 95         N*sizeof(float)));
 96     HANDLE_ERROR(cudaMalloc((void**)&dev_b,
 97         N*sizeof(float)));
 98     HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c,
 99         blocksPerGrid*sizeof(float)));
100 
101     // fill in the host memory with data
102     for (int i = 0; i<N; i++) {
103         a[i] = i;
104         b[i] = i * 2;
105     }
106 
107     // copy the arrays 'a' and 'b' to the GPU
108     HANDLE_ERROR(cudaMemcpy(dev_a, a, N*sizeof(float),
109         cudaMemcpyHostToDevice));
110     HANDLE_ERROR(cudaMemcpy(dev_b, b, N*sizeof(float),
111         cudaMemcpyHostToDevice));
112 
113     dot << <blocksPerGrid, threadsPerBlock > >>(dev_a, dev_b, dev_partial_c);
114 
115     // copy the array 'c' back from the GPU to the CPU
116     HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,
117         blocksPerGrid*sizeof(float),
118         cudaMemcpyDeviceToHost));
119 
120     /* 在主机上完成最后的相加工作
121     这样是为了避免简单的工作在GPU上造成的资源浪费
122     因为好多资源处于空闲状态
123     */
124     c = 0;
125     for (int i = 0; i<blocksPerGrid; i++) {
126         c += partial_c[i];
127     }
128 
129 #define sum_squares(x)  (x*(x+1)*(2*x+1)/6)
130     printf("Does GPU value %.6g = %.6g?
", c, 2 * sum_squares((float)(N - 1)));
131 
132     // free memory on the gpu side
133     HANDLE_ERROR(cudaFree(dev_a));
134     HANDLE_ERROR(cudaFree(dev_b));
135     HANDLE_ERROR(cudaFree(dev_partial_c));
136 
137     // free memory on the cpu side
138     free(a);
139     free(b);
140     free(partial_c);
141 }

相关阅读:
OS X进程管理之launchctl
varnish-4.x 之varnishlog,varnishstat,varnishtop,varnishhist
Java ClassLoader详解（转载）
java线程中断[interrupt()函数] (转载)
Struts2实例详解（转载）
java 重定向和转发（转载）
Java中Class.forName()的作用（转载）
JSP页面的编码设置（转载）
Struts2 实例（转载）
Spring AOP（转载）
原文地址：https://www.cnblogs.com/liangliangdetianxia/p/3986133.html