0_Simple__MultiGPU

使用多台 GPU 进行计算

▶ 源代码。使用不同的流来控制不同 GPU 上的运算任务。

  1 #include <stdio.h>
  2 #include <timer.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <helper_functions.h>
  6 #include <helper_cuda.h>
  7 #include "simpleMultiGPU.h"
  8 
  9 const int MAX_GPU_COUNT = 32;
 10 const int DATA_N        = 1048576 * 32;
 11 
 12 __global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
 13 {
 14     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 15     const int threadN = gridDim.x * blockDim.x;
 16     float sum = 0;
 17     
 18     for (int pos = tid; pos < N; pos += threadN)
 19         sum += d_Input[pos];
 20 
 21     d_Result[tid] = sum;
 22 }
 23 
 24 int main(int argc, char **argv)
 25 {
 26     printf("
	Start.
");
 27 
 28     const int BLOCK_N = 32, THREAD_N = 256;
 29     const int ACCUM_N = BLOCK_N * THREAD_N;
 30     int i, j, GPU_N;
 31     float sumGPU;
 32     TGPUplan plan[MAX_GPU_COUNT];
 33     
 34     cudaGetDeviceCount(&GPU_N);
 35     GPU_N = MIN(GPU_N, MAX_GPU_COUNT);
 36     printf("
	Device count: %i
", GPU_N);
 37 
 38     // 准备计算数据
 39     for (i = 0; i < GPU_N; i++)
 40         plan[i].dataN = DATA_N / GPU_N;
 41 
 42     // 计算数据量与设备数量没有对齐的部分
 43     for (i = 0; i < DATA_N % GPU_N; i++)
 44         plan[i].dataN++;
 45 
 46     // 申请内存，初始化 h_data
 47     for (i = 0; i < GPU_N; i++)
 48     {
 49         cudaSetDevice(i);
 50         cudaStreamCreate(&plan[i].stream);
 51         cudaMalloc((void **)&plan[i].d_data, plan[i].dataN * sizeof(float));
 52         cudaMalloc((void **)&plan[i].d_sum, ACCUM_N * sizeof(float));
 53         cudaMallocHost((void **)&plan[i].h_sum_from_device, ACCUM_N * sizeof(float));
 54         cudaMallocHost((void **)&plan[i].h_data, plan[i].dataN * sizeof(float));
 55 
 56         for (j = 0; j < plan[i].dataN; j++)
 57             plan[i].h_data[j] = (float)rand() / (float)RAND_MAX;
 58     }
 59 
 60     StartTimer();// 计时
 61 
 62     // 调用各 GPU 进行计算，plan[i].d_data -> plan[i].d_sum -> plan[i].h_sum_from_device
 63     for (i = 0; i < GPU_N; i++)
 64     {
 65         cudaSetDevice(i);
 66         cudaMemcpyAsync(plan[i].d_data, plan[i].h_data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);
 67         reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_sum, plan[i].d_data, plan[i].dataN);
 68         cudaMemcpyAsync(plan[i].h_sum_from_device, plan[i].d_sum, ACCUM_N *sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream);
 69     }
 70 
 71     // 处理 GPU 计算结果，plan[i].h_sum_from_device -> plan[i].h_sum -> sumGPU
 72     for (i = 0; i < GPU_N; i++)
 73     {
 74         cudaSetDevice(i);
 75         cudaStreamSynchronize(plan[i].stream);
 76 
 77         for (j = 0, plan[i].h_sum = 0.0f; j < ACCUM_N; j++)
 78             plan[i].h_sum += plan[i].h_sum_from_device[j];
 79     }
 80     for (i = 0, sumGPU = 0.0f; i < GPU_N; i++)// CPU 最后规约
 81         sumGPU += plan[i].h_sum;
 82     printf("
	GPU Processing time: %f (ms)
", GetTimer());
 83 
 84     // 使用 CPU 计算，plan[i].h_data -> sumCPU
 85     double sumCPU = 0;
 86     for (i = 0; i < GPU_N; i++)
 87     {
 88         for (j = 0; j < plan[i].dataN; j++)
 89             sumCPU += plan[i].h_data[j];
 90     }
 91 
 92     // 检查结果
 93     double diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
 94     printf("
	GPU sum: %f
	CPU sum: %f
", sumGPU, sumCPU);
 95     printf("
	Relative difference: %E, %s
", diff, (diff < 1e-5) ? "Passed" : "Failed");
 96     
 97     //回收工作
 98     for (i = 0; i < GPU_N; i++)
 99     {
100         cudaSetDevice(i);
101         cudaFreeHost(plan[i].h_data);
102         cudaFreeHost(plan[i].h_sum_from_device);
103         cudaFree(plan[i].d_sum);
104         cudaFree(plan[i].d_data);
105         cudaStreamDestroy(plan[i].stream);
106     }
107 
108     getchar();
109     return 0;
110 }

▶ 输出结果

    Start.

    Device count: 1

    GPU Processing time: 13.726471 (ms)

    GPU sum: 16779778.000000
    CPU sum: 16779776.312309
    Relative difference: 1.005789E-07, Passed

▶ 涨姿势

● 在使用不同的设备执行相关函数（包括 cudaFree 等主机函数）时要注意，使用函数 cudaSetDevice() 来切换设备。

相关阅读:
vue项目搭建
 iview在ie9及以上的兼容问题解决方案
 中山大学校队内部选拔赛试题试题2【New Year Gift】--------2015年2月8日
 中山大学校队选拔赛第二试题试题3【Compressed suffix array】-------2015年2月8日
 ZOJ2812------2015年2月4日
 C++STL泛型编程基础知识讲解--------2015年2月3日
 中山大学校队选拔赛第一章题4【简单数迷Simple Kakuro】-------2015年1月28日
 UVALive
UVA11375【火柴拼数Matches】-------2015年1月27日
 递推关系的运用加简单DP【UVA11137Ingenuous Cubrency】-------2015年1月27日
原文地址：https://www.cnblogs.com/cuancuancuanhao/p/7884265.html