CUDA学习4 线程协作

在CUDA学习3 Max pooling （python c++ cuda）中有一个2D grid的CUDA实现，用时141ms。

以下为2D grid 2D blocks实现，耗时进一步降低到16ms。

    int x = blockIdx.x;
    int y = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int index2 = y*gridDim.x*blockDim.y*blockDim.x + x* blockDim.y*blockDim.x + ty*blockDim.x + tx;

线程索引计算方式如上，此处需要的循环为（N,M,PH,PH），因此配置如下。(PH*PH=144未超出本机显卡max threads per block=1024的限制）

dim3    grid(M, N);
dim3    threads(PH, PH);

下面是完整代码。

#include <windows.h>
#include <iostream>


__global__ void MaxPool2d(float* bottom_data, const int height, const int pooled_height, float* top_data)
{
    int x = blockIdx.x;
    int y = blockIdx.y;
    int dx = gridDim.x;
    //int dy = gridDim.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int dtx = blockDim.x;
    int dty = blockDim.y;
    float s = -10000.0;
    int index2 = y*dx*dtx*dty + x*dtx*dty + ty*dtx + tx;
    int index = y*dx*height*height + x*height*height + ty*pooled_height*height + tx*pooled_height;
    for (int u = 0; u < pooled_height && (u + pooled_height*ty)<height; ++u)
    for (int v = 0; v < pooled_height && (v + pooled_height*tx)<height; ++v)
    if (*(bottom_data + index + u*height + v)>s)
        s = *(bottom_data + index + u*height + v);
    *(top_data + index2) = s;
}

int main()
{
    const int N = 500, M =100, H = 24, W = 24, D = 2;
    const int PH = H / D + H % D;
    int image_size = N*M*H*W*sizeof(float);
    int out_size = N*M*PH*PH*sizeof(float);
    float mul_by = 0.01;
    float *input, *output, *dev_output, *dev_input;
    input = new float[image_size];
    output = new float[out_size];
    for (int i = 0; i<N*M*H*W; i++)
        *(input + i) = i*mul_by;

    cudaMalloc((void**)&dev_output, out_size);
    cudaMalloc((void**)&dev_input, image_size);
    cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice);
    dim3    grid(M, N);
    dim3    threads(PH, PH);
    DWORD start_time = GetTickCount();
    MaxPool2d << <grid, threads >> >(dev_input, H, D, dev_output);
    cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
    DWORD end_time = GetTickCount();
    std::cout << "Cost: " << end_time - start_time << "ms." << std::endl;
    for (int i = 0; i<10; i++)
        std::cout << *(output + i) << std::endl;

    cudaFree(dev_input);
    cudaFree(dev_output);
    delete[] output;
    delete[] input;
    system("pause");
}

/*
Cost: 16ms.
0.25
0.27
0.29
0.31
0.33
0.35
0.37
0.39
0.41
0.43
*/

以下是采用3D grid 3D blocks的错误实现，如下每次比较大小时，都是和-1000.0在比较。

#include <windows.h>
#include <iostream>

__global__ void MaxPool2d(float* bottom_data, const int height, const int pooled_height, float* top_data)
{
    
    int x = blockIdx.x;
    int y = blockIdx.y;
    int z = blockIdx.z;
    int dx = gridDim.x;
    int dy = gridDim.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int tz = threadIdx.z;
    int dtx = blockDim.x;
    int dty = blockDim.y;
    int dtz = blockDim.z;

    int index2 = z*dy*dx*dtz + y*dx*dtz + x*dtz + tz;
    int index = z*dy*height*height + y*height*height + x*pooled_height*height + tz*pooled_height + ty*height + tx;
    if (tx==0 && ty==0)
        *(top_data + index2) = -1000.0;
    if (ty<height - pooled_height*x)
        if (tx<height - pooled_height*tz)
            if (*(bottom_data + index)>*(top_data + index2))
                *(top_data + index2) = *(bottom_data + index);
    //__syncthreads();
}

int main()
{
    const int N = 500, M =100, H = 24, W = 24, D = 2;
    const int PH = H / D + H % D;
    int image_size = N*M*H*W*sizeof(float);
    int out_size = N*M*PH*PH*sizeof(float);
    float mul_by = -0.01;
    float *input, *output, *dev_output, *dev_input;
    input = new float[image_size];
    output = new float[out_size];
    for (int i = 0; i<N*M*H*W; i++)
        *(input + i) = i*mul_by;

    cudaMalloc((void**)&dev_output, out_size);
    cudaMalloc((void**)&dev_input, image_size);
    cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice);
    dim3    grid(PH,M, N);
    dim3    threads(D, D,PH);
    DWORD start_time = GetTickCount();
    MaxPool2d << <grid, threads >> >(dev_input, H, D, dev_output);
    cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
    DWORD end_time = GetTickCount();
    std::cout << "Cost: " << end_time - start_time << "ms." << std::endl;
    for (int i = 0; i<10; i++)
        std::cout << *(output + i) << std::endl;

    cudaFree(dev_input);
    cudaFree(dev_output);
    delete[] output;
    delete[] input;
    system("pause");
}

/*
Cost: 47ms.
-0.25
-0.27
-0.29
-0.31
-0.33
-0.35
-0.37
-0.39
-0.41
-0.43
*/

相关阅读:
linux 中more、less 和 most 的区别
 mysql数据备份之 xtrabackup
Web登录中的信心安全问题
 Yii2.0教程应用结构篇 —— 入口脚本
 HTML基础之JS
HTML基础之DOM操作
 HTML基础之CSS
HTML基础之HTML标签
 Python之unittest参数化
 Python之单元测试unittest
原文地址：https://www.cnblogs.com/qw12/p/6399421.html