• CUDA学习4 线程协作


    CUDA学习3 Max pooling (python c++ cuda)中有一个2D grid的CUDA实现,用时141ms。

    以下为2D grid 2D blocks实现,耗时进一步降低到16ms。

        int x = blockIdx.x;
        int y = blockIdx.y;
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        int index2 = y*gridDim.x*blockDim.y*blockDim.x + x* blockDim.y*blockDim.x + ty*blockDim.x + tx;
     
    线程索引计算方式如上,此处需要的循环为(N,M,PH,PH),因此配置如下。(PH*PH=144未超出本机显卡max threads per block=1024的限制)
     
    dim3    grid(M, N);
    dim3    threads(PH, PH);

    下面是完整代码。

    #include <windows.h>
    #include <iostream>
    
    
    __global__ void MaxPool2d(float* bottom_data, const int height, const int pooled_height, float* top_data)
    {
        int x = blockIdx.x;
        int y = blockIdx.y;
        int dx = gridDim.x;
        //int dy = gridDim.y;
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        int dtx = blockDim.x;
        int dty = blockDim.y;
        float s = -10000.0;
        int index2 = y*dx*dtx*dty + x*dtx*dty + ty*dtx + tx;
        int index = y*dx*height*height + x*height*height + ty*pooled_height*height + tx*pooled_height;
        for (int u = 0; u < pooled_height && (u + pooled_height*ty)<height; ++u)
        for (int v = 0; v < pooled_height && (v + pooled_height*tx)<height; ++v)
        if (*(bottom_data + index + u*height + v)>s)
            s = *(bottom_data + index + u*height + v);
        *(top_data + index2) = s;
    }
    
    int main()
    {
        const int N = 500, M =100, H = 24, W = 24, D = 2;
        const int PH = H / D + H % D;
        int image_size = N*M*H*W*sizeof(float);
        int out_size = N*M*PH*PH*sizeof(float);
        float mul_by = 0.01;
        float *input, *output, *dev_output, *dev_input;
        input = new float[image_size];
        output = new float[out_size];
        for (int i = 0; i<N*M*H*W; i++)
            *(input + i) = i*mul_by;
    
        cudaMalloc((void**)&dev_output, out_size);
        cudaMalloc((void**)&dev_input, image_size);
        cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice);
        dim3    grid(M, N);
        dim3    threads(PH, PH);
        DWORD start_time = GetTickCount();
        MaxPool2d << <grid, threads >> >(dev_input, H, D, dev_output);
        cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
        DWORD end_time = GetTickCount();
        std::cout << "Cost: " << end_time - start_time << "ms." << std::endl;
        for (int i = 0; i<10; i++)
            std::cout << *(output + i) << std::endl;
    
        cudaFree(dev_input);
        cudaFree(dev_output);
        delete[] output;
        delete[] input;
        system("pause");
    }
    
    /*
    Cost: 16ms.
    0.25
    0.27
    0.29
    0.31
    0.33
    0.35
    0.37
    0.39
    0.41
    0.43
    */

     以下是采用3D grid 3D blocks的错误实现,如下每次比较大小时,都是和-1000.0在比较。

    #include <windows.h>
    #include <iostream>
    
    __global__ void MaxPool2d(float* bottom_data, const int height, const int pooled_height, float* top_data)
    {
        
        int x = blockIdx.x;
        int y = blockIdx.y;
        int z = blockIdx.z;
        int dx = gridDim.x;
        int dy = gridDim.y;
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        int tz = threadIdx.z;
        int dtx = blockDim.x;
        int dty = blockDim.y;
        int dtz = blockDim.z;
    
        int index2 = z*dy*dx*dtz + y*dx*dtz + x*dtz + tz;
        int index = z*dy*height*height + y*height*height + x*pooled_height*height + tz*pooled_height + ty*height + tx;
        if (tx==0 && ty==0)
            *(top_data + index2) = -1000.0;
        if (ty<height - pooled_height*x)
            if (tx<height - pooled_height*tz)
                if (*(bottom_data + index)>*(top_data + index2))
                    *(top_data + index2) = *(bottom_data + index);
        //__syncthreads();
    }
    
    int main()
    {
        const int N = 500, M =100, H = 24, W = 24, D = 2;
        const int PH = H / D + H % D;
        int image_size = N*M*H*W*sizeof(float);
        int out_size = N*M*PH*PH*sizeof(float);
        float mul_by = -0.01;
        float *input, *output, *dev_output, *dev_input;
        input = new float[image_size];
        output = new float[out_size];
        for (int i = 0; i<N*M*H*W; i++)
            *(input + i) = i*mul_by;
    
        cudaMalloc((void**)&dev_output, out_size);
        cudaMalloc((void**)&dev_input, image_size);
        cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice);
        dim3    grid(PH,M, N);
        dim3    threads(D, D,PH);
        DWORD start_time = GetTickCount();
        MaxPool2d << <grid, threads >> >(dev_input, H, D, dev_output);
        cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);
        DWORD end_time = GetTickCount();
        std::cout << "Cost: " << end_time - start_time << "ms." << std::endl;
        for (int i = 0; i<10; i++)
            std::cout << *(output + i) << std::endl;
    
        cudaFree(dev_input);
        cudaFree(dev_output);
        delete[] output;
        delete[] input;
        system("pause");
    }
    
    /*
    Cost: 47ms.
    -0.25
    -0.27
    -0.29
    -0.31
    -0.33
    -0.35
    -0.37
    -0.39
    -0.41
    -0.43
    */
  • 相关阅读:
    linux 中more、less 和 most 的区别
    mysql数据备份之 xtrabackup
    Web登录中的信心安全问题
    Yii2.0教程应用结构篇 —— 入口脚本
    HTML基础之JS
    HTML基础之DOM操作
    HTML基础之CSS
    HTML基础之HTML标签
    Python之unittest参数化
    Python之单元测试unittest
  • 原文地址:https://www.cnblogs.com/qw12/p/6399421.html
Copyright © 2020-2023  润新知