• 积分图实现均值滤波的CUDA代码


    没想到我2010年买的笔记本显卡GT330M 竟然还能跑CUDA,果断小试了一把,环境为CUDA6.5+VS2012,写了一个积分图实现均值滤波。类似于OpenCV的blur()函数。

    使用lena.jpg做测试,效果如下:

          

    代码在此:

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <stdio.h>
    #include <opencv2opencv.hpp>
    
    using namespace std;
    using namespace cv;
    
     __global__ void rowAddKernel(float* pIntegImgLena,int* pPtsImg,int imgW,int imgH)
     {
         const int tidx=blockDim.x*blockIdx.x + threadIdx.x;
         if (tidx<imgW)
         {
             for (int j=1; j<imgH; j++)
             {
                 pIntegImgLena[j*imgW+ tidx] +=pIntegImgLena[(j-1)*imgW+tidx];
                 pPtsImg[j*imgW+ tidx] +=pPtsImg[(j-1)*imgW+ tidx];
             }
         }
     }
    
      __global__ void colAddKernel(float* pIntegImgLena,int* pPtsImg,int imgW,int imgH)
     {
         const int tidy=blockDim.y*blockIdx.y + threadIdx.y;
         if (tidy<imgH)
         {
             for (int i=1; i<imgW; i++)
             {
                 pIntegImgLena[tidy*imgW+ i] +=pIntegImgLena[tidy*imgW+i-1];
                 pPtsImg[tidy*imgW+ i] +=pPtsImg[tidy*imgW+ i-1];
             }
         }
     }
    
     __global__ void filterKernel(uchar* pImgLena,float* pIntegImgLena,int* pPtsImg,int imgW,int imgH,int win)
     {
         const int tidx=blockDim.x*blockIdx.x + threadIdx.x;
         const int tidy=blockDim.y*blockIdx.y + threadIdx.y;
         if (tidx<imgW && tidy<imgH)
         {
             int left=tidx-win;
             int right=tidx+win;
             int top=tidy-win;
             int bot=tidy+win;
    
             left=max(left, 0);
             right=min(right, imgW-1);
             top=max(top, 0);
             bot=min(bot, imgH-1);
    
             int id1=top*imgW+left;
             int id2=top*imgW+right;
             int id3=bot*imgW+left;
             int id4=bot*imgW+right;
             int cnt=pPtsImg[id4]+pPtsImg[id1]-pPtsImg[id2]-pPtsImg[id3];
             float sum=pIntegImgLena[id4]+pIntegImgLena[id1]-pIntegImgLena[id2]-pIntegImgLena[id3];
    
             float value=sum/cnt;
    
             pImgLena[tidy*imgW+tidx]=(uchar)value;
         }
     }
    
    void main()
    {
        //读取原图像
        string imgPath="data/lena.jpg";
        Mat imgLena=imread(imgPath, 0);
        int imgH=imgLena.rows;
        int imgW=imgLena.cols;
        namedWindow("lena");
        imshow("lena", imgLena);
        waitKey(0);
        //滤波后的lena
         Mat filterLena=imgLena.clone();
         filterLena.setTo(0);
        //积分图以及坐标索引图
        Mat integImgLena=Mat::zeros(imgLena.size(), CV_32FC1);
        Mat ptsImg=Mat::zeros(imgLena.size(), CV_32SC1);
        //积分图初始化
        imgLena.convertTo(imgLena, CV_32FC1);
        integImgLena=imgLena.clone();
        ptsImg.setTo(1);
    
        //分配内存
        uchar* pImgLena=NULL;
        float* pIntegImgLena=NULL;
        int* pPtsImg=NULL;
        cudaMalloc(&pImgLena, imgH*imgW*sizeof(uchar));
        cudaMalloc(&pIntegImgLena, imgH*imgW*sizeof(float));
        cudaMalloc(&pPtsImg, imgH*imgW*sizeof(int));
    
        //拷贝数据至GPU
        cudaMemcpy(pImgLena, imgLena.data,imgH*imgW*sizeof(uchar), cudaMemcpyHostToDevice);
        cudaMemcpy(pIntegImgLena, integImgLena.data,imgH*imgW*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(pPtsImg, ptsImg.data,imgH*imgW*sizeof(int), cudaMemcpyHostToDevice);
    
        //按行求前缀和
        dim3 block(8,1);
        dim3 grid((imgW+block.x-1)/block.x,1);
        rowAddKernel<<<grid, block, 0>>>(pIntegImgLena, pPtsImg, imgW, imgH);
        cudaThreadSynchronize();
         //按列求前缀和
        block=dim3(1,8);
        grid=dim3(1,(imgH+block.y-1)/block.y);
        colAddKernel<<<grid, block, 0>>>(pIntegImgLena, pPtsImg, imgW, imgH);
         cudaThreadSynchronize();
        //滤波
        int win=3;
        block=dim3(8,8);
        grid=dim3((imgW+block.x-1)/block.x, (imgH+block.y-1)/block.y);
        filterKernel<<<grid, block, 0>>>(pImgLena,pIntegImgLena, pPtsImg, imgW, imgH, win);
        cudaThreadSynchronize();
    
        cudaMemcpy(filterLena.data, pImgLena, imgH*imgW*sizeof(uchar), cudaMemcpyDeviceToHost);
    
        cudaError err;
        err=cudaGetLastError();
        if (err!=cudaSuccess)
        {
            cout<<"err="<<err<<endl;
            getchar();
        }
    
        namedWindow("filterLena");
        imshow("filterLena", filterLena);
        waitKey(0);
    
        cudaFree(pImgLena);
        cudaFree(pIntegImgLena);
        cudaFree(pPtsImg);
    }
    View Code
  • 相关阅读:
    spock2.x结合mockito静态mock
    线程池的拒绝策略及常见线程池
    正确关闭线程池
    对线面试官 | 字节跳动一面
    记一次oom问题排查
    MySQL索引下推,原来这么简单!
    vs2019 编译 protocol buffers
    每日一库:classList.js
    每日一库:tinycon.js
    算法: 有效的括号
  • 原文地址:https://www.cnblogs.com/riddick/p/7577293.html
Copyright © 2020-2023  润新知