• cuda纹理内存的使用


    CUDA纹理内存的访问速度比全局内存要快,因此处理图像数据时,使用纹理内存是一个提升性能的好方法。

    贴一段自己写的简单的实现两幅图像加权和的代码,使用纹理内存实现。

    输入:两幅图 lena, moon

      

    输出:两幅图像加权和

     1 #include <opencv2opencv.hpp> 
     2 #include <iostream>
     3 #include <string>
     4 #include <cuda.h>
     5 #include <cuda_runtime.h>
     6 #include <device_launch_parameters.h>
     7 
     8 using namespace std;
     9 using namespace cv;
    10 
    11 //声明CUDA纹理
    12 texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex1;
    13 texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex2;
    14 //声明CUDA数组
    15 cudaArray* cuArray1;
    16 cudaArray* cuArray2;
    17 //通道数
    18 cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc<uchar4>();
    19 
    20 
    21 __global__ void weightAddKerkel(uchar *pDstImgData, int imgHeight, int imgWidth,int channels)
    22 {
    23     const int tidx=blockDim.x*blockIdx.x+threadIdx.x;
    24     const int tidy=blockDim.y*blockIdx.y+threadIdx.y;
    25 
    26     if (tidx<imgWidth && tidy<imgHeight)
    27     {
    28         float4 lenaBGR,moonBGR;
    29         //使用tex2D函数采样纹理
    30         lenaBGR=tex2D(refTex1, tidx, tidy);
    31         moonBGR=tex2D(refTex2, tidx, tidy);
    32 
    33         int idx=(tidy*imgWidth+tidx)*channels;
    34         float alpha=0.5;
    35         pDstImgData[idx+0]=(alpha*lenaBGR.x+(1-alpha)*moonBGR.x)*255;
    36         pDstImgData[idx+1]=(alpha*lenaBGR.y+(1-alpha)*moonBGR.y)*255;
    37         pDstImgData[idx+2]=(alpha*lenaBGR.z+(1-alpha)*moonBGR.z)*255;
    38         pDstImgData[idx+3]=0;
    39     }
    40 }
    41 
    42 void main()
    43 {
    44     Mat Lena=imread("data/lena.jpg");
    45     Mat moon=imread("data/moon.jpg");
    46     cvtColor(Lena, Lena, CV_BGR2BGRA);
    47     cvtColor(moon, moon, CV_BGR2BGRA);
    48     int imgWidth=Lena.cols;
    49     int imgHeight=Lena.rows;
    50     int channels=Lena.channels();
    51 
    52     //设置纹理属性
    53     cudaError_t t;
    54     refTex1.addressMode[0] = cudaAddressModeClamp;
    55     refTex1.addressMode[1] = cudaAddressModeClamp;
    56     refTex1.normalized = false;
    57     refTex1.filterMode = cudaFilterModeLinear;
    58     //绑定cuArray到纹理
    59     cudaMallocArray(&cuArray1, &cuDesc, imgWidth, imgHeight);
    60     t = cudaBindTextureToArray(refTex1, cuArray1);
    61 
    62     refTex2.addressMode[0] = cudaAddressModeClamp;
    63     refTex2.addressMode[1] = cudaAddressModeClamp;
    64     refTex2.normalized = false;
    65     refTex2.filterMode = cudaFilterModeLinear;
    66      cudaMallocArray(&cuArray2, &cuDesc, imgWidth, imgHeight);
    67     t = cudaBindTextureToArray(refTex2, cuArray2);
    68 
    69     //拷贝数据到cudaArray
    70     t=cudaMemcpyToArray(cuArray1, 0,0, Lena.data, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyHostToDevice);
    71     t=cudaMemcpyToArray(cuArray2, 0,0, moon.data, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyHostToDevice);
    72 
    73     //输出图像
    74     Mat dstImg=Mat::zeros(imgHeight, imgWidth, CV_8UC4);
    75     uchar *pDstImgData=NULL;
    76     t=cudaMalloc(&pDstImgData, imgHeight*imgWidth*sizeof(uchar)*channels);
    77 
    78     //核函数,实现两幅图像加权和
    79     dim3 block(8,8);
    80     dim3 grid( (imgWidth+block.x-1)/block.x, (imgHeight+block.y-1)/block.y );
    81     weightAddKerkel<<<grid, block, 0>>>(pDstImgData, imgHeight, imgWidth, channels);
    82     cudaThreadSynchronize();
    83 
    84     //从GPU拷贝输出数据到CPU
    85     t=cudaMemcpy(dstImg.data, pDstImgData, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyDeviceToHost);
    86 
    87     //显示
    88     namedWindow("show");
    89     imshow("show", dstImg);
    90     waitKey(0);
    91 }
  • 相关阅读:
    ArcObjects
    Dojo是什么?
    百度地图是什么坐标系?
    高德地图API
    地理POI数据爬取-以百度地图为例
    Microsoft Help Viewer&ArcGIS Server二次开发.net篇 (一) 安装
    DevOps:Docker VS Kubernetes
    JUnit测试环境搭建
    嵌入式tomcat
    如何使用ABAP发送带有PDF格式附件的电子邮件
  • 原文地址:https://www.cnblogs.com/riddick/p/7892663.html
Copyright © 2020-2023  润新知