• CUDA大作业_进行图像特征匹配V2.0


    在V1.0的基础上改变了排序方式并对部分并行代码进行了优化

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <string>
    #include <stdio.h>
    #include <stdlib.h>
    #include <iostream>
    #include <fstream>
    #include <sstream>
    #include <string>
    #include <math.h>
    #include<vector>
    #include<algorithm>
    #include<time.h>

    using namespace std;


    template<typename T>
    void readMatrixfromTXT(const char *fileName, const int numColumn, const int numRow, T *matrix);
    float CalDist(float*mat, int row, int col);

    int getFileColumns(const char *fileName);
    int getFileRows(const char *fileName);

    cudaStream_t stream[3];
    int gpu[3] = { 3,4,5 };

    typedef struct data_ind{
    float data;
    float i;
    };

    #define inf 1000000
    #define CUDA_CHECK_RETURN(value) {
    cudaError_t _m_cudaStat = value;
    if (_m_cudaStat != cudaSuccess) {
    fprintf(stderr, "Error %s at line %d in file %s ",
    cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);
    exit(1);
    } }

    void checkCUDAError(const char *msg)
    {
    cudaError_t err = cudaGetLastError();
    if (cudaSuccess != err)
    {
    fprintf(stderr, "Cuda error: %s: %s. ", msg, cudaGetErrorString(err));
    exit(-1);
    }
    }


    const int M = 181, N = 14, M1 = 168, N1 = 14, M2 = 168, N2 = 14;
    __constant__ float test[13] = { 131.0000, 5.2004, 0.5679, 1.5263, 2.1928, 2.78, 5.2777, 4.2365, 5.2146, 3.3337, 2.9507, 1.7726, 3.1895 };

    const float test1[13] = { 131.0000, 5.2004, 0.5679, 1.5263, 2.1928, 2.78, 5.2777, 4.2365, 5.2146, 3.3337, 2.9507, 1.7726, 3.1895 };


    __global__ void reduceComp (float *g_idata, float *g_odata,int N, int blockSize)
    {
    volatile __shared__ data_ind sdata[512], sdata1[512], sdata2[512];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2) + tid;
    sdata[tid].data = inf;sdata[tid].i = tid;
    sdata1[tid].data = inf;sdata1[tid].i = tid;
    sdata2[tid].data = inf;sdata2[tid].i = tid;

    if(i+blockSize<N){
    sdata[tid] .data=( g_idata[i] <=g_idata[i+blockSize]?g_idata[i]:g_idata[i+blockSize]);
    sdata1[tid].data=sdata[tid].data;
    sdata2[tid].data=sdata[tid].data;}
    else if(i<N){
    sdata[tid] .data=g_idata[i];
    sdata1[tid].data=sdata[tid].data;
    sdata2[tid].data=sdata[tid].data;
    }
    else{
    sdata[tid] .data=inf;
    sdata1[tid].data=inf;
    sdata2[tid].data=inf;
    }
    __syncthreads();

    /********************** Sort firt1****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata[tid].data<=sdata[tid + 512].data? 0:1); sdata[tid].data=sdata[tid+flagg*512].data;sdata[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata[tid].data<=sdata[tid + 256].data? 0:1); sdata[tid].data=sdata[tid+flagg*256].data;sdata[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata[tid].data<=sdata[tid + 128].data? 0:1); sdata[tid].data=sdata[tid+flagg*128].data;sdata[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata[tid].data<=sdata[tid + 64].data? 0:1); sdata[tid].data=sdata[tid+flagg*64].data;sdata[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata[tid].data<=sdata[tid + 32].data? 0:1);sdata[tid].data=sdata[tid+flagg*32].data;sdata[tid].i=sdata[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata[tid].data<=sdata[tid + 16].data? 0:1);sdata[tid].data=sdata[tid+flagg*16].data;sdata[tid].i=sdata[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata[tid].data<=sdata[tid + 8].data? 0:1);sdata[tid].data=sdata[tid+flagg*8].data;sdata[tid].i=sdata[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata[tid].data<=sdata[tid + 4].data? 0:1);sdata[tid].data=sdata[tid+flagg*4].data;sdata[tid].i=sdata[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata[tid].data<=sdata[tid + 2].data? 0:1);sdata[tid].data=sdata[tid+flagg*2].data;sdata[tid].i=sdata[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata[tid].data<=sdata[tid + 1].data? 0:1);sdata[tid].data=sdata[tid+flagg*1].data;sdata[tid].i=sdata[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ g_odata[3*blockIdx.x] = sdata[0].data;sdata1[int(sdata[0].i)].data=inf;sdata2[int(sdata[0].i)].data=inf;}
    //g_odata[3*blockIdx.x]=16;

    /********************** Sort second****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata1[tid].data<=sdata1[tid + 512].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*512].data;sdata1[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata1[tid].data<=sdata1[tid + 256].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*256].data;sdata1[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata1[tid].data<=sdata1[tid + 128].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*128].data;sdata1[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata1[tid].data<=sdata1[tid + 64].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*64].data;sdata1[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata1[tid].data<=sdata1[tid + 32].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*32].data;sdata1[tid].i=sdata1[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata1[tid].data<=sdata1[tid + 16].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*16].data;sdata1[tid].i=sdata1[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata1[tid].data<=sdata1[tid + 8].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*8].data;sdata1[tid].i=sdata1[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata1[tid].data<=sdata1[tid + 4].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*4].data;sdata1[tid].i=sdata1[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata1[tid].data<=sdata1[tid + 2].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*2].data;sdata1[tid].i=sdata1[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata1[tid].data<=sdata1[tid + 1].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*1].data;sdata1[tid].i=sdata1[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ g_odata[3*blockIdx.x+1] = sdata1[0].data;sdata[int(sdata1[0].i)].data=inf;sdata2[int(sdata1[0].i)].data=inf;}
    //g_odata[3*blockIdx.x+1]=16;

    /********************** Sort third****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata2[tid].data<=sdata2[tid + 512].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*512].data;sdata2[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata2[tid].data<=sdata2[tid + 256].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*256].data;sdata2[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata2[tid].data<=sdata2[tid + 128].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*128].data;sdata2[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata2[tid].data<=sdata2[tid + 64].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*64].data;sdata2[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata2[tid].data<=sdata2[tid + 32].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*32].data;sdata2[tid].i=sdata2[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata2[tid].data<=sdata2[tid + 16].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*16].data;sdata2[tid].i=sdata2[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata2[tid].data<=sdata2[tid + 8].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*8].data;sdata2[tid].i=sdata2[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata2[tid].data<=sdata2[tid + 4].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*4].data;sdata2[tid].i=sdata2[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata2[tid].data<=sdata2[tid + 2].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*2].data;sdata2[tid].i=sdata2[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata2[tid].data<=sdata2[tid + 1].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*1].data;sdata2[tid].i=sdata2[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ g_odata[3*blockIdx.x+2] = sdata2[0].data;}
    //g_odata[3*blockIdx.x+2]=16;
    }

    __global__ void global_sort(float * a/*first three*blockNum1 element find by CalDis_Sort */, float*first_thr/*first three minimum */, int N)//improved rank sort
    {
    int index = blockDim.x*blockIdx.x + threadIdx.x;
    first_thr[0] = inf;
    first_thr[1] = inf;
    first_thr[2] = inf;
    if (index<N){
    float temp = a[index];
    int k = 0;
    for (int j = 0; j < N; j++)
    {
    if (temp>a[j])
    {
    ++k;
    }
    else if (temp == a[j] && index>j)
    {
    ++k;
    }
    }
    if (k<3)first_thr[k] = temp;
    }
    }


    const int thdsPerblock_cal_dis=512;


    __global__ void CalDis_Sort( float *b/*matrix to cal Euclidean dis*/, float *tosort, int R/*Matrix row number*/, int C/*colum number*/,int blockSize)
    {
    /********************** CalDis ****************************/

    volatile __shared__ data_ind sdata[thdsPerblock_cal_dis], sdata1[thdsPerblock_cal_dis], sdata2[thdsPerblock_cal_dis]; //__shared__ float first_three[3];
    //__shared__ float share_test[13];
    int tid = threadIdx.x;
    int index = tid + blockIdx.x*blockDim.x;
    // temp[index] = 100000;
    sdata[tid].data = inf;sdata1[tid].data = inf;sdata2[tid].data = inf;sdata[tid].i =tid;sdata1[tid].i =tid;sdata2[tid].i =tid;

    //if (tid < 13) share_test[tid] = test[tid];

    __syncthreads();
    // for(int i=0;i<13;i++)
    // share_test[i] = test[i];
    float temp_add = 0;
    if (index<R)
    {
    if (abs(test[0] - b[index]) <= 10)
    {
    for (int i = 1; i < 13; i++)
    {
    float addup = test[i] - b[i*R + index];
    temp_add += addup*addup;
    }
    sdata[tid].data = temp_add;
    sdata1[tid].data = temp_add;
    sdata2[tid].data = temp_add;

    }
    // temp[thdI] = 1000;
    }
    __syncthreads();
    /********************** Sort firt1****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata[tid].data<=sdata[tid + 512].data? 0:1); sdata[tid].data=sdata[tid+flagg*512].data;sdata[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata[tid].data<=sdata[tid + 256].data? 0:1); sdata[tid].data=sdata[tid+flagg*256].data;sdata[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata[tid].data<=sdata[tid + 128].data? 0:1); sdata[tid].data=sdata[tid+flagg*128].data;sdata[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata[tid].data<=sdata[tid + 64].data? 0:1); sdata[tid].data=sdata[tid+flagg*64].data;sdata[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata[tid].data<=sdata[tid + 32].data? 0:1);sdata[tid].data=sdata[tid+flagg*32].data;sdata[tid].i=sdata[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata[tid].data<=sdata[tid + 16].data? 0:1);sdata[tid].data=sdata[tid+flagg*16].data;sdata[tid].i=sdata[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata[tid].data<=sdata[tid + 8].data? 0:1);sdata[tid].data=sdata[tid+flagg*8].data;sdata[tid].i=sdata[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata[tid].data<=sdata[tid + 4].data? 0:1);sdata[tid].data=sdata[tid+flagg*4].data;sdata[tid].i=sdata[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata[tid].data<=sdata[tid + 2].data? 0:1);sdata[tid].data=sdata[tid+flagg*2].data;sdata[tid].i=sdata[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata[tid].data<=sdata[tid + 1].data? 0:1);sdata[tid].data=sdata[tid+flagg*1].data;sdata[tid].i=sdata[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ tosort[3*blockIdx.x] = sdata[0].data;sdata1[int(sdata[0].i)].data=inf;sdata2[int(sdata[0].i)].data=inf;}

    /********************** Sort second****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata1[tid].data<=sdata1[tid + 512].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*512].data;sdata1[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata1[tid].data<=sdata1[tid + 256].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*256].data;sdata1[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata1[tid].data<=sdata1[tid + 128].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*128].data;sdata1[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata1[tid].data<=sdata1[tid + 64].data? 0:1); sdata1[tid].data=sdata1[tid+flagg*64].data;sdata1[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata1[tid].data<=sdata1[tid + 32].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*32].data;sdata1[tid].i=sdata1[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata1[tid].data<=sdata1[tid + 16].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*16].data;sdata1[tid].i=sdata1[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata1[tid].data<=sdata1[tid + 8].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*8].data;sdata1[tid].i=sdata1[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata1[tid].data<=sdata1[tid + 4].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*4].data;sdata1[tid].i=sdata1[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata1[tid].data<=sdata1[tid + 2].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*2].data;sdata1[tid].i=sdata1[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata1[tid].data<=sdata1[tid + 1].data? 0:1);sdata1[tid].data=sdata1[tid+flagg*1].data;sdata1[tid].i=sdata1[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ tosort[3*blockIdx.x+1] = sdata1[0].data;sdata2[int(sdata1[0].i)].data=inf;}


    /********************** Sort third****************************/


    if (blockSize >= 1024) { if (tid < 1024) { int flagg=(sdata2[tid].data<=sdata2[tid + 512].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*512].data;sdata2[tid].i=tid+flagg*512;}
    __syncthreads(); }
    if (blockSize >= 512) { if (tid < 256) { int flagg=(sdata2[tid].data<=sdata2[tid + 256].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*256].data;sdata2[tid].i=tid+flagg*256;}
    __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { int flagg=(sdata2[tid].data<=sdata2[tid + 128].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*128].data;sdata2[tid].i=tid+flagg*128;}
    __syncthreads(); }
    if (blockSize >= 128) { if (tid < 64) { int flagg=(sdata2[tid].data<=sdata2[tid + 64].data? 0:1); sdata2[tid].data=sdata2[tid+flagg*64].data;sdata2[tid].i=tid+flagg*64;}
    __syncthreads(); }
    if (tid < 32) {
    if (blockSize >= 64) {int flagg =(sdata2[tid].data<=sdata2[tid + 32].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*32].data;sdata2[tid].i=sdata2[tid+flagg*32].i;}
    if (blockSize >= 32) {int flagg =(sdata2[tid].data<=sdata2[tid + 16].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*16].data;sdata2[tid].i=sdata2[tid+flagg*16].i;}
    if (blockSize >= 16) {int flagg =(sdata2[tid].data<=sdata2[tid + 8].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*8].data;sdata2[tid].i=sdata2[tid+flagg*8].i;}
    if (blockSize >= 8){int flagg =(sdata2[tid].data<=sdata2[tid + 4].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*4].data;sdata2[tid].i=sdata2[tid+flagg*4].i;}
    if (blockSize >= 4) {int flagg =(sdata2[tid].data<=sdata2[tid + 2].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*2].data;sdata2[tid].i=sdata2[tid+flagg*2].i;}
    if (blockSize >= 2){int flagg =(sdata2[tid].data<=sdata2[tid + 1].data? 0:1);sdata2[tid].data=sdata2[tid+flagg*1].data;sdata2[tid].i=sdata2[tid+flagg*1].i;}
    }
    __syncthreads();
    if (tid == 0){ tosort[3*blockIdx.x+2] = sdata2[0].data;}

    }


    class CRunGPU{
    public:
    cudaEvent_t start_gpu, end_gpu;
    float consume_gpu;
    int row;
    float *BTR_train;

    float *dev_first_thr;
    float *dev_BTR_train;
    float *dev_tosort;
    float *tosort_host;
    float *sort_block;
    int temp_gpu;
    cudaStream_t temp_stream;


    int threadsPerBlock ;
    int blocksPerGrid ;

    int threadsPerBlock2;
    int blocksPerGrid2 ;

    int flag,flag2;
    int R;
    int C;
    const char*fileName;
    CRunGPU(const char*fileName, int R/*row*/, int C/*column*/,int flag)//construc function
    {
    this->R=R;
    this->C=C;
    this->fileName=fileName;
    this->flag=flag;
    row = thdsPerblock_cal_dis * int((R + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);;
    BTR_train = (float*)malloc(row*C*sizeof(float));

    threadsPerBlock = thdsPerblock_cal_dis;
    blocksPerGrid = (R + threadsPerBlock - 1) / threadsPerBlock;

    threadsPerBlock2 = 512;
    //blocksPerGrid2 = (blocksPerGrid * 3 + threadsPerBlock2 - 1) / threadsPerBlock2;
    blocksPerGrid2 = ((blocksPerGrid * 3+1)/2 + threadsPerBlock2 - 1) / threadsPerBlock2;

    cout<<"blocksPerGrid: "<<blocksPerGrid<<endl;
    cout<<"blocksPerGrid2: "<<blocksPerGrid2<<endl;


    cout << "BTR_train" << row << "C" << C << endl;
    readMatrixfromTXT<float>(fileName, C, R, BTR_train);
    tosort_host = (float *)malloc(3 * sizeof(float));
    switch (flag)
    {
    case(0) : temp_stream = stream[0]; temp_gpu = gpu[0]; break;

    case(1) : temp_stream = stream[1]; temp_gpu = gpu[1]; break;

    case(2) : temp_stream = stream[2]; temp_gpu = gpu[2]; break;
    }

    cout << "malloc and copy " << endl;
    cudaSetDevice(temp_gpu);
    cudaStreamCreate(&temp_stream);
    //CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_test, 13 * sizeof(float)));

    CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_first_thr, 3*blocksPerGrid2* sizeof(float)));

    CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_BTR_train, C*row*sizeof(float)));
    // CUDA_CHECK_RETURN(cudaMemcpyAsync(dev_test, test, 13 * sizeof(float), cudaMemcpyHostToDevice,temp_stream));
    CUDA_CHECK_RETURN(cudaMemcpyAsync(dev_BTR_train, BTR_train, C*row*sizeof(float), cudaMemcpyHostToDevice, temp_stream));
    cout << "start calculate" << endl;
    cudaMalloc((void**)&dev_tosort, blocksPerGrid * 3* sizeof(float));
    sort_block = (float *)malloc(blocksPerGrid2 * 3 * sizeof(float));
    if(sort_block)
    cout<<"malloc sort_block successful"<<endl;

    }
    void compute()
    {
    cudaSetDevice(temp_gpu);
    // cudaEventCreate(&start_gpu);
    // cudaEventCreate(&end_gpu);

    // cudaEventRecord(start_gpu, temp_stream);
    CalDis_Sort << <blocksPerGrid, threadsPerBlock, 3*800 * sizeof(float), temp_stream >> >( dev_BTR_train, dev_tosort, row, C,thdsPerblock_cal_dis);

    checkCUDAError("CalDis_Sort");
    // checkCUDAError("kernel invocation1");


    //global_sort << <blocksPerGrid2, threadsPerBlock2, 400 * sizeof(float), temp_stream >> >(dev_tosort, dev_first_thr, blocksPerGrid * 3);
    reduceComp<< <blocksPerGrid2, threadsPerBlock2, 512*2*3 * sizeof(float), temp_stream >> >(dev_tosort, dev_first_thr, blocksPerGrid * 3,threadsPerBlock2);

    int num=blocksPerGrid *3;
    flag2 =1;
    while(num>3)
    {

    blocksPerGrid2 = (num + threadsPerBlock2 -1)/threadsPerBlock2;
    if(flag2 == 1)
    {
    reduceComp << <blocksPerGrid2, threadsPerBlock2,512*2*3 * sizeof(float)+12, temp_stream >> >(dev_tosort, dev_first_thr,num,threadsPerBlock2); checkCUDAError("reduceComp");
    flag2=0;
    }
    else
    {
    reduceComp << <blocksPerGrid2, threadsPerBlock2,512*2*3 * sizeof(float)+12, temp_stream >> >(dev_first_thr,dev_tosort,num,threadsPerBlock2); checkCUDAError("reduceComp");
    flag2=1;
    }
    num = blocksPerGrid2*3;
    }


    }
    float getResult()
    {
    cout<<"get result"<<endl;
    CUDA_CHECK_RETURN(cudaMemcpyAsync(sort_block, dev_first_thr, 3* sizeof(float), cudaMemcpyDeviceToHost, temp_stream));
    //cudaMemcpyAsync(sort_block, dev_tosort, 3*blocksPerGrid2* sizeof(float), cudaMemcpyDeviceToHost, temp_stream);
    // for(int i=0;i< 3*blocksPerGrid2;i++)
    // cout<<sort_block[0]<< endl;
    // BubbleSortFirst3(sort_block, 3*blocksPerGrid2);
    cout << fileName << "distance is" << endl;
    float result=(sqrt(sort_block[0])+sqrt(sort_block[1])+sqrt(sort_block[2]))/3;
    printf(":%f ",result);
    return result;
    }

    void BubbleSortFirst3(float* pData/*array*/, int count/*the dimension of array*/)
    {
    float temp;
    for (int i = 0; i < 3; i++)
    {
    for (int j = count - 1; j > i; --j)
    {
    if (pData[j] < pData[j - 1])
    {
    temp = pData[j - 1];
    pData[j - 1] = pData[j];
    pData[j] = temp;
    }
    }
    }
    }

    virtual ~CRunGPU()
    {

    switch (flag)
    {
    case(0) : temp_stream = stream[0]; temp_gpu = gpu[0]; break;

    case(1) : temp_stream = stream[1]; temp_gpu = gpu[1]; break;

    case(2) : temp_stream = stream[2]; temp_gpu = gpu[2]; break;
    }
    cudaSetDevice(temp_gpu);
    CUDA_CHECK_RETURN(cudaFree(dev_BTR_train));
    //CUDA_CHECK_RETURN(cudaFree(dev_test));
    free(BTR_train);
    free(tosort_host);
    CUDA_CHECK_RETURN(cudaFree(dev_tosort));
    CUDA_CHECK_RETURN(cudaFree(dev_first_thr));
    free(sort_block);
    }
    };


    //void runCPU(const char*fileName){}

    #define MULNUM 1
    #define GPUCYCLE 1000
    #define CPUCYCLE 10

    const char *fileNameBTR = "data/BTR.txt";const char *fileNameBMP = "data/BMP.txt";const char *fileNameT = "data/T.txt";

    int main()
    {
    /********************************************`
    / GPU go first
    /*******************************************/
    printf("***********************START OF GPU*********************** ");
    int row_M = getFileRows(fileNameBTR);
    int row_M1 = getFileRows(fileNameBMP);
    int row_M2 = getFileRows(fileNameT);
    cout << row_M << endl;
    cout << "creating array of stream and gpu success" << endl;
    //double consume_gpu=10;

    CRunGPU btr(fileNameBTR, row_M, N,0) ;
    CRunGPU bmp(fileNameBMP, row_M1, N1,1) ;
    CRunGPU t(fileNameT, row_M2, N2,2) ;

    clock_t gpuParelledTime_b,gpuParelledTime_e;
    gpuParelledTime_b=clock();
    for(int i=0;i<GPUCYCLE;i++)
    {
    btr.compute();
    bmp.compute();
    t.compute();
    cudaStreamSynchronize(btr.temp_stream);
    cudaStreamSynchronize(bmp.temp_stream);
    cudaStreamSynchronize(t.temp_stream);
    // cudaEventSynchronize(btr.end_gpu); // ratio = 870
    // cudaEventSynchronize(bmp.end_gpu); // ratio = 860
    // cudaEventSynchronize(t.end_gpu); //ratio = 860
    }
    gpuParelledTime_e=clock();
    double consume_gpu=(double)( gpuParelledTime_e - gpuParelledTime_b ) / CLOCKS_PER_SEC * 1000 /GPUCYCLE;
    printf("gpu_time=%f ", consume_gpu);
    btr.getResult();
    bmp.getResult();
    t.getResult();

    //float consume_gpu = (gpu_t1 + gpu_t2 + gpu_t3) / 3;
    printf("***********************END OF GPU************************* ");
    printf(" ");
    printf(" ");

    /********************************************
    / CPU go second
    /*******************************************/

    printf("***********************START OF CPU*********************** ");
    int col, row;
    float distBMP, distBTR, distT;
    float*dataMatrix1, *dataMatrix2, *dataMatrix3;
    //vector<float> DisBTR;
    //vector<float> DisBMP;
    //vector<float> DisT;

    //BTR

    col = getFileColumns(fileNameBTR);
    row = getFileRows(fileNameBTR);
    int r_btr = thdsPerblock_cal_dis * int((row + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);
    //printf("row:%d,col:%d ",row,col);
    dataMatrix1 = (float*)malloc(col*r_btr*sizeof(float));
    if (dataMatrix1){
    //printf("malloc matrix successful! ");
    readMatrixfromTXT<float>(fileNameBTR, col, row, dataMatrix1);
    }
    cout << "col" << col << endl;

    // BMP

    col = getFileColumns(fileNameBMP);
    row = getFileRows(fileNameBMP);
    int r_bmp = thdsPerblock_cal_dis * int((row + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);

    //printf("row:%d,col:%d ",row,col);
    dataMatrix2 = (float*)malloc(col*r_bmp*sizeof(float));
    if (dataMatrix2){
    //printf("malloc matrix successful! ");
    readMatrixfromTXT<float>(fileNameBMP, col, row, dataMatrix2);
    }


    // T

    col = getFileColumns(fileNameT);
    row = getFileRows(fileNameT);
    int r_t = thdsPerblock_cal_dis * int((row + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);
    //printf("row:%d,col:%d ",row,col);
    dataMatrix3 = (float*)malloc(col*r_t*sizeof(float));
    if (dataMatrix3){
    //printf("malloc matrix successful! ");
    readMatrixfromTXT<float>(fileNameT, col, row, dataMatrix3);
    }

    clock_t start_cpu = clock();
    //printf("start_cpu: %f ", start_cpu);
    for (int cnt = 0; cnt<CPUCYCLE; ++cnt){
    distBTR = CalDist(dataMatrix1, r_btr, col);
    distBMP = CalDist(dataMatrix2, r_bmp, col);
    distT = CalDist(dataMatrix3, r_t, col);
    }
    clock_t end_cpu = clock();
    //printf("end_cpu: %f ", end_cpu);
    clock_t consume_cpu = (end_cpu - start_cpu);


    //DisBTR.push_back(distBTR);
    printf("The distance of BTR is %f ", distBTR);
    //DisBMP.push_back(distBMP);
    printf("The distance of BMP is %f ", distBMP);
    //DisT.push_back(distT);
    printf("The distance of T is %f ", distT);
    free(dataMatrix1);
    free(dataMatrix2);
    free(dataMatrix3);

    if (distBMP < distBTR)
    {
    if (distBMP < distT)
    {
    printf("The type of picture is BMP ");
    string type = "BMP";
    }
    else
    {
    printf("The type of picture is T ");
    string type = "T";
    }
    }
    else
    {
    if (distBTR < distT)
    {
    printf("The type of picture is BTR ");
    string type = "BTR";
    }
    else
    {
    printf("The type of picture is T ");
    string type = "T";
    }
    }
    printf("cpu_time=%f ", (double)consume_cpu / CLOCKS_PER_SEC * 1000 / CPUCYCLE);
    printf("***********************END OF CPU************************* ");
    printf("cpu_time/gpu_time=%f ", (double)consume_cpu / CLOCKS_PER_SEC * 1000 / consume_gpu / CPUCYCLE);
    //printf("cpu_time/gpu_time=%f ", (double)totaltime/CLOCKS_PER_SEC*1000 / consume_gpu);

    return 0;
    }


    void BubbleSort(float* pData/*array*/, int count/*the dimension of array*/)
    {
    float temp;
    for (int i = 1; i < 4; i++)
    {
    for (int j = count - 1; j >= i; j--)
    {
    if (pData[j] < pData[j - 1])
    {
    temp = pData[j - 1];
    pData[j - 1] = pData[j];
    pData[j] = temp;
    }
    }
    }
    }

    template<typename T>
    void readMatrixfromTXT(const char *fileName, const int numColumn, const int numRow, T *matrix)
    {
    // std::ifstream fin(fileName,std::ifstream::in);
    ifstream fin(fileName);
    // ifstream fin(fileName.c_str(),ios::in);
    if (!fin)
    {
    cerr << "??????????ò????????????" << endl;
    exit(1);
    }
    string line;
    float tmp;
    int j = 0;
    int i = 0;
    int numRow2 = thdsPerblock_cal_dis * int((numRow + thdsPerblock_cal_dis-1) / thdsPerblock_cal_dis);
    for (i = 0; i<numRow - 1; i++){
    getline(fin, line);
    j = 0;
    //for(int j=0;j<numColumn;j++){
    istringstream istr(line);
    while (istr >> tmp){
    //matrix[i*numColumn + j] = tmp;
    matrix[j*numRow2 + i]=tmp;
    ++j;
    //cout<<tmp<<endl;
    }
    istr.clear();
    line.clear();
    }
    // cout<<"to add to num%256==0"<<endl;
    getline(fin, line);
    fin.close();
    j = 0;
    int rownum2 = numRow - 1;


    do
    {
    j = 0;
    istringstream istr(line);
    while (istr >> tmp){
    matrix[j*numRow2 + rownum2]=tmp;
    ++j;
    }
    istr.clear();
    ++rownum2;
    } while (rownum2 % thdsPerblock_cal_dis>0);

    for(int i=numRow;i<numRow2;i++)
    matrix[i]=1000;
    }

    int getFileColumns(const char *fileName){
    return 14;
    }

    int getFileRows(const char *fileName){
    ifstream fileStream(fileName, ios::in);
    string tmp;
    int count = 0;
    if (fileStream){
    while (getline(fileStream, tmp, ' ')){
    count++;
    }
    }
    fileStream.close();
    return count;
    }

    float CalDist(float*mat, int row, int col){
    //vector<float>tmp_dist;
    float * a= (float*)malloc(sizeof(float)*MULNUM*181);
    int count=0;
    for (int i = 0; i<row; i++){
    if (abs(test1[0] - mat[i]) <= 10){
    float sum = 0;
    for (int k = 1; k<13; k++){
    sum += (test1[k] - mat[i + k*row])*(test1[k] - mat[i + k*row]);
    }
    sum = sqrt(sum);
    //tmp_dist.push_back(sum);
    a[count++]=sum;
    }
    }
    //cout<<"cpu sorting..."<<endl;
    BubbleSort(a, count);
    float dist1 = (a[0] + a[1] + a[2])/3;
    free(a);
    return dist1;
    }

  • 相关阅读:
    jQuery使用(十一):jQuery实例遍历与索引
    jQuery使用(十):jQuery实例方法之位置、坐标、图形(BOM)
    BOM:浏览器对象模型之浏览器剖析入门
    源码来袭:bind手写实现
    源码来袭:call、apply手写实现与应用
    浏览器UI多线程及JavaScript单线程运行机制的理解
    jQuery使用(九):队列及实现原理、基于队列模拟实现animate()
    原生JavaScript运动功能系列(五):定时定点运动
    原生JavaScript运动功能系列(四):多物体多值链式运动
    原生JavaScript运动功能系列(三):多物体多值运动
  • 原文地址:https://www.cnblogs.com/Erdos001/p/4544775.html
Copyright © 2020-2023  润新知