• cuda实践5---cuda kdtree、octree


     cuda kdtree

    前言:将kdtree 查询部分移植到GPU端,在很多应用中对提高算法的执行效率很有帮助,本文使用英伟达GPU语言cuda,完成了kdtree GPU端的移植。

    步骤比较简单:1、cpu端 创建kdtree; 2、迁移kdtree node 节点到GPU端;3、GPU端实现临近检索   (注:里面会有很多处理小技巧,望相互学习)

    核心代码:

    cuda_kdtree.cu

    //main.cu
    #include <stdio.h>
    #include <iostream>
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <math.h>
    
    #include "cuda_kdtree.h"
    
    void CheckCUDAError(const char *msg)
    {
        cudaError_t err = cudaGetLastError();
        if (cudaSuccess != err) {
            fprintf(stderr, "Cuda error: %s: %s.
    ", msg, cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    }
    
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    __device__ float distance3d(float* pt1, float* pt2)
    {
        float temp = sqrtf(
            (pt1[0] - pt2[0])*(pt1[0] - pt2[0]) +
            (pt1[1] - pt2[1])*(pt1[1] - pt2[1]) +
            (pt1[2] - pt2[2])*(pt1[2] - pt2[2]));
        return temp;
    }
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    __device__ int findLeafNode_GPU(float* searchPoint,int current_node ,int node_num, CUDA_KDNode* kdnode_vector) {
    
        if (node_num == 0)
        {
            //std::cout << "CUDA_KDNode is empty ..." << std::endl;
            return -1;
        }
    
        while (true)
        {
            if (kdnode_vector[current_node].left == -1 && kdnode_vector[current_node].right == -1)   //leaf node
            {
                break;
            }
    
            int slipnum0 = 1;
            slipnum0 = kdnode_vector[current_node].dim;
            switch (slipnum0)
            {
            case 1:
                if (searchPoint[0] < kdnode_vector[current_node].split_value)
                {
                    current_node = kdnode_vector[current_node].left;
                }
                else
                {
                    current_node = kdnode_vector[current_node].right;
                }
                break;
    
            case 2:
                if (searchPoint[1] < kdnode_vector[current_node].split_value)
                {
                    current_node = kdnode_vector[current_node].left;
                }
                else
                {
                    current_node = kdnode_vector[current_node].right;
                }
                break;
    
            case 3:
                if (searchPoint[2] < kdnode_vector[current_node].split_value)
                {
                    current_node = kdnode_vector[current_node].left;
                }
                else
                {
                    current_node = kdnode_vector[current_node].right;
                }
                break;
            }
        }
    
        return current_node;
    }
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    __device__ int findNode_GPU(float* searchPoint, float searchRadius, int current_node, int node_num, CUDA_KDNode* kdnode_vector) {
    
        if (node_num == 0)
        {
            //std::cout << "CUDA_KDNode is empty ..." << std::endl;
            return -1;
        }
    
        while (true)
        {
            if (kdnode_vector[current_node].left == -1 || kdnode_vector[current_node].right == -1)   //leaf node
            {
                break;
            }
    
            int slipnum0 = 1;
            slipnum0 = kdnode_vector[current_node].dim;
            float dis_temp = fabs(searchPoint[slipnum0 - 1] - kdnode_vector[current_node].split_value);  
            if (dis_temp < searchRadius)
            {
                break;
            }
    
            if (searchPoint[slipnum0-1] < kdnode_vector[current_node].split_value)
            {
                current_node = kdnode_vector[current_node].left;
            }
            else
            {
                current_node = kdnode_vector[current_node].right;
            }        
        }
    
        return current_node;
    }
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    __device__ int searchRadius_GPU(
        float* pointCloud,int* pt_indexs, int node_num, CUDA_KDNode* kdnode_vector, 
        float* searchPoint,float searchRadius,
        int* searchIndex, float* searchDistance) 
    {
        if (node_num == NULL)
        {
            //std::cout << "CUDA_KDNode is empty ..." << std::endl;
            return -1;
        }
        int current_node = 0;   //根节点
        int templeafnode = 0;// findNode_GPU(searchPoint, searchRadius, current_node, node_num, kdnode_vector);
    
        float distanceTemp = 0.0;
        int temp = 0;
        for (int j = 0; j < kdnode_vector[templeafnode].numOfData; j++)
        {
            float pt2[3];
    
            pt2[0] = pointCloud[3 * pt_indexs[kdnode_vector[templeafnode].start_index + j]];
            pt2[1] = pointCloud[3 * pt_indexs[kdnode_vector[templeafnode].start_index + j] + 1];
            pt2[2] = pointCloud[3 * pt_indexs[kdnode_vector[templeafnode].start_index + j] + 2];
    
            distanceTemp = distance3d(searchPoint, pt2);
    
            if (searchRadius > distanceTemp && temp < maxN)
            {
                searchIndex[temp] = pt_indexs[kdnode_vector[templeafnode].start_index + j];
                searchDistance[temp] = distanceTemp;
                temp++;
            }
        }
    
        while (true)
        {
            int indexTemp = -1;
            float tempdis;
            int changetimes = 0;
    
            if (temp == 0)
            {
                break;
            }
    
            for (int i = 0; i < temp-1; i++)
            {
                if (searchDistance[i] > searchDistance[i + 1])  
                {
                    tempdis = searchDistance[i];
                    searchDistance[i] = searchDistance[i + 1];
                    searchDistance[i + 1] = tempdis;
    
                    indexTemp = searchIndex[i];
                    searchIndex[i] = searchIndex[i + 1];
                    searchIndex[i + 1] = indexTemp;
    
                    changetimes++;
                }
            }
    
            if (changetimes == 0)
            {
                break;   
            }
        }
    
        return temp;
    }
    
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    template <int BLOCK_SIZE> __global__ void runKNNSearchRadius_GPU(float* pointCloud, int* pt_indexs, int node_num, CUDA_KDNode* kdnode_vector,float searchRadius, 
        int* searchIndex_d, float* searchDistance_d)
    {
        int searchIndex[maxN];
        float searchDistance[maxN];
        for (int i = 0; i < maxN; i++)
        {
            searchIndex[i] = -999;
            searchDistance[i] = 999.;
        }
    
        int Row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
        int Col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
        
        float searchPoint[2];
        searchPoint[0] = Col * 1.0 / imgWidth_d;
        searchPoint[1] = (imgHeight_d - Row) * 1.0 / imgHeight_d;
    
        int searchNum = searchRadius_GPU(
            pointCloud, pt_indexs, node_num, kdnode_vector,
            searchPoint, searchRadius,
            searchIndex, searchDistance);
    
        int threadId = Row * imgWidth_d + Col;
        for (int i = 0; i < maxN; i++)
        {
            searchIndex_d[maxN*threadId + i] = searchIndex[i];
            searchDistance_d[maxN*threadId + i] = searchDistance[i];
        }
    
    }
    
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    CUDA_KDTREE::~CUDA_KDTREE()
    {
        cudaFree(m_gpu_nodes);
        cudaFree(m_gpu_indexes);
        cudaFree(m_gpu_points);
    }
    
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    void CUDA_KDTREE::createKDTree(std::vector<Point3d>& pointCloud, vector<CUDA_KDNode>& cpu_nodes, vector<int>& indexes)
    {
        kd::KDTree<Point3d> tree;
        tree.setInputPointCloud(pointCloud);
        tree.setNumOfLeafData(100);
        tree.buildKDTree();
    
        std::queue<kd::kdnode*> kdnodePtrQue;
        kdnodePtrQue.push(tree.ptree->root);
    
        std::vector<kd::kdnode*> kdnode_vect;
    
        //go through kdtree node
        int node_t = 0;
        while (!kdnodePtrQue.empty())
        {
            kd::kdnode* tempkdnode = kdnodePtrQue.front();
    
            if (tempkdnode==NULL)
            {
                kdnodePtrQue.pop();
                continue;
            }
    
            tempkdnode->node_index = node_t;
            kdnode_vect.push_back(tempkdnode);
    
            CUDA_KDNode gpu_node_temp;
            gpu_node_temp.dim = tempkdnode->dim;
            gpu_node_temp.nodelr = tempkdnode->nodelr;
            gpu_node_temp.numOfData = tempkdnode->numOfData;
            gpu_node_temp.split_value = tempkdnode->split_value;
            cpu_nodes.push_back(gpu_node_temp);
    
            kdnodePtrQue.pop();
            if (tempkdnode->left!=NULL)
            {
                kdnodePtrQue.push(tempkdnode->left);
            }
            if (tempkdnode->right != NULL)
            {
                kdnodePtrQue.push(tempkdnode->right);
            }
    
            node_t++;
        }
        
        int index_t = 0;
        for (int i = 0; i < node_t; i++)
        {
            if (kdnode_vect[i]->father==NULL)
                cpu_nodes[i].father = -1;
            else
                cpu_nodes[i].father = kdnode_vect[i]->father->node_index;
    
            if (kdnode_vect[i]->left == NULL)
                cpu_nodes[i].left = -1;
            else
                cpu_nodes[i].left = kdnode_vect[i]->left->node_index;
    
            if (kdnode_vect[i]->right == NULL)
                cpu_nodes[i].right = -1;
            else
                cpu_nodes[i].right = kdnode_vect[i]->right->node_index;
    
            cpu_nodes[i].start_index = index_t;
            index_t += cpu_nodes[i].numOfData;
    
            for (int j = 0; j < cpu_nodes[i].numOfData; j++)
            {
                indexes.push_back(kdnode_vect[i]->data[j]);
            }
        }
    }
    
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    int CUDA_KDTREE::search(std::vector<Point3d>& pointCloud, vector<CUDA_KDNode>& cpu_nodes, vector<int>& indexes, int* queries_indexes, float *queries_dists)
    {
        // Create the nodes again on the CPU, laid out nicely for the GPU transfer
        int m_num_index = indexes.size();
        int num_nodes = cpu_nodes.size();
        int point_num = pointCloud.size();
    
        //int* queries_indexes = NULL;
        //float *queries_dists = NULL;
        //queries_indexes = (int*)malloc(sizeof(int) * imgHeight_d * imgWidth_d* maxN);
        //queries_dists = (float*)malloc(sizeof(int) * imgHeight_d * imgWidth_d* maxN);
    
        int* searchIndex_d = NULL;
        float* searchDistance_d = NULL;
    
        cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode)*num_nodes);
        cudaMalloc((void**)&m_gpu_indexes, sizeof(int)*m_num_index);
        cudaMalloc((void**)&m_gpu_points, sizeof(float) * 3 * point_num);
    
        cudaMalloc((void**)&searchIndex_d, sizeof(int) * imgHeight_d * imgWidth_d* maxN);
        cudaMalloc((void**)&searchDistance_d, sizeof(float) * imgHeight_d * imgWidth_d* maxN);
    
        cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode) * num_nodes, cudaMemcpyHostToDevice);
        cudaMemcpy(m_gpu_indexes, &indexes[0], sizeof(int) * m_num_index, cudaMemcpyHostToDevice);
        cudaMemcpy(m_gpu_points, &pointCloud[0], sizeof(float) * 3 * point_num, cudaMemcpyHostToDevice);
    
        int BLOCK_SIZE = 16;
        // Setup execution parameters
        dim3 block(BLOCK_SIZE, BLOCK_SIZE);   //16*16 或者 32*32  block大小
        dim3 grid(imgWidth_d / block.x, imgHeight_d / block.y);   //计算grid 大小
    
        runKNNSearchRadius_GPU<16><<<grid, block>>> (m_gpu_points, m_gpu_indexes, num_nodes, m_gpu_nodes, 0.02, searchIndex_d, searchDistance_d);
    
        cudaError_t cudaStatus;
        cudaStatus = cudaGetLastError();
        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.
        cudaStatus = cudaDeviceSynchronize();
    
        cudaMemcpy(&queries_indexes[0], searchIndex_d, sizeof(int)*imgHeight_d * imgWidth_d* maxN, cudaMemcpyDeviceToHost);
        cudaMemcpy(&queries_dists[0], searchDistance_d, sizeof(float)*imgHeight_d * imgWidth_d* maxN, cudaMemcpyDeviceToHost);
    
        CheckCUDAError("CreateKDTree");
    
        // free 
        cudaFree(searchIndex_d);
        cudaFree(searchDistance_d);
    
        return 0;
    }

    参考:

    https://github.com/vincentfpgarcia/kNN-CUDA

    http://raytracingdiary.blogspot.com/2011/04/cuda-kd-tree-implementations.html

    http://nghiaho.com/?p=437   

  • 相关阅读:
    leetcode & lintcode 题解
    部署 Flask 应用时,为什么会需要 gunicorn 或 uWSGI?
    ubuntu vim python配置
    深度学习Momentum(动量方法)
    spark shuffle原理
    c++多态特性总结
    FM/FFM原理
    hadoop streaming怎么设置key
    归一化的优点和方法
    九章算法强化
  • 原文地址:https://www.cnblogs.com/lovebay/p/13454226.html
Copyright © 2020-2023  润新知