• 【Cuda编程】加法归约


    cuda编程并行归约

    AtomicAdd调用出错

    在cuda中调用atomicAdd函数,但总显示未定义标识符,在网上送了一下,于是做了如下修改,

    右键解决方案属性-》配置属性-》CUDA C/C++-》Device-》Code Generation,加入compute_20,sm_20,并且把下面的“从父级或项目属性默认设置继承”的勾选去掉

    gpu cpu下时间计算

    //cpu 下
    #include <time.h>
    clock_t start,end;
    start  = clock();
    //cpu codes
    end = clock();
    printf("CPU Time: %.5f
    ", (float)(end-start));
    
    //gpu 下
    cudaEvent_t st,ed;
    cudaEventCreate(&st);
    cudaEventCreate(&ed);
    cudaEventRecord(st,0);
    //gpu codes
    cudaEventRecord(ed,0);
    cudaEventSynchronize(ed);
    float gpu_time;
    cudaEventElapsedTime(&gpu_time,st,ed);
    printf("GPU Time: %.5f
    ",gpu_time);
    
    cudaEventDestroy(st);
    cudaEventDestroy(ed);
    
    

    加法的归约

    #include <stdlib.h>
    #include <stdio.h>
    #include <cuda.h>
    #include <device_launch_parameters.h>
    #include <cuda_runtime.h>
    #include <book.h>
    
    const int Size = 256;
    const int block = 8;
    const int thread = 32;
    
    __global__ void calc(float *in, float *out){
    	unsigned int tid = threadIdx.x;
    	unsigned int bid = blockIdx.x;
    
    	//target array
    	float * target = in + blockIdx.x * blockDim.x;
    
    	//bounding
    	if(tid > thread)
    		return;
    
    	for(int stride = 1 ; stride < blockDim.x ; stride *= 2)
    	{
    		if(tid % (stride*2) == 0)
    		{
    			target[tid] += target[tid+stride];
    		}
    		__syncthreads();
    	}
    
    	if(tid == 0)
    	{
    		out[blockIdx.x] = target[tid];
    	}
    }
    
    __global__ void calc2(float *in, float *out)
    {
    	unsigned int tid = threadIdx.x;
    	unsigned int bid = tid + blockIdx.x*blockDim.x;
    
    	float * target = in + blockIdx.x * blockDim.x;
    
    	//bounding
    	if(tid > thread)
    		return;
    	//stride = 1,2,4,8
    	for(int stride = 1 ; stride < blockDim.x ; stride *= 2)
    	{
    		unsigned int index = 2*stride*tid;
    		if(index < blockDim.x)
    			target[index] += target[index+stride];
    		__syncthreads();
    	}
    
    	if(tid == 0)
    	{
    		out[blockIdx.x] = target[tid];
    	}
    }
    
    //跨步规约
    __global__ void calc3(float *in, float *out)
    {
    	unsigned int tid = threadIdx.x;
    	unsigned int bid = tid + blockIdx.x*blockDim.x;
    
    	float * target = in + blockIdx.x * blockDim.x;
    
    	//bounding
    	if(tid > thread)
    		return;
    
    	for(int stride = blockDim.x/2 ; stride > 0 ; stride /=2)
    	{
    		if(tid < stride)
    			target[tid] += target[tid+stride];
    		__syncthreads();
    	}
    	if(tid == 0)
    	{
    		out[blockIdx.x] = target[tid];
    	}
    } 
    
    __global__ void calc4(float *in, float *out)
    {
    	int tid = threadIdx.x;
    	int bid = blockIdx.x;
    	
    	float * target=in + bid * blockDim.x;
    
    	if(tid < thread)
    		return;
    	__shared__ float share_in[thread];
    
    	share_in[tid] = target[tid];
    
    	__syncthreads();
    
    	for(int stride = blockDim.x/2 ; stride > 0; stride /= 2)
    	{
    		if(tid < stride)
    		{
    			share_in[tid] += share_in[tid+stride];
    		}
    		__syncthreads();
    	}
    	if(tid == 0)
    	{
    		out[blockIdx.x] = share_in[tid];
    	}
    }
    
    int main()
    {
    	//host
    	float * indata; // Size
    	float * outdata; // block
    	float * ans; // 1
    
    	// device
    	float * dev_indata; // Size
    	float * dev_outdata; // block
    
    	// host malloc
    	indata = (float*)malloc(sizeof(float)*Size);
    	outdata = (float*)malloc(sizeof(float)*block);
    	ans = (float*)malloc(sizeof(float));
    
    	// device malloc
    	cudaMalloc((void**)&dev_indata,sizeof(float)*Size);
    	cudaMalloc((void**)&dev_outdata,sizeof(float)*block);
    
    	// init & generate data
    	for(int i = 0 ; i < Size ; i++)
    	{
    		indata[i] = i;
    	}
    	*ans = 0;
    
    	// time start
    	cudaEvent_t st,ed;
    	cudaEventCreate(&st);
    	cudaEventCreate(&ed);
    	cudaEventRecord(st,0);
    
    	// memcpy to device
    	HANDLE_ERROR(cudaMemcpy(dev_indata,indata,sizeof(float)*Size,cudaMemcpyHostToDevice));
    
    	// kernal functions
    	cudaDeviceSynchronize();
    	calc4<<<block,thread>>>(dev_indata,dev_outdata);
    	cudaDeviceSynchronize();
    
    	// memcpy to host
    	HANDLE_ERROR(cudaMemcpy(outdata,dev_outdata,sizeof(float)*block,cudaMemcpyDeviceToHost));
    
    	// time end
    	cudaEventRecord(ed,0);
    	cudaEventSynchronize(ed);
    
    	float gpu_time;
    	cudaEventElapsedTime(&gpu_time,st,ed);
    	
    	// test output
    	for(int i = 0 ; i < block ; i++)
    	{
    		//printf("%.3f
    ",outdata[i]);
    		*ans += outdata[i];
    	}
    	printf("GPU Time: %.5f
    Ans: %.5f
    ",gpu_time,*ans);
    
    	//time destory
    	cudaEventDestroy(st);
    	cudaEventDestroy(ed);
    
    	//device destory
    	cudaFree(indata);
    	cudaFree(outdata);
    	cudaFree(ans);
    
    	getchar();
    
    	return 0;
    }
    

    矩阵乘法

    #include <stdlib.h>
    #include <cuda_runtime.h>
    #include <stdio.h>
    #include <cuda.h>
    #include <device_launch_parameters.h>
    
    const int N = 20;
    
    __global__ void mul(int *a,int* b,int *out)
    {
      unsigned int tidx = threadIdx.x;
      unsigned int tidy = threadIdx.y;
    
      unsigned int offset = tidx*N + tidy;
    
      if(offset > N*N)return;
    
      int t = 0;
      for(int i = 0 ; i < N ; i++)
      {
        t += a[tidx*N+i]*b[i*N+tidy];
      }
      out[offset] = t;
    }
    
    int main()
    {
      //host
      int * matrix1;
      int * matrix2;
      int * output;
    
      //device
      int * dev_matrix1;
      int * dev_matrix2;
      int * dev_output;
    
      //host malloc
      matrix1 = (int*)malloc(sizeof(int)*N*N);
      matrix2 = (int*)malloc(sizeof(int)*N*N);
      output = (int*)malloc(sizeof(int)*N*N);
    
      //device malloc
      cudaMalloc((void**)&dev_matrix1,sizeof(int)*N*N);
      cudaMalloc((void**)&dev_matrix2,sizeof(int)*N*N);
      cudaMalloc((void**)&dev_output,sizeof(int)*N*N);
    
      //init generate data
      for(int i = 0 ; i < N*N ; i++)
      {
        matrix1[i] = i+1;
        matrix2[i] = i+1;
        output[i] = 0;
      }
    
      //CPU
      for(int i = 0 ; i < N ; i++)
      {
        for(int j = 0 ; j < N ; j++){
          int tp = 0;
          for(int k = 0 ; k < N ; k++)
          {
            tp += matrix1[i*N+k] * matrix2[k*N+j];
          }
          printf("%d ",tp);
        }
      }
      printf("
    ----------
    ");
    
      //time start
      cudaEvent_t st,ed;
      cudaEventCreate(&st);
      cudaEventCreate(&ed);
      cudaEventRecord(st,0);
    
      //memcpy to device
      cudaMemcpy(dev_matrix1,matrix1,sizeof(int)*N*N,cudaMemcpyHostToDevice);
      cudaMemcpy(dev_matrix2,matrix2,sizeof(int)*N*N,cudaMemcpyHostToDevice);
    
      //kernel functions
      mul<<<2,dim3(N,N)>>>(dev_matrix1,dev_matrix2,dev_output);
    
      //memcpy to host
      cudaMemcpy(output,dev_output,sizeof(int)*N*N,cudaMemcpyDeviceToHost);
    
      //output
      for(int i = 0 ; i < N*N ; i++)
      {
        printf("%d ",output[i]);
      }
      printf("
    ");
    
      //time end
      cudaEventRecord(ed,0);
      cudaEventSynchronize(ed);
      float gpu_time;
      cudaEventElapsedTime(&gpu_time,st,ed);
      printf("gpu time: %.5f
    ",gpu_time);
    
      //time destory
      cudaEventDestroy(st);
      cudaEventDestroy(ed);
    
      //device destory
      cudaFree(dev_matrix1);
      cudaFree(dev_matrix2);
      cudaFree(dev_output);
      free(matrix1);
      free(matrix2);
      free(output);
    
      return 0;
    }
    
    

    矩阵转置

    #include <iostream>
    #include <stdlib.h>
    #include <stdio.h>
    #include "cuda.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    const int N = 5;
    
    void output(int * arr)
    {
      for(int i = 0 ; i < N*N ; i++)
      {
        printf("%d	",arr[i]);
        if((i+1) % N == 0)
          printf("
    ");
      }
      printf("
    ");
    }
    
    __global__ void trans(int * in, int * out)
    {
    
      unsigned int xIndex = threadIdx.x + blockDim.x * blockIdx.x;
      unsigned int yIndex = threadIdx.y + blockDim.y * blockIdx.y;
    
      if(xIndex < N && yIndex < N)
      {
        unsigned int index_in = xIndex + N * yIndex;
        unsigned int index_out = yIndex + N * xIndex;
        out[index_out] = in[index_in];
      }
    }
    
    __global__ void trans2(int * in , int * out)
    {
      __shared__ float block[N][N];
      unsigned int xIndex = blockIdx.x * N + threadIdx.x;
      unsigned int yIndex = blockIdx.y * N + threadIdx.y;
      if((xIndex < N) && (yIndex < N))
      {
        unsigned int index_in = yIndex * N +xIndex;
        block[threadIdx.x][threadIdx.y] = in[index_in];
      }
    
      __syncthreads();
    
      xIndex = blockIdx.y * N + threadIdx.x;
      yIndex = blockIdx.x * N + threadIdx.y;
      if((xIndex < N) && (yIndex < N))
      {
        unsigned int index_out = yIndex * N + xIndex;
        out[index_out] = block[threadIdx.x][threadIdx.y];
      }
    }
    
    
    int main()
    {
      //host
      int * in;
      int * out;
    
      //device
      int * dev_in;    
      int * dev_out;
    
      //host cudaMalloc
      in = (int*)malloc(sizeof(int)*N*N);
      out = (int*)malloc(sizeof(int)*N*N);
    
      //device cudaMalloc
      cudaMalloc((void**)&dev_in,sizeof(int)*N*N);
      cudaMalloc((void**)&dev_out,sizeof(int)*N*N);
    
      //init
      for(int i = 0 ; i < N*N ; i++){
        in[i] = i+1;
      }
    
      //cudaMemcpy
      cudaMemcpy(dev_in,in,sizeof(int)*N*N,cudaMemcpyHostToDevice);
    
      //kernel functions
      trans<<<1,dim3(N,N)>>>(dev_in,dev_out);
    
      //memcpy back
      cudaMemcpy(out,dev_out,sizeof(int)*N*N,cudaMemcpyDeviceToHost);
    
      //dev_output
      output(in);
      printf("
    --------
    ");
      output(out);
    
      //cudaFree
      cudaFree(dev_in);
      cudaFree(dev_out);
      free(in);
      free(out);
    
      return 0;
    }
    
    

    统计数目

    #include <iostream>
    #include <stdlib.h>
    #include <stdio.h>
    #include "cuda.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    const int N = 26;
    const int L = 128;
    const int S = L*4;
    const int block = 4;
    const int thread = 32;
    
    __global__ void rec(char* book, int * record)
    {
      unsigned int tid = threadIdx.x;
    
      __shared__ int temp[N];
    
      temp[tid] = 0;
    
      __syncthreads();
    
      int index = tid + blockIdx.x * blockDim.x;
      int offset = blockDim.x * gridDim.x;
      //printf("%d-%d
    ",index,offset);
      while(index < S)
      {
        atomicAdd(&(temp[book[index]]),1);
        index += offset;
      }
      __syncthreads();
      atomicAdd(&(record[tid]),temp[tid]);
    }
    
    int main()
    {
       //host
       char * book;
       int * record;
    
       //device
       char * dev_book;
       int * dev_record;
    
       //host cudaMalloc
       book = (char*)malloc(sizeof(char)*S);
       record = (int*)malloc(sizeof(int)*N);
    
       //device malloc
       cudaMalloc((void**)&dev_book,sizeof(char)*S);
       cudaMalloc((void**)&dev_record,sizeof(int)*N);
    
       //init
       for(int i = 0 ; i < S ; i++)
       {
          srand(i+rand());
          book[i] = (i+i*i+rand())%26;
       }
    
       //cpu
       int tp[N]={0};
       for(int i = 0 ; i < S ; i++)
       {
         tp[book[i]]++;
       }
       for(int i = 0 ; i < N ; i++)
          printf("%d ",tp[i]);
       printf("
    ");
    
       //memcpy To device
       cudaMemcpy(dev_book,book,sizeof(char)*S,cudaMemcpyHostToDevice);
    
       //kernel functions
       rec<<<block,thread>>>(dev_book,dev_record);
       //memcpy To host
       cudaMemcpy(record,dev_record,sizeof(int)*N,cudaMemcpyDeviceToHost);
       //output
       for(int i = 0 ; i < N ; i++)
       {
         printf("%d ",record[i]);
       }
       printf("
    ");
    
       //destory
       cudaFree(dev_book);
       cudaFree(dev_record);
       free(book);
       free(record);
    
      return 0;
    }
    
    

    平方和求和

    分块处理

    #include <iostream>
    #include <stdlib.h>
    #include <stdio.h>
    #include "cuda.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <time.h>
    
    /*
    *  author : pprp
    *  theme : 平方和
    */
    const int N = 128;
    const int block = 4;
    const int thread = 32;
    
    __global__ void calc0(int * arr, int * result)
    {
      int tid = threadIdx.x;
      int Size = N / block;
      int sum = 0;
      for(int i = tid * Size ; i <(tid+1)*Size; i++)
      {
        sum += arr[i]*arr[i];
      }
      result[tid] = sum;
      //printf("sum: %d
    ",sum);
    }
    
    int main()
    {
      //host
      int * arr;
      int * result;
    
      //device
      int * dev_arr;
      int * dev_result;
    
      //host malloc
      arr = (int*)malloc(sizeof(int)*N);
      result = (int*)malloc(sizeof(int)*block);
    
      //device malloc
      cudaMalloc((void**)&dev_arr,sizeof(int)*N);
      cudaMalloc((void**)&dev_result,sizeof(int)*block);
    
      //init
      for(int i = 0 ; i < N ; i++)
      {
        arr[i] = i+1;
        if(i < block)
        {
          result[i] = 0;
        }
      }
    
      //cpu
      clock_t start,end;
      start = clock();
      unsigned int res = 0;
      for(int i = 0 ; i < N ; i++)
      {
        res += arr[i]*arr[i];
      }
      end = clock();
      printf("cpu ans : %d
    cpu time: %.5f
    ",res,float(end-start));
    
      //time start
      cudaEvent_t st,ed;
      cudaEventCreate(&st);
      cudaEventCreate(&ed);
      cudaEventRecord(st,0);
    
      //memcpy To Host
      cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);
    
      //kernel functions
      calc0<<<1,4>>>(dev_arr,dev_result);
      //memcpy To Device
      cudaMemcpy(result,dev_result,sizeof(int)*block,cudaMemcpyDeviceToHost);
    
      //output
      int res2=0;
      for(int i = 0 ; i < block ; i++)
      {
        res2 += result[i];
        //printf("test: %d
    ",result[i]);
      }
    
      //time end
      cudaEventRecord(ed,0);
      cudaEventSynchronize(ed);
      float gpu_time;
      cudaEventElapsedTime(&gpu_time,st,ed);
      printf("gpu ans :%d
    gpu time: %.5f
    ",res2,gpu_time);
    
      //time destroy
      cudaEventDestroy(st);
      cudaEventDestroy(ed);
    
      //device free
      cudaFree(dev_arr);
      cudaFree(dev_result);
      free(arr);
      free(result);
    
      return 0;
    }
    
    

    线程相邻

    #include <iostream>
    #include <stdlib.h>
    #include <stdio.h>
    #include "cuda.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <time.h>
    
    /*
    *  author : pprp
    *  theme : 平方和
    */
    const int N = 128;
    const int block = 4;
    const int thread = 32;
    
    __global__ void calc0(int * arr, int * result)
    {
      int tid = threadIdx.x;
    
      if(tid > block)return;
    
      int sum = 0;
      for(int i = tid; i < N ; i+=block)
      {
        sum += arr[i]*arr[i];
      }
      result[tid] = sum;
    }
    
    int main()
    {
      //host
      int * arr;
      int * result;
    
      //device
      int * dev_arr;
      int * dev_result;
    
      //host malloc
      arr = (int*)malloc(sizeof(int)*N);
      result = (int*)malloc(sizeof(int)*block);
    
      //device malloc
      cudaMalloc((void**)&dev_arr,sizeof(int)*N);
      cudaMalloc((void**)&dev_result,sizeof(int)*block);
    
      //init
      for(int i = 0 ; i < N ; i++)
      {
        arr[i] = i+1;
        if(i < block)
        {
          result[i] = 0;
        }
      }
    
      //cpu
      clock_t start,end;
      start = clock();
      unsigned int res = 0;
      for(int i = 0 ; i < N ; i++)
      {
        res += arr[i]*arr[i];
      }
      end = clock();
      printf("cpu ans : %d
    cpu time: %.5f
    ",res,float(end-start));
    
      //time start
      cudaEvent_t st,ed;
      cudaEventCreate(&st);
      cudaEventCreate(&ed);
      cudaEventRecord(st,0);
    
      //memcpy To Host
      cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);
    
      //kernel functions
      calc0<<<1,block>>>(dev_arr,dev_result);
      //memcpy To Device
      cudaMemcpy(result,dev_result,sizeof(int)*block,cudaMemcpyDeviceToHost);
    
      //output
      int res2=0;
      for(int i = 0 ; i < block ; i++)
      {
        res2 += result[i];
        //printf("test: %d
    ",result[i]);
      }
    
      //time end
      cudaEventRecord(ed,0);
      cudaEventSynchronize(ed);
      float gpu_time;
      cudaEventElapsedTime(&gpu_time,st,ed);
      printf("gpu ans :%d
    gpu time: %.5f
    ",res2,gpu_time);
    
      //time destroy
      cudaEventDestroy(st);
      cudaEventDestroy(ed);
    
      //device free
      cudaFree(dev_arr);
      cudaFree(dev_result);
      free(arr);
      free(result);
    
      return 0;
    }
    
    

    多block计算

    #include <iostream>
    #include <stdlib.h>
    #include <stdio.h>
    #include "cuda.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <time.h>
    
    /*
    *  author : pprp
    *  theme : 平方和
    */
    const int N = 32;
    const int block = 4;
    const int thread = 8;
    
    __global__ void calc0(int * arr, int * result)
    {
      int tid = threadIdx.x;
      int bid = blockIdx.x;
     
    
      int sum = 0;
      for(int i = bid*blockDim.x+tid; i < N ; i += blockDim.x*gridDim.x)
      {
        sum += arr[i]*arr[i];
      }
      __syncthreads();
      result[bid*blockDim.x+tid] = sum;
      printf("++%d 
    ",sum);
    }
    
    int main()
    {
      //host
      int * arr;
      int * result;
    
      //device
      int * dev_arr;
      int * dev_result;
    
      //host malloc
      arr = (int*)malloc(sizeof(int)*N);
      result = (int*)malloc(sizeof(int)*N);
    
      //device malloc
      cudaMalloc((void**)&dev_arr,sizeof(int)*N);
      cudaMalloc((void**)&dev_result,sizeof(int)*N);
    
      //init
      for(int i = 0 ; i < N ; i++)
      {
        arr[i] = i+1;
        if(i < thread)
        {
          result[i] = 0;
        }
      }
    
      //cpu
      clock_t start,end;
      start = clock();
      unsigned int res = 0;
      for(int i = 0 ; i < N ; i++)
      {
        res += arr[i]*arr[i];
      }
      end = clock();
      printf("cpu ans : %d
    cpu time: %.5f
    ",res,float(end-start));
    
      //time start
      cudaEvent_t st,ed;
      cudaEventCreate(&st);
      cudaEventCreate(&ed);
      cudaEventRecord(st,0);
    
      //memcpy To Host
      cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);
    
      //kernel functions
      calc0<<<block,thread>>>(dev_arr,dev_result);
      //memcpy To Device
      cudaMemcpy(result,dev_result,sizeof(int)*N,cudaMemcpyDeviceToHost);
    
      //output
      int res2=0;
      for(int i = 0 ; i < N ; i++)
      {
        res2 += result[i];
        //printf("test: %d
    ",result[i]);
      }
    
      //time end
      cudaEventRecord(ed,0);
      cudaEventSynchronize(ed);
      float gpu_time;
      cudaEventElapsedTime(&gpu_time,st,ed);
      printf("gpu ans :%d
    gpu time: %.5f
    ",res2,gpu_time);
    
      //time destroy
      cudaEventDestroy(st);
      cudaEventDestroy(ed);
    
      //device free
      cudaFree(dev_arr);
      cudaFree(dev_result);
      free(arr);
      free(result);
    
      return 0;
    }
    
    
  • 相关阅读:
    linux定时任务之crontab
    Examples of GoF Design Patterns--摘录
    weblogic升级之ddconverter
    Memcached分布式算法详解--转
    java实现迷宫算法--转
    kmp java implement--转
    2013年小结及2014年展望
    深入redis内部--字典实现
    项目管理学习笔记之二.工作分解
    android在当前app该文件下创建一个文件夹
  • 原文地址:https://www.cnblogs.com/pprp/p/9960146.html
Copyright © 2020-2023  润新知