• CUDA 例程


    scalar add

    #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <iostream> __global__ void add(int *a, int *b,int *c) { c[blockIdx.x]=a[blockIdx.x]+b[blockIdx.x]; } int main(void) { // H has storage for 4 integers int a,b,c; int *da,*db,*dc; int size=1*sizeof(int); //scalar; cudaMalloc((void**)&da,size); cudaMalloc((void**)&db,size); cudaMalloc((void**)&dc,size); a=2; b=7; cudaMemcpy(da,&a,size,cudaMemcpyHostToDevice); cudaMemcpy(db,&b,size,cudaMemcpyHostToDevice); add<<<1,1>>>(da,db,dc); cudaMemcpy(&c,dc,size,cudaMemcpyDeviceToHost ); std::cout<<c<<std::endl; cudaFree(da); cudaFree(db); cudaFree(dc); std::cout<<"hell"; thrust::host_vector<int> H(4); // initialize individual elements H[0] = 14; H[1] = 20; H[2] = 38; H[3] = 46; // H.size() returns the size of vector H std::cout << "H has size " << H.size() << std::endl; // print contents of H for(int i = 0; i < H.size(); i++) std::cout << "H[" << i << "] = " << H[i] << std::endl; // resize H H.resize(2); std::cout << "H now has size " << H.size() << std::endl; // Copy host_vector H to device_vector D thrust::device_vector<int> D = H; // elements of D can be modified D[0] = 99; D[1] = 88; // print contents of D for(int i = 0; i < D.size(); i++) std::cout << "D[" << i << "] = " << D[i] << std::endl; // H and D are automatically deleted when the function returns return 0; }
    block or thread

    #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <iostream> const int N=512; __global__ void add(int *a, int *b,int *c) { c[blockIdx.x]=a[blockIdx.x]+b[blockIdx.x]; //c[threadIdx.x]=a[threadIdx.x]+b[threadIdx.x]; } int main(void) { // H has storage for 4 integers int *a,*b,*c; int *da,*db,*dc; int size=N*sizeof(int); //scalar; cudaMalloc((void**)&da,size); cudaMalloc((void**)&db,size); cudaMalloc((void**)&dc,size); a=(int *) malloc(size); memset(a,0,N*sizeof(int));//rand_ints(a,N); a[0]=10; a[3]=3; b=(int *) malloc(size); memset(b,0, N*sizeof(int));// rand_ints(b,N); b[0]=2; b[4]=32; c=(int *) malloc(size); //rand_ints(c,N); memset(c,0, N*sizeof(int)); cudaMemcpy(da,a,size,cudaMemcpyHostToDevice); cudaMemcpy(db,b,size,cudaMemcpyHostToDevice); add<<<N,1>>>(da,db,dc); //N blocks add<<<1,N>>>(da,db,dc); N threads
    cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost ); for (int i=0; i<20;i++) std::cout<<c[i]<<std::endl; //_syncthreads(); //useless cudaDeviceSynchronize(); free(a); free(b); free(c); cudaFree(da); cudaFree(db); cudaFree(dc); return 0; }
    block+thread
    #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <iostream>

    /*

    #define N (2048*2048)
    #define M 512 // THREADS_PER_BLOCK

    add<<<N/M, M>>>(d_a, d_b, d_c);

    N /M      blocks used
    M threads / block
    */
    const int N=2048*2048; const int M=512; __global__ void add(int *a, int *b,int *c,int n) { int index=threadIdx.x+blockIdx.x*blockDim.x; c[index]=a[index]+b[index]; if (index<n) c[index]=a[index]+b[index]; //c[threadIdx.x]=a[threadIdx.x]+b[threadIdx.x]; } int main(void) { // H has storage for 4 integers int *a,*b,*c; int *da,*db,*dc; int size=N*sizeof(int); //scalar; cudaMalloc((void**)&da,size); cudaMalloc((void**)&db,size); cudaMalloc((void**)&dc,size); a=(int *) malloc(size); memset(a,0,N*sizeof(int));//rand_ints(a,N); a[0]=10; a[3]=3; b=(int *) malloc(size); memset(b,0, N*sizeof(int));// rand_ints(b,N); b[0]=2; b[4]=32; c=(int *) malloc(size); //rand_ints(c,N); memset(c,0, N*sizeof(int)); cudaMemcpy(da,a,size,cudaMemcpyHostToDevice); cudaMemcpy(db,b,size,cudaMemcpyHostToDevice); add<<<(N+M-1)/M,M>>>(da,db,dc,N); cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost ); for (int i=0; i<20;i++) std::cout<<c[i]<<std::endl; //_syncthreads(); //useless cudaDeviceSynchronize(); free(a); free(b); free(c); cudaFree(da); cudaFree(db); cudaFree(dc); return 0; }
  • 相关阅读:
    JS-排序详解-选择排序
    JS-排序详解-快速排序
    JS-排序详解-冒泡排序
    正则表达式入门
    JS-最全的创建对象的方式
    用JS实现回文数的精准辨别!!!
    基本包装类型
    引用类型之Function类型
    引用类型之Array类型
    Object类型
  • 原文地址:https://www.cnblogs.com/huashiyiqike/p/3869093.html
Copyright © 2020-2023  润新知