• CUDA实战


    1.第一个程序,输出hello world,1个Block块中含有5个线程

     1 #include <stdio.h>
     2 #include "cuda_runtime.h"
     3 
     4 __global__ void hello(void)
     5 {
     6   printf("hello world from GPU!
    ");
     7 }
     8 int main()
     9 {
    10     printf("hello world from CPU!
    ");
    11     hello<<<1,5>>>();
    12     //重置CUDA设置释放程序占用的资源
    13     cudaDeviceReset();
    14     return 0;
    15 }
    View Code

    2.参数的传入,

     1 #include <stdio.h>
     2 #include "cuda_runtime.h"
     3 #include "device_launch_parameters.h"
     4 __global__ void add(int i,int j)
     5 {
     6    int count;
     7    count = i + j;
     8    printf("
     Sum is %d
    ",count);
     9 }
    10 
    11 int main()
    12 {
    13     add<<<1,1>>>(10,20);
    14     cudaDeviceReset();
    15     return 0;
    16 }
    View Code

     3.数据的传入与传出,我们的数据要从内存copy到显存上面,然后现在又要从显存上面copy回来

     1 #include <stdio.h>
     2 #include "cuda_runtime.h"
     3 #include "device_launch_parameters.h"
     4 
     5 __global__ void decrease(int a, int b, int *c)
     6 {
     7     *c = a + b;
     8 }
     9 int main()
    10 {
    11     int *c=0;
    12     int *dev_c=0;
    13     //初始化CPU上的内存空间
    14     c = (int*)malloc(sizeof(int));
    15     //初始化GPU上的内存空间
    16     cudaMalloc((void**)&dev_c,sizeof(int));
    17     //调用内核函数
    18     decrease <<<1,1>>>(15,20,dev_c);
    19     //等待设备所有线程任务执行完毕
    20     cudaDeviceSynchronize();
    21     //将数据从device中复制到hist中
    22     cudaMemcpy(c,dev_c,sizeof(int),cudaMemcpyDeviceToHost);
    23     //输出
    24     printf(" c = %d
    ",*c);
    25     //释放内存
    26     cudaFree(dev_c);
    27     free(c);
    28     return 0;
    29 }
    View Code

     4.传入的值全改为指针类型

     1 #include <stdio.h>
     2 #include "cuda_runtime.h"
     3 #include "device_launch_parameters.h"
     4 
     5 __global__ void addCuda(int* a, int* b, int* c)
     6 {
     7     *c = *a - *b;
     8 }
     9 
    10 void addWithCuda(int *c,int *a,int *b)
    11 {
    12     int *dev_c = 0;
    13     int *dev_a = 0;
    14     int *dev_b = 0;
    15     
    16     //初始化CUDA内存
    17     cudaMalloc((void**)&dev_c,sizeof(int));
    18     cudaMalloc((void**)&dev_a,sizeof(int));
    19     cudaMalloc((void**)&dev_b,sizeof(int));
    20     
    21     //从主机复制数据复制到device上
    22     cudaMemcpy(dev_a,a,sizeof(int),cudaMemcpyHostToDevice);
    23     cudaMemcpy(dev_b,b,sizeof(int),cudaMemcpyHostToDevice);
    24     
    25     //调用内核函数
    26     addCuda<<<1,1>>>(dev_a,dev_b,dev_c);
    27     cudaDeviceSynchronize();
    28     
    29     //数据复制到host
    30     cudaMemcpy(c,dev_c,sizeof(int),cudaMemcpyDeviceToHost);
    31     
    32     cudaFree(dev_c);
    33     cudaFree(dev_a);
    34     cudaFree(dev_b);
    35 
    36 }
    37 
    38 int main()
    39 {
    40    int a, b, c;
    41    a = 30;
    42    b = 15;
    43    c = 10;
    44    //传入参数变量(地址)
    45    addWithCuda(&c,&a,&b);
    46    //重置CUDA设备释放程序占用的程序
    47    cudaDeviceReset();
    48    printf("Value is %d
    ", c);
    49 
    50     return 0;
    51 }
    View Code

    5.传入的值全改为指针类型

     1 #include <stdio.h>
     2 #include "cuda_runtime.h"
     3 #include "device_launch_parameters.h"
     4 
     5 __global__ void deCuda(int* a, int* b, int* c)
     6 {
     7     *c = *a - *b;
     8 }
     9 int main()
    10 {
    11    int *a, *b, *c;
    12    a = (int*)malloc(sizeof(int));
    13    b = (int*)malloc(sizeof(int));
    14    c = (int*)malloc(sizeof(int));
    15    *a=10;
    16    *b=5;
    17    *c=0;
    18    int *dev_c = 0;
    19    int *dev_a = 0;
    20    int *dev_b = 0;
    21     //3.请求CUDA设备的内存(显存),执行CUDA函数
    22     cudaMalloc((void**)&dev_c, sizeof(int));
    23     cudaMalloc((void**)&dev_a, sizeof(int));
    24     cudaMalloc((void**)&dev_b, sizeof(int));
    25     
    26     cudaMemcpy(dev_a, a, sizeof(int), cudaMemcpyHostToDevice);
    27     cudaMemcpy(dev_b, b, sizeof(int), cudaMemcpyHostToDevice);
    28     
    29     deCuda<<<1,1>>>(dev_a,dev_b,dev_c);
    30     cudaMemcpy(c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
    31     printf("Value is %d
    ", *c);
    32    
    33     cudaFree(dev_c);
    34     cudaFree(dev_a);
    35     cudaFree(dev_b);
    36     free(a);
    37     free(b);
    38     free(c);
    39    //重置CUDA设备释放程序占用的程序
    40    cudaDeviceReset();
    41    return 0;
    42 }
    View Code

     6. 程序实现向量的加法操作,一个block中含有512个线程

     1 #include <stdio.h>
     2 #include <cuda_runtime.h>
     3 __global__ void add( int *dev_a, int *dev_b, int *dev_c)
     4 {
     5     int i=threadIdx.x;
     6     dev_c[i] = dev_a[i] + dev_b[i];
     7 }
     8 int main()
     9 {
    10     int host_a[512], host_b[512], host_c[512];
    11     for(int i = 0; i < 512; i++)
    12     {
    13         host_a[i] = i;
    14         host_b[i] = i<<1;
    15     }
    16     //定义cudaError,默认为cudaSuccess
    17     cudaError_t err = cudaSuccess;
    18     int *dev_a, *dev_b, *dev_c;
    19     err = cudaMalloc((void**)&dev_a,sizeof(int)*512);
    20     err = cudaMalloc((void**)&dev_b,sizeof(int)*512);
    21     err = cudaMalloc((void**)&dev_c,sizeof(int)*512);
    22     
    23     if(err!=cudaSuccess)
    24     {
    25         printf("the cuadaMalloc on GPU is failed");
    26         return 1;
    27     }
    28     
    29     printf("SUCCESS
    ");
    30     //从host到device
    31     cudaMemcpy(dev_a,host_a,sizeof(host_a),cudaMemcpyHostToDevice);
    32     cudaMemcpy(dev_b,host_b,sizeof(host_b),cudaMemcpyHostToDevice);
    33     
    34     //调用核函数
    35     add<<<1,512>>>(dev_a,dev_b,dev_c);
    36     cudaMemcpy(&host_c,dev_c,sizeof(host_c),cudaMemcpyDeviceToHost);
    37     for(int i=0; i<512; i++)
    38     {
    39         printf("host_a[%d] + host_b[%d] = %d + %d = %d
    ",i,i,host_a[i],host_b[i],host_c[i]);
    40     }
    41     
    42     //释放内存
    43     cudaFree(dev_c);
    44     cudaFree(dev_b);
    45     cudaFree(dev_a);
    46     
    47     return 0;
    48     
    49 }
    View Code

           

     

  • 相关阅读:
    UITableView学习笔记
    IOS基础之设置APP的名字、设置图标、添加等待加载时的图片
    UIScrollView,UIPageControl
    UIPickerView基本用法
    最大公约数和最小公倍数
    快速幂、快速乘
    素数筛
    最小生成树
    BZOJ1070 [SCOI2007]修车
    BZOJ1109 [POI2007]堆积木Klo
  • 原文地址:https://www.cnblogs.com/lin1216/p/12672994.html
Copyright © 2020-2023  润新知