• CUDA学习入门1


    关于俺的显卡

    NVidia NVS 4200M
    support opencl, directx11, DirectCompute, OpenGL 2.1
    Memory Amount 1GB
    CUDA compute capability 2.1

    步骤

    1. download cuda5 from https://developer.nvidia.com/thrust
    2. install
    中间有一步安装toolkit失败。。(为啥呀为啥?)


    3. 编译例子程序成功,但是运行失败
    F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\CUDA Sam
    ples\v5.0\bin\win32\Debug>vectorAdd.exe
    [Vector addition of 50000 elements]
    Failed to allocate device vector A (error code CUDA driver version is insufficient for CUDA runtime version)! (为啥呀为啥?)

    以下是例子程序

    /**
     * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
     *
     * Please refer to the NVIDIA end user license agreement (EULA) associated
     * with this source code for terms and conditions that govern your use of
     * this software. Any use, reproduction, disclosure, or distribution of
     * this software and related documentation outside the terms of the EULA
     * is strictly prohibited.
     *
     */
    
    /**
     * Vector addition: C = A + B.
     *
     * This sample is a very basic sample that implements element by element
     * vector addition. It is the same as the sample illustrating Chapter 2
     * of the programming guide with some additions like error checking.
     */
    
    #include <stdio.h>
    
    // For the CUDA runtime routines (prefixed with "cuda_")
    #include <cuda_runtime.h>
    
    /**
     * CUDA Kernel Device code
     *
     * Computes the vector addition of A and B into C. The 3 vectors have the same
     * number of elements numElements.
     */
    __global__ void
    vectorAdd(const float *A, const float *B, float *C, int numElements)
    {
        int i = blockDim.x * blockIdx.x + threadIdx.x;
    
        if (i < numElements)
        {
            C[i] = A[i] + B[i];
        }
    }
    
    /**
     * Host main routine
     */
    int
    main(void)
    {
        // Error code to check return values for CUDA calls
        cudaError_t err = cudaSuccess;
    
        // Print the vector length to be used, and compute its size
        int numElements = 50000;
        size_t size = numElements * sizeof(float);
        printf("[Vector addition of %d elements]\n", numElements);
    
        // Allocate the host input vector A
        float *h_A = (float *)malloc(size);
    
        // Allocate the host input vector B
        float *h_B = (float *)malloc(size);
    
        // Allocate the host output vector C
        float *h_C = (float *)malloc(size);
    
        // Verify that allocations succeeded
        if (h_A == NULL || h_B == NULL || h_C == NULL)
        {
            fprintf(stderr, "Failed to allocate host vectors!\n");
            exit(EXIT_FAILURE);
        }
    
        // Initialize the host input vectors
        for (int i = 0; i < numElements; ++i)
        {
            h_A[i] = rand()/(float)RAND_MAX;
            h_B[i] = rand()/(float)RAND_MAX;
        }
    
        // Allocate the device input vector A
        float *d_A = NULL;
        err = cudaMalloc((void **)&d_A, size);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        // Allocate the device input vector B
        float *d_B = NULL;
        err = cudaMalloc((void **)&d_B, size);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        // Allocate the device output vector C
        float *d_C = NULL;
        err = cudaMalloc((void **)&d_C, size);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        // Copy the host input vectors A and B in host memory to the device input vectors in
        // device memory
        printf("Copy input data from the host memory to the CUDA device\n");
        err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        // Launch the Vector Add CUDA Kernel
        int threadsPerBlock = 256;
        int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
        printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
        vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
        err = cudaGetLastError();
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        // Copy the device result vector in device memory to the host result vector
        // in host memory.
        printf("Copy output data from the CUDA device to the host memory\n");
        err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        // Verify that the result vector is correct
        for (int i = 0; i < numElements; ++i)
        {
            if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
            {
                fprintf(stderr, "Result verification failed at element %d!\n", i);
                exit(EXIT_FAILURE);
            }
        }
    
        // Free device global memory
        err = cudaFree(d_A);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
        err = cudaFree(d_B);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
        err = cudaFree(d_C);
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        // Free host memory
        free(h_A);
        free(h_B);
        free(h_C);
    
        // Reset the device and exit
        err = cudaDeviceReset();
    
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }
    
        printf("Done\n");
        return 0;
    }
    View Code

     4. 安装后,有个DeviceQuery.exe,运行失败

    F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA G
    PU Computing SDK 4.0\C\bin\win32\Release>deviceQuery.exe
    [deviceQuery.exe] starting...
    deviceQuery.exe Starting...
    
     CUDA Device Query (Runtime API) version (CUDART static linking)
    
    cudaGetDeviceCount returned 35
    -> CUDA driver version is insufficient for CUDA runtime version
    [deviceQuery.exe] test results...
    FAILED
    View Code

    5. 俺看到还有个oclDeviceQuery.exe,这应该是opencl版本的测试程序。点了一下这个能运行,以下是输出信息。

    [oclDeviceQuery.exe] starting...
    F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA G
    PU Computing SDK 4.0\OpenCL\Bin\Win32\release\oclDeviceQuery.exe Starting...
    
    OpenCL SW Info:
    
     CL_PLATFORM_NAME:      NVIDIA CUDA
     CL_PLATFORM_VERSION:   OpenCL 1.0 CUDA 3.2.1
     OpenCL SDK Revision:   7027912
    
    
    OpenCL Device Info:
    
     1 devices found supporting OpenCL:
    
     ---------------------------------
     Device NVS 4200M
     ---------------------------------
      CL_DEVICE_NAME:                       NVS 4200M
      CL_DEVICE_VENDOR:                     NVIDIA Corporation
      CL_DRIVER_VERSION:                    268.24
      CL_DEVICE_VERSION:                    OpenCL 1.0 CUDA
      CL_DEVICE_TYPE:                       CL_DEVICE_TYPE_GPU
      CL_DEVICE_MAX_COMPUTE_UNITS:          1
      CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:   3
      CL_DEVICE_MAX_WORK_ITEM_SIZES:        1024 / 1024 / 64
      CL_DEVICE_MAX_WORK_GROUP_SIZE:        1024
      CL_DEVICE_MAX_CLOCK_FREQUENCY:        1620 MHz
      CL_DEVICE_ADDRESS_BITS:               32
      CL_DEVICE_MAX_MEM_ALLOC_SIZE:         255 MByte
      CL_DEVICE_GLOBAL_MEM_SIZE:            1023 MByte
      CL_DEVICE_ERROR_CORRECTION_SUPPORT:   no
      CL_DEVICE_LOCAL_MEM_TYPE:             local
      CL_DEVICE_LOCAL_MEM_SIZE:             48 KByte
      CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:   64 KByte
      CL_DEVICE_QUEUE_PROPERTIES:           CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
      CL_DEVICE_QUEUE_PROPERTIES:           CL_QUEUE_PROFILING_ENABLE
      CL_DEVICE_IMAGE_SUPPORT:              1
      CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
      CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
      CL_DEVICE_SINGLE_FP_CONFIG:           denorms INF-quietNaNs round-to-nearest r
    ound-to-zero round-to-inf fma
    
      CL_DEVICE_IMAGE <dim>                 2D_MAX_WIDTH     4096
                                            2D_MAX_HEIGHT    32768
                                            3D_MAX_WIDTH     2048
                                            3D_MAX_HEIGHT    2048
                                            3D_MAX_DEPTH     2048
    
      CL_DEVICE_EXTENSIONS:                 cl_khr_byte_addressable_store
                                            cl_khr_icd
                                            cl_khr_gl_sharing
                                            cl_nv_d3d9_sharing
                                            cl_nv_compiler_options
                                            cl_nv_device_attribute_query
                                            cl_nv_pragma_unroll
                                            cl_khr_global_int32_base_atomics
                                            cl_khr_global_int32_extended_atomics
                                            cl_khr_local_int32_base_atomics
                                            cl_khr_local_int32_extended_atomics
                                            cl_khr_fp64
    
    
      CL_DEVICE_COMPUTE_CAPABILITY_NV:      2.1
      NUMBER OF MULTIPROCESSORS:            1
      NUMBER OF CUDA CORES:                 48
      CL_DEVICE_REGISTERS_PER_BLOCK_NV:     32768
      CL_DEVICE_WARP_SIZE_NV:               32
      CL_DEVICE_GPU_OVERLAP_NV:             CL_TRUE
      CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:     CL_TRUE
      CL_DEVICE_INTEGRATED_MEMORY_NV:       CL_FALSE
      CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>  CHAR 1, SHORT 1, INT 1, LONG 1, FLOAT 1,
     DOUBLE 1
    
    
      ---------------------------------
      2D Image Formats Supported (71)
      ---------------------------------
      #     Channel Order   Channel Type
    
      1     CL_R            CL_FLOAT
      2     CL_R            CL_HALF_FLOAT
      3     CL_R            CL_UNORM_INT8
      4     CL_R            CL_UNORM_INT16
      5     CL_R            CL_SNORM_INT16
      6     CL_R            CL_SIGNED_INT8
      7     CL_R            CL_SIGNED_INT16
      8     CL_R            CL_SIGNED_INT32
      9     CL_R            CL_UNSIGNED_INT8
      10    CL_R            CL_UNSIGNED_INT16
      11    CL_R            CL_UNSIGNED_INT32
      12    CL_A            CL_FLOAT
      13    CL_A            CL_HALF_FLOAT
      14    CL_A            CL_UNORM_INT8
      15    CL_A            CL_UNORM_INT16
      16    CL_A            CL_SNORM_INT16
      17    CL_A            CL_SIGNED_INT8
      18    CL_A            CL_SIGNED_INT16
      19    CL_A            CL_SIGNED_INT32
      20    CL_A            CL_UNSIGNED_INT8
      21    CL_A            CL_UNSIGNED_INT16
      22    CL_A            CL_UNSIGNED_INT32
      23    CL_RG           CL_FLOAT
      24    CL_RG           CL_HALF_FLOAT
      25    CL_RG           CL_UNORM_INT8
      26    CL_RG           CL_UNORM_INT16
      27    CL_RG           CL_SNORM_INT16
      28    CL_RG           CL_SIGNED_INT8
      29    CL_RG           CL_SIGNED_INT16
      30    CL_RG           CL_SIGNED_INT32
      31    CL_RG           CL_UNSIGNED_INT8
      32    CL_RG           CL_UNSIGNED_INT16
      33    CL_RG           CL_UNSIGNED_INT32
      34    CL_RA           CL_FLOAT
      35    CL_RA           CL_HALF_FLOAT
      36    CL_RA           CL_UNORM_INT8
      37    CL_RA           CL_UNORM_INT16
      38    CL_RA           CL_SNORM_INT16
      39    CL_RA           CL_SIGNED_INT8
      40    CL_RA           CL_SIGNED_INT16
      41    CL_RA           CL_SIGNED_INT32
      42    CL_RA           CL_UNSIGNED_INT8
      43    CL_RA           CL_UNSIGNED_INT16
      44    CL_RA           CL_UNSIGNED_INT32
      45    CL_RGBA         CL_FLOAT
      46    CL_RGBA         CL_HALF_FLOAT
      47    CL_RGBA         CL_UNORM_INT8
      48    CL_RGBA         CL_UNORM_INT16
      49    CL_RGBA         CL_SNORM_INT16
      50    CL_RGBA         CL_SIGNED_INT8
      51    CL_RGBA         CL_SIGNED_INT16
      52    CL_RGBA         CL_SIGNED_INT32
      53    CL_RGBA         CL_UNSIGNED_INT8
      54    CL_RGBA         CL_UNSIGNED_INT16
      55    CL_RGBA         CL_UNSIGNED_INT32
      56    CL_BGRA         CL_UNORM_INT8
      57    CL_BGRA         CL_SIGNED_INT8
      58    CL_BGRA         CL_UNSIGNED_INT8
      59    CL_ARGB         CL_UNORM_INT8
      60    CL_ARGB         CL_SIGNED_INT8
      61    CL_ARGB         CL_UNSIGNED_INT8
      62    CL_INTENSITY    CL_FLOAT
      63    CL_INTENSITY    CL_HALF_FLOAT
      64    CL_INTENSITY    CL_UNORM_INT8
      65    CL_INTENSITY    CL_UNORM_INT16
      66    CL_INTENSITY    CL_SNORM_INT16
      67    CL_LUMINANCE    CL_FLOAT
      68    CL_LUMINANCE    CL_HALF_FLOAT
      69    CL_LUMINANCE    CL_UNORM_INT8
      70    CL_LUMINANCE    CL_UNORM_INT16
      71    CL_LUMINANCE    CL_SNORM_INT16
    
      ---------------------------------
      3D Image Formats Supported (71)
      ---------------------------------
      #     Channel Order   Channel Type
    
      1     CL_R            CL_FLOAT
      2     CL_R            CL_HALF_FLOAT
      3     CL_R            CL_UNORM_INT8
      4     CL_R            CL_UNORM_INT16
      5     CL_R            CL_SNORM_INT16
      6     CL_R            CL_SIGNED_INT8
      7     CL_R            CL_SIGNED_INT16
      8     CL_R            CL_SIGNED_INT32
      9     CL_R            CL_UNSIGNED_INT8
      10    CL_R            CL_UNSIGNED_INT16
      11    CL_R            CL_UNSIGNED_INT32
      12    CL_A            CL_FLOAT
      13    CL_A            CL_HALF_FLOAT
      14    CL_A            CL_UNORM_INT8
      15    CL_A            CL_UNORM_INT16
      16    CL_A            CL_SNORM_INT16
      17    CL_A            CL_SIGNED_INT8
      18    CL_A            CL_SIGNED_INT16
      19    CL_A            CL_SIGNED_INT32
      20    CL_A            CL_UNSIGNED_INT8
      21    CL_A            CL_UNSIGNED_INT16
      22    CL_A            CL_UNSIGNED_INT32
      23    CL_RG           CL_FLOAT
      24    CL_RG           CL_HALF_FLOAT
      25    CL_RG           CL_UNORM_INT8
      26    CL_RG           CL_UNORM_INT16
      27    CL_RG           CL_SNORM_INT16
      28    CL_RG           CL_SIGNED_INT8
      29    CL_RG           CL_SIGNED_INT16
      30    CL_RG           CL_SIGNED_INT32
      31    CL_RG           CL_UNSIGNED_INT8
      32    CL_RG           CL_UNSIGNED_INT16
      33    CL_RG           CL_UNSIGNED_INT32
      34    CL_RA           CL_FLOAT
      35    CL_RA           CL_HALF_FLOAT
      36    CL_RA           CL_UNORM_INT8
      37    CL_RA           CL_UNORM_INT16
      38    CL_RA           CL_SNORM_INT16
      39    CL_RA           CL_SIGNED_INT8
      40    CL_RA           CL_SIGNED_INT16
      41    CL_RA           CL_SIGNED_INT32
      42    CL_RA           CL_UNSIGNED_INT8
      43    CL_RA           CL_UNSIGNED_INT16
      44    CL_RA           CL_UNSIGNED_INT32
      45    CL_RGBA         CL_FLOAT
      46    CL_RGBA         CL_HALF_FLOAT
      47    CL_RGBA         CL_UNORM_INT8
      48    CL_RGBA         CL_UNORM_INT16
      49    CL_RGBA         CL_SNORM_INT16
      50    CL_RGBA         CL_SIGNED_INT8
      51    CL_RGBA         CL_SIGNED_INT16
      52    CL_RGBA         CL_SIGNED_INT32
      53    CL_RGBA         CL_UNSIGNED_INT8
      54    CL_RGBA         CL_UNSIGNED_INT16
      55    CL_RGBA         CL_UNSIGNED_INT32
      56    CL_BGRA         CL_UNORM_INT8
      57    CL_BGRA         CL_SIGNED_INT8
      58    CL_BGRA         CL_UNSIGNED_INT8
      59    CL_ARGB         CL_UNORM_INT8
      60    CL_ARGB         CL_SIGNED_INT8
      61    CL_ARGB         CL_UNSIGNED_INT8
      62    CL_INTENSITY    CL_FLOAT
      63    CL_INTENSITY    CL_HALF_FLOAT
      64    CL_INTENSITY    CL_UNORM_INT8
      65    CL_INTENSITY    CL_UNORM_INT16
      66    CL_INTENSITY    CL_SNORM_INT16
      67    CL_LUMINANCE    CL_FLOAT
      68    CL_LUMINANCE    CL_HALF_FLOAT
      69    CL_LUMINANCE    CL_UNORM_INT8
      70    CL_LUMINANCE    CL_UNORM_INT16
      71    CL_LUMINANCE    CL_SNORM_INT16
    
    oclDeviceQuery, Platform Name = NVIDIA CUDA, Platform Version = OpenCL 1.0 CUDA
    3.2.1, SDK Revision = 7027912, NumDevs = 1, Device = NVS 4200M
    
    System Info:
    
     Local Time/Date = 13:19:53, 5/25/2013
     CPU Arch: 0
     CPU Level: 6
     # of CPU processors: 4
     Windows Build: 2600
     Windows Ver: 5.1
    
    
    [oclDeviceQuery.exe] test results...
    PASSED
    
    Press ENTER to exit...
    View Code

    6. opencl的带宽测试程序

    PU Computing SDK 4.0\OpenCL\Bin\Win32\release\oclBandwidthTest.exe Starting...
    
    Running on...
    
    NVS 4200M
    
    Quick Mode
    
    Host to Device Bandwidth, 1 Device(s), Paged memory, direct access
       Transfer Size (Bytes)        Bandwidth(MB/s)
       33554432                     4131.2
    
    Device to Host Bandwidth, 1 Device(s), Paged memory, direct access
       Transfer Size (Bytes)        Bandwidth(MB/s)
       33554432                     3485.6
    
    Device to Device Bandwidth, 1 Device(s)
       Transfer Size (Bytes)        Bandwidth(MB/s)
       33554432                     8901.4
    
    [oclBandwidthTest.exe] test results...
    PASSED
    View Code

    7 关于安装driver失败,在stackoverflow上有个人说

    http://stackoverflow.com/questions/11913320/installing-cuda-nvidia-graphic-driver-failed
    I have a VAIO too and I had the same problem. Don't download notebook version, try Desktop version of Nvidia Driver. I also had to disable my another Graphic card (Intel). It worked for me.
    View Code

    不过也有人说要修改inf文件才行

    Unfortunately, there are many NVIDIA GPUs for which the driver from the NVIDIA website will not install (especially for GPU versions that are specifically OEM'd for Sony, Lenovo, etc and the OEM wants to control the driver experience). This is most likely the case for you.
    
    In those cases, you can edit the .inf file to add your GPU into the list of GPUs for which the driver will install. However, it is a bit tricky and typically requires editing 3 different sections of the INF file. You can search around for details on how to mod NVIDIA inf files; there are a number of sites that do that.
    
    Of course, you have to have the appropriate CUDA driver before you can run CUDA stuff. So first things first... you've gotta get the driver installed.
    View Code

    这些俺暂时没有测试过是否有效

    8. 既然cuda用不了,而opencl貌似可以

    那俺还是转移到opencl上吧,首先测试一个例子 http://www.kimicat.com/opencl-1/opencl-jiao-xue-yi

    // OpenCL tutorial 1
    
    #include <iostream>
    #include <string>
    #include <vector>
    
    #ifdef __APPLE__
    #include <OpenCL/opencl.h>
    #else
    #include <CL/cl.h>
    #endif
    
    
    int main()
    {
    
        cl_int err;
        cl_uint num;
        err = clGetPlatformIDs(0, 0, &num);
        if(err != CL_SUCCESS) {
    
            std::cerr << "Unable to get platforms\n";
    
            return 0;
    
        }
    
        std::vector<cl_platform_id> platforms(num);
        err = clGetPlatformIDs(num, &platforms[0], &num);
        if(err != CL_SUCCESS) {
    
            std::cerr << "Unable to get platform ID\n";
    
            return 0;
    
        }
    
        cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[0]), 0 };
    
        cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_DEFAULT, NULL, NULL, NULL);
    
        if(context == 0) {
    
            std::cerr << "Can't create OpenCL context\n";
    
            return 0;
    
        }
    
    
        size_t cb;
    
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
    
        std::vector<cl_device_id> devices(cb / sizeof(cl_device_id));
    
        clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, &devices[0], 0);
    
    
        clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 0, NULL, &cb);
    
        std::string devname;
    
        devname.resize(cb);
    
        clGetDeviceInfo(devices[0], CL_DEVICE_NAME, cb, &devname[0], 0);
    
        std::cout << "Device: " << devname.c_str() << "\n";
    
    
        clReleaseContext(context);
    
        return 0;
    
    }
    View Code

    注意设置好路径

    F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK 4.0\OpenCL\common\inc

    F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK 4.0\OpenCL\common\lib\Win32

    正常编译运行成功!

    Device: NVS 4200M
    请按任意键继续. . .
    View Code

     9. 注: 后来俺在bios里面把intergrated GPU disable后,成功安装307.83-quadro-notebook-winxp-32bit-international-whql.exe
    安装cuda_5.0.35_winxp_general_32-3.msi还是有错误
    但是执行cuda程序貌似都正常了

    以下是bandwidthTest.exe测试结果,比opencl版本的快了很多

    [CUDA Bandwidth Test] - Starting...
    Running on...
    
     Device 0: NVS 4200M
     Quick Mode
    
     Host to Device Bandwidth, 1 Device(s)
     PINNED Memory Transfers
       Transfer Size (Bytes)        Bandwidth(MB/s)
       33554432                     6241.5
    
     Device to Host Bandwidth, 1 Device(s)
     PINNED Memory Transfers
       Transfer Size (Bytes)        Bandwidth(MB/s)
       33554432                     6302.9
    
     Device to Device Bandwidth, 1 Device(s)
     PINNED Memory Transfers
       Transfer Size (Bytes)        Bandwidth(MB/s)
       33554432                     10330.3
    View Code

    devicequery结果

    ples\v5.0\bin\win32\Release\deviceQuery.exe Starting...
    
     CUDA Device Query (Runtime API) version (CUDART static linking)
    
    Detected 1 CUDA Capable device(s)
    
    Device 0: "NVS 4200M"
      CUDA Driver Version / Runtime Version          5.0 / 5.0
      CUDA Capability Major/Minor version number:    2.1
      Total amount of global memory:                 1024 MBytes (1073283072 bytes)
      ( 1) Multiprocessors x ( 48) CUDA Cores/MP:    48 CUDA Cores
      GPU Clock rate:                                1620 MHz (1.62 GHz)
      Memory Clock rate:                             800 Mhz
      Memory Bus Width:                              64-bit
      L2 Cache Size:                                 65536 bytes
      Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3
    D=(2048,2048,2048)
      Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16
    384) x 2048
      Total amount of constant memory:               65536 bytes
      Total amount of shared memory per block:       49152 bytes
      Total number of registers available per block: 32768
      Warp size:                                     32
      Maximum number of threads per multiprocessor:  1536
      Maximum number of threads per block:           1024
      Maximum sizes of each dimension of a block:    1024 x 1024 x 64
      Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
      Maximum memory pitch:                          2147483647 bytes
      Texture alignment:                             512 bytes
      Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
      Run time limit on kernels:                     Yes
      Integrated GPU sharing Host Memory:            No
      Support host page-locked memory mapping:       Yes
      Alignment requirement for Surfaces:            Yes
      Device has ECC support:                        Disabled
      CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Mo
    del)
      Device supports Unified Addressing (UVA):      No
      Device PCI Bus ID / PCI location ID:           1 / 0
      Compute Mode:
         < Default (multiple host threads can use ::cudaSetDevice() with device simu
    ltaneously) >
    
    deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.0, CUDA Runtime Versi
    on = 5.0, NumDevs = 1, Device0 = NVS 4200M
    View Code
  • 相关阅读:
    【实验】利用系统自带脚本utlsampl.sql创建scott用户及样本数据
    有哪些优秀的沟通思路?
    srand()以及rand()函数用法
    微信公众号
    Sublime Text 3 全程详细图文原创教程(持续更新中。。。)
    Android应用的缓冲界面启动界面
    ListView技巧
    android线性布局参数
    CocoaPods的一波三则
    003.开发者账号异同
  • 原文地址:https://www.cnblogs.com/cutepig/p/3098621.html
Copyright © 2020-2023  润新知