• cublas fp16


    编译选项: nvcc 4.cpp -o test_gemm  -lcudart -lcuda -lcublas -std=c++11

    #include <sys/time.h>
    #include <cuda_profiler_api.h>
    #include <cublas_v2.h>
    #include <cuda.h>
    #include <cuda_fp16.h>
    #include <cuda_runtime.h>
    #include <stdio.h>
     
    int8_t float2int8(float f, float scale) {
        int8_t i = int8_t(f * scale);
        if (i < -127) i = -127;
        if (i > 127) i = 127;
        return i;
    }
     
    template <typename T, typename S>
    void allocate_memory(int m, int n, int k, T **A, T **B, S **C) {
        cudaMallocManaged(A, m * k * sizeof(T));
        cudaMallocManaged(B, k * n * sizeof(T));
        cudaMallocManaged(C, m * n * sizeof(S));
    }
     
    template <typename T, typename S>
    void free_memory(T *A, T *B, S *C) {
        cudaFree(A);
        cudaFree(B);
        cudaFree(C);
    }
     
    template <typename T, typename S>
    int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transA, cublasOperation_t transB,
                       int m, int n, int k, T *A, T *B, S *C, int lda, int ldb, int ldc,
                       S *alpha, S *beta, int algo) {
        cudaDataType_t AType, BType, CType, ComputeType;
        if (std::is_same<T, float>::value) {
            AType = BType = CType = ComputeType = CUDA_R_32F;
        } else if (std::is_same<T, __half>::value) {
            AType = BType = CType = ComputeType = CUDA_R_16F;
        } else if (std::is_same<T, int8_t>::value) {
            AType = BType = CUDA_R_8I;
            CType = ComputeType = CUDA_R_32I;
        } else {
            printf("Not supported data type.");
            return -1;
        }
        cublasStatus_t status;
        status = cublasGemmEx(handle,
                              transA,
                              transB,
                              m,
                              n,
                              k,
                              alpha,
                              A,
                              AType,
                              lda,
                              B,
                              BType,
                              ldb,
                              beta,
                              C,
                              CType,
                              ldc,
                              ComputeType,
                              static_cast<cublasGemmAlgo_t>(algo));
        
        if (status == CUBLAS_STATUS_SUCCESS)
            return 1;
        else
            return -1;
    }
     
    template <typename T, typename S>
    void test_gemm(cublasHandle_t handle, int m, int n, int k, T *A, T *B, S *C,
                   S *alpha, S *beta, int algo, int iteration) {
        float total_time = 0;
        for (int i = 0; i < iteration; ++i) {
            struct timeval start, end;
            cudaDeviceSynchronize();
            cudaProfilerStart();
            gettimeofday(&start, NULL);
            int success = cublas_gemm_ex(handle,
                                         CUBLAS_OP_N,
                                         CUBLAS_OP_N,
                                         n,
                                         m,
                                         k,
                                         B,
                                         A,
                                         C,
                                         n,
                                         k,
                                         n,
                                         alpha,
                                         beta,
                                         static_cast<cublasGemmAlgo_t>(algo));
            cudaDeviceSynchronize();
            gettimeofday(&end, NULL);
            cudaProfilerStop();
            if (success > 0 && i > 0)
                total_time += (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
        }
        if (total_time > 0)
            printf("algo %d: %.3f ms
    ", algo, total_time / (iteration - 1));
    }
     
    int main() {
        int m = 4096, n = 8192, k = 1024;
        printf("shape: (%d, %d) x (%d, %d)
    ", m, k, k, n);
        int start_algo = CUBLAS_GEMM_DEFAULT;
        int end_algo = CUBLAS_GEMM_ALGO23;
        int start_algo_t_op = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
        int end_algo_t_op = CUBLAS_GEMM_ALGO15_TENSOR_OP;
        int iteration = 10;
     
        float *fA, *fB, *fC;
        __half *hA, *hB, *hC;
        int8_t *iA, *iB; int32_t *iC;
        float f_alpha = 1, f_beta = 0;
        __half h_alpha = __float2half_rn(1.0), h_beta = __float2half_rn(0.0);
        int32_t i_alpha = 1, i_beta = 0;
        allocate_memory(m, n, k, &fA, &fB, &fC);
        allocate_memory(m, n, k, &hA, &hB, &hC);
        allocate_memory(m, n, k, &iA, &iB, &iC);
        for (int i = 0; i < m * k; ++i) {
            fA[i] = float(i % 255 - 127) / 127;
            hA[i] = __float2half_rn(fA[i]);
            iA[i] = float2int8(fA[i], 127);
        } 
        for (int i = 0; i < k * n; ++i) {
            fB[i] = float(i % 255 - 127) / 127;
            hB[i] = __float2half_rn(fB[i]);
            iB[i] = float2int8(fB[i], 127);
        }
        cublasHandle_t handle;
        cublasCreate(&handle);
        
        printf(">>>>>>>>>>>>>>>>> test fp32 >>>>>>>>>>>>>>>>>
    ");
        for (int algo = start_algo; algo <= end_algo; ++algo)
            test_gemm(handle, m, n, k, fA, fB, fC, &f_alpha, &f_beta, algo, iteration);
        for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)
            test_gemm(handle, m, n, k, fA, fB, fC, &f_alpha, &f_beta, algo, iteration);
        
     
        printf(">>>>>>>>>>>>>>>>> test fp16 >>>>>>>>>>>>>>>>>
    ");
        for (int algo = start_algo; algo <= end_algo; ++algo)
            test_gemm(handle, m, n, k, hA, hB, hC, &h_alpha, &h_beta, algo, iteration);
        for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)
            test_gemm(handle, m, n, k, hA, hB, hC, &h_alpha, &h_beta, algo, iteration);
     
        printf(">>>>>>>>>>>>>>>>> test int8 >>>>>>>>>>>>>>>>>
    ");
        for (int algo = start_algo; algo <= end_algo; ++algo)
            test_gemm(handle, m, n, k, iA, iB, iC, &i_alpha, &i_beta, algo, iteration);
        for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)
            test_gemm(handle, m, n, k, iA, iB, iC, &i_alpha, &i_beta, algo, iteration);
        
        printf(">>>>>>>>>>>>>>>>> compare result >>>>>>>>>>>>>>>>>
    ");
        printf("fp32: ");
        for (int i = 0; i < 10; ++i)
            printf("%.5f%c", fC[i], " 
    "[i==9]);
        printf("fp16: ");
        for (int i = 0; i < 10; ++i)
            printf("%.5f%c", float(hC[i]), " 
    "[i==9]);
        printf("int8: ");
        for (int i = 0; i < 10; ++i)
            printf("%.5f%c", float(iC[i])/127/127, " 
    "[i==9]);
     
        free_memory(iA, iB, iC);
        free_memory(fA, fB, fC);
        free_memory(hA, hB, hC);
        return 0;
    }
  • 相关阅读:
    redis redis-cli 操作指令
    Apache 配置默认编码
    Apache 查找httpd.conf文件
    Apache 错误日志
    dataTable 自定义排序
    bootstrap select2 参数详解
    获取元素滚动条高度
    TP5 操作DB is null is not null 条件
    TP5 自带分页类的传参
    jquery 获取 file 表单 上传的文件名
  • 原文地址:https://www.cnblogs.com/lin1216/p/15535621.html
Copyright © 2020-2023  润新知