• 20140824


    矩阵转置:

    __global__ void TransDtD(float*des, float*src, int srcH, int srcW)
    {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;

    //如果srcH*srcW>BLOCK_NUM*THREAD_NUM 用for否则用if
    for(int i= idx; i<srcH*srcW; i=i+BLOCK_NUM*THREAD_NUM)
    {
    int row = (int)((i)/srcW);
    int col = i-row*srcW;
    des[col*srcH+row] = src[i];
    }
    /*
    if(idx< srcH*srcW)
    {
    int row = (int)((idx)/srcW);
    int col = idx-row*srcW;
    des[col*srcH+row] = src[idx];
    }
    */
    __syncthreads();
    }
    void TransDeviceToDevice(float* src, int srcH, int srcW)
    {
    float *des = 0;
    CUDA_CALL(cudaMalloc((void**)&des,sizeof(float)*srcH*srcW));
    CUDA_CALL(cudaMemcpy(des,src,sizeof(float)*srcH*srcW,cudaMemcpyDeviceToDevice));

    TransDtD<<<BLOCK_NUM,THREAD_NUM>>>(des,src,srcH,srcW);

    CUDA_CALL(cudaMemcpy(src,des,sizeof(float)*srcH*srcW,cudaMemcpyDeviceToDevice));
    cudaFree(des);
    }

    ---------------------------------------------------------

    MSE计算均方误差:

    用MSE[BLOCK_NUM]的数组记录mse的结果 mse可以和bp流并行
    __global__ void MSE(float*mse, float*targets, float*output, const int Sample_Num)
    {
    const size_t thID = threadIdx.x;
    const size_t bloID = blockIdx.x;

    __shared__ float sharedData[THREAD_NUM];

    for(size_t i = bloID*THREAD_NUM + thID ; i < Sample_Num*Num_Out ;i = i+BLOCK_NUM*THREAD_NUM )
    {
    sharedData[thID] += 0.5*(targets[i]-output[i])*(targets[i]-output[i]);
    }
    __syncthreads( );


    if(thID<128) sharedData[thID] += sharedData[thID+128];
    __syncthreads( );
    if ( thID < 64 ) sharedData[thID] += sharedData[thID + 64];
    __syncthreads( );
    if ( thID < 32 ) sharedData[thID] += sharedData[thID + 32];
    if ( thID < 16 ) sharedData[thID] += sharedData[thID + 16];
    if ( thID < 8 ) sharedData[thID]+= sharedData[thID + 8];
    if ( thID < 4 ) sharedData[thID]+= sharedData[thID + 4];
    if ( thID < 2 ) sharedData[thID]+= sharedData[thID + 2];
    if ( thID < 1 ) sharedData[thID]+= sharedData[thID + 1];

    if ( thID == 0 )// 如果线程ID为0,那么计算结果
    {
    mse[bloID] = sharedData[0];
    }
    }

    --------------------------------------------------

    并行的+bias& SIGMOD激活函数

    __global__ void SIGMOD(float* IO, float*Bias, const int NodeNum, const int SampleNum)
    {
    int idx = blockIdx.x*blockDim.x +threadIdx.x ;

    //如果NodeNum*SampleNum>BLOCK_NUM*THREAD_NUM 用for否则用if
    /*
    for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
    {
    int row = i%NodeNum;
    IO[i] = 1/(1+exp(-IO[i]-Bias[row]));
    }
    */
    if(idx < NodeNum*SampleNum)
    {
    int row = idx%NodeNum;
    IO[idx] = 1/(1+exp(-IO[idx]-Bias[row]));
    }
    __syncthreads();
    }

    ---------------------------------------------

    FeedForward,输出每一层的O0,O1,O2

    //注意:sample的例子要预处理成按列存储即 NUM_IN*SAMPLE_NUM个例子
    //Targets要处理成NUM_IN*SAMPLE_NUM的稀疏矩阵格式
    void FeedForward( float* O0,float* O1,float* O2,
    float* pSamples, const int SAMPLE_NUM,
    float* weight0, float* weight1, float* weight2,
    float* bias0, float* bias1, float* bias2)
    {
    cublasHandle_t handle;
    cublasStatus_t ret;
    ret = cublasCreate(&handle);
    if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d) ", ret, __LINE__);exit(EXIT_FAILURE);}
    const float alpha = 1.0f;
    const float beta = 0.0f;

    //输入层到第一层 + bias0 得到结果经过激活函数得到O0
    ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Num_In, SAMPLE_NUM, Num_In, &alpha, weight0, Num_In, pSamples, Num_In, &beta, O0, Num_In);
    SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(O0,bias0,Num_In,SAMPLE_NUM);

    //第一层到第二层 + bias1 得到结果经过激活函数得到O1
    ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Num_Hide, SAMPLE_NUM, Num_In, &alpha, weight1, Num_Hide, O0, Num_In, &beta, O1, Num_Hide);
    SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(O1,bias1,Num_Hide,SAMPLE_NUM);

    //第二层到第三层 + bias2 得到结果经过激活函数得到O2
    ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Num_Out, SAMPLE_NUM, Num_Hide, &alpha, weight2, Num_Out, O1, Num_Hide, &beta, O2, Num_Out);
    //ret = cublasSaxpy(handle, Num_Out,&alpha,bias2,1,O2,1);
    SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(O2,bias2,Num_Out,SAMPLE_NUM);

    ret = cublasDestroy(handle);
    if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d) ", ret, __LINE__);exit(EXIT_FAILURE);}
    }

    -------------------------------------------------------------------------------

     O*(1-O)可能用不上:

    /*
    __global__ void OMulitply1_O(float* OutPut, float* InPut, const int NodeNum, const int SampleNum)
    {
    //如果NodeNum*SampleNum>BLOCK_NUM*THREAD_NUM 用for否则用if

    //for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
    //{
    //OutPut[i] = InPut[i]*(1-InPut[i]);
    //}

    int idx = blockIdx.x*blockDim.x + threadIdx.x;

    if(idx<NodeNum*SampleNum)
    {
    OutPut[idx] = InPut[idx]*(1-InPut[idx]);
    }
    }

    */

    ---------------------------------------------------------------------------------

    并行计算最后一层的err和bias:
    __global__ void ErroBiasLastlayer(float* err, float*bias, float*Targets, float* O, const int NodeNum, const int SampleNum)
    {
    //如果NodeNum*SampleNum>BLOCK_NUM*THREAD_NUM 用for否则用if
    /*
    for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
    {
    OutPut[i] = InPut[i]*(1-InPut[i]);
    }
    */
    int idx = blockIdx.x*blockDim.x + threadIdx.x;

    if(idx<NodeNum*SampleNum)
    {
    err[idx] = (Targets[idx]-O[idx])*O[idx]*(1-O[idx]);
    }
    __syncthreads();

    if(idx<NodeNum)
    {
    for(int i = 0; i<SampleNum; ++i)
    {
    bias[idx] += ALFA*err[idx+i*NodeNum];
    }
    }
    __syncthreads();
    }

    ---------------------------------------------

    最后一层的weight,err, bias 接口:

    void BackPropagationLastLayer(float* weight, float* err, float* bias, float*Targets, float*ONext, float*OPre,const int NodeNum_Next,const int NodeNum_Pre, const int SampleNum)
    {
    /*float* tmpONext = 0;
    CUDA_CALL(cudaMalloc((void**)&tmpONext, sizeof(NodeNum*SampleNum)));
    cudaFree(tmpONext);*/
    ErroBiasLastlayer<<<BLOCK_NUM, THREAD_NUM>>>(err, bias, Targets, ONext, NodeNum_Next, SampleNum);

    cublasHandle_t handle;
    cublasStatus_t ret;
    ret = cublasCreate(&handle);
    if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d) ", ret, __LINE__);exit(EXIT_FAILURE);}
    const float alpha = ALFA;
    const float beta = 0.0f;

    ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, NodeNum_Next, NodeNum_Pre, SampleNum, &alpha, err, NodeNum_Next, OPre, NodeNum_Pre, &beta, weight, NodeNum_Next);


    /*************ADD************/

    ret = cublasDestroy(handle);
    if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d) ", ret, __LINE__);exit(EXIT_FAILURE);}

    }

    void BackPropagationCore(float* weight, float* err, float* bias, float*errNext, float*ONext, float*OPre,const int NodeNum, const int SampleNum)
    {

    }

    void BackPropagation(float* O0,float* O1,float* O2, float* Targets,
    float* weight0, float* weight1, float* weight2,
    float* bias0, float* bias1, float* bias2,
    float* err0,float* err1, float* err2)
    {

    }

    -------------------------------------------------------------------------------

    内层的Err和Bias:

    __global__ void ErroBias(float* err, float*bias, float*Err_Weight, float* OThis, const int NodeNum, const int SampleNum)
    {
    //如果NodeNum*SampleNum>BLOCK_NUM*THREAD_NUM 用for否则用if
    /*
    for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
    {
    OutPut[i] = InPut[i]*(1-InPut[i]);
    }
    */
    int idx = blockIdx.x*blockDim.x + threadIdx.x;

    if(idx<NodeNum*SampleNum)
    {
    err[idx] = Err_Weight[idx]*OThis[idx]*(1-OThis[idx]);
    }
    __syncthreads();

    if(idx<NodeNum)
    {
    for(int i = 0; i<SampleNum; ++i)
    {
    bias[idx] += ALFA*err[idx+i*NodeNum];
    }
    }
    __syncthreads();
    }

  • 相关阅读:
    个人图床【Gitee+PicGo(+Typora)】
    java 对象序列化
    @RequestParam和@PathVariable
    restful架构
    数组跟切片的区别
    为什么java支持 一个类实现多个接口;但是只能继承一个类
    == 与equals区别
    static代码块是先加载的,不能用成员变量。可以new。
    @Configuration和 @Bean
    Thymeleaf 常用th标签基础整理
  • 原文地址:https://www.cnblogs.com/huangshan/p/3933650.html
Copyright © 2020-2023  润新知