20140824 - 润新知

20140824

矩阵转置：

__global__ void TransDtD(float*des, float*src, int srcH, int srcW)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;

//如果srcH*srcW>BLOCK_NUM*THREAD_NUM 用for否则用if
for(int i= idx; i<srcH*srcW; i=i+BLOCK_NUM*THREAD_NUM)
{
int row = (int)((i)/srcW);
int col = i-row*srcW;
des[col*srcH+row] = src[i];
}
/*
if(idx< srcH*srcW)
{
int row = (int)((idx)/srcW);
int col = idx-row*srcW;
des[col*srcH+row] = src[idx];
}
*/
__syncthreads();
}
void TransDeviceToDevice(float* src, int srcH, int srcW)
{
float *des = 0;
CUDA_CALL(cudaMalloc((void**)&des,sizeof(float)*srcH*srcW));
CUDA_CALL(cudaMemcpy(des,src,sizeof(float)*srcH*srcW,cudaMemcpyDeviceToDevice));

TransDtD<<<BLOCK_NUM,THREAD_NUM>>>(des,src,srcH,srcW);

CUDA_CALL(cudaMemcpy(src,des,sizeof(float)*srcH*srcW,cudaMemcpyDeviceToDevice));
cudaFree(des);
}

---------------------------------------------------------

MSE计算均方误差：

用MSE[BLOCK_NUM]的数组记录mse的结果 mse可以和bp流并行
__global__ void MSE(float*mse, float*targets, float*output, const int Sample_Num)
{
const size_t thID = threadIdx.x;
const size_t bloID = blockIdx.x;

__shared__ float sharedData[THREAD_NUM];

for(size_t i = bloID*THREAD_NUM + thID ; i < Sample_Num*Num_Out ;i = i+BLOCK_NUM*THREAD_NUM )
{
sharedData[thID] += 0.5*(targets[i]-output[i])*(targets[i]-output[i]);
}
__syncthreads( );

if(thID<128) sharedData[thID] += sharedData[thID+128];
__syncthreads( );
if ( thID < 64 ) sharedData[thID] += sharedData[thID + 64];
__syncthreads( );
if ( thID < 32 ) sharedData[thID] += sharedData[thID + 32];
if ( thID < 16 ) sharedData[thID] += sharedData[thID + 16];
if ( thID < 8 ) sharedData[thID]+= sharedData[thID + 8];
if ( thID < 4 ) sharedData[thID]+= sharedData[thID + 4];
if ( thID < 2 ) sharedData[thID]+= sharedData[thID + 2];
if ( thID < 1 ) sharedData[thID]+= sharedData[thID + 1];

if ( thID == 0 )// 如果线程ID为0，那么计算结果
{
mse[bloID] = sharedData[0];
}
}

--------------------------------------------------

并行的+bias& SIGMOD激活函数

__global__ void SIGMOD(float* IO, float*Bias, const int NodeNum, const int SampleNum)
{
int idx = blockIdx.x*blockDim.x +threadIdx.x ;

//如果NodeNum*SampleNum>BLOCK_NUM*THREAD_NUM 用for否则用if
/*
for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
{
int row = i%NodeNum;
IO[i] = 1/(1+exp(-IO[i]-Bias[row]));
}
*/
if(idx < NodeNum*SampleNum)
{
int row = idx%NodeNum;
IO[idx] = 1/(1+exp(-IO[idx]-Bias[row]));
}
__syncthreads();
}

---------------------------------------------

FeedForward，输出每一层的O0，O1，O2

//注意：sample的例子要预处理成按列存储即 NUM_IN*SAMPLE_NUM个例子
//Targets要处理成NUM_IN*SAMPLE_NUM的稀疏矩阵格式
void FeedForward( float* O0,float* O1,float* O2,
float* pSamples, const int SAMPLE_NUM,
float* weight0, float* weight1, float* weight2,
float* bias0, float* bias1, float* bias2)
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d) ", ret, __LINE__);exit(EXIT_FAILURE);}
const float alpha = 1.0f;
const float beta = 0.0f;

//输入层到第一层 + bias0 得到结果经过激活函数得到O0
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Num_In, SAMPLE_NUM, Num_In, &alpha, weight0, Num_In, pSamples, Num_In, &beta, O0, Num_In);
SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(O0,bias0,Num_In,SAMPLE_NUM);

//第一层到第二层 + bias1 得到结果经过激活函数得到O1
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Num_Hide, SAMPLE_NUM, Num_In, &alpha, weight1, Num_Hide, O0, Num_In, &beta, O1, Num_Hide);
SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(O1,bias1,Num_Hide,SAMPLE_NUM);

//第二层到第三层 + bias2 得到结果经过激活函数得到O2
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Num_Out, SAMPLE_NUM, Num_Hide, &alpha, weight2, Num_Out, O1, Num_Hide, &beta, O2, Num_Out);
//ret = cublasSaxpy(handle, Num_Out,&alpha,bias2,1,O2,1);
SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(O2,bias2,Num_Out,SAMPLE_NUM);

ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d) ", ret, __LINE__);exit(EXIT_FAILURE);}
}

-------------------------------------------------------------------------------

O*(1-O)可能用不上：

/*
__global__ void OMulitply1_O(float* OutPut, float* InPut, const int NodeNum, const int SampleNum)
{
//如果NodeNum*SampleNum>BLOCK_NUM*THREAD_NUM 用for否则用if

//for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
//{
//OutPut[i] = InPut[i]*(1-InPut[i]);
//}

int idx = blockIdx.x*blockDim.x + threadIdx.x;

if(idx<NodeNum*SampleNum)
{
OutPut[idx] = InPut[idx]*(1-InPut[idx]);
}
}

*/

---------------------------------------------------------------------------------

并行计算最后一层的err和bias：
__global__ void ErroBiasLastlayer(float* err, float*bias, float*Targets, float* O, const int NodeNum, const int SampleNum)
{
//如果NodeNum*SampleNum>BLOCK_NUM*THREAD_NUM 用for否则用if
/*
for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
{
OutPut[i] = InPut[i]*(1-InPut[i]);
}
*/
int idx = blockIdx.x*blockDim.x + threadIdx.x;

if(idx<NodeNum*SampleNum)
{
err[idx] = (Targets[idx]-O[idx])*O[idx]*(1-O[idx]);
}
__syncthreads();

if(idx<NodeNum)
{
for(int i = 0; i<SampleNum; ++i)
{
bias[idx] += ALFA*err[idx+i*NodeNum];
}
}
__syncthreads();
}

---------------------------------------------

最后一层的weight,err, bias 接口：

void BackPropagationLastLayer(float* weight, float* err, float* bias, float*Targets, float*ONext, float*OPre,const int NodeNum_Next,const int NodeNum_Pre, const int SampleNum)
{
/*float* tmpONext = 0;
CUDA_CALL(cudaMalloc((void**)&tmpONext, sizeof(NodeNum*SampleNum)));
cudaFree(tmpONext);*/
ErroBiasLastlayer<<<BLOCK_NUM, THREAD_NUM>>>(err, bias, Targets, ONext, NodeNum_Next, SampleNum);

cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d) ", ret, __LINE__);exit(EXIT_FAILURE);}
const float alpha = ALFA;
const float beta = 0.0f;

ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, NodeNum_Next, NodeNum_Pre, SampleNum, &alpha, err, NodeNum_Next, OPre, NodeNum_Pre, &beta, weight, NodeNum_Next);

/*************ADD************/

ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d) ", ret, __LINE__);exit(EXIT_FAILURE);}

}

void BackPropagationCore(float* weight, float* err, float* bias, float*errNext, float*ONext, float*OPre,const int NodeNum, const int SampleNum)
{

}

void BackPropagation(float* O0,float* O1,float* O2, float* Targets,
float* weight0, float* weight1, float* weight2,
float* bias0, float* bias1, float* bias2,
float* err0,float* err1, float* err2)
{

}

-------------------------------------------------------------------------------

内层的Err和Bias：

__global__ void ErroBias(float* err, float*bias, float*Err_Weight, float* OThis, const int NodeNum, const int SampleNum)
{
//如果NodeNum*SampleNum>BLOCK_NUM*THREAD_NUM 用for否则用if
/*
for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
{
OutPut[i] = InPut[i]*(1-InPut[i]);
}
*/
int idx = blockIdx.x*blockDim.x + threadIdx.x;

if(idx<NodeNum*SampleNum)
{
err[idx] = Err_Weight[idx]*OThis[idx]*(1-OThis[idx]);
}
__syncthreads();

if(idx<NodeNum)
{
for(int i = 0; i<SampleNum; ++i)
{
bias[idx] += ALFA*err[idx+i*NodeNum];
}
}
__syncthreads();
}
相关阅读:
个人图床【Gitee+PicGo（+Typora）】
java 对象序列化
 @RequestParam和@PathVariable
restful架构
 数组跟切片的区别
 为什么java支持一个类实现多个接口；但是只能继承一个类
 == 与equals区别
 static代码块是先加载的，不能用成员变量。可以new。
@Configuration和 @Bean
Thymeleaf 常用th标签基础整理
原文地址：https://www.cnblogs.com/huangshan/p/3933650.html