• Caffe参数交换源码分析




    1 . . . . . 
    2 if (gpus.size() > 1) {
    3     caffe::P2PSync<float> sync(solver, NULL, solver->param());
    4     sync.run(gpus);
    5   } else {
    6     LOG(INFO) << "Starting Optimization";
    7     solver->Solve();
    8   }

    因为GPU的个数>1,所以执行sync(solver, NULL, solver->param())和run()函数,首先会执行P2PSync类的构造函数,然后执行run()函数,run函数的代码如下:

     1 void P2PSync<Dtype>::run(const vector<int>& gpus) {
     2   vector<DevicePair> pairs;
     3   DevicePair::compute(gpus, &pairs);
     4   SolverParameter param(solver_->param());
     5   vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
     7   // Build the GPU tree by finding the parent for each solver
     8   for (int attempts = 0; attempts < pairs.size(); ++attempts) {. . . . . . .
     9   }
    10   for (int i = 1; i < syncs.size(); ++i) {
    11 syncs[i]->StartInternalThread();
    12   }
    13   solver_->Solve();
    14   for (int i = 1; i < syncs.size(); ++i) {
    15     syncs[i]->StopInternalThread();
    16   }
    17 }

    在run()函数中,首先会执行compute()函数,该函数的作用是产生GPU Pairs,GPU Pairs的含义是[parent:child],对于2个GPU而言,GPU Pairs为[-1:0],[0:1],默认根GPU的parent是其本身。然后通过一个for循环构建GPU树,对于2个GPU而言,GPU树如下图所示:



    1 void InternalThread::StartInternalThread() {. . . . . 
    2   try {
    3     thread_.reset(new boost::thread(&InternalThread::entry, this, device, mode,
    4           rand_seed, solver_count, root_solver));
    5   }. . . . . . .
    6 }


    1 void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed,
    2 int solver_count, bool root_solver) {
    3 . . . . . .
    4   InternalThreadEntry();
    5 }


     1 void P2PSync<Dtype>::InternalThreadEntry() {
     2   Caffe::SetDevice(solver_->param().device_id());
     3   CHECK(Caffe::root_solver());
     4   Caffe::set_root_solver(false);
     5   // See if there is a defined seed and reset random state if so
     6   if (solver_->param().random_seed() >= 0) {
     7     Caffe::set_random_seed(
     8         solver_->param().random_seed() + solver_->param().device_id());
     9   }
    10   solver_->Step(solver_->param().max_iter() - initial_iter_);
    11 }


     1 void Solver<Dtype>::Step(int iters) {
     2   . . . . . . . . . .
     3   while (iter_ < stop_iter) {
     4 . . . . . . . . . .
     5 for (int i = 0; i < callbacks_.size(); ++i) {
     6       0_[i]->on_start();
     7     }
     8     const bool display = param_.display() && iter_ % param_.display() == 0;
     9     net_->set_debug_info(display && param_.debug_info());
    10     // accumulate the loss and gradient
    11     Dtype loss = 0;
    12     for (int i = 0; i < param_.iter_size(); ++i) {
    13       loss += net_->ForwardBackward(bottom_vec);//计算loss,一次前后向
    14     }
    15     loss /= param_.iter_size();//loss归一化
    16     . . . . . . . 
    17     for (int i = 0; i < callbacks_.size(); ++i) {
    18       callbacks_[i]->on_gradients_ready();
    19     }
    20     ApplyUpdate();
    21     . . . . . . . . . .
    22     ++iter_;
    23   }
    24 }


    1 void P2PSync<Dtype>::on_start() {
    2   . . . . . . .
    3   // Wait for update from parent
    4   if (parent_) {
    5     P2PSync<Dtype> *parent = queue_.pop();//取队列中的第一个gpu节点为根gpu
    6     CHECK(parent == parent_);
    7   }
    8   . . . . . .


    1 T BlockingQueue<T>::pop(const string& log_on_wait) {
    2   boost::mutex::scoped_lock lock(sync_->mutex_);
    3   while (queue_.empty()) {
    4     if (!log_on_wait.empty()) {
    5       LOG_EVERY_N(INFO, 1000)<< log_on_wait;
    6     }
    7     sync_->condition_.wait(lock);//如果queue_为空,就一直阻塞。
    8   }


    1 void Solver<Dtype>::Solve(const char* resume_file) {
    2   int start_iter = iter_;
    3 . . . . .
    4   //LOG(INFO) <<"This is the sign of the train begin?********Ni****Jian*********";  //test for nijian
    5   Step(param_.max_iter() - iter_);  
    6   . . . . .
    7 }


     1  . . . . .
     2 // Update children
     3   for (int i = children_.size() - 1; i >= 0; i--) {
     4     Dtype* src = data_;
     5     Dtype* dst = children_[i]->data_;
     6 #ifdef DEBUG
     7    . . . .
     8 #endif
     9     CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),
    10         cudaMemcpyDeviceToDevice, cudaStreamDefault));//每个子GPU把信息传入到根GPU,异步操作
    11     CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));//根GPU把信息同步传到各个子GPU
    12     children_[i]->queue_.push(this);
    13   }
    14 #endif
    15 }


    1 void BlockingQueue<T>::push(const T& t) {
    2   boost::mutex::scoped_lock lock(sync_->mutex_);
    3   queue_.push(t);
    4   lock.unlock();
    5   sync_->condition_.notify_one();
    6 }


    1 Dtype ForwardBackward(const vector<Blob<Dtype>* > & bottom) {
    2     Dtype loss;
    3     Forward(bottom, &loss);
    4     Backward();
    5     return loss;
    6   }


     1 void P2PSync<Dtype>::on_gradients_ready() {. . . . . . . .
     2   // Sum children gradients as they appear in the queue
     3   for (int i = 0; i < children_.size(); ++i) {
     4     P2PSync<Dtype> *child = queue_.pop();
     5     Dtype* src = child->parent_grads_;
     6     Dtype* dst = diff_;
     7 #ifdef DEBUG
     8     cudaPointerAttributes attributes;
     9     CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    10     CHECK(attributes.device == device);
    11     CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    12     CHECK(attributes.device == device);
    13 #endif
    14     caffe_gpu_add(size_, src, dst, dst);
    15   }  


     1   if (parent_) {
     2     Dtype* src = diff_;
     3     Dtype* dst = parent_grads_;
     4 #ifdef DEBUG
     5 #endif
     6     CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
     7         cudaMemcpyDeviceToDevice, cudaStreamDefault));
     8     CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
     9     parent_->queue_.push(this);
    10   } else {
    11     // Loss functions divide gradients by the batch size, so to compensate
    12     // for split batch, the root solver divides by number of solvers.
    13     caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
    14   }

















  • 相关阅读:
    洛谷P3165 [CQOI2014]排序机械臂
    Cognos 图表用图片取代”没有数据显示”
    A shallow summary of oracle log miner
    smarty 截取字符串,调用php中的方法,foreach循环
  • 原文地址:https://www.cnblogs.com/liuzhongfeng/p/7809689.html
Copyright © 2020-2023  润新知