• tfs主控与数据节点读写流程—源码解读


    1. tfs节点的管理类介绍
    1.1 数据节点
       数据节点主要进行进行实际数据的存储与读写,其管理类是DataServer,其职责如下:


       相关数据流的任务函数处理由handlePacketQueue函数完成,如下:

        //一个数据节点
        class DataService 
        {
          OpManager op_manager_;
          LeaseManager *lease_manager_;
          DataHelper data_helper_;
          TaskManager task_manager_;
          BlockManager *block_manager_;
          TrafficControl traffic_control_;
          ClientRequestServer client_request_server_;
          WritableBlockManager writable_block_manager_;
          CheckManager check_manager_;
          SyncManager*  sync_manager_;
          MigrateManager* migrate_manager_;
          TimeoutThreadHelperPtr  timeout_thread_;
          RunTaskThreadHelperPtr  task_thread_;
          RunCheckThreadHelperPtr check_thread_;
        }
        
        bool DataService::handlePacketQueue(tbnet::Packet* packet, void* args)
        {
          bool bret = BaseService::handlePacketQueue(packet, args);
          if (bret)
          {
            int32_t pcode = packet->getPCode();
            int32_t ret = LOCAL_PACKET == pcode ? TFS_ERROR : TFS_SUCCESS;
            if (TFS_SUCCESS == ret)
            {
              switch (pcode)
              {
                case LIST_BLOCK_MESSAGE:
                  ret = list_blocks(dynamic_cast<ListBlockMessage*>(packet));
                  break;
                case REPLICATE_BLOCK_MESSAGE:
                case COMPACT_BLOCK_MESSAGE:
                case DS_COMPACT_BLOCK_MESSAGE:
                case DS_REPLICATE_BLOCK_MESSAGE:
                case RESP_DS_COMPACT_BLOCK_MESSAGE:
                case RESP_DS_REPLICATE_BLOCK_MESSAGE:
                case REQ_EC_MARSHALLING_MESSAGE:
                case REQ_EC_REINSTATE_MESSAGE:
                case REQ_EC_DISSOLVE_MESSAGE:
                case NS_REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
                  ret = task_manager_.handle(dynamic_cast<BaseTaskMessage*>(packet));
                  break;
                case GET_BLOCK_INFO_MESSAGE_V2:
                  ret = get_block_info(dynamic_cast<GetBlockInfoMessageV2*>(packet));
                  break;
                case GET_SERVER_STATUS_MESSAGE:
                  ret = get_server_status(dynamic_cast<GetServerStatusMessage*>(packet));
                  break;
                case STATUS_MESSAGE:
                  ret = get_ping_status(dynamic_cast<StatusMessage*>(packet));
                  break;
                case CLIENT_CMD_MESSAGE:
                  ret = client_command(dynamic_cast<ClientCmdMessage*>(packet));
                  break;
                case REQ_CALL_DS_REPORT_BLOCK_MESSAGE:
                case STAT_FILE_MESSAGE_V2:
                case READ_FILE_MESSAGE_V2:
                case WRITE_FILE_MESSAGE_V2:
                case CLOSE_FILE_MESSAGE_V2:
                case UNLINK_FILE_MESSAGE_V2:
                case NEW_BLOCK_MESSAGE_V2:
                case REMOVE_BLOCK_MESSAGE_V2:
                case READ_RAWDATA_MESSAGE_V2:
                case WRITE_RAWDATA_MESSAGE_V2:
                case READ_INDEX_MESSAGE_V2:
                case WRITE_INDEX_MESSAGE_V2:
                case QUERY_EC_META_MESSAGE:
                case COMMIT_EC_META_MESSAGE:
                case GET_ALL_BLOCKS_HEADER_MESSAGE:
                  ret = client_request_server_.handle(packet);
                  break;
                case REQ_CHECK_BLOCK_MESSAGE:
                case REPORT_CHECK_BLOCK_MESSAGE:
                  ret = check_manager_.handle(packet);
                  break;
                default:
                  TBSYS_LOG(ERROR, "process packet pcode: %d
    ", pcode);
                  ret = TFS_ERROR;
                  break;
              }
              if (common::TFS_SUCCESS != ret)
              {
                common::BasePacket* msg = dynamic_cast<common::BasePacket*>(packet);
                msg->reply_error_packet(TBSYS_LOG_LEVEL(ERROR), ret, "execute message failed");
              }
            }
          }
          return bret;
        }
    

        其中的主要职责由task_manager_和client_request_server_完成:

      a. task_manager_ 负责执行数据节点的均衡、数据迁移、清理压缩、整理等操作,满足系统集群的平衡迁移、复制容错、节点扩容等需求;
      b. client_request_server_ 负责执行数据节点上文件的读、写、查询、删除等操作,直接响应客户端的请求。相应的代码如下:

        int TaskManager::handle(BaseTaskMessage* packet)
        {
          int pcode = packet->getPCode();
          int ret = TFS_SUCCESS;
          switch(pcode)
          {
            case REPLICATE_BLOCK_MESSAGE:
              ret = add_replicate_task(dynamic_cast<ReplicateBlockMessage*>(packet));
              break;
            case COMPACT_BLOCK_MESSAGE:
              ret = add_compact_task(dynamic_cast<NsRequestCompactBlockMessage*>(packet));
              break;
            case DS_REPLICATE_BLOCK_MESSAGE:
              ret = add_ds_replicate_task(dynamic_cast<DsReplicateBlockMessage*>(packet));
              break;
            case DS_COMPACT_BLOCK_MESSAGE:
              ret = add_ds_compact_task(dynamic_cast<DsCompactBlockMessage*>(packet));
              break;
            case REQ_EC_MARSHALLING_MESSAGE:
              ret = add_marshalling_task(dynamic_cast<ECMarshallingMessage*>(packet));
              break;
            case REQ_EC_REINSTATE_MESSAGE:
              ret = add_reinstate_task(dynamic_cast<ECReinstateMessage*>(packet));
              break;
            case REQ_EC_DISSOLVE_MESSAGE:
              ret = add_dissolve_task(dynamic_cast<ECDissolveMessage*>(packet));
              break;
            case NS_REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
              ret = add_resolve_conflict_task(dynamic_cast<NsReqResolveBlockVersionConflictMessage*>(packet));
              return ret;
            case RESP_DS_REPLICATE_BLOCK_MESSAGE:
            case RESP_DS_COMPACT_BLOCK_MESSAGE:
              ret = handle_complete(packet);
              break;
            default:
              ret = TFS_ERROR;
              TBSYS_LOG(WARN, "unknown pcode : %d",  pcode);
              break;
          }
    
          if (TFS_SUCCESS == ret)
          {
            packet->reply(new StatusMessage(STATUS_MESSAGE_OK));
          }
    
          return ret;
        }
    
        int ClientRequestServer::handle(tbnet::Packet* packet)
        {
          int ret = (NULL == packet) ? EXIT_POINTER_NULL : TFS_SUCCESS;
          if (TFS_SUCCESS == ret)
          {
            int32_t pcode = packet->getPCode();
            switch (pcode)
            {
              case REQ_CALL_DS_REPORT_BLOCK_MESSAGE:
                ret = report_block(dynamic_cast<CallDsReportBlockRequestMessage*>(packet));
                break;
              case STAT_FILE_MESSAGE_V2:
                ret = stat_file(dynamic_cast<StatFileMessageV2*>(packet));
                break;
              case READ_FILE_MESSAGE_V2:
                ret = read_file(dynamic_cast<ReadFileMessageV2*>(packet));
                break;
              case WRITE_FILE_MESSAGE_V2:
                ret = write_file(dynamic_cast<WriteFileMessageV2*>(packet));
                break;
              case CLOSE_FILE_MESSAGE_V2:
                ret = close_file(dynamic_cast<CloseFileMessageV2*>(packet));
                break;
              case UNLINK_FILE_MESSAGE_V2:
                ret = unlink_file(dynamic_cast<UnlinkFileMessageV2*>(packet));
                break;
              case NEW_BLOCK_MESSAGE_V2:
                ret = new_block(dynamic_cast<NewBlockMessageV2*>(packet));
                break;
              case REMOVE_BLOCK_MESSAGE_V2:
                ret = remove_block(dynamic_cast<RemoveBlockMessageV2*>(packet));
                break;
              case READ_RAWDATA_MESSAGE_V2:
                ret = read_raw_data(dynamic_cast<ReadRawdataMessageV2*>(packet));
                break;
              case WRITE_RAWDATA_MESSAGE_V2:
                ret = write_raw_data(dynamic_cast<WriteRawdataMessageV2*>(packet));
                break;
              case READ_INDEX_MESSAGE_V2:
                ret = read_index(dynamic_cast<ReadIndexMessageV2*>(packet));
                break;
              case WRITE_INDEX_MESSAGE_V2:
                ret = write_index(dynamic_cast<WriteIndexMessageV2*>(packet));
                break;
              case QUERY_EC_META_MESSAGE:
                ret = query_ec_meta(dynamic_cast<QueryEcMetaMessage*>(packet));
                break;
              case COMMIT_EC_META_MESSAGE:
                ret = commit_ec_meta(dynamic_cast<CommitEcMetaMessage*>(packet));
                break;
              case GET_ALL_BLOCKS_HEADER_MESSAGE:
                ret = get_all_blocks_header(dynamic_cast<GetAllBlocksHeaderMessage*>(packet));
                break;
              default:
                TBSYS_LOG(WARN, "process packet pcode: %d
    ", pcode);
                ret = EXIT_UNKNOWN_MSGTYPE;
                break;
            }
          }
    
          return ret;
        }
    

     1.2 主控节点

       主控节点主要职责分为三块: 
        a. 各个控制流的任务管理与下发,如Block的创建,删除,复制,均衡,整理;
        b. 各个数据节点健康状态的检查;
        c. 元数据及元数据与各个数据节点上block的检索管理。
        其管理类是NameServer,负责的相关任务同样由handlePacketQueue函数处理,如下:

      class NameServer
        {
          LayoutManager layout_manager_;
          NameServerHeartManager master_slave_heart_manager_;
          HeartManagement heart_manager_;
        }
        
        
        class LayoutManager
        {
            BlockManager block_manager_;
            ServerManager server_manager_;
            TaskManager task_manager_;
            OpLogSyncManager oplog_sync_mgr_;
            FamilyManager  family_manager_;    
            ClientRequestServer client_request_server_;
            common::GCObjectManager<LayoutManager, common::BaseObject> gc_manager_;        
        }
        
        bool NameServer::handlePacketQueue(tbnet::Packet *packet, void *args)
        {
          bool bret = BaseService::handlePacketQueue(packet, args);
          if (bret)
          {
            int32_t pcode = packet->getPCode();
            int32_t ret = LOCAL_PACKET == pcode ? TFS_ERROR : common::TFS_SUCCESS;
            if (TFS_SUCCESS == ret)
            {
              //TBSYS_LOG(DEBUG, "PCODE: %d", pcode);
              common::BasePacket* msg = dynamic_cast<common::BasePacket*>(packet);
              switch (pcode)
              {
                case GET_BLOCK_INFO_MESSAGE_V2:
                  ret = open(msg);
                  break;
                case BATCH_GET_BLOCK_INFO_MESSAGE_V2:
                  ret = batch_open(msg);
                  break;
                case REPLICATE_BLOCK_MESSAGE:
                case BLOCK_COMPACT_COMPLETE_MESSAGE:
                case REQ_EC_MARSHALLING_COMMIT_MESSAGE:
                case REQ_EC_REINSTATE_COMMIT_MESSAGE:
                case REQ_EC_DISSOLVE_COMMIT_MESSAGE:
                  ret = layout_manager_.get_client_request_server().handle(msg);
                  break;
                case SHOW_SERVER_INFORMATION_MESSAGE:
                  ret = show_server_information(msg);
                  break;
                case STATUS_MESSAGE:
                  ret = ping(msg);
                  break;
                case DUMP_PLAN_MESSAGE:
                  ret = dump_plan(msg);
                  break;
                case CLIENT_NS_KEEPALIVE_MESSAGE:
                  ret = client_keepalive(msg);
                  break;
                case CLIENT_CMD_MESSAGE:
                  ret = client_control_cmd(msg);
                  break;
                case REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
                  {
                  BaseTaskMessage* packet = dynamic_cast<BaseTaskMessage*>(msg);
                  if (0 == packet->get_seqno())
                    ret = resolve_block_version_conflict(msg);
                  else
                    ret = layout_manager_.get_client_request_server().handle(msg);
                  }
                  break;
                case REQ_GET_FAMILY_INFO_MESSAGE:
                  ret = get_family_info(msg);
                  break;
                case REPAIR_BLOCK_MESSAGE_V2:
                  ret = repair(msg);
                  break;
                case DS_APPLY_BLOCK_MESSAGE:
                  ret = apply_block(msg);
                  break;
                case DS_APPLY_BLOCK_FOR_UPDATE_MESSAGE:
                  ret = apply_block_for_update(msg);
                  break;
                case DS_GIVEUP_BLOCK_MESSAGE:
                  ret = giveup_block(msg);
                  break;
                default:
                  ret = EXIT_UNKNOWN_MSGTYPE;
                  TBSYS_LOG(WARN, "unknown msg type: %d", pcode);
                  break;
              }
              if (common::TFS_SUCCESS != ret)
              {
                msg->reply_error_packet(TBSYS_LOG_LEVEL(ERROR), ret, "execute message failed, pcode: %d", pcode);
              }
            }
          }
          return bret;
        }
    

     2. 文件物理结构与逻辑结构设计介绍

      2.1 文件存储设计  
       文件存储设计官方相关公布图如下:


         在tfs系统里面,个人理解数据存储设计概念分为三层:
             a. 应用层:用户最关心的业务数据文件,如图片、文档等file_id。
             b. 逻辑层:开发人员最关心的,用LogicBlock表示。
             c. 物理层:运维人员最关心的,用BasePhysicalBlock表示。
          Block是逻辑上的概念,一个block包含多个bucket,每个bucket包含多个slot,每个slot一一对应的file_id, file_id在block内唯一。BasePhysicalBlock是物理磁盘上的概念,一个block也包含一个或多个BasePhysicalBlock,与磁盘的碎片程度有关。
     2.2 关于Block
          每个block_id标示着一个Block,tfs逻辑上是以Block的方式管理文件内容的,block_id_集群中是全局唯一的,由nameserver全局唯一生成与管理,file_id_在每个block中是局部唯一的,所以通过二元祖<block_id_,file_id_>可以定位集群中用户所需要的数据。
          另外,每个block的大小基本配置固定的,例如默认64M。由于tfs与HDFS的需求不同,主要管理存储小文件的,所以数据文件偏小,但当满足大数据文件的存储需求时,也会将其拆分成多个小文件FileSegment。所以读取文件数据时数据检索流程:FileSegment-->LogicBlock-->FileInfoV2-->BasePhysicalBlock

     2.3 结构表达
        各个相应的类结构表达如下:

        //用户请求上传信息
        struct FileSegment
        {
          uint64_t block_id_;
          uint64_t file_id_;
          int32_t offset_;
          int32_t length_;
        };
        
        struct BlockIndex
        {
          uint64_t logic_block_id_;
          int32_t  physical_block_id_:20;//<=1048575
          int32_t  physical_file_name_id_:20;//<=1048575 number 1~1048575
          int32_t  next_index_:20;//<=1048575
          int32_t  prev_index_:20;//<=1048575
          int8_t   index_:7;// 0 ~36(0: main block, 1~36: ext block)
          int8_t   status_:2;//0: uncomplete, 1: complete
          int8_t   split_flag_:2;//0: unsplit, 1:split
          int8_t   split_status_:2;//0: split uncomplete, 1: split complete
          int8_t   reserve_:3;//reserve
        };
        
        struct FileInfoV2 //30
        {
          uint64_t id_; //file id
          int32_t  offset_; //offset in block file
          int32_t  size_:28;// file size
          int8_t   status_:4;//delete flag
          uint32_t crc_; // checksum
          int32_t  modify_time_;//modify time
          int32_t create_time_; // create time
          uint16_t next_;      //next index
        }
     
        struct IndexHeaderV2
        {
          common::BlockInfoV2 info_;//56
          ThroughputV2 throughput_;//72
          int32_t used_offset_;//12 * 4 = 48
          int32_t avail_offset_;
          int32_t marshalling_offset_;
          uint32_t seq_no_;
          union
          {
            uint16_t file_info_bucket_size_;
            uint16_t index_num_;
          };
          uint16_t used_file_info_bucket_size_;
          int8_t  max_index_num_;
          int8_t  reserve_[27];
        }
        
        class BasePhysicalBlock : public GCObject
        {
          public:
            int32_t physical_block_id_;
            int32_t start_;//the data start offset of this block file
            int32_t end_;//the data end offset of this block file
            FileOperation file_op_(path);
        };
        
        class LogicBlock
        {
            BaseIndexHandle* index_handle_;
            DataHandle data_handle_;
            std::vector<PhysicalBlock*> physical_block_list_; //the physical block list of this logic block
        }
        
        class LogicBlockManager
        {
            common::TfsSortedVector<BaseLogicBlock*, LogicBlockIdCompare> logic_blocks_;
        }
        
        class PhysicalBlockManager
        {
            common::TfsSortedVector<BasePhysicalBlock*,PhysicalBlockIdCompare> physical_blocks_;
            common::TfsSortedVector<BasePhysicalBlock*,PhysicalBlockIdCompare> alloc_physical_blocks_; 
        }
        
        class BlockManager
        {    
            LogicBlockManager logic_block_manager_;
            PhysicalBlockManager physical_block_manager_;
            GCObjectManager   gc_manager_;
            mutable common::RWLock mutex_;
        }   
    

    3. 读取流程

       3.1 客户端

        int64_t TfsClientImplV2::read(const int fd, void* buf, const int64_t count)
        {
          int64_t ret = TFS_SUCCESS;
          if ((fd < 0) || (NULL == buf) || (count < 0))
          {
            ret = EXIT_PARAMETER_ERROR;
          }
          else
          {
            TfsFile* tfs_file = get_file(fd);
            if (NULL == tfs_file)
            {
              ret = EXIT_INVALIDFD_ERROR;
            }
            else
            {
              ret = tfs_file->read(buf, count);
              tfs_file->get_session()->update_stat(ST_READ, ret > 0);
            }
          }
          return ret;
        }
    
        
        uint64_t File::get_read_ds() const
        {
          uint64_t server_id = 0;
          if (ds_.size() > 0)
          {
            server_id = ds_[read_index_%ds_.size()];
          }
        }
    

     3.2 服务端

        //读取数据
        int ClientRequestServer::read_file(ReadFileMessageV2* message)
        {
          TIMER_START();
          uint64_t block_id = message->get_block_id();
          uint64_t attach_block_id = message->get_attach_block_id();
          uint64_t file_id = message->get_file_id();
          int32_t length = message->get_length();
          int32_t offset = message->get_offset();
          int8_t flag = message->get_flag();
          uint64_t peer_id = message->get_connection()->getPeerId();
          const FamilyInfoExt& family_info = message->get_family_info();
    
          int ret = ((INVALID_BLOCK_ID == block_id) || (INVALID_FILE_ID == file_id) ||
              (offset < 0) || (length <= 0)) ? EXIT_PARAMETER_ERROR : TFS_SUCCESS;
    
          if (TFS_SUCCESS == ret)
          {
            FileInfoV2 file_info;
            file_info.id_ = file_id;
            if (INVALID_FAMILY_ID == family_info.family_id_)
            {
              ret = get_block_manager().stat(file_info,
                  FORCE_STAT, block_id, message->get_attach_block_id());
            }
            else
            {
              ret = get_data_helper().stat_file_degrade(block_id,
                  file_id, FORCE_STAT, family_info, file_info);
            }
    
            if (TFS_SUCCESS == ret)
            {
              // truncate read length
              if (offset + length > file_info.size_)
              {
                length = file_info.size_ - offset;
              }
    
              ret = (length < 0) ? EXIT_PARAMETER_ERROR: TFS_SUCCESS;
              if (TFS_SUCCESS == ret)
              {
                ReadFileRespMessageV2* resp_msg = new (std::nothrow) ReadFileRespMessageV2();
                assert(NULL != message);
    
                // if length is truncated to 0
                // reply a packet with length 0 to tell client that it already reach to the end of file
                if (0 == length)
                {
                  resp_msg->set_length(0);
                }
                else
                {
                  char* buffer = resp_msg->alloc_data(length);
                  assert(NULL != buffer);
                  if (INVALID_FAMILY_ID == family_info.family_id_)
                  {
                    TimeStat timer;
                    timer.start();
                    ret = get_block_manager().read(buffer,
                        length, offset, file_id, flag, block_id, attach_block_id);
                    timer.end();
                    ret = (ret < 0) ? ret: TFS_SUCCESS;
    
                    // log slow read request
                    if (TFS_SUCCESS == ret && timer.duration() > SYSPARAM_DATASERVER.max_io_warn_time_)
                    {
                      TBSYS_LOG(WARN, "slow read request. blockid: %"PRI64_PREFIX"u, "
                          "attach_blockid: %"PRI64_PREFIX"u, fileid: %"PRI64_PREFIX"u, cost: %"PRI64_PREFIX"d",
                          attach_block_id, block_id, file_id, timer.duration());
                    }
                  }
                  else
                  {
                    ret = get_data_helper().read_file_degrade(block_id,
                        file_info, buffer, length, offset, flag, family_info);
                  }
                }
    
                if (TFS_SUCCESS != ret)
                {
                  // upper layer will reply error packet to client
                  tbsys::gDelete(resp_msg);
                }
                else
                {
                  // readv2 support
                  if (flag & READ_DATA_OPTION_WITH_FINFO)
                  {
                    resp_msg->set_file_info(file_info);
                  }
                  ret = message->reply(resp_msg);
                  if (TFS_SUCCESS == ret)
                  {
                    get_traffic_control().rw_traffic_stat(false, length);
                  }
                }
              }
            }
          }
    
          TIMER_END();
          return ret;
        }
        
        // b BlockManager读取buf,从logic_block_id
        int BlockManager::read(char* buf, int32_t& nbytes, const int32_t offset,
            const uint64_t fileid, const int8_t flag, const uint64_t logic_block_id, const uint64_t attach_logic_block_id)
        {
          int32_t ret = (NULL != buf && nbytes > 0 && offset >= 0 && INVALID_FILE_ID != fileid && flag >= 0
              && INVALID_BLOCK_ID != logic_block_id && INVALID_BLOCK_ID != attach_logic_block_id) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
          if (TFS_SUCCESS == ret)
          {
            BaseLogicBlock* logic_block = get(logic_block_id);
            ret = (NULL != logic_block) ? TFS_SUCCESS  : EXIT_NO_LOGICBLOCK_ERROR;
            if (TFS_SUCCESS == ret)
            {
              ret = logic_block->read(buf, nbytes, offset, fileid, flag, attach_logic_block_id);
            }
          }
          return ret;
        }
        
        //c 从physical_block读取真实数据,一个logic_block可能包含多个physic_blocks
        int DataHandle::pread(char* buf, const int32_t nbytes, const int32_t offset)
        {
          int32_t ret = (NULL != buf && nbytes > 0 && offset >= 0) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
          if (TFS_SUCCESS == ret)
          {
            PhysicalBlock* physical_block = NULL;
            int32_t inner_offset = 0, length = nbytes, current_offset = offset;
            int32_t inner_length = 0, mem_offset = 0,  total_read_length = 0;
            while ((TFS_SUCCESS == ret) && (current_offset < (offset + nbytes)))
            {
              inner_length  = length;
              ret = logic_block_.choose_physic_block(physical_block, inner_length, inner_offset, current_offset);
              if (TFS_SUCCESS == ret)
              {
                length = std::min(length, inner_length);
                ret = physical_block->pread((buf + mem_offset), length, inner_offset);
                ret = ret >= 0 ? TFS_SUCCESS : ret;
                if (TFS_SUCCESS == ret)
                {
                  current_offset += length;
                  mem_offset     += length;
                  total_read_length += length;
                  length         =  nbytes - total_read_length;
                }
              }
            }
          }
          return TFS_SUCCESS == ret ? nbytes : ret;
        }
    

     4. 写流程

       4.1 客户端

        // client 写数据
        int TfsFile::write_ex(const char* buf, int64_t count)
        {
          int ret = TFS_SUCCESS;
          uint64_t server = 0;
          tbnet::Packet* resp_msg = NULL;
          NewClient* client = NewClientManager::get_instance().create_client();
          if (NULL == client)
          {
            ret = EXIT_CLIENT_MANAGER_CREATE_CLIENT_ERROR;
            TBSYS_LOG(WARN, "create new client fail.");
          }
          else
          {
            WriteFileMessageV2 msg;
            msg.set_block_id(fsname_.get_block_id());
            msg.set_attach_block_id(fsname_.get_block_id());
            msg.set_file_id(fsname_.get_file_id());
            msg.set_offset(file_.offset_);
            msg.set_length(count);
            msg.set_lease_id(file_.lease_id_);
            msg.set_master_id(file_.get_write_ds());
            msg.set_version(file_.version_);
            msg.set_flag(file_.opt_flag_);
            msg.set_ds(file_.ds_);
            msg.set_data(buf);
            if (file_.has_family())
            {
              msg.set_family_info(file_.family_info_);
            }
            server = file_.get_write_ds();
            ret = send_msg_to_server(server, client, &msg, resp_msg, ClientConfig::wait_timeout_);
          }
        }
        
          uint64_t File::get_write_ds() const
        {
          uint64_t server_id = 0;
          if (ds_.size() > 0)
          {
            server_id = ds_[0];
          }
          return server_id;
        }
    

      4.2 服务端

       a. NameServer逻辑过程

    	BlockCollect* LayoutManager::add_new_block_helper_create_by_system_()
    	{
          BlockCollect* block = NULL;
          int32_t ret =  (0 == block_id) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
          if (TFS_SUCCESS == ret)
          {
            uint64_t result[MAX_REPLICATION_NUM];
            ArrayHelper<uint64_t> helper(MAX_REPLICATION_NUM, result);
            uint64_t news[MAX_REPLICATION_NUM];
            ArrayHelper<uint64_t> news_helper(MAX_REPLICATION_NUM, news);
            if (NULL != server)
              helper.push_back(server->id());
    
            block_id = get_alive_block_id_(false);
            ret = (INVALID_BLOCK_ID == block_id) ? EXIT_BLOCK_ID_INVALID_ERROR : TFS_SUCCESS;
            if (TFS_SUCCESS == ret)
            {
              //add block collect object
              block = get_block_manager().insert(block_id, now, true);
              ret = (NULL != block) ? TFS_SUCCESS : EXIT_NO_BLOCK;
            }
    
            if (TFS_SUCCESS == ret)
            {
              int32_t count = SYSPARAM_NAMESERVER.max_replication_ - helper.get_array_index();
              if (count > 0)
              {
                get_server_manager().choose_create_block_target_server(helper, news_helper, count);
              }
              BlockCollect* pobject = NULL;
              ret = !helper.empty() ? TFS_SUCCESS : EXIT_CHOOSE_CREATE_BLOCK_TARGET_SERVER_ERROR;
              if (TFS_SUCCESS == ret)//add block collect object successful
              {
                ret = add_new_block_helper_send_msg_(block_id, helper);
                if (TFS_SUCCESS == ret)
                {
                  //build relation
                  ret = add_new_block_helper_build_relation_(block, helper, now);
                  if (TFS_SUCCESS == ret)
                  {
                    add_new_block_helper_write_log_(block_id, helper, now);
                  }
                }//end send message to dataserver successful
                else
                {
                  get_block_manager().remove(pobject, block_id);//rollback
                }
              }
              else
              {
                get_block_manager().remove(pobject, block_id);//rollback
              }
              get_gc_manager().insert(pobject, now);
            }
          }//end if (TFS_SUCCESS == ret) check parameter
          return TFS_SUCCESS == ret ? block : NULL;
        }
    	
    	//a.生成block_id
    	uint64_t BlockIdFactory::generation(const bool verify)
        {
          mutex_.lock();
          ++count_;
          uint64_t id = ++global_id_;
          assert(id <= MAX_BLOCK_ID);
          bool flush_flag = false;
          if (count_ >= SKIP_BLOCK_NUMBER)
          {
            flush_flag = true;
            count_ = 0;
          }
          mutex_.unlock();
          int32_t ret = common::TFS_SUCCESS;
          if (flush_flag)
          {
            ret = flush_(id);
            if (common::TFS_SUCCESS != ret)
            {
              TBSYS_LOG(WARN, "update global block id failed, id: %"PRI64_PREFIX"u, ret: %d", id, ret);
            }
          }
          if (common::TFS_SUCCESS == ret)
          {
            if (verify)
              id |= 0x8000000000000000;
          }
          return id;
        }
    	
    	//b.选择数据节点
    	int ServerManager::choose_create_block_target_server(common::ArrayHelper<uint64_t>& result,
            common::ArrayHelper<uint64_t>& news, const int32_t count) const
        {
          news.clear();
          std::set<uint32_t> lans;
          get_lans_(lans, result);
          ServerCollect* pserver = NULL;
          int32_t index = count;
          while (index-- > 0)
          {
            pserver = NULL;
            if (TFS_SUCCESS == choose_replciate_random_choose_server_base_lock_(pserver, result, lans))
            {
              assert(NULL != pserver);
              news.push_back(pserver->id());
              result.push_back(pserver->id());
              uint32_t lan =  Func::get_lan(pserver->id(), SYSPARAM_NAMESERVER.group_mask_);
              lans.insert(lan);
            }
          }
          return count - news.get_array_index();
        }
    	
    	int ServerManager::choose_replciate_random_choose_server_base_lock_(ServerCollect*& result,
            const common::ArrayHelper<uint64_t>& except, const std::set<uint32_t>& lans) const
        {
          result = NULL;
          RWLock::Lock lock(rwmutex_, READ_LOCKER);
          int64_t size = std::min(servers_.size(), SYSPARAM_NAMESERVER.choose_target_server_random_max_nums_);
          int64_t index = size, random_index = 0;
          while (index-- > 0 && NULL == result)
          {
            random_index = random() % servers_.size();
            ServerCollect* pserver = servers_.at(random_index);
            assert(NULL != pserver);
            bool valid  = ((!pserver->is_full()) && (!except.exist(pserver->id()))
                          && (DATASERVER_DISK_TYPE_FULL == pserver->get_disk_type()));
            if (valid && !lans.empty())
            {
              uint32_t lan =  Func::get_lan(pserver->id(), SYSPARAM_NAMESERVER.group_mask_);
              valid = lans.find(lan) == lans.end();
            }
    
            if (valid)
            {
              result = pserver;
            }
          }
          return (NULL != result) ? TFS_SUCCESS : EXIT_NO_DATASERVER;
    	}
    

       b. 数据节点逻辑过程   

         //服务端的响应分为三步

      1. prepare_op 校验lease_id合法性、准备block空间等;
      2. forward_op 异步发送数据至从副本节点data_slaves、同时写入本主副本节点上;
      3. write_file_callback 核实各个副本数据节点的写入成功与否、校验各个副本数据节点的版本一致性、返回写入结果消息。

    int ClientRequestServer::write_file(WriteFileMessageV2* message)
        {
          TIMER_START();
          uint64_t block_id = message->get_block_id();
          uint64_t attach_block_id = message->get_attach_block_id();
          uint64_t file_id = message->get_file_id();
          uint64_t lease_id = message->get_lease_id();
          int32_t offset = message->get_offset();
          int32_t length = message->get_length();
          VUINT64 servers = message->get_ds(); // will copy vector
          const char* data = message->get_data();
          uint64_t master_id = message->get_master_id();
          uint64_t peer_id = message->get_connection()->getPeerId();
          int32_t flag = message->get_flag();
          const FamilyInfoExt& family_info = message->get_family_info();
          int64_t family_id = family_info.family_id_;
          DsRuntimeGlobalInformation& ds_info = DsRuntimeGlobalInformation::instance();
          bool is_master = (master_id == ds_info.information_.id_);
          int32_t version = is_master ? -1 : message->get_version();  // master won't check version
    
          bool prepare_ok = false;
          int ret = TFS_SUCCESS;
          if ((NULL == data) || (offset < 0) || (length <= 0))
          {
            ret = EXIT_PARAMETER_ERROR;
          }
    
          // tbnet already receive this packet from network
          get_traffic_control().rw_traffic_stat(true, length);
    
          if (TFS_SUCCESS == ret)
          {
            if (is_master && INVALID_LEASE_ID == lease_id)
            {
              // first write to master
              ret = get_op_manager().prepare_op(attach_block_id,
                  file_id, lease_id, OP_TYPE_WRITE, is_master, family_info, servers);
              if (TFS_SUCCESS == ret)
              {
                // callback & slave will use
                if (INVALID_BLOCK_ID == block_id) // data block
                {
                  block_id = attach_block_id;
                }
                message->set_block_id(block_id);
                message->set_attach_block_id(attach_block_id);
                message->set_file_id(file_id);
                message->set_lease_id(lease_id);
                message->set_flag(TFS_FILE_FIRST_WRITE_TO_SLAVE);
              }
            }
            else if (!is_master && (flag & TFS_FILE_FIRST_WRITE_TO_SLAVE))
            {
              // first write to slave
              ret = get_op_manager().prepare_op(attach_block_id,
                  file_id, lease_id, OP_TYPE_WRITE, is_master, family_info, servers);
            }
            else
            {
              // not the first wirte, just reset operation
              ret = get_op_manager().reset_op(attach_block_id, file_id, lease_id, servers);
            }
    
            // async op prepare work finished
            prepare_ok = (TFS_SUCCESS == ret);
          }
    
          // post request to slaves
          if ((TFS_SUCCESS == ret) && (servers.size() > 1U) && is_master)
          {
            ret = get_op_manager().forward_op(message, attach_block_id, family_id, servers);
          }
    
          // local write file
          BlockInfoV2 local;
          if (TFS_SUCCESS == ret)
          {
            ret = get_op_manager().write_file(block_id, attach_block_id,
                file_id, lease_id, data, length, offset, version, local);
            get_op_manager().update_op(attach_block_id, file_id, lease_id, ret, local);
          }
    
          // master check if all successful
          // slave response to master
          if (is_master)
          {
            if (prepare_ok)
            {
              write_file_callback(message);
            }
            else
            {
              message->reply(new StatusMessage(ret, "master prepare op fail"));
            }
          }
          else
          {
            SlaveDsRespMessage* resp_msg = new (std::nothrow) SlaveDsRespMessage();
            assert(NULL != resp_msg);
            resp_msg->set_server_id(ds_info.information_.id_);
            resp_msg->set_block_info(local);
            resp_msg->set_status(ret);
            message->reply(resp_msg);
    
            // slave write fail, release op
            if (TFS_SUCCESS != ret)
            {
              get_op_manager().release_op(attach_block_id, file_id, lease_id);
            }
          }
    
          TIMER_END();
    
          TBSYS_LOG_DW(ret, "write file %s, blockid: %"PRI64_PREFIX"u, attach_blockid: %"PRI64_PREFIX"u, "
              "fileid: %"PRI64_PREFIX"u, leaseid: %"PRI64_PREFIX"u, length: %d, offset: %d, "
              "version: %d, role: %s, peer ip: %s, cost: %"PRI64_PREFIX"d, ret: %d",
              (TFS_SUCCESS == ret) ? "success" : "fail", block_id, attach_block_id, file_id, lease_id,
              length, offset, version, is_master ? "master": "slave",
              tbsys::CNetUtil::addrToString(peer_id).c_str(), TIMER_DURATION(), ret);
    
          return TFS_SUCCESS;
        }
        
        int ClientRequestServer::write_file_callback(WriteFileMessageV2* message)
        {
          uint64_t attach_block_id = message->get_attach_block_id();
          uint64_t file_id = message->get_file_id();
          uint64_t lease_id = message->get_lease_id();
          uint32_t length = message->get_length();
          uint32_t offset = message->get_offset();
          uint64_t peer_id = message->get_connection()->getPeerId();
          OpStat op_stat;
    
          int ret = TFS_SUCCESS;
          bool all_finish = get_op_manager().check_op(attach_block_id, file_id, lease_id, op_stat);
          if (all_finish)
          {
            ret = op_stat.status_;
            if (TFS_SUCCESS != ret)
            {
              // req ns resolve version conflict
              if (EXIT_BLOCK_VERSION_CONFLICT_ERROR == ret)
              {
                get_op_manager().resolve_block_version_conflict(attach_block_id, file_id, lease_id);
              }
              message->reply_error_packet(TBSYS_LOG_LEVEL(WARN), ret, op_stat.error_.str().c_str());
    
              // if fail, close will never happen, release op, expire writable block
              get_op_manager().release_op(attach_block_id, file_id, lease_id, ret);
            }
            else
            {
              WriteFileRespMessageV2* resp_msg = new (std::nothrow) WriteFileRespMessageV2();
              assert(NULL != resp_msg);
              resp_msg->set_block_id(attach_block_id);
              resp_msg->set_file_id(file_id);
              resp_msg->set_lease_id(lease_id);
              message->reply(resp_msg);
            }
    
            if (TFS_SUCCESS != ret)
            {
              get_traffic_control().rw_stat(RW_STAT_TYPE_WRITE, ret, 0 == offset, length);
            }
    
            TBSYS_LOG_IW(ret, "WRITE file %s, ret: %d. blockid: %"PRI64_PREFIX"u, "
                "fileid: %"PRI64_PREFIX"u, leaseid: %"PRI64_PREFIX"u, "
                "length: %d, offset: %d, peer ip: %s, cost: %"PRI64_PREFIX"d",
                (TFS_SUCCESS == ret) ? "success": "fail", ret, attach_block_id,
                file_id, lease_id, length, offset,
                tbsys::CNetUtil::addrToString(peer_id).c_str(), op_stat.cost_);
          }
    
          return TFS_SUCCESS;
        }
    

    参考

         http://code.taobao.org/p/tfs/src/

  • 相关阅读:
    tips
    数学建模-预测模型优缺(搬运)
    数学建模-灰色预测模型GM(1,1)_MATLAB
    Floyd算法_MATLAB
    第二章 运算方法与运算器(浮点数的加减法,IEEE754标准32/64浮点规格化数)
    面向对象
    for循环
    if---else
    airflow的web任务管理
    airflow原理
  • 原文地址:https://www.cnblogs.com/gisorange/p/4948749.html
Copyright © 2020-2023  润新知