1. tfs节点的管理类介绍
1.1 数据节点
数据节点主要进行进行实际数据的存储与读写,其管理类是DataServer,其职责如下:
相关数据流的任务函数处理由handlePacketQueue函数完成,如下:
//一个数据节点 class DataService { OpManager op_manager_; LeaseManager *lease_manager_; DataHelper data_helper_; TaskManager task_manager_; BlockManager *block_manager_; TrafficControl traffic_control_; ClientRequestServer client_request_server_; WritableBlockManager writable_block_manager_; CheckManager check_manager_; SyncManager* sync_manager_; MigrateManager* migrate_manager_; TimeoutThreadHelperPtr timeout_thread_; RunTaskThreadHelperPtr task_thread_; RunCheckThreadHelperPtr check_thread_; } bool DataService::handlePacketQueue(tbnet::Packet* packet, void* args) { bool bret = BaseService::handlePacketQueue(packet, args); if (bret) { int32_t pcode = packet->getPCode(); int32_t ret = LOCAL_PACKET == pcode ? TFS_ERROR : TFS_SUCCESS; if (TFS_SUCCESS == ret) { switch (pcode) { case LIST_BLOCK_MESSAGE: ret = list_blocks(dynamic_cast<ListBlockMessage*>(packet)); break; case REPLICATE_BLOCK_MESSAGE: case COMPACT_BLOCK_MESSAGE: case DS_COMPACT_BLOCK_MESSAGE: case DS_REPLICATE_BLOCK_MESSAGE: case RESP_DS_COMPACT_BLOCK_MESSAGE: case RESP_DS_REPLICATE_BLOCK_MESSAGE: case REQ_EC_MARSHALLING_MESSAGE: case REQ_EC_REINSTATE_MESSAGE: case REQ_EC_DISSOLVE_MESSAGE: case NS_REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE: ret = task_manager_.handle(dynamic_cast<BaseTaskMessage*>(packet)); break; case GET_BLOCK_INFO_MESSAGE_V2: ret = get_block_info(dynamic_cast<GetBlockInfoMessageV2*>(packet)); break; case GET_SERVER_STATUS_MESSAGE: ret = get_server_status(dynamic_cast<GetServerStatusMessage*>(packet)); break; case STATUS_MESSAGE: ret = get_ping_status(dynamic_cast<StatusMessage*>(packet)); break; case CLIENT_CMD_MESSAGE: ret = client_command(dynamic_cast<ClientCmdMessage*>(packet)); break; case REQ_CALL_DS_REPORT_BLOCK_MESSAGE: case STAT_FILE_MESSAGE_V2: case READ_FILE_MESSAGE_V2: case WRITE_FILE_MESSAGE_V2: case CLOSE_FILE_MESSAGE_V2: case UNLINK_FILE_MESSAGE_V2: case NEW_BLOCK_MESSAGE_V2: case REMOVE_BLOCK_MESSAGE_V2: case READ_RAWDATA_MESSAGE_V2: case WRITE_RAWDATA_MESSAGE_V2: case READ_INDEX_MESSAGE_V2: case WRITE_INDEX_MESSAGE_V2: case QUERY_EC_META_MESSAGE: case COMMIT_EC_META_MESSAGE: case GET_ALL_BLOCKS_HEADER_MESSAGE: ret = client_request_server_.handle(packet); break; case REQ_CHECK_BLOCK_MESSAGE: case REPORT_CHECK_BLOCK_MESSAGE: ret = check_manager_.handle(packet); break; default: TBSYS_LOG(ERROR, "process packet pcode: %d ", pcode); ret = TFS_ERROR; break; } if (common::TFS_SUCCESS != ret) { common::BasePacket* msg = dynamic_cast<common::BasePacket*>(packet); msg->reply_error_packet(TBSYS_LOG_LEVEL(ERROR), ret, "execute message failed"); } } } return bret; }
其中的主要职责由task_manager_和client_request_server_完成:
a. task_manager_ 负责执行数据节点的均衡、数据迁移、清理压缩、整理等操作,满足系统集群的平衡迁移、复制容错、节点扩容等需求;
b. client_request_server_ 负责执行数据节点上文件的读、写、查询、删除等操作,直接响应客户端的请求。相应的代码如下:
int TaskManager::handle(BaseTaskMessage* packet) { int pcode = packet->getPCode(); int ret = TFS_SUCCESS; switch(pcode) { case REPLICATE_BLOCK_MESSAGE: ret = add_replicate_task(dynamic_cast<ReplicateBlockMessage*>(packet)); break; case COMPACT_BLOCK_MESSAGE: ret = add_compact_task(dynamic_cast<NsRequestCompactBlockMessage*>(packet)); break; case DS_REPLICATE_BLOCK_MESSAGE: ret = add_ds_replicate_task(dynamic_cast<DsReplicateBlockMessage*>(packet)); break; case DS_COMPACT_BLOCK_MESSAGE: ret = add_ds_compact_task(dynamic_cast<DsCompactBlockMessage*>(packet)); break; case REQ_EC_MARSHALLING_MESSAGE: ret = add_marshalling_task(dynamic_cast<ECMarshallingMessage*>(packet)); break; case REQ_EC_REINSTATE_MESSAGE: ret = add_reinstate_task(dynamic_cast<ECReinstateMessage*>(packet)); break; case REQ_EC_DISSOLVE_MESSAGE: ret = add_dissolve_task(dynamic_cast<ECDissolveMessage*>(packet)); break; case NS_REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE: ret = add_resolve_conflict_task(dynamic_cast<NsReqResolveBlockVersionConflictMessage*>(packet)); return ret; case RESP_DS_REPLICATE_BLOCK_MESSAGE: case RESP_DS_COMPACT_BLOCK_MESSAGE: ret = handle_complete(packet); break; default: ret = TFS_ERROR; TBSYS_LOG(WARN, "unknown pcode : %d", pcode); break; } if (TFS_SUCCESS == ret) { packet->reply(new StatusMessage(STATUS_MESSAGE_OK)); } return ret; } int ClientRequestServer::handle(tbnet::Packet* packet) { int ret = (NULL == packet) ? EXIT_POINTER_NULL : TFS_SUCCESS; if (TFS_SUCCESS == ret) { int32_t pcode = packet->getPCode(); switch (pcode) { case REQ_CALL_DS_REPORT_BLOCK_MESSAGE: ret = report_block(dynamic_cast<CallDsReportBlockRequestMessage*>(packet)); break; case STAT_FILE_MESSAGE_V2: ret = stat_file(dynamic_cast<StatFileMessageV2*>(packet)); break; case READ_FILE_MESSAGE_V2: ret = read_file(dynamic_cast<ReadFileMessageV2*>(packet)); break; case WRITE_FILE_MESSAGE_V2: ret = write_file(dynamic_cast<WriteFileMessageV2*>(packet)); break; case CLOSE_FILE_MESSAGE_V2: ret = close_file(dynamic_cast<CloseFileMessageV2*>(packet)); break; case UNLINK_FILE_MESSAGE_V2: ret = unlink_file(dynamic_cast<UnlinkFileMessageV2*>(packet)); break; case NEW_BLOCK_MESSAGE_V2: ret = new_block(dynamic_cast<NewBlockMessageV2*>(packet)); break; case REMOVE_BLOCK_MESSAGE_V2: ret = remove_block(dynamic_cast<RemoveBlockMessageV2*>(packet)); break; case READ_RAWDATA_MESSAGE_V2: ret = read_raw_data(dynamic_cast<ReadRawdataMessageV2*>(packet)); break; case WRITE_RAWDATA_MESSAGE_V2: ret = write_raw_data(dynamic_cast<WriteRawdataMessageV2*>(packet)); break; case READ_INDEX_MESSAGE_V2: ret = read_index(dynamic_cast<ReadIndexMessageV2*>(packet)); break; case WRITE_INDEX_MESSAGE_V2: ret = write_index(dynamic_cast<WriteIndexMessageV2*>(packet)); break; case QUERY_EC_META_MESSAGE: ret = query_ec_meta(dynamic_cast<QueryEcMetaMessage*>(packet)); break; case COMMIT_EC_META_MESSAGE: ret = commit_ec_meta(dynamic_cast<CommitEcMetaMessage*>(packet)); break; case GET_ALL_BLOCKS_HEADER_MESSAGE: ret = get_all_blocks_header(dynamic_cast<GetAllBlocksHeaderMessage*>(packet)); break; default: TBSYS_LOG(WARN, "process packet pcode: %d ", pcode); ret = EXIT_UNKNOWN_MSGTYPE; break; } } return ret; }
1.2 主控节点
主控节点主要职责分为三块:
a. 各个控制流的任务管理与下发,如Block的创建,删除,复制,均衡,整理;
b. 各个数据节点健康状态的检查;
c. 元数据及元数据与各个数据节点上block的检索管理。
其管理类是NameServer,负责的相关任务同样由handlePacketQueue函数处理,如下:
class NameServer { LayoutManager layout_manager_; NameServerHeartManager master_slave_heart_manager_; HeartManagement heart_manager_; } class LayoutManager { BlockManager block_manager_; ServerManager server_manager_; TaskManager task_manager_; OpLogSyncManager oplog_sync_mgr_; FamilyManager family_manager_; ClientRequestServer client_request_server_; common::GCObjectManager<LayoutManager, common::BaseObject> gc_manager_; } bool NameServer::handlePacketQueue(tbnet::Packet *packet, void *args) { bool bret = BaseService::handlePacketQueue(packet, args); if (bret) { int32_t pcode = packet->getPCode(); int32_t ret = LOCAL_PACKET == pcode ? TFS_ERROR : common::TFS_SUCCESS; if (TFS_SUCCESS == ret) { //TBSYS_LOG(DEBUG, "PCODE: %d", pcode); common::BasePacket* msg = dynamic_cast<common::BasePacket*>(packet); switch (pcode) { case GET_BLOCK_INFO_MESSAGE_V2: ret = open(msg); break; case BATCH_GET_BLOCK_INFO_MESSAGE_V2: ret = batch_open(msg); break; case REPLICATE_BLOCK_MESSAGE: case BLOCK_COMPACT_COMPLETE_MESSAGE: case REQ_EC_MARSHALLING_COMMIT_MESSAGE: case REQ_EC_REINSTATE_COMMIT_MESSAGE: case REQ_EC_DISSOLVE_COMMIT_MESSAGE: ret = layout_manager_.get_client_request_server().handle(msg); break; case SHOW_SERVER_INFORMATION_MESSAGE: ret = show_server_information(msg); break; case STATUS_MESSAGE: ret = ping(msg); break; case DUMP_PLAN_MESSAGE: ret = dump_plan(msg); break; case CLIENT_NS_KEEPALIVE_MESSAGE: ret = client_keepalive(msg); break; case CLIENT_CMD_MESSAGE: ret = client_control_cmd(msg); break; case REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE: { BaseTaskMessage* packet = dynamic_cast<BaseTaskMessage*>(msg); if (0 == packet->get_seqno()) ret = resolve_block_version_conflict(msg); else ret = layout_manager_.get_client_request_server().handle(msg); } break; case REQ_GET_FAMILY_INFO_MESSAGE: ret = get_family_info(msg); break; case REPAIR_BLOCK_MESSAGE_V2: ret = repair(msg); break; case DS_APPLY_BLOCK_MESSAGE: ret = apply_block(msg); break; case DS_APPLY_BLOCK_FOR_UPDATE_MESSAGE: ret = apply_block_for_update(msg); break; case DS_GIVEUP_BLOCK_MESSAGE: ret = giveup_block(msg); break; default: ret = EXIT_UNKNOWN_MSGTYPE; TBSYS_LOG(WARN, "unknown msg type: %d", pcode); break; } if (common::TFS_SUCCESS != ret) { msg->reply_error_packet(TBSYS_LOG_LEVEL(ERROR), ret, "execute message failed, pcode: %d", pcode); } } } return bret; }
2. 文件物理结构与逻辑结构设计介绍
2.1 文件存储设计
文件存储设计官方相关公布图如下:
在tfs系统里面,个人理解数据存储设计概念分为三层:
a. 应用层:用户最关心的业务数据文件,如图片、文档等file_id。
b. 逻辑层:开发人员最关心的,用LogicBlock表示。
c. 物理层:运维人员最关心的,用BasePhysicalBlock表示。
Block是逻辑上的概念,一个block包含多个bucket,每个bucket包含多个slot,每个slot一一对应的file_id, file_id在block内唯一。BasePhysicalBlock是物理磁盘上的概念,一个block也包含一个或多个BasePhysicalBlock,与磁盘的碎片程度有关。
2.2 关于Block
每个block_id标示着一个Block,tfs逻辑上是以Block的方式管理文件内容的,block_id_集群中是全局唯一的,由nameserver全局唯一生成与管理,file_id_在每个block中是局部唯一的,所以通过二元祖<block_id_,file_id_>可以定位集群中用户所需要的数据。
另外,每个block的大小基本配置固定的,例如默认64M。由于tfs与HDFS的需求不同,主要管理存储小文件的,所以数据文件偏小,但当满足大数据文件的存储需求时,也会将其拆分成多个小文件FileSegment。所以读取文件数据时数据检索流程:FileSegment-->LogicBlock-->FileInfoV2-->BasePhysicalBlock
2.3 结构表达
各个相应的类结构表达如下:
//用户请求上传信息 struct FileSegment { uint64_t block_id_; uint64_t file_id_; int32_t offset_; int32_t length_; }; struct BlockIndex { uint64_t logic_block_id_; int32_t physical_block_id_:20;//<=1048575 int32_t physical_file_name_id_:20;//<=1048575 number 1~1048575 int32_t next_index_:20;//<=1048575 int32_t prev_index_:20;//<=1048575 int8_t index_:7;// 0 ~36(0: main block, 1~36: ext block) int8_t status_:2;//0: uncomplete, 1: complete int8_t split_flag_:2;//0: unsplit, 1:split int8_t split_status_:2;//0: split uncomplete, 1: split complete int8_t reserve_:3;//reserve }; struct FileInfoV2 //30 { uint64_t id_; //file id int32_t offset_; //offset in block file int32_t size_:28;// file size int8_t status_:4;//delete flag uint32_t crc_; // checksum int32_t modify_time_;//modify time int32_t create_time_; // create time uint16_t next_; //next index } struct IndexHeaderV2 { common::BlockInfoV2 info_;//56 ThroughputV2 throughput_;//72 int32_t used_offset_;//12 * 4 = 48 int32_t avail_offset_; int32_t marshalling_offset_; uint32_t seq_no_; union { uint16_t file_info_bucket_size_; uint16_t index_num_; }; uint16_t used_file_info_bucket_size_; int8_t max_index_num_; int8_t reserve_[27]; } class BasePhysicalBlock : public GCObject { public: int32_t physical_block_id_; int32_t start_;//the data start offset of this block file int32_t end_;//the data end offset of this block file FileOperation file_op_(path); }; class LogicBlock { BaseIndexHandle* index_handle_; DataHandle data_handle_; std::vector<PhysicalBlock*> physical_block_list_; //the physical block list of this logic block } class LogicBlockManager { common::TfsSortedVector<BaseLogicBlock*, LogicBlockIdCompare> logic_blocks_; } class PhysicalBlockManager { common::TfsSortedVector<BasePhysicalBlock*,PhysicalBlockIdCompare> physical_blocks_; common::TfsSortedVector<BasePhysicalBlock*,PhysicalBlockIdCompare> alloc_physical_blocks_; } class BlockManager { LogicBlockManager logic_block_manager_; PhysicalBlockManager physical_block_manager_; GCObjectManager gc_manager_; mutable common::RWLock mutex_; }
3. 读取流程
3.1 客户端
int64_t TfsClientImplV2::read(const int fd, void* buf, const int64_t count) { int64_t ret = TFS_SUCCESS; if ((fd < 0) || (NULL == buf) || (count < 0)) { ret = EXIT_PARAMETER_ERROR; } else { TfsFile* tfs_file = get_file(fd); if (NULL == tfs_file) { ret = EXIT_INVALIDFD_ERROR; } else { ret = tfs_file->read(buf, count); tfs_file->get_session()->update_stat(ST_READ, ret > 0); } } return ret; } uint64_t File::get_read_ds() const { uint64_t server_id = 0; if (ds_.size() > 0) { server_id = ds_[read_index_%ds_.size()]; } }
3.2 服务端
//读取数据 int ClientRequestServer::read_file(ReadFileMessageV2* message) { TIMER_START(); uint64_t block_id = message->get_block_id(); uint64_t attach_block_id = message->get_attach_block_id(); uint64_t file_id = message->get_file_id(); int32_t length = message->get_length(); int32_t offset = message->get_offset(); int8_t flag = message->get_flag(); uint64_t peer_id = message->get_connection()->getPeerId(); const FamilyInfoExt& family_info = message->get_family_info(); int ret = ((INVALID_BLOCK_ID == block_id) || (INVALID_FILE_ID == file_id) || (offset < 0) || (length <= 0)) ? EXIT_PARAMETER_ERROR : TFS_SUCCESS; if (TFS_SUCCESS == ret) { FileInfoV2 file_info; file_info.id_ = file_id; if (INVALID_FAMILY_ID == family_info.family_id_) { ret = get_block_manager().stat(file_info, FORCE_STAT, block_id, message->get_attach_block_id()); } else { ret = get_data_helper().stat_file_degrade(block_id, file_id, FORCE_STAT, family_info, file_info); } if (TFS_SUCCESS == ret) { // truncate read length if (offset + length > file_info.size_) { length = file_info.size_ - offset; } ret = (length < 0) ? EXIT_PARAMETER_ERROR: TFS_SUCCESS; if (TFS_SUCCESS == ret) { ReadFileRespMessageV2* resp_msg = new (std::nothrow) ReadFileRespMessageV2(); assert(NULL != message); // if length is truncated to 0 // reply a packet with length 0 to tell client that it already reach to the end of file if (0 == length) { resp_msg->set_length(0); } else { char* buffer = resp_msg->alloc_data(length); assert(NULL != buffer); if (INVALID_FAMILY_ID == family_info.family_id_) { TimeStat timer; timer.start(); ret = get_block_manager().read(buffer, length, offset, file_id, flag, block_id, attach_block_id); timer.end(); ret = (ret < 0) ? ret: TFS_SUCCESS; // log slow read request if (TFS_SUCCESS == ret && timer.duration() > SYSPARAM_DATASERVER.max_io_warn_time_) { TBSYS_LOG(WARN, "slow read request. blockid: %"PRI64_PREFIX"u, " "attach_blockid: %"PRI64_PREFIX"u, fileid: %"PRI64_PREFIX"u, cost: %"PRI64_PREFIX"d", attach_block_id, block_id, file_id, timer.duration()); } } else { ret = get_data_helper().read_file_degrade(block_id, file_info, buffer, length, offset, flag, family_info); } } if (TFS_SUCCESS != ret) { // upper layer will reply error packet to client tbsys::gDelete(resp_msg); } else { // readv2 support if (flag & READ_DATA_OPTION_WITH_FINFO) { resp_msg->set_file_info(file_info); } ret = message->reply(resp_msg); if (TFS_SUCCESS == ret) { get_traffic_control().rw_traffic_stat(false, length); } } } } } TIMER_END(); return ret; } // b BlockManager读取buf,从logic_block_id int BlockManager::read(char* buf, int32_t& nbytes, const int32_t offset, const uint64_t fileid, const int8_t flag, const uint64_t logic_block_id, const uint64_t attach_logic_block_id) { int32_t ret = (NULL != buf && nbytes > 0 && offset >= 0 && INVALID_FILE_ID != fileid && flag >= 0 && INVALID_BLOCK_ID != logic_block_id && INVALID_BLOCK_ID != attach_logic_block_id) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR; if (TFS_SUCCESS == ret) { BaseLogicBlock* logic_block = get(logic_block_id); ret = (NULL != logic_block) ? TFS_SUCCESS : EXIT_NO_LOGICBLOCK_ERROR; if (TFS_SUCCESS == ret) { ret = logic_block->read(buf, nbytes, offset, fileid, flag, attach_logic_block_id); } } return ret; } //c 从physical_block读取真实数据,一个logic_block可能包含多个physic_blocks int DataHandle::pread(char* buf, const int32_t nbytes, const int32_t offset) { int32_t ret = (NULL != buf && nbytes > 0 && offset >= 0) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR; if (TFS_SUCCESS == ret) { PhysicalBlock* physical_block = NULL; int32_t inner_offset = 0, length = nbytes, current_offset = offset; int32_t inner_length = 0, mem_offset = 0, total_read_length = 0; while ((TFS_SUCCESS == ret) && (current_offset < (offset + nbytes))) { inner_length = length; ret = logic_block_.choose_physic_block(physical_block, inner_length, inner_offset, current_offset); if (TFS_SUCCESS == ret) { length = std::min(length, inner_length); ret = physical_block->pread((buf + mem_offset), length, inner_offset); ret = ret >= 0 ? TFS_SUCCESS : ret; if (TFS_SUCCESS == ret) { current_offset += length; mem_offset += length; total_read_length += length; length = nbytes - total_read_length; } } } } return TFS_SUCCESS == ret ? nbytes : ret; }
4. 写流程
4.1 客户端
// client 写数据 int TfsFile::write_ex(const char* buf, int64_t count) { int ret = TFS_SUCCESS; uint64_t server = 0; tbnet::Packet* resp_msg = NULL; NewClient* client = NewClientManager::get_instance().create_client(); if (NULL == client) { ret = EXIT_CLIENT_MANAGER_CREATE_CLIENT_ERROR; TBSYS_LOG(WARN, "create new client fail."); } else { WriteFileMessageV2 msg; msg.set_block_id(fsname_.get_block_id()); msg.set_attach_block_id(fsname_.get_block_id()); msg.set_file_id(fsname_.get_file_id()); msg.set_offset(file_.offset_); msg.set_length(count); msg.set_lease_id(file_.lease_id_); msg.set_master_id(file_.get_write_ds()); msg.set_version(file_.version_); msg.set_flag(file_.opt_flag_); msg.set_ds(file_.ds_); msg.set_data(buf); if (file_.has_family()) { msg.set_family_info(file_.family_info_); } server = file_.get_write_ds(); ret = send_msg_to_server(server, client, &msg, resp_msg, ClientConfig::wait_timeout_); } } uint64_t File::get_write_ds() const { uint64_t server_id = 0; if (ds_.size() > 0) { server_id = ds_[0]; } return server_id; }
4.2 服务端
a. NameServer逻辑过程
BlockCollect* LayoutManager::add_new_block_helper_create_by_system_() { BlockCollect* block = NULL; int32_t ret = (0 == block_id) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR; if (TFS_SUCCESS == ret) { uint64_t result[MAX_REPLICATION_NUM]; ArrayHelper<uint64_t> helper(MAX_REPLICATION_NUM, result); uint64_t news[MAX_REPLICATION_NUM]; ArrayHelper<uint64_t> news_helper(MAX_REPLICATION_NUM, news); if (NULL != server) helper.push_back(server->id()); block_id = get_alive_block_id_(false); ret = (INVALID_BLOCK_ID == block_id) ? EXIT_BLOCK_ID_INVALID_ERROR : TFS_SUCCESS; if (TFS_SUCCESS == ret) { //add block collect object block = get_block_manager().insert(block_id, now, true); ret = (NULL != block) ? TFS_SUCCESS : EXIT_NO_BLOCK; } if (TFS_SUCCESS == ret) { int32_t count = SYSPARAM_NAMESERVER.max_replication_ - helper.get_array_index(); if (count > 0) { get_server_manager().choose_create_block_target_server(helper, news_helper, count); } BlockCollect* pobject = NULL; ret = !helper.empty() ? TFS_SUCCESS : EXIT_CHOOSE_CREATE_BLOCK_TARGET_SERVER_ERROR; if (TFS_SUCCESS == ret)//add block collect object successful { ret = add_new_block_helper_send_msg_(block_id, helper); if (TFS_SUCCESS == ret) { //build relation ret = add_new_block_helper_build_relation_(block, helper, now); if (TFS_SUCCESS == ret) { add_new_block_helper_write_log_(block_id, helper, now); } }//end send message to dataserver successful else { get_block_manager().remove(pobject, block_id);//rollback } } else { get_block_manager().remove(pobject, block_id);//rollback } get_gc_manager().insert(pobject, now); } }//end if (TFS_SUCCESS == ret) check parameter return TFS_SUCCESS == ret ? block : NULL; } //a.生成block_id uint64_t BlockIdFactory::generation(const bool verify) { mutex_.lock(); ++count_; uint64_t id = ++global_id_; assert(id <= MAX_BLOCK_ID); bool flush_flag = false; if (count_ >= SKIP_BLOCK_NUMBER) { flush_flag = true; count_ = 0; } mutex_.unlock(); int32_t ret = common::TFS_SUCCESS; if (flush_flag) { ret = flush_(id); if (common::TFS_SUCCESS != ret) { TBSYS_LOG(WARN, "update global block id failed, id: %"PRI64_PREFIX"u, ret: %d", id, ret); } } if (common::TFS_SUCCESS == ret) { if (verify) id |= 0x8000000000000000; } return id; } //b.选择数据节点 int ServerManager::choose_create_block_target_server(common::ArrayHelper<uint64_t>& result, common::ArrayHelper<uint64_t>& news, const int32_t count) const { news.clear(); std::set<uint32_t> lans; get_lans_(lans, result); ServerCollect* pserver = NULL; int32_t index = count; while (index-- > 0) { pserver = NULL; if (TFS_SUCCESS == choose_replciate_random_choose_server_base_lock_(pserver, result, lans)) { assert(NULL != pserver); news.push_back(pserver->id()); result.push_back(pserver->id()); uint32_t lan = Func::get_lan(pserver->id(), SYSPARAM_NAMESERVER.group_mask_); lans.insert(lan); } } return count - news.get_array_index(); } int ServerManager::choose_replciate_random_choose_server_base_lock_(ServerCollect*& result, const common::ArrayHelper<uint64_t>& except, const std::set<uint32_t>& lans) const { result = NULL; RWLock::Lock lock(rwmutex_, READ_LOCKER); int64_t size = std::min(servers_.size(), SYSPARAM_NAMESERVER.choose_target_server_random_max_nums_); int64_t index = size, random_index = 0; while (index-- > 0 && NULL == result) { random_index = random() % servers_.size(); ServerCollect* pserver = servers_.at(random_index); assert(NULL != pserver); bool valid = ((!pserver->is_full()) && (!except.exist(pserver->id())) && (DATASERVER_DISK_TYPE_FULL == pserver->get_disk_type())); if (valid && !lans.empty()) { uint32_t lan = Func::get_lan(pserver->id(), SYSPARAM_NAMESERVER.group_mask_); valid = lans.find(lan) == lans.end(); } if (valid) { result = pserver; } } return (NULL != result) ? TFS_SUCCESS : EXIT_NO_DATASERVER; }
b. 数据节点逻辑过程
//服务端的响应分为三步
1. prepare_op 校验lease_id合法性、准备block空间等;
2. forward_op 异步发送数据至从副本节点data_slaves、同时写入本主副本节点上;
3. write_file_callback 核实各个副本数据节点的写入成功与否、校验各个副本数据节点的版本一致性、返回写入结果消息。
int ClientRequestServer::write_file(WriteFileMessageV2* message) { TIMER_START(); uint64_t block_id = message->get_block_id(); uint64_t attach_block_id = message->get_attach_block_id(); uint64_t file_id = message->get_file_id(); uint64_t lease_id = message->get_lease_id(); int32_t offset = message->get_offset(); int32_t length = message->get_length(); VUINT64 servers = message->get_ds(); // will copy vector const char* data = message->get_data(); uint64_t master_id = message->get_master_id(); uint64_t peer_id = message->get_connection()->getPeerId(); int32_t flag = message->get_flag(); const FamilyInfoExt& family_info = message->get_family_info(); int64_t family_id = family_info.family_id_; DsRuntimeGlobalInformation& ds_info = DsRuntimeGlobalInformation::instance(); bool is_master = (master_id == ds_info.information_.id_); int32_t version = is_master ? -1 : message->get_version(); // master won't check version bool prepare_ok = false; int ret = TFS_SUCCESS; if ((NULL == data) || (offset < 0) || (length <= 0)) { ret = EXIT_PARAMETER_ERROR; } // tbnet already receive this packet from network get_traffic_control().rw_traffic_stat(true, length); if (TFS_SUCCESS == ret) { if (is_master && INVALID_LEASE_ID == lease_id) { // first write to master ret = get_op_manager().prepare_op(attach_block_id, file_id, lease_id, OP_TYPE_WRITE, is_master, family_info, servers); if (TFS_SUCCESS == ret) { // callback & slave will use if (INVALID_BLOCK_ID == block_id) // data block { block_id = attach_block_id; } message->set_block_id(block_id); message->set_attach_block_id(attach_block_id); message->set_file_id(file_id); message->set_lease_id(lease_id); message->set_flag(TFS_FILE_FIRST_WRITE_TO_SLAVE); } } else if (!is_master && (flag & TFS_FILE_FIRST_WRITE_TO_SLAVE)) { // first write to slave ret = get_op_manager().prepare_op(attach_block_id, file_id, lease_id, OP_TYPE_WRITE, is_master, family_info, servers); } else { // not the first wirte, just reset operation ret = get_op_manager().reset_op(attach_block_id, file_id, lease_id, servers); } // async op prepare work finished prepare_ok = (TFS_SUCCESS == ret); } // post request to slaves if ((TFS_SUCCESS == ret) && (servers.size() > 1U) && is_master) { ret = get_op_manager().forward_op(message, attach_block_id, family_id, servers); } // local write file BlockInfoV2 local; if (TFS_SUCCESS == ret) { ret = get_op_manager().write_file(block_id, attach_block_id, file_id, lease_id, data, length, offset, version, local); get_op_manager().update_op(attach_block_id, file_id, lease_id, ret, local); } // master check if all successful // slave response to master if (is_master) { if (prepare_ok) { write_file_callback(message); } else { message->reply(new StatusMessage(ret, "master prepare op fail")); } } else { SlaveDsRespMessage* resp_msg = new (std::nothrow) SlaveDsRespMessage(); assert(NULL != resp_msg); resp_msg->set_server_id(ds_info.information_.id_); resp_msg->set_block_info(local); resp_msg->set_status(ret); message->reply(resp_msg); // slave write fail, release op if (TFS_SUCCESS != ret) { get_op_manager().release_op(attach_block_id, file_id, lease_id); } } TIMER_END(); TBSYS_LOG_DW(ret, "write file %s, blockid: %"PRI64_PREFIX"u, attach_blockid: %"PRI64_PREFIX"u, " "fileid: %"PRI64_PREFIX"u, leaseid: %"PRI64_PREFIX"u, length: %d, offset: %d, " "version: %d, role: %s, peer ip: %s, cost: %"PRI64_PREFIX"d, ret: %d", (TFS_SUCCESS == ret) ? "success" : "fail", block_id, attach_block_id, file_id, lease_id, length, offset, version, is_master ? "master": "slave", tbsys::CNetUtil::addrToString(peer_id).c_str(), TIMER_DURATION(), ret); return TFS_SUCCESS; } int ClientRequestServer::write_file_callback(WriteFileMessageV2* message) { uint64_t attach_block_id = message->get_attach_block_id(); uint64_t file_id = message->get_file_id(); uint64_t lease_id = message->get_lease_id(); uint32_t length = message->get_length(); uint32_t offset = message->get_offset(); uint64_t peer_id = message->get_connection()->getPeerId(); OpStat op_stat; int ret = TFS_SUCCESS; bool all_finish = get_op_manager().check_op(attach_block_id, file_id, lease_id, op_stat); if (all_finish) { ret = op_stat.status_; if (TFS_SUCCESS != ret) { // req ns resolve version conflict if (EXIT_BLOCK_VERSION_CONFLICT_ERROR == ret) { get_op_manager().resolve_block_version_conflict(attach_block_id, file_id, lease_id); } message->reply_error_packet(TBSYS_LOG_LEVEL(WARN), ret, op_stat.error_.str().c_str()); // if fail, close will never happen, release op, expire writable block get_op_manager().release_op(attach_block_id, file_id, lease_id, ret); } else { WriteFileRespMessageV2* resp_msg = new (std::nothrow) WriteFileRespMessageV2(); assert(NULL != resp_msg); resp_msg->set_block_id(attach_block_id); resp_msg->set_file_id(file_id); resp_msg->set_lease_id(lease_id); message->reply(resp_msg); } if (TFS_SUCCESS != ret) { get_traffic_control().rw_stat(RW_STAT_TYPE_WRITE, ret, 0 == offset, length); } TBSYS_LOG_IW(ret, "WRITE file %s, ret: %d. blockid: %"PRI64_PREFIX"u, " "fileid: %"PRI64_PREFIX"u, leaseid: %"PRI64_PREFIX"u, " "length: %d, offset: %d, peer ip: %s, cost: %"PRI64_PREFIX"d", (TFS_SUCCESS == ret) ? "success": "fail", ret, attach_block_id, file_id, lease_id, length, offset, tbsys::CNetUtil::addrToString(peer_id).c_str(), op_stat.cost_); } return TFS_SUCCESS; }
参考: