• raft 学习笔记


    raft

    中文翻译

    选举过程

    跟随者一个心跳周期没有收到心跳,触发选举,但是为了选举安全性,和领导者失联的时间必须超过一个心跳周期开始

    状态变化

    跟随者 ---> 候选者 ---> 领导者

    处理交互请求,例如 追加日志,选举投票等

    // processRPC is called to handle an incoming RPC request. This must only be
    // called from the main thread.
    func (r *Raft) processRPC(rpc RPC) {
    	if err := r.checkRPCHeader(rpc); err != nil {
    		rpc.Respond(nil, err)
    		return
    	}
    
    	switch cmd := rpc.Command.(type) {
    	case *AppendEntriesRequest:
    		r.appendEntries(rpc, cmd)
    	case *RequestVoteRequest:
    		r.requestVote(rpc, cmd)
    	case *InstallSnapshotRequest:
    		r.installSnapshot(rpc, cmd)
    	case *TimeoutNowRequest:
    		r.timeoutNow(rpc, cmd)
    	default:
    		r.logger.Error("got unexpected command",
    			"command", hclog.Fmt("%#v", rpc.Command))
    		rpc.Respond(nil, fmt.Errorf("unexpected command"))
    	}
    }
    

    启动

    // run is a long running goroutine that runs the Raft FSM.
    func (r *Raft) run() {
    	for {
    		// Check if we are doing a shutdown
    		select {
    		case <-r.shutdownCh:
    			// Clear the leader to prevent forwarding
    			r.setLeader("")
    			return
    		default:
    		}
    
    		// Enter into a sub-FSM
    		switch r.getState() {
    		case Follower:
    			r.runFollower()  // 默认 follower
    		case Candidate:
    			r.runCandidate()
    		case Leader:
    			r.runLeader()
    		}
    	}
    }
    

    执行跟随者,转化为候选者

    // runFollower runs the FSM for a follower.
    func (r *Raft) runFollower() {
    	didWarn := false
    	r.logger.Info("entering follower state", "follower", r, "leader", r.Leader())
    	metrics.IncrCounter([]string{"raft", "state", "follower"}, 1)
    	heartbeatTimer := randomTimeout(r.config().HeartbeatTimeout)
    
    	for r.getState() == Follower {
    		select {
    		//省略部分代码
    		case <-heartbeatTimer:
                // 心跳超时,开始选举
    			// Restart the heartbeat timer
    			hbTimeout := r.config().HeartbeatTimeout
    			heartbeatTimer = randomTimeout(hbTimeout)
    
    			// Check if we have had a successful contact
    			lastContact := r.LastContact()
    			if time.Now().Sub(lastContact) < hbTimeout {
    				continue
    			}
    
    			// Heartbeat failed! Transition to the candidate state
    			lastLeader := r.Leader()
    			r.setLeader("")
    
    			if r.configurations.latestIndex == 0 {
    				if !didWarn {
    					r.logger.Warn("no known peers, aborting election")
    					didWarn = true
    				}
    			} else if r.configurations.latestIndex == r.configurations.committedIndex &&
    				!hasVote(r.configurations.latest, r.localID) {
    				if !didWarn {
    					r.logger.Warn("not part of stable configuration, aborting election")
    					didWarn = true
    				}
    			} else {
    				r.logger.Warn("heartbeat timeout reached, starting election", "last-leader", lastLeader)
    				metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timeout"}, 1)
    
                    // 转化为候选者
    				r.setState(Candidate)
    				return
    			}
    
    		case <-r.shutdownCh:
    			return
    		}
    	}
    }
    

    执行候选者,先为自己投票,得到票数大于规定票数,则
    变成leader

    // runCandidate runs the FSM for a candidate.
    func (r *Raft) runCandidate() {
    	r.logger.Info("entering candidate state", "node", r, "term", r.getCurrentTerm()+1)
    	metrics.IncrCounter([]string{"raft", "state", "candidate"}, 1)
    
    	// Start vote for us, and set a timeout
    	voteCh := r.electSelf()
    
    	// Make sure the leadership transfer flag is reset after each run. Having this
    	// flag will set the field LeadershipTransfer in a RequestVoteRequst to true,
    	// which will make other servers vote even though they have a leader already.
    	// It is important to reset that flag, because this priviledge could be abused
    	// otherwise.
    	defer func() { r.candidateFromLeadershipTransfer = false }()
    
    	electionTimer := randomTimeout(r.config().ElectionTimeout)
    
    	// Tally the votes, need a simple majority
    	grantedVotes := 0
    	votesNeeded := r.quorumSize()
    	r.logger.Debug("votes", "needed", votesNeeded)
    
    	for r.getState() == Candidate {
    		select {
    		case rpc := <-r.rpcCh:
    			r.processRPC(rpc)
    
    		case vote := <-voteCh:
    			// Check if the term is greater than ours, bail
    			if vote.Term > r.getCurrentTerm() {
    				r.logger.Debug("newer term discovered, fallback to follower")
    				r.setState(Follower)
    				r.setCurrentTerm(vote.Term)
    				return
    			}
    
    			// Check if the vote is granted
    			if vote.Granted {
    				grantedVotes++
    				r.logger.Debug("vote granted", "from", vote.voterID, "term", vote.Term, "tally", grantedVotes)
    			}
    
    			// Check if we've become the leader
    			if grantedVotes >= votesNeeded {
    				r.logger.Info("election won", "tally", grantedVotes)
    				r.setState(Leader)
    				r.setLeader(r.localAddr)
    				return
    			}
    
    	
    		}
    	}
    }
    

    发起投票

    	// Start vote for us, and set a timeout
    	voteCh := r.electSelf()
    
        // electSelf is used to send a RequestVote RPC to all peers, and vote for
    // ourself. This has the side affecting of incrementing the current term. The
    // response channel returned is used to wait for all the responses (including a
    // vote for ourself). This must only be called from the main thread.
    func (r *Raft) electSelf() <-chan *voteResult {
    	// Create a response channel
    	respCh := make(chan *voteResult, len(r.configurations.latest.Servers))
    
    	// Increment the term
    	r.setCurrentTerm(r.getCurrentTerm() + 1)
    
    	// Construct the request
    	lastIdx, lastTerm := r.getLastEntry()
    	req := &RequestVoteRequest{
    		RPCHeader:          r.getRPCHeader(),
    		Term:               r.getCurrentTerm(),
    		Candidate:          r.trans.EncodePeer(r.localID, r.localAddr),
    		LastLogIndex:       lastIdx,
    		LastLogTerm:        lastTerm,
    		LeadershipTransfer: r.candidateFromLeadershipTransfer,
    	}
    
    	// Construct a function to ask for a vote
    	askPeer := func(peer Server) {
    		r.goFunc(func() {
    			defer metrics.MeasureSince([]string{"raft", "candidate", "electSelf"}, time.Now())
    			resp := &voteResult{voterID: peer.ID}
    			err := r.trans.RequestVote(peer.ID, peer.Address, req, &resp.RequestVoteResponse)
    			if err != nil {
    				r.logger.Error("failed to make requestVote RPC",
    					"target", peer,
    					"error", err)
    				resp.Term = req.Term
    				resp.Granted = false
    			}
    			respCh <- resp
    		})
    	}
    
    	// For each peer, request a vote
    	for _, server := range r.configurations.latest.Servers {
    		if server.Suffrage == Voter {
    			if server.ID == r.localID {
    				// Persist a vote for ourselves
    				if err := r.persistVote(req.Term, req.Candidate); err != nil {
    					r.logger.Error("failed to persist vote", "error", err)
    					return nil
    				}
    				// Include our own vote
    				respCh <- &voteResult{
    					RequestVoteResponse: RequestVoteResponse{
    						RPCHeader: r.getRPCHeader(),
    						Term:      req.Term,
    						Granted:   true,
    					},
    					voterID: r.localID,
    				}
    			} else {
    				askPeer(server)
    			}
    		}
    	}
    
    	return respCh
    }
    

    给候选者投票, 只有任期比我大且日志比我长的人才投票,相同任期只投一票

    // requestVote is invoked when we get an request vote RPC call.
    func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) {
    	defer metrics.MeasureSince([]string{"raft", "rpc", "requestVote"}, time.Now())
    	r.observe(*req)
    
    	// Setup a response
    	resp := &RequestVoteResponse{
    		RPCHeader: r.getRPCHeader(),
    		Term:      r.getCurrentTerm(),
    		Granted:   false,
    	}
    	var rpcErr error
    	defer func() {
    		rpc.Respond(resp, rpcErr)
    	}()
    
    	// Version 0 servers will panic unless the peers is present. It's only
    	// used on them to produce a warning message.
    	if r.protocolVersion < 2 {
    		resp.Peers = encodePeers(r.configurations.latest, r.trans)
    	}
    
    	// Check if we have an existing leader [who's not the candidate] and also
    	// check the LeadershipTransfer flag is set. Usually votes are rejected if
    	// there is a known leader. But if the leader initiated a leadership transfer,
    	// vote!
    	candidate := r.trans.DecodePeer(req.Candidate)
    	if leader := r.Leader(); leader != "" && leader != candidate && !req.LeadershipTransfer {
    		r.logger.Warn("rejecting vote request since we have a leader",
    			"from", candidate,
    			"leader", leader)
    		return
    	}
    
    	// Ignore an older term
    	if req.Term < r.getCurrentTerm() {
    		return
    	}
    
    	// Increase the term if we see a newer one
    	if req.Term > r.getCurrentTerm() {
    		// Ensure transition to follower
    		r.logger.Debug("lost leadership because received a requestVote with a newer term")
    		r.setState(Follower)
    		r.setCurrentTerm(req.Term)
    		resp.Term = req.Term
    	}
    
    	// Check if we have voted yet
    	lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm)
    	if err != nil && err.Error() != "not found" {
    		r.logger.Error("failed to get last vote term", "error", err)
    		return
    	}
    	lastVoteCandBytes, err := r.stable.Get(keyLastVoteCand)
    	if err != nil && err.Error() != "not found" {
    		r.logger.Error("failed to get last vote candidate", "error", err)
    		return
    	}
    
    	// Check if we've voted in this election before
    	if lastVoteTerm == req.Term && lastVoteCandBytes != nil {
    		r.logger.Info("duplicate requestVote for same term", "term", req.Term)
    		if bytes.Compare(lastVoteCandBytes, req.Candidate) == 0 {
    			r.logger.Warn("duplicate requestVote from", "candidate", candidate)
    			resp.Granted = true
    		}
    		return
    	}
    
    	// Reject if their term is older
    	lastIdx, lastTerm := r.getLastEntry()
    	if lastTerm > req.LastLogTerm {
    		r.logger.Warn("rejecting vote request since our last term is greater",
    			"candidate", candidate,
    			"last-term", lastTerm,
    			"last-candidate-term", req.LastLogTerm)
    		return
    	}
    
    	if lastTerm == req.LastLogTerm && lastIdx > req.LastLogIndex {
    		r.logger.Warn("rejecting vote request since our last index is greater",
    			"candidate", candidate,
    			"last-index", lastIdx,
    			"last-candidate-index", req.LastLogIndex)
    		return
    	}
    
    	// Persist a vote for safety
    	if err := r.persistVote(req.Term, req.Candidate); err != nil {
    		r.logger.Error("failed to persist vote", "error", err)
    		return
    	}
    
    	resp.Granted = true
    	r.setLastContact()
    	return
    }
    

    当选leader 后,立即给其他成员发送命令,使其成为 跟随者

    复制过程

    跟随者 接受领导者的日志

    // runFollower runs the FSM for a follower.
    func (r *Raft) runFollower() {
    	didWarn := false
    	r.logger.Info("entering follower state", "follower", r, "leader", r.Leader())
    	metrics.IncrCounter([]string{"raft", "state", "follower"}, 1)
    	heartbeatTimer := randomTimeout(r.config().HeartbeatTimeout)
    
    	for r.getState() == Follower {
    		select {
    		case rpc := <-r.rpcCh:
    			r.processRPC(rpc)
    
                // 忽略部分代码
            }
        }
    }
    

    处理日志

    
    // appendEntries is invoked when we get an append entries RPC call. This must
    // only be called from the main thread.
    func (r *Raft) appendEntries(rpc RPC, a *AppendEntriesRequest) {
    	defer metrics.MeasureSince([]string{"raft", "rpc", "appendEntries"}, time.Now())
    	// Setup a response
    	resp := &AppendEntriesResponse{
    		RPCHeader:      r.getRPCHeader(),
    		Term:           r.getCurrentTerm(),
    		LastLog:        r.getLastIndex(),
    		Success:        false,
    		NoRetryBackoff: false,
    	}
    	var rpcErr error
    	defer func() {
    		rpc.Respond(resp, rpcErr)
    	}()
    
    	// Ignore an older term
    	if a.Term < r.getCurrentTerm() {
    		return
    	}
    
    	// Increase the term if we see a newer one, also transition to follower
    	// if we ever get an appendEntries call
    	if a.Term > r.getCurrentTerm() || r.getState() != Follower {
    		// Ensure transition to follower
    		r.setState(Follower)
    		r.setCurrentTerm(a.Term)
    		resp.Term = a.Term
    	}
    
    	// Save the current leader
    	r.setLeader(r.trans.DecodePeer(a.Leader))
    
    	// Verify the last log entry
    	if a.PrevLogEntry > 0 {
    		lastIdx, lastTerm := r.getLastEntry()
    
    		var prevLogTerm uint64
    		if a.PrevLogEntry == lastIdx {
    			prevLogTerm = lastTerm
    
    		} else {
    			var prevLog Log
    			if err := r.logs.GetLog(a.PrevLogEntry, &prevLog); err != nil {
    				r.logger.Warn("failed to get previous log",
    					"previous-index", a.PrevLogEntry,
    					"last-index", lastIdx,
    					"error", err)
    				resp.NoRetryBackoff = true
    				return
    			}
    			prevLogTerm = prevLog.Term
    		}
    
    		if a.PrevLogTerm != prevLogTerm {
    			r.logger.Warn("previous log term mis-match",
    				"ours", prevLogTerm,
    				"remote", a.PrevLogTerm)
    			resp.NoRetryBackoff = true
    			return
    		}
    	}
    
    	// Process any new entries
    	if len(a.Entries) > 0 {
    		start := time.Now()
    
    		// Delete any conflicting entries, skip any duplicates
    		lastLogIdx, _ := r.getLastLog()
    		var newEntries []*Log
    		for i, entry := range a.Entries {
    			if entry.Index > lastLogIdx {
    				newEntries = a.Entries[i:]
    				break
    			}
    			var storeEntry Log
    			if err := r.logs.GetLog(entry.Index, &storeEntry); err != nil {
    				r.logger.Warn("failed to get log entry",
    					"index", entry.Index,
    					"error", err)
    				return
    			}
    			if entry.Term != storeEntry.Term {
    				r.logger.Warn("clearing log suffix",
    					"from", entry.Index,
    					"to", lastLogIdx)
    				if err := r.logs.DeleteRange(entry.Index, lastLogIdx); err != nil {
    					r.logger.Error("failed to clear log suffix", "error", err)
    					return
    				}
    				if entry.Index <= r.configurations.latestIndex {
    					r.setLatestConfiguration(r.configurations.committed, r.configurations.committedIndex)
    				}
    				newEntries = a.Entries[i:]
    				break
    			}
    		}
    
    		if n := len(newEntries); n > 0 {
    			// Append the new entries
    			if err := r.logs.StoreLogs(newEntries); err != nil {
    				r.logger.Error("failed to append to logs", "error", err)
    				// TODO: leaving r.getLastLog() in the wrong
    				// state if there was a truncation above
    				return
    			}
    
    			// Handle any new configuration changes
    			for _, newEntry := range newEntries {
    				r.processConfigurationLogEntry(newEntry)
    			}
    
    			// Update the lastLog
    			last := newEntries[n-1]
    			r.setLastLog(last.Index, last.Term)
    		}
    
    		metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "storeLogs"}, start)
    	}
    
    	// Update the commit index
    	if a.LeaderCommitIndex > 0 && a.LeaderCommitIndex > r.getCommitIndex() {
    		start := time.Now()
    		idx := min(a.LeaderCommitIndex, r.getLastIndex())
    		r.setCommitIndex(idx)
    		if r.configurations.latestIndex <= idx {
    			r.setCommittedConfiguration(r.configurations.latest, r.configurations.latestIndex)
    		}
    		r.processLogs(idx, nil)
    		metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "processLogs"}, start)
    	}
    
    	// Everything went well, set success
    	resp.Success = true
    	r.setLastContact()
    	return
    }
    
    
    
  • 相关阅读:
    Hadoop分布式文件系统:架构和设计
    分布式设计学习资料
    codeforces上一道贪心算法题
    优先队列实现n路归并算法O(n * lgK)
    LINUX 暂停、继续进程
    重叠(Overlapped)IO模型
    WSAEventSelect模型
    WSAEventSelect模型 应用实例,重写TCP服务器实例
    选择模型2
    第四章 数据抽象 《C++编程思想》
  • 原文地址:https://www.cnblogs.com/SLchuck/p/14781920.html
Copyright © 2020-2023  润新知