kafka 的心跳是 kafka consumer 和 broker 之间的健康检查,只有当 broker coordinator 正常时,consumer 才会发送心跳。
consumer 和 rebalance 相关的 2 个配置参数:
参数名 --> MemberMetadata 字段 session.timeout.ms --> MemberMetadata.sessionTimeoutMs max.poll.interval.ms --> MemberMetadata.rebalanceTimeoutMs
broker 端,sessionTimeoutMs 参数
broker 处理心跳的逻辑在 GroupCoordinator 类中:如果心跳超期, broker coordinator 会把消费者从 group 中移除,并触发 rebalance。
1 private def completeAndScheduleNextHeartbeatExpiration(group: GroupMetadata, member: MemberMetadata) { 2 // complete current heartbeat expectation 3 member.latestHeartbeat = time.milliseconds() 4 val memberKey = MemberKey(member.groupId, member.memberId) 5 heartbeatPurgatory.checkAndComplete(memberKey) 6 7 // reschedule the next heartbeat expiration deadline 8 // 计算心跳截止时刻 9 val newHeartbeatDeadline = member.latestHeartbeat + member.sessionTimeoutMs 10 val delayedHeartbeat = new DelayedHeartbeat(this, group, member, newHeartbeatDeadline, member.sessionTimeoutMs) 11 heartbeatPurgatory.tryCompleteElseWatch(delayedHeartbeat, Seq(memberKey)) 12 } 13 14 // 心跳过期 15 def onExpireHeartbeat(group: GroupMetadata, member: MemberMetadata, heartbeatDeadline: Long) { 16 group.inLock { 17 if (!shouldKeepMemberAlive(member, heartbeatDeadline)) { 18 info(s"Member ${member.memberId} in group ${group.groupId} has failed, removing it from the group") 19 removeMemberAndUpdateGroup(group, member) 20 } 21 } 22 } 23 24 private def shouldKeepMemberAlive(member: MemberMetadata, heartbeatDeadline: Long) = 25 member.awaitingJoinCallback != null || 26 member.awaitingSyncCallback != null || 27 member.latestHeartbeat + member.sessionTimeoutMs > heartbeatDeadline
consumer 端:sessionTimeoutMs,rebalanceTimeoutMs 参数
如果客户端发现心跳超期,客户端会标记 coordinator 为不可用,并阻塞心跳线程;如果超过了 poll 消息的间隔超过了 rebalanceTimeoutMs,则 consumer 告知 broker 主动离开消费组,也会触发 rebalance
org.apache.kafka.clients.consumer.internals.AbstractCoordinator.HeartbeatThread 代码片段:
if (coordinatorUnknown()) { if (findCoordinatorFuture != null || lookupCoordinator().failed()) // the immediate future check ensures that we backoff properly in the case that no // brokers are available to connect to. AbstractCoordinator.this.wait(retryBackoffMs); } else if (heartbeat.sessionTimeoutExpired(now)) { // the session timeout has expired without seeing a successful heartbeat, so we should // probably make sure the coordinator is still healthy. markCoordinatorUnknown(); } else if (heartbeat.pollTimeoutExpired(now)) { // the poll timeout has expired, which means that the foreground thread has stalled // in between calls to poll(), so we explicitly leave the group. maybeLeaveGroup(); } else if (!heartbeat.shouldHeartbeat(now)) { // poll again after waiting for the retry backoff in case the heartbeat failed or the // coordinator disconnected AbstractCoordinator.this.wait(retryBackoffMs); } else { heartbeat.sentHeartbeat(now); sendHeartbeatRequest().addListener(new RequestFutureListener<Void>() { @Override public void onSuccess(Void value) { synchronized (AbstractCoordinator.this) { heartbeat.receiveHeartbeat(time.milliseconds()); } } @Override public void onFailure(RuntimeException e) { synchronized (AbstractCoordinator.this) { if (e instanceof RebalanceInProgressException) { // it is valid to continue heartbeating while the group is rebalancing. This // ensures that the coordinator keeps the member in the group for as long // as the duration of the rebalance timeout. If we stop sending heartbeats, // however, then the session timeout may expire before we can rejoin. heartbeat.receiveHeartbeat(time.milliseconds()); } else { heartbeat.failHeartbeat(); // wake up the thread if it's sleeping to reschedule the heartbeat AbstractCoordinator.this.notify(); } } } }); }
1 /** 2 * A helper class for managing the heartbeat to the coordinator 3 */ 4 public final class Heartbeat { 5 private final long sessionTimeout; 6 private final long heartbeatInterval; 7 private final long maxPollInterval; 8 private final long retryBackoffMs; 9 10 private volatile long lastHeartbeatSend; // volatile since it is read by metrics 11 private long lastHeartbeatReceive; 12 private long lastSessionReset; 13 private long lastPoll; 14 private boolean heartbeatFailed; 15 16 public Heartbeat(long sessionTimeout, 17 long heartbeatInterval, 18 long maxPollInterval, 19 long retryBackoffMs) { 20 if (heartbeatInterval >= sessionTimeout) 21 throw new IllegalArgumentException("Heartbeat must be set lower than the session timeout"); 22 23 this.sessionTimeout = sessionTimeout; 24 this.heartbeatInterval = heartbeatInterval; 25 this.maxPollInterval = maxPollInterval; 26 this.retryBackoffMs = retryBackoffMs; 27 } 28 29 public void poll(long now) { 30 this.lastPoll = now; 31 } 32 33 public void sentHeartbeat(long now) { 34 this.lastHeartbeatSend = now; 35 this.heartbeatFailed = false; 36 } 37 38 public void failHeartbeat() { 39 this.heartbeatFailed = true; 40 } 41 42 public void receiveHeartbeat(long now) { 43 this.lastHeartbeatReceive = now; 44 } 45 46 public boolean shouldHeartbeat(long now) { 47 return timeToNextHeartbeat(now) == 0; 48 } 49 50 public long lastHeartbeatSend() { 51 return this.lastHeartbeatSend; 52 } 53 54 public long timeToNextHeartbeat(long now) { 55 long timeSinceLastHeartbeat = now - Math.max(lastHeartbeatSend, lastSessionReset); 56 final long delayToNextHeartbeat; 57 if (heartbeatFailed) 58 delayToNextHeartbeat = retryBackoffMs; 59 else 60 delayToNextHeartbeat = heartbeatInterval; 61 62 if (timeSinceLastHeartbeat > delayToNextHeartbeat) 63 return 0; 64 else 65 return delayToNextHeartbeat - timeSinceLastHeartbeat; 66 } 67 68 public boolean sessionTimeoutExpired(long now) { 69 return now - Math.max(lastSessionReset, lastHeartbeatReceive) > sessionTimeout; 70 } 71 72 public long interval() { 73 return heartbeatInterval; 74 } 75 76 public void resetTimeouts(long now) { 77 this.lastSessionReset = now; 78 this.lastPoll = now; 79 this.heartbeatFailed = false; 80 } 81 82 public boolean pollTimeoutExpired(long now) { 83 return now - lastPoll > maxPollInterval; 84 } 85 86 }
join group 的处理逻辑:kafka.coordinator.group.GroupCoordinator#onCompleteJoin