HBase MemStore Flush由类org.apache.hadoop.hbase.regionserver.MemStoreFlusher实现,具体表现为HRegionServer中的一个实例变量cacheFlusher,类结构如下:
class MemStoreFlusher extends HasThread implements FlushRequester { ...... }
/** * Abstract class which contains a Thread and delegates the common Thread * methods to that instance. * * The purpose of this class is to workaround Sun JVM bug #6915621, in which * something internal to the JDK uses Thread.currentThread() as a monitor lock. * This can produce deadlocks like HBASE-4367, HBASE-4101, etc. */ public abstract class HasThread implements Runnable { private final Thread thread; public HasThread() { this.thread = new Thread(this); } public HasThread(String name) { this.thread = new Thread(this, name); } public Thread getThread() { return thread; } public abstract void run(); // // Begin delegation to Thread public final String getName() { return thread.getName(); } public void interrupt() { thread.interrupt(); } public final boolean isAlive() { return thread.isAlive(); } public boolean isInterrupted() { return thread.isInterrupted(); } public final void setDaemon(boolean on) { thread.setDaemon(on); } public final void setName(String name) { thread.setName(name); } public final void setPriority(int newPriority) { thread.setPriority(newPriority); } public void setUncaughtExceptionHandler(UncaughtExceptionHandler eh) { thread.setUncaughtExceptionHandler(eh); } public void start() { thread.start(); } public final void join() throws InterruptedException { thread.join(); } public final void join(long millis, int nanos) throws InterruptedException { thread.join(millis, nanos); } public final void join(long millis) throws InterruptedException { thread.join(millis); } // // End delegation to Thread }
/** * Request a flush. */ public interface FlushRequester { /** * Tell the listener the cache needs to be flushed. * * @param region * the HRegion requesting the cache flush */ void requestFlush(HRegion region); }
// These two data members go together. Any entry in the one must have // a corresponding entry in the other. private final BlockingQueue<FlushQueueEntry> flushQueue = new DelayQueue<FlushQueueEntry>(); private final Map<HRegion, FlushRegionEntry> regionsInQueue = new HashMap<HRegion, FlushRegionEntry>();
private AtomicBoolean wakeupPending = new AtomicBoolean();
private final long threadWakeFrequency;
private final HRegionServer server;
private final ReentrantLock lock = new ReentrantLock(); private final Condition flushOccurred = lock.newCondition();
protected final long globalMemStoreLimit; protected final long globalMemStoreLimitLowMark; private static final float DEFAULT_UPPER = 0.4f; private static final float DEFAULT_LOWER = 0.35f; private static final String UPPER_KEY = "hbase.regionserver.global.memstore.upperLimit"; private static final String LOWER_KEY = "hbase.regionserver.global.memstore.lowerLimit";
private long blockingStoreFilesNumber; private long blockingWaitTime;
所有的Region Flush请求会被放到一个DelayedQueue中,因此放入该队列的元素必须实现Delayed接口:
interface FlushQueueEntry extends Delayed { }
Flush请求会被分为两种类型:“空”请求与实质请求,“空”请求主要用于唤醒线程,实质请求即为Region Flush请求。
/** * Token to insert into the flush queue that ensures that the flusher does * not sleep */ static class WakeupFlushThread implements FlushQueueEntry { @Override public long getDelay(TimeUnit unit) { return 0; } @Override public int compareTo(Delayed o) { return -1; } }
/** * Datastructure used in the flush queue. Holds region and retry count. * Keeps tabs on how old this object is. Implements {@link Delayed}. On * construction, the delay is zero. When added to a delay queue, we'll come * out near immediately. Call {@link #requeue(long)} passing delay in * milliseconds before readding to delay queue if you want it to stay there * a while. */ static class FlushRegionEntry implements FlushQueueEntry { private final HRegion region; private final long createTime; private long whenToExpire; private int requeueCount = 0; FlushRegionEntry(final HRegion r) { this.region = r; this.createTime = System.currentTimeMillis(); this.whenToExpire = this.createTime; } /** * @param maximumWait * @return True if we have been delayed > <code>maximumWait</code> * milliseconds. */ public boolean isMaximumWait(final long maximumWait) { return (System.currentTimeMillis() - this.createTime) > maximumWait; } /** * @return Count of times {@link #resetDelay()} was called; i.e this is * number of times we've been requeued. */ public int getRequeueCount() { return this.requeueCount; } /** * @param when * When to expire, when to come up out of the queue. Specify * in milliseconds. This method adds * System.currentTimeMillis() to whatever you pass. * @return This. */ public FlushRegionEntry requeue(final long when) { this.whenToExpire = System.currentTimeMillis() + when; this.requeueCount++; return this; } @Override public long getDelay(TimeUnit unit) { return unit.convert(this.whenToExpire - System.currentTimeMillis(), TimeUnit.MILLISECONDS); } @Override public int compareTo(Delayed other) { return Long.valueOf( getDelay(TimeUnit.MILLISECONDS) - other.getDelay(TimeUnit.MILLISECONDS)).intValue(); } @Override public String toString() { return "[flush region " + Bytes.toStringBinary(region.getRegionName()) + "]"; } }
long max = ManagementFactory.getMemoryMXBean().getHeapMemoryUsage() .getMax(); this.globalMemStoreLimit = globalMemStoreLimit(max, DEFAULT_UPPER, UPPER_KEY, conf); long lower = globalMemStoreLimit(max, DEFAULT_LOWER, LOWER_KEY, conf); if (lower > this.globalMemStoreLimit) { lower = this.globalMemStoreLimit; LOG.info("Setting globalMemStoreLimitLowMark == globalMemStoreLimit " + "because supplied " + LOWER_KEY + " was > " + UPPER_KEY); } this.globalMemStoreLimitLowMark = lower;
/** * Calculate size using passed <code>key</code> for configured percentage of * <code>max</code>. * * @param max * @param defaultLimit * @param key * @param c * @return Limit. */ static long globalMemStoreLimit(final long max, final float defaultLimit, final String key, final Configuration c) { float limit = c.getFloat(key, defaultLimit); return getMemStoreLimit(max, limit, defaultLimit); } static long getMemStoreLimit(final long max, final float limit, final float defaultLimit) { float effectiveLimit = limit; if (limit >= 0.9f || limit < 0.1f) { LOG.warn("Setting global memstore limit to default of " + defaultLimit + " because supplied value outside allowed range of 0.1 -> 0.9"); effectiveLimit = defaultLimit; } return (long) (max * effectiveLimit); }
@Override public void run() { while (!this.server.isStopped()) { FlushQueueEntry fqe = null; try { ...... } catch (InterruptedException ex) { continue; } catch (ConcurrentModificationException ex) { continue; } catch (Exception ex) { LOG.error("Cache flusher failed for entry " + fqe, ex); if (!server.checkFileSystem()) { break; } } } this.regionsInQueue.clear(); this.flushQueue.clear(); // Signal anyone waiting, so they see the close flag lock.lock(); try { flushOccurred.signalAll(); } finally { lock.unlock(); } LOG.info(getName() + " exiting"); }
wakeupPending.set(false); // allow someone to wake us up again fqe = flushQueue.poll(threadWakeFrequency, TimeUnit.MILLISECONDS);
if (fqe == null || fqe instanceof WakeupFlushThread) { if (isAboveLowWaterMark()) { LOG.debug("Flush thread woke up because memory above low water=" + StringUtils .humanReadableInt(this.globalMemStoreLimitLowMark)); if (!flushOneForGlobalPressure()) { // Wasn't able to flush any region, but we're above // low water mark // This is unlikely to happen, but might happen when // closing the // entire server - another thread is flushing // regions. We'll just // sleep a little bit to avoid spinning, and then // pretend that // we flushed one, so anyone blocked will check // again lock.lock(); try { Thread.sleep(1000); flushOccurred.signalAll(); } finally { lock.unlock(); } } // Enqueue another one of these tokens so we'll wake up // again wakeupFlushThread(); } continue; }
/** * Return true if we're above the high watermark */ private boolean isAboveLowWaterMark() { return server.getRegionServerAccounting().getGlobalMemstoreSize() >= globalMemStoreLimitLowMark; }
/** * The memstore across all regions has exceeded the low water mark. Pick one * region to flush and flush it synchronously (this is called from the flush * thread) * * @return true if successful */ private boolean flushOneForGlobalPressure() { SortedMap<Long, HRegion> regionsBySize = server .getCopyOfOnlineRegionsSortedBySize(); Set<HRegion> excludedRegions = new HashSet<HRegion>(); boolean flushedOne = false; while (!flushedOne) { ...... } return true; }
// Find the biggest region that doesn't have too many storefiles // (might be null!) HRegion bestFlushableRegion = getBiggestMemstoreRegion( regionsBySize, excludedRegions, true);
private HRegion getBiggestMemstoreRegion( SortedMap<Long, HRegion> regionsBySize, Set<HRegion> excludedRegions, boolean checkStoreFileCount) { synchronized (regionsInQueue) { for (HRegion region : regionsBySize.values()) { //如果Region出现在excludedRegions中,则表示该Region是unflushable的。 if (excludedRegions.contains(region)) { continue; } if (checkStoreFileCount && isTooManyStoreFiles(region)) { continue; } return region; } } return null; } private boolean isTooManyStoreFiles(HRegion region) { for (Store hstore : region.stores.values()) { if (hstore.getStorefilesCount() > this.blockingStoreFilesNumber) { return true; } } return false; }
// Find the biggest region, total, even if it might have too many // flushes. HRegion bestAnyRegion = getBiggestMemstoreRegion(regionsBySize, excludedRegions, false); if (bestAnyRegion == null) { LOG.error("Above memory mark but there are no flushable regions!"); return false; }
HRegion regionToFlush; if (bestFlushableRegion != null && bestAnyRegion.memstoreSize.get() > 2 * bestFlushableRegion.memstoreSize .get()) { // Even if it's not supposed to be flushed, pick a region if // it's more than twice // as big as the best flushable one - otherwise when we're under // pressure we make // lots of little flushes and cause lots of compactions, etc, // which just makes // life worse! if (LOG.isDebugEnabled()) { LOG.debug("Under global heap pressure: " + "Region " + bestAnyRegion.getRegionNameAsString() + " has too many " + "store files, but is " + StringUtils .humanReadableInt(bestAnyRegion.memstoreSize .get()) + " vs best flushable region's " + StringUtils .humanReadableInt(bestFlushableRegion.memstoreSize .get()) + ". Choosing the bigger."); } regionToFlush = bestAnyRegion; } else { if (bestFlushableRegion == null) { regionToFlush = bestAnyRegion; } else { regionToFlush = bestFlushableRegion; } }
(1)虽然bestFlushableRegion不为null,但bestAnyRegion的MemStore大小比bestFlushableRegion的MemStore大小两倍还要在,此时regionToFlush = bestAnyRegion;
(2)否则,如果bestFlushableRegion为null,则regionToFlush = bestAnyRegion,否则regionToFlush = bestFlushableRegion。
Preconditions.checkState(regionToFlush.memstoreSize.get() > 0); LOG.info("Flush of region " + regionToFlush + " due to global heap pressure"); flushedOne = flushRegion(regionToFlush, true); if (!flushedOne) { LOG.info("Excluding unflushable region " + regionToFlush + " - trying to find a different region to flush."); excludedRegions.add(regionToFlush); }
FlushRegionEntry fre = (FlushRegionEntry) fqe; if (!flushRegion(fre)) { break; }
该方法是MemStoreFlusher的实例方法,在执行具体的Region batchMutate操作(完成写入操作)之前被调用,
HRegion region = getRegion(regionName); if (!region.getRegionInfo().isMetaTable()) { /* * This method blocks callers until we're down to a safe * amount of memstore consumption. * * ****************************************************** */ this.cacheFlusher.reclaimMemStoreMemory(); }
if (isAboveHighWaterMark()) { lock.lock(); try { boolean blocked = false; long startTime = 0; while (isAboveHighWaterMark() && !server.isStopped()) { if (!blocked) { startTime = EnvironmentEdgeManager.currentTimeMillis(); LOG.info("Blocking updates on " + server.toString() + ": the global memstore size " + StringUtils.humanReadableInt(server .getRegionServerAccounting() .getGlobalMemstoreSize()) + " is >= than blocking " + StringUtils .humanReadableInt(globalMemStoreLimit) + " size"); } blocked = true; wakeupFlushThread(); try { // we should be able to wait forever, but we've seen a // bug where // we miss a notify, so put a 5 second bound on it at // least. flushOccurred.await(5, TimeUnit.SECONDS); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } } if (blocked) { final long totalTime = EnvironmentEdgeManager .currentTimeMillis() - startTime; if (totalTime > 0) { this.updatesBlockedMsHighWater.add(totalTime); } LOG.info("Unblocking updates for server " + server.toString()); } } finally { lock.unlock(); } }
/** * Return true if global memory usage is above the high watermark */ private boolean isAboveHighWaterMark() { return server.getRegionServerAccounting().getGlobalMemstoreSize() >= globalMemStoreLimit; }
else if (isAboveLowWaterMark()) { wakeupFlushThread(); }
/** * Perform a batch of mutations. It supports only Put and Delete mutations * and will ignore other types passed. * * @param mutationsAndLocks * the list of mutations paired with their requested lock IDs. * @return an array of OperationStatus which internally contains the * OperationStatusCode and the exceptionMessage if any. * @throws IOException */ public OperationStatus[] batchMutate( Pair<Mutation, Integer>[] mutationsAndLocks) throws IOException { BatchOperationInProgress<Pair<Mutation, Integer>> batchOp = new BatchOperationInProgress<Pair<Mutation, Integer>>( mutationsAndLocks); boolean initialized = false; while (!batchOp.isDone()) { checkReadOnly(); // Check if resources to support an update, may be blocked. checkResources(); ...... } return batchOp.retCodeDetails; }
在Region batchMutate中,每次循环写入数据之前都会进行checkResources的操作,该操作可能会导致本次地写入操作被阻塞。
/* * Check if resources to support an update. * * Here we synchronize on HRegion, a broad scoped lock. Its appropriate * given we're figuring in here whether this region is able to take on * writes. This is only method with a synchronize (at time of writing), this * and the synchronize on 'this' inside in internalFlushCache to send the * notify. */ private void checkResources() throws RegionTooBusyException, InterruptedIOException { // If catalog region, do not impose resource constraints or block // updates. if (this.getRegionInfo().isMetaRegion()) { return; } boolean blocked = false; long startTime = 0; while (this.memstoreSize.get() > this.blockingMemStoreSize) { requestFlush(); if (!blocked) { startTime = EnvironmentEdgeManager.currentTimeMillis(); LOG.info("Blocking updates for '" + Thread.currentThread().getName() + "' on region " + Bytes.toStringBinary(getRegionName()) + ": memstore size " + StringUtils.humanReadableInt(this.memstoreSize.get()) + " is >= than blocking " + StringUtils .humanReadableInt(this.blockingMemStoreSize) + " size"); } long now = EnvironmentEdgeManager.currentTimeMillis(); long timeToWait = startTime + busyWaitDuration - now; if (timeToWait <= 0L) { final long totalTime = now - startTime; this.updatesBlockedMs.add(totalTime); LOG.info("Failed to unblock updates for region " + this + " '" + Thread.currentThread().getName() + "' in " + totalTime + "ms. The region is still busy."); throw new RegionTooBusyException("region is flushing"); } blocked = true; synchronized (this) { try { wait(Math.min(timeToWait, threadWakeFrequency)); } catch (InterruptedException ie) { final long totalTime = EnvironmentEdgeManager .currentTimeMillis() - startTime; if (totalTime > 0) { this.updatesBlockedMs.add(totalTime); } LOG.info("Interrupted while waiting to unblock updates for region " + this + " '" + Thread.currentThread().getName() + "'"); InterruptedIOException iie = new InterruptedIOException(); iie.initCause(ie); throw iie; } } } if (blocked) { // Add in the blocked time if appropriate final long totalTime = EnvironmentEdgeManager.currentTimeMillis() - startTime; if (totalTime > 0) { this.updatesBlockedMs.add(totalTime); } LOG.info("Unblocking updates for region " + this + " '" + Thread.currentThread().getName() + "'"); } }
this.memstoreSize.get() > this.blockingMemStoreSize
如果上述条件成立,本次写入操作会被阻塞直到该Region MemStore的内存消耗值低于要求值为止。
long flushSize = this.htableDescriptor.getMemStoreFlushSize(); if (flushSize <= 0) { flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE, HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE); } this.memstoreFlushSize = flushSize; this.blockingMemStoreSize = this.memstoreFlushSize * conf.getLong("hbase.hregion.memstore.block.multiplier", 2);
MemStoreFlusher flushRegion
/* * A flushRegion that checks store file count. If too many, puts the flush * on delay queue to retry later. * * @param fqe * * @return true if the region was successfully flushed, false otherwise. If * false, there will be accompanying log messages explaining why the log was * not flushed. */ private boolean flushRegion(final FlushRegionEntry fqe) { HRegion region = fqe.region; if (!fqe.region.getRegionInfo().isMetaRegion() && isTooManyStoreFiles(region)) { if (fqe.isMaximumWait(this.blockingWaitTime)) { LOG.info("Waited " + (System.currentTimeMillis() - fqe.createTime) + "ms on a compaction to clean up 'too many store files'; waited " + "long enough... proceeding with flush of " + region.getRegionNameAsString()); } else { // If this is first time we've been put off, then emit a log // message. if (fqe.getRequeueCount() <= 0) { // Note: We don't impose blockingStoreFiles constraint on // meta regions LOG.warn("Region " + region.getRegionNameAsString() + " has too many " + "store files; delaying flush up to " + this.blockingWaitTime + "ms"); if (!this.server.compactSplitThread.requestSplit(region)) { try { this.server.compactSplitThread.requestCompaction( region, getName()); } catch (IOException e) { LOG.error( "Cache flush failed" + (region != null ? (" for region " + Bytes .toStringBinary(region .getRegionName())) : ""), RemoteExceptionHandler.checkIOException(e)); } } } // Put back on the queue. Have it come back out of the queue // after a delay of this.blockingWaitTime / 100 ms. this.flushQueue.add(fqe.requeue(this.blockingWaitTime / 100)); // Tell a lie, it's not flushed but it's ok return true; } } return flushRegion(region, false); }