PostgreSQL 在何处真正开始写数据

基本关系是：

BackgroundWriterMain 循环中，调用 BgBufferSync() -->SyncOneBuffer -->FlushBuffer -->smgrwrite

看代码：

/*                    
 * Main entry point for bgwriter process                    
 *                    
 * This is invoked from AuxiliaryProcessMain, which has already created the                    
 * basic execution environment, but not enabled signals yet.                    
 */                    
void                    
BackgroundWriterMain(void)                    
{                    
    ……                
    /*                
     * Loop forever                
     */                
    for (;;)                
    {                
        ……            
                    
        /*            
         * Do one cycle of dirty-buffer writing.            
         */            
        can_hibernate = BgBufferSync();            
        ……            
    }                
}

再看：

/*                            
 * BgBufferSync -- Write out some dirty buffers in the pool.                            
 *                            
 * This is called periodically by the background writer process.                            
 *                            
 * Returns true if it's appropriate for the bgwriter process to go into                            
 * low-power hibernation mode.    (This happens if the strategy clock sweep                        
 * has been "lapped" and no buffer allocations have occurred recently,                            
 * or if the bgwriter has been effectively disabled by setting                            
 * bgwriter_lru_maxpages to 0.)                            
 */                            
bool                            
BgBufferSync(void)                            
{                            
    ……                        
    /* Execute the LRU scan */                        
    while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)                        
    {                        
        int    buffer_state = SyncOneBuffer(next_to_clean, true);                
                            
        if (++next_to_clean >= NBuffers)                    
        {                    
            next_to_clean = 0;                
            next_passes++;                
        }                    
        num_to_scan--;                    
                            
        if (buffer_state & BUF_WRITTEN)                    
        {                    
            reusable_buffers++;                
            if (++num_written >= bgwriter_lru_maxpages)                
            {                
                BgWriterStats.m_maxwritten_clean++;            
                break;            
            }                
        }                    
        else if (buffer_state & BUF_REUSABLE)                    
            reusable_buffers++;                
    }                        
    ……                        
}

再看：

/*                        
 * SyncOneBuffer -- process a single buffer during syncing.                        
 *                        
 * If skip_recently_used is true, we don't write currently-pinned buffers, nor                        
 * buffers marked recently used, as these are not replacement candidates.                        
 *                        
 * Returns a bitmask containing the following flag bits:                        
 *    BUF_WRITTEN: we wrote the buffer.                    
 *    BUF_REUSABLE: buffer is available for replacement, ie, it has                    
 *        pin count 0 and usage count 0.                
 *                        
 * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean                        
 * after locking it, but we don't care all that much.)                        
 *                        
 * Note: caller must have done ResourceOwnerEnlargeBuffers.                        
 */                        
static int                        
SyncOneBuffer(int buf_id, bool skip_recently_used)                        
{                        
    volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];                    
    int            result = 0;        
                        
    /*                    
     * Check whether buffer needs writing.                    
     *                    
     * We can make this check without taking the buffer content lock so long                    
     * as we mark pages dirty in access methods *before* logging changes with                    
     * XLogInsert(): if someone marks the buffer dirty just after our check we                    
     * don't worry because our checkpoint.redo points before log record for                    
     * upcoming changes and so we are not required to write such dirty buffer.                    
     */                    
    LockBufHdr(bufHdr);                    
                        
    if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)                    
        result |= BUF_REUSABLE;                
    else if (skip_recently_used)                    
    {                    
        /* Caller told us not to write recently-used buffers */                
        UnlockBufHdr(bufHdr);                
        return result;                
    }                    
                        
    if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))                    
    {                    
        /* It's clean, so nothing to do */                
        UnlockBufHdr(bufHdr);                
        return result;                
    }                    
                        
    /*                    
     * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the                    
     * buffer is clean by the time we've locked it.)                    
     */                    
    PinBuffer_Locked(bufHdr);                    
    LWLockAcquire(bufHdr->content_lock, LW_SHARED);                    
                        
    FlushBuffer(bufHdr, NULL);                    
                        
    LWLockRelease(bufHdr->content_lock);                    
    UnpinBuffer(bufHdr, true);                    
                        
    return result | BUF_WRITTEN;                    
}

再看：

/*                        
 * FlushBuffer                        
 *        Physically write out a shared buffer.                
 *                        
 * NOTE: this actually just passes the buffer contents to the kernel; the                        
 * real write to disk won't happen until the kernel feels like it.  This                        
 * is okay from our point of view since we can redo the changes from WAL.                        
 * However, we will need to force the changes to disk via fsync before                        
 * we can checkpoint WAL.                        
 *                        
 * The caller must hold a pin on the buffer and have share-locked the                        
 * buffer contents.  (Note: a share-lock does not prevent updates of                        
 * hint bits in the buffer, so the page could change while the write                        
 * is in progress, but we assume that that will not invalidate the data                        
 * written.)                        
 *                        
 * If the caller has an smgr reference for the buffer's relation, pass it                        
 * as the second parameter.  If not, pass NULL.  In the latter case, the                        
 * relation will be marked as "transient" so that the corresponding                        
 * kernel-level file descriptors are closed when the current transaction ends,                        
 * if any.                        
 */                        
static void                        
FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)                        
{                        
    XLogRecPtr    recptr;                
    ErrorContextCallback errcontext;                    
    instr_time    io_start,                
                io_time;        
                        
    /*                    
     * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns                    
     * false, then someone else flushed the buffer before we could, so we need                    
     * not do anything.                    
     */                    
    if (!StartBufferIO(buf, false))                    
        return;                
                        
    /* Setup error traceback support for ereport() */                    
    errcontext.callback = shared_buffer_write_error_callback;                    
    errcontext.arg = (void *) buf;                    
    errcontext.previous = error_context_stack;                    
    error_context_stack = &errcontext;                    
                        
    /* Find smgr relation for buffer, and mark it as transient */                    
    if (reln == NULL)                    
    {                    
        reln = smgropen(buf->tag.rnode, InvalidBackendId);                
        smgrsettransient(reln);                
    }                    
                        
    TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,                    
                    buf->tag.blockNum,    
                    reln->smgr_rnode.node.spcNode,    
                    reln->smgr_rnode.node.dbNode,    
                    reln->smgr_rnode.node.relNode);    
                        
    /*                    
     * Force XLOG flush up to buffer's LSN.  This implements the basic WAL                    
     * rule that log updates must hit disk before any of the data-file changes                    
     * they describe do.                    
     */                    
    recptr = BufferGetLSN(buf);                    
    XLogFlush(recptr);                    
                        
    /*                    
     * Now it's safe to write buffer to disk. Note that no one else should                    
     * have been able to write it while we were busy with log flushing because                    
     * we have the io_in_progress lock.                    
     */                    
                        
    /* To check if block content changes while flushing. - vadim 01/17/97 */                    
    LockBufHdr(buf);                    
    buf->flags &= ~BM_JUST_DIRTIED;                    
    UnlockBufHdr(buf);                    
                        
    if (track_io_timing)                    
        INSTR_TIME_SET_CURRENT(io_start);                
                        
    smgrwrite(reln,                    
              buf->tag.forkNum,            
              buf->tag.blockNum,            
              (char *) BufHdrGetBlock(buf),            
              false);            
                        
    if (track_io_timing)                    
    {                    
        INSTR_TIME_SET_CURRENT(io_time);                
        INSTR_TIME_SUBTRACT(io_time, io_start);                
        pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));                
        INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);                
    }                    
                        
    pgBufferUsage.shared_blks_written++;                    
                        
    /*                    
     * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and                    
     * end the io_in_progress state.                    
     */                    
    TerminateBufferIO(buf, true, 0);                    
                        
    TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,                    
                           buf->tag.blockNum,
                           reln->smgr_rnode.node.spcNode,
                           reln->smgr_rnode.node.dbNode,
                           reln->smgr_rnode.node.relNode);
                        
    /* Pop the error context stack */                    
    error_context_stack = errcontext.previous;                    
}

循环里面一次写一个 buffer哇，怪异否？也许是有一点就写一点，设计者是故意的？

相关阅读:
设计模式系列
 Python3 系列之可变参数和关键字参数
 设计模式系列
 【HANA系列】SAP HANA ODBC error due to mismatch of version
【FICO系列】SAP FICO FS00修改科目为未清项目管理
 【FIORI系列】SAP OpenUI5 (SAPUI5) js框架简单介绍
 【HANA系列】SAP HANA SQL获取当前日期加若干天后的日期
 【HANA系列】SAP HANA SQL获取本周的周一
 【HANA系列】SAP HANA SQL获取当前日期
 【HANA系列】SAP HANA SQL获取当前日期最后一天
原文地址：https://www.cnblogs.com/gaojian/p/2737470.html