From bd7dcdafa752802566d0fef3ac9c126c41f276e4 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 21 May 2018 15:43:30 -0700 Subject: [PATCH] WIP: Optimize register_dirty_segment() to not repeatedly queue fsync requests. Author: Andres Freund Reviewed-By: Discussion: https://postgr.es/m/ Backpatch: --- src/backend/postmaster/checkpointer.c | 36 ++++++++++++------- src/backend/storage/smgr/md.c | 50 +++++++++++++++++++-------- src/include/postmaster/bgwriter.h | 3 ++ 3 files changed, 63 insertions(+), 26 deletions(-) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 0950ada601..333eb91c9d 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -46,6 +46,7 @@ #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" +#include "port/atomics.h" #include "postmaster/bgwriter.h" #include "replication/syncrep.h" #include "storage/bufmgr.h" @@ -126,8 +127,9 @@ typedef struct int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ - uint32 num_backend_writes; /* counts user backend buffer writes */ - uint32 num_backend_fsync; /* counts user backend fsync calls */ + pg_atomic_uint32 num_backend_writes; /* counts user backend buffer writes */ + pg_atomic_uint32 num_backend_fsync; /* counts user backend fsync calls */ + pg_atomic_uint32 ckpt_cycle; /* cycle */ int num_requests; /* current # of requests */ int max_requests; /* allocated array size */ @@ -943,6 +945,9 @@ CheckpointerShmemInit(void) MemSet(CheckpointerShmem, 0, size); SpinLockInit(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->max_requests = NBuffers; + pg_atomic_init_u32(&CheckpointerShmem->ckpt_cycle, 0); + pg_atomic_init_u32(&CheckpointerShmem->num_backend_writes, 0); + pg_atomic_init_u32(&CheckpointerShmem->num_backend_fsync, 0); } } @@ -1133,10 +1138,6 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); - /* Count all backend writes regardless of if they fit in the queue */ - if (!AmBackgroundWriterProcess()) - CheckpointerShmem->num_backend_writes++; - /* * If the checkpointer isn't running or the request queue is full, the * backend will have to perform its own fsync request. But before forcing @@ -1151,7 +1152,7 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) * fsync */ if (!AmBackgroundWriterProcess()) - CheckpointerShmem->num_backend_fsync++; + pg_atomic_fetch_add_u32(&CheckpointerShmem->num_backend_fsync, 1); LWLockRelease(CheckpointerCommLock); return false; } @@ -1312,11 +1313,10 @@ AbsorbFsyncRequests(void) LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); /* Transfer stats counts into pending pgstats message */ - BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_backend_writes; - BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_backend_fsync; - - CheckpointerShmem->num_backend_writes = 0; - CheckpointerShmem->num_backend_fsync = 0; + BgWriterStats.m_buf_written_backend += + pg_atomic_exchange_u32(&CheckpointerShmem->num_backend_writes, 0); + BgWriterStats.m_buf_fsync_backend += + pg_atomic_exchange_u32(&CheckpointerShmem->num_backend_fsync, 0); /* * We try to avoid holding the lock for a long time by copying the request @@ -1390,3 +1390,15 @@ FirstCallSinceLastCheckpoint(void) return FirstCall; } + +uint32 +GetCheckpointSyncCycle(void) +{ + return pg_atomic_read_u32(&CheckpointerShmem->ckpt_cycle); +} + +uint32 +IncCheckpointSyncCycle(void) +{ + return pg_atomic_fetch_add_u32(&CheckpointerShmem->ckpt_cycle, 1); +} diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2ec103e604..555774320b 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -109,6 +109,7 @@ typedef struct _MdfdVec { File mdfd_vfd; /* fd number in fd.c's pool */ BlockNumber mdfd_segno; /* segment number, from 0 */ + uint32 mdfd_dirtied_cycle; } MdfdVec; static MemoryContext MdCxt; /* context for all MdfdVec objects */ @@ -133,12 +134,12 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */ * (Regular backends do not track pending operations locally, but forward * them to the checkpointer.) */ -typedef uint16 CycleCtr; /* can be any convenient integer size */ +typedef uint32 CycleCtr; /* can be any convenient integer size */ typedef struct { RelFileNode rnode; /* hash table key (must be first!) */ - CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */ + CycleCtr cycle_ctr; /* sync cycle of oldest request */ /* requests[f] has bit n set if we need to fsync segment n of fork f */ Bitmapset *requests[MAX_FORKNUM + 1]; /* canceled[f] is true if we canceled fsyncs for fork "recently" */ @@ -155,7 +156,6 @@ static HTAB *pendingOpsTable = NULL; static List *pendingUnlinks = NIL; static MemoryContext pendingOpsCxt; /* context for the above */ -static CycleCtr mdsync_cycle_ctr = 0; static CycleCtr mdckpt_cycle_ctr = 0; @@ -333,6 +333,7 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) mdfd = &reln->md_seg_fds[forkNum][0]; mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; + mdfd->mdfd_dirtied_cycle = GetCheckpointSyncCycle() - 1; } /* @@ -614,6 +615,7 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) mdfd = &reln->md_seg_fds[forknum][0]; mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; + mdfd->mdfd_dirtied_cycle = GetCheckpointSyncCycle() - 1; Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); @@ -1089,9 +1091,9 @@ mdsync(void) * To avoid excess fsync'ing (in the worst case, maybe a never-terminating * checkpoint), we want to ignore fsync requests that are entered into the * hashtable after this point --- they should be processed next time, - * instead. We use mdsync_cycle_ctr to tell old entries apart from new - * ones: new ones will have cycle_ctr equal to the incremented value of - * mdsync_cycle_ctr. + * instead. We use GetCheckpointSyncCycle() to tell old entries apart + * from new ones: new ones will have cycle_ctr equal to + * IncCheckpointSyncCycle(). * * In normal circumstances, all entries present in the table at this point * will have cycle_ctr exactly equal to the current (about to be old) @@ -1115,16 +1117,16 @@ mdsync(void) hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { - entry->cycle_ctr = mdsync_cycle_ctr; + entry->cycle_ctr = GetCheckpointSyncCycle(); } } - /* Advance counter so that new hashtable entries are distinguishable */ - mdsync_cycle_ctr++; - /* Set flag to detect failure if we don't reach the end of the loop */ mdsync_in_progress = true; + /* Advance counter so that new hashtable entries are distinguishable */ + IncCheckpointSyncCycle(); + /* Now scan the hashtable for fsync requests to process */ absorb_counter = FSYNCS_PER_ABSORB; hash_seq_init(&hstat, pendingOpsTable); @@ -1137,11 +1139,11 @@ mdsync(void) * contain multiple fsync-request bits, but they are all new. Note * "continue" bypasses the hash-remove call at the bottom of the loop. */ - if (entry->cycle_ctr == mdsync_cycle_ctr) + if (entry->cycle_ctr == GetCheckpointSyncCycle()) continue; /* Else assert we haven't missed it */ - Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); + Assert((CycleCtr) (entry->cycle_ctr + 1) == GetCheckpointSyncCycle()); /* * Scan over the forks and segments represented by the entry. @@ -1308,7 +1310,7 @@ mdsync(void) break; } if (forknum <= MAX_FORKNUM) - entry->cycle_ctr = mdsync_cycle_ctr; + entry->cycle_ctr = GetCheckpointSyncCycle(); else { /* Okay to remove it */ @@ -1427,18 +1429,37 @@ mdpostckpt(void) static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { + uint32 cycle; + /* Temp relations should never be fsync'd */ Assert(!SmgrIsTemp(reln)); + pg_memory_barrier(); + cycle = GetCheckpointSyncCycle(); + + /* + * Don't repeatedly register the same segment as dirty. + * + * FIXME: This doesn't correctly deal with overflows yet! We could + * e.g. emit an smgr invalidation every now and then, or use a 64bit + * counter. Or just error out if the cycle reaches UINT32_MAX. + */ + if (seg->mdfd_dirtied_cycle == cycle) + return; + if (pendingOpsTable) { /* push it into local pending-ops table */ RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno); + seg->mdfd_dirtied_cycle = cycle; } else { if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno)) + { + seg->mdfd_dirtied_cycle = cycle; return; /* passed it off successfully */ + } ereport(DEBUG1, (errmsg("could not forward fsync request because request queue is full"))); @@ -1623,7 +1644,7 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) /* if new entry, initialize it */ if (!found) { - entry->cycle_ctr = mdsync_cycle_ctr; + entry->cycle_ctr = GetCheckpointSyncCycle(); MemSet(entry->requests, 0, sizeof(entry->requests)); MemSet(entry->canceled, 0, sizeof(entry->canceled)); } @@ -1793,6 +1814,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, v = &reln->md_seg_fds[forknum][segno]; v->mdfd_vfd = fd; v->mdfd_segno = segno; + v->mdfd_dirtied_cycle = GetCheckpointSyncCycle() - 1; Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index 941c6aba7d..87a5cfad41 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -38,6 +38,9 @@ extern void AbsorbFsyncRequests(void); extern Size CheckpointerShmemSize(void); extern void CheckpointerShmemInit(void); +extern uint32 GetCheckpointSyncCycle(void); +extern uint32 IncCheckpointSyncCycle(void); + extern bool FirstCallSinceLastCheckpoint(void); #endif /* _BGWRITER_H */ -- 2.39.5