Before PostgreSQL 8.1, all operations of the shared buffer manager itself
were protected by a single system-wide lock, the BufMgrLock, which
-unsurprisingly proved to be a source of contention. The new locking scheme
-avoids grabbing system-wide exclusive locks in common code paths. It works
-like this:
-
-* There is a system-wide LWLock, the BufMappingLock, that notionally
-protects the mapping from buffer tags (page identifiers) to buffers.
-(Physically, it can be thought of as protecting the hash table maintained
-by buf_table.c.) To look up whether a buffer exists for a tag, it is
-sufficient to obtain share lock on the BufMappingLock. Note that one
-must pin the found buffer, if any, before releasing the BufMappingLock.
-To alter the page assignment of any buffer, one must hold exclusive lock
-on the BufMappingLock. This lock must be held across adjusting the buffer's
-header fields and changing the buf_table hash table. The only common
-operation that needs exclusive lock is reading in a page that was not
-in shared buffers already, which will require at least a kernel call
-and usually a wait for I/O, so it will be slow anyway.
-
-* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS
-separate locks, each guarding a portion of the buffer tag space. This allows
-further reduction of contention in the normal code paths. The partition
-that a particular buffer tag belongs to is determined from the low-order
-bits of the tag's hash value. The rules stated above apply to each partition
-independently. If it is necessary to lock more than one partition at a time,
-they must be locked in partition-number order to avoid risk of deadlock.
+unsurprisingly proved to be a source of contention. In subsequent releases,
+this lock was split into NUM_BUFFER_PARTITIONS locks, each guarding a portion
+of the buffer tag space. Even this proved to be too much contention, so
+now we use a highly concurrent hashtable (see chash.c and chash.h).
* A separate system-wide spinlock, buffer_strategy_lock, provides mutual
exclusion for operations that access the buffer free list or select
+++ /dev/null
-/*-------------------------------------------------------------------------
- *
- * buf_table.c
- * routines for mapping BufferTags to buffer indexes.
- *
- * Note: the routines in this file do no locking of their own. The caller
- * must hold a suitable lock on the appropriate BufMappingLock, as specified
- * in the comments. We can't do the locking inside these functions because
- * in most cases the caller needs to adjust the buffer header contents
- * before the lock is released (see notes in README).
- *
- *
- * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- * src/backend/storage/buffer/buf_table.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "miscadmin.h"
-#include "storage/bufmgr.h"
-#include "storage/buf_internals.h"
-#include "utils/chash.h"
-
-
-/* entry for buffer lookup hashtable */
-typedef struct
-{
- BufferTag key; /* Tag of a disk page */
- int id; /* Associated buffer ID */
-} BufferLookupEnt;
-
-static CHashDescriptor SharedBufDescriptor = {
- "buffer lookup table",
- 0,
- sizeof(BufferLookupEnt),
- sizeof(BufferTag)
-};
-static CHashTable SharedBufHash;
-
-/*
- * Estimate space needed for mapping hashtable
- * size is the desired hash table size (possibly more than NBuffers)
- */
-Size
-BufTableShmemSize(int size)
-{
- if (SharedBufHash == NULL)
- {
- SharedBufDescriptor.capacity = size;
- SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
- }
-
- return CHashEstimateSize(SharedBufHash);
-}
-
-/*
- * Initialize shmem hash table for mapping buffers
- * size is the desired hash table size (possibly more than NBuffers)
- */
-void
-InitBufTable(int size)
-{
- if (SharedBufHash == NULL || !IsUnderPostmaster)
- {
- Assert(SharedBufDescriptor.capacity == 0 ||
- SharedBufDescriptor.capacity == size);
- SharedBufDescriptor.capacity = size;
- SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
- }
-}
-
-/*
- * BufTableLookup
- * Lookup the given BufferTag; return buffer ID, or -1 if not found
- */
-int
-BufTableLookup(BufferTag *tagPtr)
-{
- BufferLookupEnt ent;
-
- ent.key = *tagPtr;
- if (!CHashSearch(SharedBufHash, &ent))
- return -1;
-
- return ent.id;
-}
-
-/*
- * BufTableInsert
- * Insert a hashtable entry for given tag and buffer ID,
- * unless an entry already exists for that tag
- *
- * Returns -1 on successful insertion. If a conflicting entry exists
- * already, returns the buffer ID in that entry.
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-int
-BufTableInsert(BufferTag *tagPtr, int buf_id)
-{
- BufferLookupEnt ent;
-
- ent.key = *tagPtr;
- ent.id = buf_id;
-
- Assert(buf_id >= 0); /* -1 is reserved for not-in-table */
- Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
-
- if (CHashInsert(SharedBufHash, &ent))
- return -1;
-
- return ent.id;
-}
-
-/*
- * BufTableDelete
- * Delete the hashtable entry for given tag (which must exist)
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-void
-BufTableDelete(BufferTag *tagPtr)
-{
- if (!CHashDelete(SharedBufHash, tagPtr))
- elog(ERROR, "shared buffer hash table corrupted");
-}
* MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
* The disk write is delayed until buffer replacement or checkpoint.
*
- * See also these files:
- * freelist.c -- chooses victim for buffer replacement
- * buf_table.c -- manages the buffer lookup table
+ * See also freelist.c, which chooses victim for buffer replacement
*/
#include "postgres.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
+#include "utils/chash.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+ BufferTag key; /* Tag of a disk page */
+ int id; /* Associated buffer ID */
+} BufferLookupEnt;
+
+static CHashDescriptor SharedBufDescriptor = {
+ "buffer lookup table",
+ 0,
+ sizeof(BufferLookupEnt),
+ sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
/* Note: these two macros only work on shared buffers, not local ones! */
#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
static inline int32 GetPrivateRefCount(Buffer buffer);
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
+/*
+ * Estimate space needed for mapping hashtable
+ * size is the desired hash table size (possibly more than NBuffers)
+ */
+Size
+BufMgrShmemSize(int size)
+{
+ if (SharedBufHash == NULL)
+ {
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+ }
+
+ return CHashEstimateSize(SharedBufHash);
+}
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ * size is the desired hash table size (possibly more than NBuffers)
+ */
+void
+BufMgrInitShmem(int size)
+{
+ if (SharedBufHash == NULL || !IsUnderPostmaster)
+ {
+ Assert(SharedBufDescriptor.capacity == 0 ||
+ SharedBufDescriptor.capacity == size);
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+ }
+}
+
/*
* Ensure that the the PrivateRefCountArray has sufficient space to store one
* more entry. This has to be called before using NewPrivateRefCountEntry() to
}
else
{
- BufferTag newTag; /* identity of requested block */
- int buf_id;
+ BufferLookupEnt ent; /* identity of requested block */
/* create a tag so we can lookup the buffer */
- INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
+ INIT_BUFFERTAG(ent.key, reln->rd_smgr->smgr_rnode.node,
forkNum, blockNum);
- /* see if the block is in the buffer pool already */
- buf_id = BufTableLookup(&newTag);
-
/* If not in buffers, initiate prefetch */
- if (buf_id < 0)
+ if (!CHashSearch(SharedBufHash, &ent))
smgrprefetch(reln->rd_smgr, forkNum, blockNum);
/*
BufferAccessStrategy strategy,
bool *foundPtr)
{
- BufferTag newTag; /* identity of requested block */
- BufferTag oldTag; /* previous identity of selected buffer */
+ BufferLookupEnt newEnt; /* identity of requested block */
+ BufferLookupEnt oldEnt; /* previous identity of selected buffer */
BufFlags oldFlags;
- int buf_id;
volatile BufferDesc *buf;
bool valid;
/* create a tag so we can lookup the buffer */
- INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
+ INIT_BUFFERTAG(newEnt.key, smgr->smgr_rnode.node, forkNum, blockNum);
/* see if the block is in the buffer pool already */
start:
- buf_id = BufTableLookup(&newTag);
- if (buf_id >= 0)
+ if (CHashSearch(SharedBufHash, &newEnt))
{
BufferDesc *foundbuf;
* Found it. Now, pin the buffer so no one can steal it from the
* buffer pool.
*/
- foundbuf = &BufferDescriptors[buf_id];
+ foundbuf = &BufferDescriptors[newEnt.id];
valid = PinBuffer(foundbuf, strategy);
/* Check whether someone recycled the buffer before we pinned it. */
- if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+ if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
{
UnpinBuffer(foundbuf, true);
goto start;
if (oldFlags & BM_TAG_VALID)
{
/* Save old tag. */
- oldTag = buf->tag;
+ oldEnt.key = buf->tag;
}
/*
* tag.
*/
enter:
- buf_id = BufTableInsert(&newTag, buf->buf_id);
-
- if (buf_id >= 0)
+ newEnt.id = buf->buf_id;
+ if (!CHashInsert(SharedBufHash, &newEnt))
{
BufferDesc *foundbuf;
* recheck the buffer tag after pinning it, because it could still
* get renamed under us.
*/
- foundbuf = &BufferDescriptors[buf_id];
+ foundbuf = &BufferDescriptors[newEnt.id];
valid = PinBuffer(foundbuf, strategy);
- if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+ if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
{
UnpinBuffer(foundbuf, true);
goto enter;
break;
UnlockBufHdr(buf);
- BufTableDelete(&newTag);
+ if (!CHashDelete(SharedBufHash, &newEnt.key))
+ elog(ERROR, "shared buffer hash table corrupted");
UnpinBuffer(buf, true);
}
* the old content is no longer relevant. (The usage_count starts out at
* 1 so that the buffer can survive one clock-sweep pass.)
*/
- buf->tag = newTag;
+ buf->tag = newEnt.key;
buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
if (relpersistence == RELPERSISTENCE_PERMANENT)
buf->flags |= BM_TAG_VALID | BM_PERMANENT;
UnlockBufHdr(buf);
- if (oldFlags & BM_TAG_VALID)
- BufTableDelete(&oldTag);
+ if ((oldFlags & BM_TAG_VALID) != 0 &&
+ !CHashDelete(SharedBufHash, &oldEnt))
+ elog(ERROR, "shared buffer hash table corrupted");
/*
* Buffer contents are currently invalid. Try to get the io_in_progress
static void
InvalidateBuffer(volatile BufferDesc *buf)
{
- BufferTag oldTag;
+ BufferLookupEnt oldEnt;
BufFlags oldFlags;
/* Save the original buffer tag before dropping the spinlock */
- oldTag = buf->tag;
+ oldEnt.key = buf->tag;
/*
* We assume the only reason for it to be pinned is that someone else is
LockBufHdr(buf);
/* If it's changed while we were waiting for lock, do nothing */
- if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+ if (!BUFFERTAGS_EQUAL(buf->tag, oldEnt.key))
{
UnlockBufHdr(buf);
return;
/*
* Remove the buffer from the lookup hashtable, if it was in there.
*/
- if (oldFlags & BM_TAG_VALID)
- BufTableDelete(&oldTag);
+ if ((oldFlags & BM_TAG_VALID) != 0 &&
+ !CHashDelete(SharedBufHash, &oldEnt))
+ elog(ERROR, "shared buffer hash table corrupted");
/*
* Insert the buffer at the head of the list of free buffers.