Update buffer README; get rid of buf_table.c. chash2014
authorRobert Haas <rhaas@postgresql.org>
Tue, 27 Jan 2015 03:19:02 +0000 (03:19 +0000)
committerRobert Haas <rhaas@postgresql.org>
Tue, 27 Jan 2015 03:19:02 +0000 (03:19 +0000)
src/backend/storage/buffer/Makefile
src/backend/storage/buffer/README
src/backend/storage/buffer/buf_table.c [deleted file]
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/freelist.c
src/include/storage/buf_internals.h

index 2c10fba9cd1b8c24e320b4cf7c0697951c775684..b30a0dac4173e0446ee8254d4baccf51d784dc60 100644 (file)
@@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o
+OBJS = buf_init.o bufmgr.o freelist.o localbuf.o
 
 include $(top_srcdir)/src/backend/common.mk
index a4ebbccd4895c86be320827598864b17fd8af87a..86697e98251e5c272ed230e0054ca12e1b26a6ad 100644 (file)
@@ -100,30 +100,10 @@ Buffer Manager's Internal Locking
 
 Before PostgreSQL 8.1, all operations of the shared buffer manager itself
 were protected by a single system-wide lock, the BufMgrLock, which
-unsurprisingly proved to be a source of contention.  The new locking scheme
-avoids grabbing system-wide exclusive locks in common code paths.  It works
-like this:
-
-* There is a system-wide LWLock, the BufMappingLock, that notionally
-protects the mapping from buffer tags (page identifiers) to buffers.
-(Physically, it can be thought of as protecting the hash table maintained
-by buf_table.c.)  To look up whether a buffer exists for a tag, it is
-sufficient to obtain share lock on the BufMappingLock.  Note that one
-must pin the found buffer, if any, before releasing the BufMappingLock.
-To alter the page assignment of any buffer, one must hold exclusive lock
-on the BufMappingLock.  This lock must be held across adjusting the buffer's
-header fields and changing the buf_table hash table.  The only common
-operation that needs exclusive lock is reading in a page that was not
-in shared buffers already, which will require at least a kernel call
-and usually a wait for I/O, so it will be slow anyway.
-
-* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS
-separate locks, each guarding a portion of the buffer tag space.  This allows
-further reduction of contention in the normal code paths.  The partition
-that a particular buffer tag belongs to is determined from the low-order
-bits of the tag's hash value.  The rules stated above apply to each partition
-independently.  If it is necessary to lock more than one partition at a time,
-they must be locked in partition-number order to avoid risk of deadlock.
+unsurprisingly proved to be a source of contention.  In subsequent releases,
+this lock was split into NUM_BUFFER_PARTITIONS locks, each guarding a portion
+of the buffer tag space.  Even this proved to be too much contention, so
+now we use a highly concurrent hashtable (see chash.c and chash.h).
 
 * A separate system-wide spinlock, buffer_strategy_lock, provides mutual
 exclusion for operations that access the buffer free list or select
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
deleted file mode 100644 (file)
index 0840afa..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * buf_table.c
- *       routines for mapping BufferTags to buffer indexes.
- *
- * Note: the routines in this file do no locking of their own.  The caller
- * must hold a suitable lock on the appropriate BufMappingLock, as specified
- * in the comments.  We can't do the locking inside these functions because
- * in most cases the caller needs to adjust the buffer header contents
- * before the lock is released (see notes in README).
- *
- *
- * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *       src/backend/storage/buffer/buf_table.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "miscadmin.h"
-#include "storage/bufmgr.h"
-#include "storage/buf_internals.h"
-#include "utils/chash.h"
-
-
-/* entry for buffer lookup hashtable */
-typedef struct
-{
-       BufferTag       key;                    /* Tag of a disk page */
-       int                     id;                             /* Associated buffer ID */
-} BufferLookupEnt;
-
-static CHashDescriptor SharedBufDescriptor = {
-       "buffer lookup table",
-       0,
-       sizeof(BufferLookupEnt),
-       sizeof(BufferTag)
-};
-static CHashTable SharedBufHash;
-
-/*
- * Estimate space needed for mapping hashtable
- *             size is the desired hash table size (possibly more than NBuffers)
- */
-Size
-BufTableShmemSize(int size)
-{
-       if (SharedBufHash == NULL)
-       {
-               SharedBufDescriptor.capacity = size;
-               SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
-       }
-
-       return CHashEstimateSize(SharedBufHash);
-}
-
-/*
- * Initialize shmem hash table for mapping buffers
- *             size is the desired hash table size (possibly more than NBuffers)
- */
-void
-InitBufTable(int size)
-{
-       if (SharedBufHash == NULL || !IsUnderPostmaster)
-       {
-               Assert(SharedBufDescriptor.capacity == 0 ||
-                       SharedBufDescriptor.capacity == size);
-               SharedBufDescriptor.capacity = size;
-               SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
-       }
-}
-
-/*
- * BufTableLookup
- *             Lookup the given BufferTag; return buffer ID, or -1 if not found
- */
-int
-BufTableLookup(BufferTag *tagPtr)
-{
-       BufferLookupEnt ent;
-
-       ent.key = *tagPtr;
-       if (!CHashSearch(SharedBufHash, &ent))
-               return -1;
-
-       return ent.id;
-}
-
-/*
- * BufTableInsert
- *             Insert a hashtable entry for given tag and buffer ID,
- *             unless an entry already exists for that tag
- *
- * Returns -1 on successful insertion.  If a conflicting entry exists
- * already, returns the buffer ID in that entry.
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-int
-BufTableInsert(BufferTag *tagPtr, int buf_id)
-{
-       BufferLookupEnt ent;
-
-       ent.key = *tagPtr;
-       ent.id = buf_id;
-
-       Assert(buf_id >= 0);            /* -1 is reserved for not-in-table */
-       Assert(tagPtr->blockNum != P_NEW);      /* invalid tag */
-
-       if (CHashInsert(SharedBufHash, &ent))
-               return -1;
-
-       return ent.id;
-}
-
-/*
- * BufTableDelete
- *             Delete the hashtable entry for given tag (which must exist)
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-void
-BufTableDelete(BufferTag *tagPtr)
-{
-       if (!CHashDelete(SharedBufHash, tagPtr))
-               elog(ERROR, "shared buffer hash table corrupted");
-}
index cbc82bf932133808b7497decc9df6f904f9bba07..4435b3ebf9866fe124bac734c835f88f6df5fcd6 100644 (file)
@@ -24,9 +24,7 @@
  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
  *             The disk write is delayed until buffer replacement or checkpoint.
  *
- * See also these files:
- *             freelist.c -- chooses victim for buffer replacement
- *             buf_table.c -- manages the buffer lookup table
+ * See also freelist.c, which chooses victim for buffer replacement
  */
 #include "postgres.h"
 
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
+#include "utils/chash.h"
 #include "utils/rel.h"
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
 
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+       BufferTag       key;                    /* Tag of a disk page */
+       int                     id;                             /* Associated buffer ID */
+} BufferLookupEnt;
+
+static CHashDescriptor SharedBufDescriptor = {
+       "buffer lookup table",
+       0,
+       sizeof(BufferLookupEnt),
+       sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
 
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
@@ -137,6 +150,38 @@ static PrivateRefCountEntry* GetPrivateRefCountEntry(Buffer buffer, bool do_move
 static inline int32 GetPrivateRefCount(Buffer buffer);
 static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
 
+/*
+ * Estimate space needed for mapping hashtable
+ *             size is the desired hash table size (possibly more than NBuffers)
+ */
+Size
+BufMgrShmemSize(int size)
+{
+       if (SharedBufHash == NULL)
+       {
+               SharedBufDescriptor.capacity = size;
+               SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+       }
+
+       return CHashEstimateSize(SharedBufHash);
+}
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ *             size is the desired hash table size (possibly more than NBuffers)
+ */
+void
+BufMgrInitShmem(int size)
+{
+       if (SharedBufHash == NULL || !IsUnderPostmaster)
+       {
+               Assert(SharedBufDescriptor.capacity == 0 ||
+                       SharedBufDescriptor.capacity == size);
+               SharedBufDescriptor.capacity = size;
+               SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+       }
+}
+
 /*
  * Ensure that the the PrivateRefCountArray has sufficient space to store one
  * more entry. This has to be called before using NewPrivateRefCountEntry() to
@@ -444,18 +489,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
        }
        else
        {
-               BufferTag       newTag;         /* identity of requested block */
-               int                     buf_id;
+               BufferLookupEnt ent;    /* identity of requested block */
 
                /* create a tag so we can lookup the buffer */
-               INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
+               INIT_BUFFERTAG(ent.key, reln->rd_smgr->smgr_rnode.node,
                                           forkNum, blockNum);
 
-               /* see if the block is in the buffer pool already */
-               buf_id = BufTableLookup(&newTag);
-
                /* If not in buffers, initiate prefetch */
-               if (buf_id < 0)
+               if (!CHashSearch(SharedBufHash, &ent))
                        smgrprefetch(reln->rd_smgr, forkNum, blockNum);
 
                /*
@@ -862,20 +903,18 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                        BufferAccessStrategy strategy,
                        bool *foundPtr)
 {
-       BufferTag       newTag;                 /* identity of requested block */
-       BufferTag       oldTag;                 /* previous identity of selected buffer */
+       BufferLookupEnt newEnt;         /* identity of requested block */
+       BufferLookupEnt oldEnt;         /* previous identity of selected buffer */
        BufFlags        oldFlags;
-       int                     buf_id;
        volatile BufferDesc *buf;
        bool            valid;
 
        /* create a tag so we can lookup the buffer */
-       INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
+       INIT_BUFFERTAG(newEnt.key, smgr->smgr_rnode.node, forkNum, blockNum);
 
        /* see if the block is in the buffer pool already */
 start:
-       buf_id = BufTableLookup(&newTag);
-       if (buf_id >= 0)
+       if (CHashSearch(SharedBufHash, &newEnt))
        {
                BufferDesc *foundbuf;
 
@@ -883,12 +922,12 @@ start:
                 * Found it.  Now, pin the buffer so no one can steal it from the
                 * buffer pool.
                 */
-               foundbuf = &BufferDescriptors[buf_id];
+               foundbuf = &BufferDescriptors[newEnt.id];
 
                valid = PinBuffer(foundbuf, strategy);
 
                /* Check whether someone recycled the buffer before we pinned it. */
-               if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+               if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
                {
                        UnpinBuffer(foundbuf, true);
                        goto start;
@@ -1026,7 +1065,7 @@ start:
                if (oldFlags & BM_TAG_VALID)
                {
                        /* Save old tag. */
-                       oldTag = buf->tag;
+                       oldEnt.key = buf->tag;
                }
 
                /*
@@ -1037,9 +1076,8 @@ start:
                 * tag.
                 */
 enter:
-               buf_id = BufTableInsert(&newTag, buf->buf_id);
-
-               if (buf_id >= 0)
+               newEnt.id = buf->buf_id;
+               if (!CHashInsert(SharedBufHash, &newEnt))
                {
                        BufferDesc *foundbuf;
 
@@ -1050,9 +1088,9 @@ enter:
                         * recheck the buffer tag after pinning it, because it could still
                         * get renamed under us.
                         */
-                       foundbuf = &BufferDescriptors[buf_id];
+                       foundbuf = &BufferDescriptors[newEnt.id];
                        valid = PinBuffer(foundbuf, strategy);
-                       if (!BUFFERTAGS_EQUAL(newTag, foundbuf->tag))
+                       if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
                        {
                                UnpinBuffer(foundbuf, true);
                                goto enter;
@@ -1104,7 +1142,8 @@ enter:
                        break;
 
                UnlockBufHdr(buf);
-               BufTableDelete(&newTag);
+               if (!CHashDelete(SharedBufHash, &newEnt.key))
+                       elog(ERROR, "shared buffer hash table corrupted");
                UnpinBuffer(buf, true);
        }
 
@@ -1116,7 +1155,7 @@ enter:
         * the old content is no longer relevant.  (The usage_count starts out at
         * 1 so that the buffer can survive one clock-sweep pass.)
         */
-       buf->tag = newTag;
+       buf->tag = newEnt.key;
        buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
        if (relpersistence == RELPERSISTENCE_PERMANENT)
                buf->flags |= BM_TAG_VALID | BM_PERMANENT;
@@ -1126,8 +1165,9 @@ enter:
 
        UnlockBufHdr(buf);
 
-       if (oldFlags & BM_TAG_VALID)
-               BufTableDelete(&oldTag);
+       if ((oldFlags & BM_TAG_VALID) != 0 &&
+               !CHashDelete(SharedBufHash, &oldEnt))
+               elog(ERROR, "shared buffer hash table corrupted");
 
        /*
         * Buffer contents are currently invalid.  Try to get the io_in_progress
@@ -1162,11 +1202,11 @@ enter:
 static void
 InvalidateBuffer(volatile BufferDesc *buf)
 {
-       BufferTag       oldTag;
+       BufferLookupEnt oldEnt;
        BufFlags        oldFlags;
 
        /* Save the original buffer tag before dropping the spinlock */
-       oldTag = buf->tag;
+       oldEnt.key = buf->tag;
 
        /*
         * We assume the only reason for it to be pinned is that someone else is
@@ -1187,7 +1227,7 @@ InvalidateBuffer(volatile BufferDesc *buf)
                LockBufHdr(buf);
 
                /* If it's changed while we were waiting for lock, do nothing */
-               if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
+               if (!BUFFERTAGS_EQUAL(buf->tag, oldEnt.key))
                {
                        UnlockBufHdr(buf);
                        return;
@@ -1208,8 +1248,9 @@ InvalidateBuffer(volatile BufferDesc *buf)
        /*
         * Remove the buffer from the lookup hashtable, if it was in there.
         */
-       if (oldFlags & BM_TAG_VALID)
-               BufTableDelete(&oldTag);
+       if ((oldFlags & BM_TAG_VALID) != 0 &&
+               !CHashDelete(SharedBufHash, &oldEnt))
+               elog(ERROR, "shared buffer hash table corrupted");
 
        /*
         * Insert the buffer at the head of the list of free buffers.
index 3add619b5da7117afee16f8c5e421cb1de78bb49..2410dfc27225da248d6d0c8dc7dd43572c656315 100644 (file)
@@ -432,7 +432,7 @@ StrategyShmemSize(void)
        Size            size = 0;
 
        /* size of lookup hash table ... see comment in StrategyInitialize */
-       size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
+       size = add_size(size, BufMgrShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
 
        /* size of the shared replacement strategy control block */
        size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
@@ -462,7 +462,7 @@ StrategyInitialize(bool init)
         * happening in each partition concurrently, so we could need as many as
         * NBuffers + NUM_BUFFER_PARTITIONS entries.
         */
-       InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
+       BufMgrInitShmem(NBuffers + NUM_BUFFER_PARTITIONS);
 
        /*
         * Get or create the shared strategy control block
index 4b1696cf778444f26316e844beb1f75a24bc91ad..b58af8845128a2f12adc106a76da725813714858 100644 (file)
@@ -182,14 +182,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
 
-/* buf_table.c */
-extern Size BufTableShmemSize(int size);
-extern void InitBufTable(int size);
-extern uint32 BufTableHashCode(BufferTag *tagPtr);
-extern int     BufTableLookup(BufferTag *tagPtr);
-extern int     BufTableInsert(BufferTag *tagPtr, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr);
-
 /* localbuf.c */
 extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
                                        BlockNumber blockNum);
@@ -201,4 +193,8 @@ extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
 extern void AtEOXact_LocalBuffers(bool isCommit);
 
+/* bufmgr.c */
+extern Size BufMgrShmemSize(int size);
+extern void BufMgrInitShmem(int size);
+
 #endif   /* BUFMGR_INTERNALS_H */