WIP: Conveyor belt storage.
authorRobert Haas <rhaas@postgresql.org>
Sun, 19 Sep 2021 23:58:12 +0000 (19:58 -0400)
committerRobert Haas <rhaas@postgresql.org>
Tue, 14 Dec 2021 20:12:19 +0000 (15:12 -0500)
There are still things missing here and probably quite a few bugs yet,
but the basics are mostly here and mostly working now.

For an overview of the ideas I have in mind, please check out
src/backend/access/conveyor/README

29 files changed:
src/backend/access/Makefile
src/backend/access/conveyor/Makefile [new file with mode: 0644]
src/backend/access/conveyor/README [new file with mode: 0644]
src/backend/access/conveyor/cbcache.c [new file with mode: 0644]
src/backend/access/conveyor/cbfsmpage.c [new file with mode: 0644]
src/backend/access/conveyor/cbindexpage.c [new file with mode: 0644]
src/backend/access/conveyor/cbmetapage.c [new file with mode: 0644]
src/backend/access/conveyor/cbmodify.c [new file with mode: 0644]
src/backend/access/conveyor/cbxlog.c [new file with mode: 0644]
src/backend/access/conveyor/conveyor.c [new file with mode: 0644]
src/backend/access/rmgrdesc/Makefile
src/backend/access/rmgrdesc/cbdesc.c [new file with mode: 0644]
src/backend/access/transam/rmgr.c
src/bin/pg_waldump/.gitignore
src/bin/pg_waldump/rmgrdesc.c
src/include/access/cbcache.h [new file with mode: 0644]
src/include/access/cbdefs.h [new file with mode: 0644]
src/include/access/cbfsmpage.h [new file with mode: 0644]
src/include/access/cbfsmpage_format.h [new file with mode: 0644]
src/include/access/cbindexpage.h [new file with mode: 0644]
src/include/access/cbindexpage_format.h [new file with mode: 0644]
src/include/access/cbmetapage.h [new file with mode: 0644]
src/include/access/cbmetapage_format.h [new file with mode: 0644]
src/include/access/cbmodify.h [new file with mode: 0644]
src/include/access/cbstorage.h [new file with mode: 0644]
src/include/access/cbxlog.h [new file with mode: 0644]
src/include/access/conveyor.h [new file with mode: 0644]
src/include/access/rmgrlist.h
src/tools/pgindent/typedefs.list

index 0880e0a8bbb63901164aef4a2de577db59c98b25..a840ae2e916b9955ebb474d4fb8cf16bd3fada17 100644 (file)
@@ -8,7 +8,7 @@ subdir = src/backend/access
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-SUBDIRS            = brin common gin gist hash heap index nbtree rmgrdesc spgist \
-                         table tablesample transam
+SUBDIRS            = brin common conveyor gin gist hash heap index nbtree rmgrdesc \
+                         spgist table tablesample transam
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/conveyor/Makefile b/src/backend/access/conveyor/Makefile
new file mode 100644 (file)
index 0000000..798f463
--- /dev/null
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for access/conveyor
+#
+# IDENTIFICATION
+#    src/backend/access/conveyor/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/conveyor
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+       cbcache.o \
+       cbfsmpage.o \
+       cbindexpage.o \
+       cbmetapage.o \
+       cbmodify.o \
+       cbxlog.o \
+       conveyor.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/conveyor/README b/src/backend/access/conveyor/README
new file mode 100644 (file)
index 0000000..109a758
--- /dev/null
@@ -0,0 +1,187 @@
+Conveyor Belt Storage
+=====================
+
+It's pretty common to want an append-only data store, but it's not usually
+practical to keep accumulating data forever without ever discarding any of
+it. What people most often want to do is periodically discard the oldest
+data, keeping all the newer stuff. Hence, imagine a conveyor belt. New data
+is continually added at one end of the conveyor belt, and eventually falls
+off the other end. Unlike a real conveyor belt, however, our conveyor belt
+is of variable length, and can grow and shrink to accommodate the amount of
+data that needs to be stored at any given time.
+
+Some other parts of PostgreSQL, for example xlog.c and slru.c, handle
+similar needs by using a series of files at the operating system level,
+adding new ones at the end and removing old ones. We want to use a standard
+relation fork, and so instead maintain a logical-to-physical page mapping.
+Callers allocate new logical page numbers (which are just consecutive 64-bit
+integers) and this module takes care of figuring out where to place them
+phyically. When the oldest logical pages are thrown away, the blocks
+allocated to them can be reused for new logical pages.
+
+Conceptually, a relation fork organized as a conveyor belt has three parts:
+
+- Payload. The payload is whatever data the user of this module wishes
+  to store. The conveyor belt doesn't care what you store in a payload page,
+  but it does require that you store something: each time a payload page is
+  initialized, it must end up with either pd_lower > SizeOfPageHeaderData,
+  or pd_lower < BLCKSZ.
+
+- Index. The index translates logical page numbers to physical block
+  numbers.  The intention is that pages might be physically relocated - e.g.
+  they could be moved to lower-numbered blocks to allow the relation to be
+  physically compacted - so external references to the data should use only
+  logical page numbers. The index is used to figure out which block
+  currently contains a given logical page number.
+
+- Freespace Map. The freespace map is used to decide where payload and
+  index data should be stored.
+
+Segments and the Freespace Map
+------------------------------
+
+Every page in a conveyor belt is either the metapage, or a freespace map
+page, or part of a segment. Segments can store either payload data or
+index data. Every segment in any particular conveyor belt contains
+the same number of pages. As old data is removed from the conveyor belt,
+segments get recycled.
+
+The allocation status of every segment is tracked by a single bit in the
+freespace map: 1 = allocated, 0 = free. The initial portion of the freespace
+map is stored in the metapage. When the relation has been extended far
+enough that there are no more bits available in the metapage to track the
+allocation status of further segments, a single freespace map page is
+inserted. That single freespace map page is then followed by additional
+segments. If we extend the relation far enough that the previous freespace
+map page has no more bits available to track the allocation status of
+further segments, then it's time to another freespace map page.
+
+This scheme allows us to compute the location of every freespace map page
+based on (1) the number of freespace bits in the metapage, (2) the
+number of freespace map bits in each freespace map page, and (3) the
+number of pages per segment. (1) and (2) are compile time constants, and
+(3) can't be changed after a conveyor belt is created, so once we have
+read the metapage for a conveyor belt once, we can compute the location
+of freespace map pages at will.
+
+It's also straightforward to compute the starting and ending block numbers
+for any given segment. We just have to be careful to account for the number
+of freespace map pages that precede that segment.
+
+Payload Segments and the Index
+------------------------------
+
+As payload data is added to a conveyor belt, payload segments are allocated
+to store it. As old payload data is removed from a conveyor belt, the
+payload segments that contain it can be deallocated and eventually reused.
+
+Because space for payload data is allocated a segment at a time, the
+index just needs to keep track of (1) the segment numbers that contain
+payload data, in order of allocation and (2) the starting logical page
+number for the first such segment. If there's not much data on the conveyor
+belt, all of these segment numbers - which we call index entries - can be
+stored in the metapage itself.
+
+If the metapage runs out of space to store index entries, then we move the
+oldest index entries that it presently contains into an index segment, and
+continue to insert new index entries into the metapage. The index segments
+themselves are organized into a singly linked list.
+
+As data is removed from a conveyor belt, we'll eventually reach a point
+where none of the index entries in a given index segment are needed any
+more. At that point, we can deallocate the index segment and reuse it.
+
+Note that nothing prevents a dealloated index segment from being reused
+as a payload segment, or the other way around.
+
+Removing Old Data
+-----------------
+
+From time to time, we expect to receive a request to discard old data,
+which will come in the form of a call to ConveyorBeltLogicalTruncate stating
+that all logical pages with page numbers less than some threshold value are
+no longer required. Thereafter, a subsequent call to ConveyorBeltVacuum
+may be used to free up any segments that are no longer required as a result
+of the increased logical truncation point. Finally, a call to
+ConveyorBeltPhysicalTruncate may be used to discard unused pages from the
+end of the relation.
+
+ConveyorBeltLogicalTruncate just updates the metapage with a new value for
+the oldest logical page in existence. Once this has been done, attempts to
+access logical pages prior to the new threshold will be categorically
+refused.  We require a cleanup lock on the matapage for this step.  This
+allows a reader which has determined the location of a payload page to
+release all buffer locks, retaining just a pin on the metapage, before
+reading and locking the target page. Since the page can't be logically
+truncated away while the pin on the metapage is held, it also can't be
+recycled.
+
+ConveyorBeltVacuum performs a multi-step process to recycle index and
+payload segments whose contents are no longer needed:
+
+1. If there are now one or more payload segments that contain no accessible
+   pages, it frees them up. To do this, it must first reinitialize each page of
+   each such segment, taking a cleanup lock on each page as it does so.
+   This guarantees that no other backend retains a pin on any such pages.
+   It should be impossible for any new locks or pins on these pages to be
+   taken at this point, because pages that have been logically truncated
+   can no longer be looked up via ConveyorBeltReadBuffer. It then clears
+   the index entry that points to the segment in question and simultaneously
+   marks it free in the freespace map.
+
+2. When all of the index entries in an index segment have been cleared,
+   the index segment itself can be freed. To do this, we first reinitialize
+   all the pages, and then update the metapage. The metapage update involves
+   changing metapage's notion of the oldest index segment and of the
+   logical page number where the index begins. Simultaneously, the segment
+   must be marked as free in the freespace map.
+
+   These metapage updates must be performed with a cleanup lock on the
+   metapage.  This allows a concurrent reader to lock the metapage, note the
+   location of the first index segment, release the lock on the metapage while
+   retaining the pin, and then go lock the pages in that index segment, or
+   successor index segments, without fearing that they will vanish.
+
+3. If index entries are cleared in the metapage itself, then any
+   remaining entries can be shifted to the start of the array of index
+   entries stored in the metapage, provided that we simultaneously
+   adjust the metapage's notion of the logical page number where the
+   index begins.
+
+Note that there's no correctness issue if ConveyorBeltVacuum is skipped or
+if it is interrupted before all the work that it could perform actually
+gets done. It doesn't do anything that can't be deferred until a later time;
+the only problem is that you might end up with a lot of bloat. That could
+be bad, but it's not a functional breakage.
+
+Note also that running multiple copies of ConveyorBeltVacuum on the same
+conveyor belt at the same time is a bad plan. They'll contend with each
+othr trying to do the same work. Consider preventing this by some means
+(e.g. a self-exclusive heavyweight lock).
+
+ConveyorBeltPhysicalTruncate can be used to return disk space to the
+operating system. This only works if the highest-numbered segments happen
+to be free, and it requires both a relation extension lock (since it would
+be bad if someone is trying to make the relation longer while we're trying
+to make it shorter) and a cleanup lock on the metapage (since
+ConveyorBeltNewPage can't cope with a concurrent truncation; it expects
+that the relation will only grow concurrently, not shrink).
+
+Buffer Lock Ordering
+--------------------
+
+Various parts of the code need to acquire multiple buffer locks
+simultaneously, and must do so in a consistent order to avoid deadlock. We
+use this ordering:
+
+1. Any new page that we're about to allocate.
+2. Any payload page.
+3. Any index pages.
+4. Any FSM page.
+5. The metapage.
+
+It might seem strange to lock the metapage last, because we typically need
+to examine it before doing much of anything. However, after we examine it,
+we typically want to read other pages, and we don't want to retain the
+buffer lock while doing I/O.  Instead, we release the buffer lock and then
+reacquire it at the end, after we've got all the other locks we need.
diff --git a/src/backend/access/conveyor/cbcache.c b/src/backend/access/conveyor/cbcache.c
new file mode 100644 (file)
index 0000000..789fe40
--- /dev/null
@@ -0,0 +1,186 @@
+/*-------------------------------------------------------------------------
+ *
+ * cbcache.c
+ *       Conveyor belt index segment location cache.
+ *
+ * The conveyor belt metapage stores the segment numbers of the oldest and
+ * newest index segments that currently exist, but the location of any
+ * other index segment can only be discovered by reading the first page
+ * of some index segment whose position is known and extracting from it
+ * the segment number of the next index segment. That's potentially
+ * expensive, especially if we have to traverse a whole bunch of index
+ * segments to figure out the location of the one we need, so we maintain
+ * a cache.
+ *
+ * The cache maps the oldest logical page number covered by an index
+ * segment to the segment number where that index segment is located.
+ * If older index segments are removed, the corresponding mappings become
+ * obsolete, but nobody should be accessing those pages anyway. Still,
+ * we're careful to purge such mappings to avoid wasting memory.
+ *
+ * If an index segment is moved, we invalidate the entire cache. This
+ * is expected to be fairly rare, as it should only happen if someone is
+ * trying to reduce the on-disk footprint of the conveyor belt. Moreover,
+ * if someone is doing that, it is likely that multiple index segments
+ * will be moved in relatively quick succession, so it's not clear that
+ * a more granular invalidation strategy would help anything.
+ *
+ * Copyright (c) 2016-2021, PostgreSQL Global Development Group
+ *
+ * src/backend/access/conveyor/cbcache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/cbcache.h"
+#include "common/hashfn.h"
+
+typedef struct cb_iseg_entry
+{
+       CBPageNo        index_segment_start;
+       CBSegNo         segno;
+       char            status;
+} cb_iseg_entry;
+
+#define SH_PREFIX cb_iseg
+#define SH_ELEMENT_TYPE cb_iseg_entry
+#define SH_KEY_TYPE CBPageNo
+#define SH_KEY index_segment_start
+#define SH_HASH_KEY(tb, key) tag_hash(&key, sizeof(CBPageNo))
+#define SH_EQUAL(tb, a, b) a == b
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+struct CBCache
+{
+       uint64          index_segments_moved;
+       CBPageNo        oldest_possible_start;
+       cb_iseg_hash *iseg;
+};
+
+/*
+ * Create a new cache.
+ */
+CBCache *
+cb_cache_create(MemoryContext mcxt, uint64 index_segments_moved)
+{
+       CBCache    *cache = MemoryContextAlloc(mcxt, sizeof(CBCache));
+
+       cache->index_segments_moved = index_segments_moved;
+       cache->oldest_possible_start = 0;
+       cache->iseg = cb_iseg_create(mcxt, 128, NULL);
+
+       return cache;
+}
+
+/*
+ * Invalidate cache entries as required.
+ *
+ * If index_segments_moved has changed, throw away everything we think we
+ * know. Otherwise, if index_start has advanced, throw away any entries that
+ * precede the new start.
+ */
+void
+cb_cache_invalidate(CBCache *cache, CBPageNo index_start,
+                                       uint64 index_segments_moved)
+{
+       if (index_segments_moved != cache->index_segments_moved)
+       {
+               cb_iseg_reset(cache->iseg);
+               cache->index_segments_moved = index_segments_moved;
+       }
+       else if (index_start > cache->oldest_possible_start)
+       {
+               cb_iseg_iterator it;
+               cb_iseg_entry *entry;
+
+               cb_iseg_start_iterate(cache->iseg, &it);
+               while ((entry = cb_iseg_iterate(cache->iseg, &it)) != NULL)
+                       if (entry->index_segment_start < index_start)
+                               cb_iseg_delete_item(cache->iseg, entry);
+       }
+}
+
+/*
+ * Search the cache for an index segment number, given the first logical page
+ * number covered by that index segment.
+ *
+ * It is the caller's responsibility to make sure that pageno is the first
+ * logical page number covered by some index segment, rather than any random
+ * page number whose index entry might be anywhere in the segment. We don't
+ * have enough information here to verify this, and just trust that the caller
+ * knows what they are doing.
+ */
+CBSegNo
+cb_cache_lookup(CBCache *cache, CBPageNo pageno)
+{
+       cb_iseg_entry *entry;
+
+       entry = cb_iseg_lookup(cache->iseg, pageno);
+       return entry != NULL ? entry->segno : CB_INVALID_SEGMENT;
+}
+
+/*
+ * Search the cache for an index segment that precedes the one for which we
+ * are searching by as little as possible.
+ *
+ * As with cb_cache_lookup, pageno should be the first logical page of the
+ * index segment in which the caller is interested, although unlike that
+ * function, this function would still work correctly if it were an arbitrary
+ * page number, at least as presently implemented.
+ *
+ * If no segment with a starting segment number preceding pageno is found
+ * in cache, the return value is CB_INVALID_SEGMENT and *index_segment_start
+ * is set to CB_INVALID_LOGICAL_PAGE. Otherwise, the return value is the
+ * segment number we found and *index_segment_start is set to the starting
+ * logical page number of that segment.
+ */
+CBSegNo
+cb_cache_fuzzy_lookup(CBCache *cache, CBPageNo pageno,
+                                         CBPageNo *index_segment_start)
+{
+       cb_iseg_iterator it;
+       cb_iseg_entry *current;
+       cb_iseg_entry *best = NULL;
+
+       cb_iseg_start_iterate(cache->iseg, &it);
+       while ((current = cb_iseg_iterate(cache->iseg, &it)) != NULL)
+       {
+               if (current->index_segment_start > pageno)
+                       continue;
+               if (best == NULL ||
+                       best->index_segment_start < current->index_segment_start)
+                       best = current;
+       }
+
+       if (best == NULL)
+       {
+               *index_segment_start = CB_INVALID_LOGICAL_PAGE;
+               return CB_INVALID_SEGMENT;
+       }
+
+       *index_segment_start = best->index_segment_start;
+       return best->segno;
+}
+
+/*
+ * Insert a cache entry.
+ *
+ * As in cb_cache_lookup, it's critical that index_segment_start is the first
+ * logical page number covered by the index segment.
+ */
+void
+cb_cache_insert(CBCache *cache, CBSegNo segno, CBPageNo index_segment_start)
+{
+       cb_iseg_entry *entry;
+       bool            found;
+
+       entry = cb_iseg_insert(cache->iseg, index_segment_start, &found);
+       Assert(!found);
+       entry->segno = segno;
+
+       Assert(index_segment_start >= cache->oldest_possible_start);
+}
diff --git a/src/backend/access/conveyor/cbfsmpage.c b/src/backend/access/conveyor/cbfsmpage.c
new file mode 100644 (file)
index 0000000..65b8f25
--- /dev/null
@@ -0,0 +1,152 @@
+/*-------------------------------------------------------------------------
+ *
+ * cbfsmpage.c
+ *       APIs for accessing conveyor belt FSM pages.
+ *
+ * Similar to cbmetapage.c, this file abstracts accesses to conveyor
+ * belt FSM pages, and should be the only backend code that understands
+ * their internal structure.
+ *
+ * Copyright (c) 2016-2021, PostgreSQL Global Development Group
+ *
+ * src/backend/access/conveyor/cbfsmpage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/cbfsmpage.h"
+#include "access/cbfsmpage_format.h"
+#include "access/cbmetapage.h"
+
+static CBFSMPageData *cb_fsmpage_get_special(Page page);
+
+/*
+ * Initialize FSM page.
+ *
+ * Returns the first segment number that will be covered by the new page.
+ */
+CBSegNo
+cb_fsmpage_initialize(Page page, BlockNumber blkno, uint16 pages_per_segment)
+{
+       CBFSMPageData *fsmp;
+       BlockNumber     first_fsm_block = cb_first_fsm_block(pages_per_segment);
+       unsigned        fsm_block_spacing = cb_fsm_block_spacing(pages_per_segment);
+
+       /* Sanity checks. */
+       Assert(blkno >= first_fsm_block);
+       Assert((blkno - first_fsm_block) % fsm_block_spacing == 0);
+
+       /* Initialize page. PageInit will zero the payload bits for us. */
+       PageInit(page, BLCKSZ, sizeof(CBFSMPageData));
+       fsmp = (CBFSMPageData *) PageGetSpecialPointer(page);
+       fsmp->cbfsm_magic = CB_FSMPAGE_MAGIC;
+       fsmp->cbfsm_start = cb_first_segment_for_fsm_page(blkno, pages_per_segment);
+
+       return fsmp->cbfsm_start;
+}
+
+/*
+ * Get the allocation status of a segment from an FSM page.
+ */
+bool
+cb_fsmpage_get_fsm_bit(Page page, CBSegNo segno)
+{
+       CBFSMPageData *fsmp = cb_fsmpage_get_special(page);
+       uint8           byte;
+       uint8           mask;
+       uint32          bitno;
+
+       if (segno < fsmp->cbfsm_start ||
+               segno >= fsmp->cbfsm_start + CB_FSM_SEGMENTS_PER_FSMPAGE)
+               elog(ERROR, "segment %u out of range for fsm page starting at segment %u",
+                        segno, fsmp->cbfsm_start);
+
+       bitno = segno - fsmp->cbfsm_start;
+       byte = fsmp->cbfsm_state[bitno / BITS_PER_BYTE];
+       mask = 1 << (bitno % BITS_PER_BYTE);
+       return (byte & mask) != 0;
+}
+
+/*
+ * Set the allocation status of a segment in an FSM page.
+ *
+ * new_state should be true if the bit is currently clear and should be set,
+ * and false if the bit is currently set and should be cleared. Don't call
+ * this unless you know that the bit actually needs to be changed.
+ */
+void
+cb_fsmpage_set_fsm_bit(Page page, CBSegNo segno, bool new_state)
+{
+       CBFSMPageData *fsmp = cb_fsmpage_get_special(page);
+       uint8      *byte;
+       uint8           mask;
+       uint8           old_state;
+       uint32          bitno;
+
+       if (segno < fsmp->cbfsm_start ||
+               segno >= fsmp->cbfsm_start + CB_FSM_SEGMENTS_PER_FSMPAGE)
+               elog(ERROR, "segment %u out of range for fsm page starting at segment %u",
+                        segno, fsmp->cbfsm_start);
+
+       bitno = segno - fsmp->cbfsm_start;
+       byte = &fsmp->cbfsm_state[bitno / BITS_PER_BYTE];
+       mask = 1 << (segno % BITS_PER_BYTE);
+       old_state = (*byte & mask) != 0;
+
+       if (old_state == new_state)
+               elog(ERROR, "fsm bit for segment %u already has value %d",
+                        segno, old_state ? 1 : 0);
+
+       if (new_state)
+               *byte |= mask;
+       else
+               *byte &= ~mask;
+}
+
+/*
+ * Returns the lowest unused segment number covered by the supplied FSM page,
+ * or CB_INVALID_SEGMENT if none.
+ */
+CBSegNo
+cbfsmpage_find_free_segment(Page page)
+{
+       CBFSMPageData *fsmp = cb_fsmpage_get_special(page);
+       unsigned        i;
+       unsigned        j;
+
+       StaticAssertStmt(CB_FSMPAGE_FREESPACE_BYTES % sizeof(uint64) == 0,
+                                        "CB_FSMPAGE_FREESPACE_BYTES should be a multiple of 8");
+
+       for (i = 0; i < CB_FSMPAGE_FREESPACE_BYTES; ++i)
+       {
+               uint8   b = fsmp->cbfsm_state[i];
+
+               if (b == 0xFF)
+                       continue;
+
+               for (j = 0; j < BITS_PER_BYTE; ++j)
+               {
+                       if ((b & (1 << j)) == 0)
+                               return fsmp->cbfsm_start + (i * BITS_PER_BYTE) + j;
+               }
+       }
+
+       return CB_INVALID_SEGMENT;
+}
+
+/*
+ * Given a page that is known to be a conveyor belt free space map page,
+ * return a pointer to the CBFSMPageData, after checking the magic number.
+ */
+static CBFSMPageData *
+cb_fsmpage_get_special(Page page)
+{
+       CBFSMPageData *fsmp = (CBFSMPageData *) PageGetSpecialPointer(page);
+
+       if (fsmp->cbfsm_magic != CB_FSMPAGE_MAGIC)
+               elog(ERROR, "bad magic number in conveyor belt fsm page: %08X",
+                        fsmp->cbfsm_magic);
+
+       return fsmp;
+}
diff --git a/src/backend/access/conveyor/cbindexpage.c b/src/backend/access/conveyor/cbindexpage.c
new file mode 100644 (file)
index 0000000..99ad141
--- /dev/null
@@ -0,0 +1,189 @@
+/*-------------------------------------------------------------------------
+ *
+ * cbindexpage.c
+ *       APIs for accessing conveyor belt index pages.
+ *
+ * Similar to cbmetapage.c, this file abstracts accesses to conveyor
+ * belt index pages, and should be the only backend code that understands
+ * their internal structure.
+ *
+ * Copyright (c) 2016-2021, PostgreSQL Global Development Group
+ *
+ * src/backend/access/conveyor/cbindexpage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/cbfsmpage.h"
+#include "access/cbindexpage.h"
+#include "access/cbindexpage_format.h"
+
+static CBIndexPageData *cb_indexpage_get_special(Page page);
+
+/*
+ * Initialize an index page.
+ *
+ * If this is the first page in a new index segment, it has to be the newest
+ * segment, so there's no next segment yet. And there's never a next segment
+ * for a page that is not the first one in the segment.
+ */
+void
+cb_indexpage_initialize(Page page, CBPageNo pageno)
+{
+       CBIndexPageData *ipd;
+       int             i;
+
+       PageInit(page, BLCKSZ, sizeof(CBIndexPageData));
+       ipd = (CBIndexPageData *) PageGetSpecialPointer(page);
+       ipd->cbidx_magic = CB_INDEXPAGE_MAGIC;
+       ipd->cbidx_next_segment = CB_INVALID_SEGMENT;
+       ipd->cbidx_first_page = pageno;
+
+       for (i = 0; i < CB_INDEXPAGE_INDEX_ENTRIES; ++i)
+               ipd->cbidx_entry[i] = CB_INVALID_SEGMENT;
+}
+
+/*
+ * Figure out where a certain logical page is physically located.
+ *
+ * It is the caller's responsibility to supply the correct index page.
+ */
+BlockNumber
+cb_indexpage_find_logical_page(Page page, CBPageNo pageno,
+                                                          uint16 pages_per_segment)
+{
+       CBIndexPageData *ipd = cb_indexpage_get_special(page);
+       unsigned        offset;
+       CBSegNo         segno;
+
+       if (pageno < ipd->cbidx_first_page)
+               elog(ERROR, "can't find index entry for page " UINT64_FORMAT " on an index page that starts at page " UINT64_FORMAT,
+                        pageno, ipd->cbidx_first_page);
+       offset = (pageno - ipd->cbidx_first_page) / pages_per_segment;
+       if (offset > CB_INDEXPAGE_INDEX_ENTRIES)
+               elog(ERROR, "can't find index entry for page " UINT64_FORMAT " on an index page that starts at page " UINT64_FORMAT,
+                        pageno, ipd->cbidx_first_page);
+       segno = ipd->cbidx_entry[offset];
+       if (segno == CB_INVALID_SEGMENT)
+               elog(ERROR, "no index entry for page " INT64_FORMAT, pageno);
+
+       return cb_segment_to_block(pages_per_segment, segno,
+                                                          pageno % pages_per_segment);
+}
+
+/*
+ * Add index entries for logical pages beginning at 'pageno'.
+ *
+ * It is the caller's responsibility to supply the correct index page, and
+ * to make sure that there is enough room for the entries to be added.
+ */
+void
+cb_indexpage_add_index_entries(Page page,
+                                                          unsigned pageoffset,
+                                                          unsigned num_index_entries,
+                                                          CBSegNo *index_entries)
+{
+       CBIndexPageData *ipd = cb_indexpage_get_special(page);
+
+       if (num_index_entries < 1 || num_index_entries > CB_INDEXPAGE_INDEX_ENTRIES)
+               elog(ERROR, "can't add %u index entries to an index page",
+                        num_index_entries);
+       if (pageoffset + num_index_entries > CB_INDEXPAGE_INDEX_ENTRIES)
+               elog(ERROR, "can't place %u index entries starting at offset %u",
+                        num_index_entries, pageoffset);
+
+       memcpy(&ipd->cbidx_entry[pageoffset], index_entries,
+                  num_index_entries * sizeof(CBSegNo));
+}
+
+/*
+ * Get an obsolete index entry for the given segment.
+ *
+ * Starts searching for an index entry at the offset given by *pageoffset,
+ * and update *pageoffset to the offset at which an entry was found, or to
+ * CB_INDEXPAGE_INDEX_ENTRIES if no entry is found.
+ *
+ * Sets *pageno to the first logical page covered by this index page.
+ *
+ * Returns the segment number to which the obsolete index entry points.
+ */
+CBSegNo
+cb_indexpage_get_obsolete_entry(Page page, unsigned *pageoffset,
+                                                               CBPageNo *first_pageno)
+{
+       CBIndexPageData *ipd = cb_indexpage_get_special(page);
+
+       *first_pageno = ipd->cbidx_first_page;
+
+       while (*pageoffset < CB_INDEXPAGE_INDEX_ENTRIES &&
+                  ipd->cbidx_entry[*pageoffset] != CB_INVALID_SEGMENT)
+               ++*pageoffset;
+
+       return ipd->cbidx_entry[*pageoffset];
+}
+
+/*
+ * Clear the obsolete index entry for the given segment from the given page
+ * offset.
+ */
+void
+cb_indexpage_clear_obsolete_entry(Page page,
+                                                                 CBSegNo segno,
+                                                                 unsigned pageoffset)
+{
+       CBIndexPageData *ipd = cb_indexpage_get_special(page);
+
+       if (pageoffset >= CB_INDEXPAGE_INDEX_ENTRIES)
+               elog(ERROR, "page offset %u out of range", pageoffset);
+       if (ipd->cbidx_entry[pageoffset] != segno)
+               elog(ERROR, "while clearing index entry %u, found %u where %u was expected",
+                        pageoffset, ipd->cbidx_entry[pageoffset], segno);
+
+       ipd->cbidx_entry[pageoffset] = CB_INVALID_SEGMENT;
+}
+
+/*
+ * Set the next index segment.
+ *
+ * This should only be used on the first page of an index segment, since
+ * that's where the next segment number is stored.
+ */
+void
+cb_indexpage_set_next_segment(Page page, CBSegNo segno)
+{
+       CBIndexPageData *ipd = cb_indexpage_get_special(page);
+
+       ipd->cbidx_next_segment = segno;
+}
+
+/*
+ * Get the next index segment.
+ *
+ * This should only be used on the first page of an index segment, since
+ * that's where the next segment number is stored.
+ */
+CBSegNo
+cb_indexpage_get_next_segment(Page page)
+{
+       CBIndexPageData *ipd = cb_indexpage_get_special(page);
+
+       return ipd->cbidx_next_segment;
+}
+
+/*
+ * Given a page that is known to be a conveyor belt free space map page,
+ * return a pointer to the CBFSMPageData, after checking the magic number.
+ */
+static CBIndexPageData *
+cb_indexpage_get_special(Page page)
+{
+       CBIndexPageData *ipd = (CBIndexPageData *) PageGetSpecialPointer(page);
+
+       if (ipd->cbidx_magic != CB_INDEXPAGE_MAGIC)
+               elog(ERROR, "bad magic number in conveyor belt index page: %08X",
+                        ipd->cbidx_magic);
+
+       return ipd;
+}
diff --git a/src/backend/access/conveyor/cbmetapage.c b/src/backend/access/conveyor/cbmetapage.c
new file mode 100644 (file)
index 0000000..15021e0
--- /dev/null
@@ -0,0 +1,721 @@
+/*-------------------------------------------------------------------------
+ *
+ * cbmetapage.c
+ *       APIs for accessing conveyor belt metapages.
+ *
+ * The goal of this file is to provide a set of functions that can be
+ * used to perform all necessary access to or modification of a conveyor
+ * belt metapage. The functions in this file should be the only backend
+ * code that knows about the actual organization of CBMetapageData,
+ * but they shouldn't know about the internals of other types of pages
+ * (like index segment or freespace map pages) nor should they know
+ * about buffers or locking.
+ *
+ * Much - but not all - of the work done here is sanity checking. We
+ * do this partly to catch bugs, and partly as a defense against the
+ * possibility that the metapage is corrupted on disk. Because of the
+ * latter possibility, most of these checks use an elog(ERROR) rather
+ * than just Assert.
+ *
+ * Copyright (c) 2016-2021, PostgreSQL Global Development Group
+ *
+ * src/backend/access/conveyor/cbmetapage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/cbfsmpage.h"
+#include "access/cbindexpage.h"
+#include "access/cbmetapage.h"
+#include "access/cbmetapage_format.h"
+
+/*
+ * Initialize metapage.
+ */
+void
+cb_metapage_initialize(Page page, uint16 pages_per_segment)
+{
+       CBMetapageData *meta;
+       int                     i;
+
+       PageInit(page, BLCKSZ, sizeof(CBMetapageData));
+       meta = (CBMetapageData *) PageGetSpecialPointer(page);
+       meta->cbm_magic = CB_METAPAGE_MAGIC;
+       meta->cbm_version = CBM_VERSION;
+       meta->cbm_pages_per_segment = pages_per_segment;
+
+       /*
+        * PageInit has already zeroed the page, so we only need to initialize any
+        * fields that need to be non-zero. Everything of type CBPageNo and all of
+        * the freespace map should start out as 0, but most of the fields of
+        * CBSegNo fields need to be set to CB_INVALID_SEGMENT.
+        */
+       meta->cbm_oldest_index_segment = CB_INVALID_SEGMENT;
+       meta->cbm_newest_index_segment = CB_INVALID_SEGMENT;
+       for (i = 0; i < CB_METAPAGE_INDEX_ENTRIES; ++i)
+               meta->cbm_index[i] = CB_INVALID_SEGMENT;
+}
+
+/*
+ * Given a page that is known to be a conveyor belt metapage, return a
+ * pointer to the CBMetapageData.
+ *
+ * We take the opportunity to perform some basic sanity checks here.
+ */
+CBMetapageData *
+cb_metapage_get_special(Page page)
+{
+       CBMetapageData *meta = (CBMetapageData *) PageGetSpecialPointer(page);
+
+       if (meta->cbm_magic != CB_METAPAGE_MAGIC)
+               elog(ERROR, "bad magic number in conveyor belt metapage: %08X",
+                        meta->cbm_magic);
+       if (meta->cbm_version != CBM_VERSION)
+               elog(ERROR, "bad version in conveyor belt metapage: %08X",
+                        meta->cbm_version);
+       if (meta->cbm_pages_per_segment == 0)
+               elog(ERROR, "conveyor belt may not have zero pages per segment");
+
+       return meta;
+}
+
+/*
+ * Deduce what we can about the physical location of a logical page.
+ *
+ * If the logical page precedes the logical truncation point, returns false.
+ * Otherwise, returns true.
+ *
+ * If the physical location of the block can be computed based on the data
+ * in the metapage, sets *blkno to the appropriate block number. Otherwise,
+ * sets *blkno to InvalidBlockNumber.
+ */
+bool
+cb_metapage_find_logical_page(CBMetapageData *meta,
+                                                         CBPageNo pageno,
+                                                         BlockNumber *blkno)
+{
+       CBPageNo        relp;
+       CBSegNo         segno;
+       unsigned        segoff;
+
+       /* Physical location unknown, unless we later discover otherwise. */
+       *blkno = InvalidBlockNumber;
+
+       /* Is it too old to be accessible? */
+       if (pageno < meta->cbm_oldest_logical_page)
+               return false;
+
+       /* Is it too old to have an index entry in the metapage? */
+       if (pageno < meta->cbm_index_metapage_start)
+       {
+               /* Index entry exists, but not on metapage. */
+               return true;
+       }
+
+       /* Is it too new to have an index entry? */
+       relp = pageno - meta->cbm_index_metapage_start;
+       if (relp >= CB_METAPAGE_INDEX_ENTRIES * meta->cbm_pages_per_segment)
+               return false;
+
+       /* Index entry must be in the metapage, if it exists at all. */
+       segno = meta->cbm_index[relp / meta->cbm_pages_per_segment];
+       segoff = relp % meta->cbm_pages_per_segment;
+       if (segno == CB_INVALID_SEGMENT)
+               return false;
+
+       /* Location identified! */
+       *blkno = cb_segment_to_block(meta->cbm_pages_per_segment, segno, segoff);
+       return true;
+}
+
+/*
+ * Tell the caller what needs to be done to insert a page.
+ *
+ * Regardless of the return value, *next_pageno and *next_segno will be
+ * set to the lowest-numbered logical page that is not allocated and the
+ * lowest segment number that is not allocated, respectively. In addition,
+ * *index_start will be set to the first logical page number covered by the
+ * index, *index_metapage_start to the first logical page number covered by
+ * the metapage portion of the index, and *newest_index_segment to the segment
+ * number of the newest index segment, or CB_INVALID_SEGMENT if there is none.
+ *
+ * If the return value is CBM_INSERT_OK, *blkno will be set to the block number
+ * of the first unused page in the unfilled payload segment.
+ *
+ * If the return value is CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED, *blkno
+ * will be set to the first not-entirely-filled page in the newest index
+ * segment.
+ */
+CBMInsertState
+cb_metapage_get_insert_state(CBMetapageData *meta,
+                                                        BlockNumber *blkno,
+                                                        CBPageNo *next_pageno,
+                                                        CBSegNo *next_segno,
+                                                        CBPageNo *index_start,
+                                                        CBPageNo *index_metapage_start,
+                                                        CBSegNo *newest_index_segment)
+{
+       CBPageNo        relp;
+       CBSegNo         segno;
+       unsigned        segoff;
+
+       /* Set the values that we return unconditionally. */
+       *next_pageno = meta->cbm_next_logical_page;
+       *next_segno = meta->cbm_next_segment;
+       *index_start = meta->cbm_index_start;
+       *index_metapage_start = meta->cbm_index_metapage_start;
+       *newest_index_segment = meta->cbm_newest_index_segment;
+
+       /* Compute next logical page number relative to start of metapage. */
+       relp = meta->cbm_next_logical_page - meta->cbm_index_metapage_start;
+
+       /*
+        * If the next logical page number doesn't fit on the metapage, we need to
+        * make space by relocating some index entries to an index segment.
+        *
+        * Potentially, we could instead clean out some index entries from the
+        * metapage that now precede the logical truncation point, but that would
+        * require a cleanup lock on the metapage, and it normally isn't going to
+        * be possible, because typically the last truncate operation will have
+        * afterwards done any such work that is possible. We might miss an
+        * opportunity in the case where the last truncate operation didn't clean
+        * up fully, but hopefully that's rare enough that we don't need to stress
+        * about it.
+        *
+        * If the newest index segment is already full, then a new index segment
+        * will need to be created. Otherwise, some entries can be copied into the
+        * existing index segment.
+        */
+       if (relp >= CB_METAPAGE_INDEX_ENTRIES * meta->cbm_pages_per_segment)
+       {
+               unsigned        entries;
+               unsigned        maxentries;
+
+               entries = meta->cbm_entries_in_newest_index_segment;
+               maxentries = CB_INDEXPAGE_INDEX_ENTRIES * meta->cbm_pages_per_segment;
+
+               if (entries > maxentries)
+                       elog(ERROR,
+                                "newest index segment listed as using %u of %u entries",
+                                entries, maxentries);
+               else if (entries == maxentries ||
+                                meta->cbm_newest_index_segment == CB_INVALID_SEGMENT)
+                       return CBM_INSERT_NEEDS_INDEX_SEGMENT;
+               else
+               {
+                       /* Figure out which block should be targeted. */
+                       *blkno = cb_segment_to_block(meta->cbm_pages_per_segment,
+                                                                                meta->cbm_newest_index_segment,
+                                                                                entries / CB_INDEXPAGE_INDEX_ENTRIES);
+
+                       return CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED;
+               }
+       }
+
+       /* Compute current insertion segment and offset. */
+       segno = meta->cbm_index[relp / meta->cbm_pages_per_segment];
+       segoff = meta->cbm_next_logical_page % meta->cbm_pages_per_segment;
+
+       /*
+        * If the next logical page number would be covered by an index entry that
+        * does not yet exist, we need a new payload segment.
+        */
+       if (segno == CB_INVALID_SEGMENT)
+               return CBM_INSERT_NEEDS_PAYLOAD_SEGMENT;
+
+       /* Looks like we can go ahead and insert a page. Hooray! */
+       *blkno = cb_segment_to_block(meta->cbm_pages_per_segment, segno, segoff);
+       return CBM_INSERT_OK;
+}
+
+/*
+ * Advance the next logical page number for this conveyor belt by one.
+ *
+ * We require the caller to specify the physical block number where the new
+ * block was placed. This allows us to perform some sanity-checking.
+ */
+void
+cb_metapage_advance_next_logical_page(CBMetapageData *meta,
+                                                                         BlockNumber blkno)
+{
+       BlockNumber expected_blkno;
+       CBPageNo        dummy_pageno;
+       CBSegNo         dummy_segno;
+
+       /* Perform sanity checks. */
+       if (cb_metapage_get_insert_state(meta, &expected_blkno, &dummy_pageno,
+                                                                        &dummy_segno, &dummy_pageno,
+                                                                        &dummy_pageno, &dummy_segno)
+               != CBM_INSERT_OK)
+               elog(ERROR, "no active insertion segment");
+       if (blkno != expected_blkno)
+               elog(ERROR, "new page is at block %u but expected block %u",
+                        blkno, expected_blkno);
+
+       /* Do the real work. */
+       meta->cbm_next_logical_page++;
+}
+
+/*
+ * Advance the oldest logical page number for this conveyor belt.
+ */
+void
+cb_metapage_advance_oldest_logical_page(CBMetapageData *meta,
+                                                                               CBPageNo oldest_logical_page)
+{
+       /*
+        * Something must be desperately wrong if an effort is ever made to set
+        * the value backwards or even to the existing value. Higher-level code
+        * can choose to do nothing in such cases rather than rejecting them, but
+        * this function should only get called when we're committed to dirtying
+        * the page and (if required) writing WAL.
+        */
+       if (meta->cbm_oldest_logical_page >= oldest_logical_page)
+               elog(ERROR, "oldest logical page is already " UINT64_FORMAT " so can't be set to " UINT64_FORMAT,
+                        meta->cbm_oldest_logical_page, oldest_logical_page);
+
+       /* Do the real work. */
+       meta->cbm_oldest_logical_page = oldest_logical_page;
+}
+
+/*
+ * Get the oldest and next logical page numbers for this conveyor belt.
+ */
+void
+cb_metapage_get_bounds(CBMetapageData *meta, CBPageNo *oldest_logical_page,
+                                          CBPageNo *next_logical_page)
+{
+       *oldest_logical_page = meta->cbm_oldest_logical_page;
+       *next_logical_page = meta->cbm_next_logical_page;
+}
+
+/*
+ * Compute the number of index entries that are used in the metapage.
+ *
+ * For our purposes here, an index entry isn't used unless there are some
+ * logical pages associated with it. It's possible that the real number
+ * of index entries is one higher than the value we return, but if so,
+ * no pages have been allocated from the final segment just yet.
+ *
+ * The reason this is OK is that the intended purpose of this function is
+ * to figure out where a new index entry ought to be put, and we shouldn't
+ * be putting a new index entry into the page at all unless all of the
+ * existing entries point to segments that are completely full. If we
+ * needed to know how many entries had been filled in, whether or not any
+ * of the associated storage was in use, we could do that by adding 1 to
+ * the value computed here here if the entry at that offset is already
+ * initialized.
+ */
+int
+cb_metapage_get_index_entries_used(CBMetapageData *meta)
+{
+       CBPageNo        relp;
+
+       /*
+        * Compute next logical page number relative to start of metapage.
+        *
+        * NB: The number of index entries could be equal to the number that will
+        * fit on the page, but it cannot be more.
+        */
+       relp = meta->cbm_next_logical_page - meta->cbm_index_metapage_start;
+       if (relp > CB_METAPAGE_INDEX_ENTRIES * meta->cbm_pages_per_segment)
+               elog(ERROR,
+                        "next logical page " UINT64_FORMAT " not in metapage index starting at " UINT64_FORMAT,
+                        meta->cbm_next_logical_page, meta->cbm_index_start);
+
+       /* Now we can calculate the answer. */
+       return relp / meta->cbm_pages_per_segment;
+}
+
+/*
+ * Add a new index entry to the metapage.
+ */
+void
+cb_metapage_add_index_entry(CBMetapageData *meta, CBSegNo segno)
+{
+       int                     offset = cb_metapage_get_index_entries_used(meta);
+
+       /* Sanity checks. */
+       if (offset >= CB_METAPAGE_INDEX_ENTRIES)
+               elog(ERROR, "no space for index entries remains on metapage");
+       if (meta->cbm_index[offset] != CB_INVALID_SEGMENT)
+               elog(ERROR, "index entry at offset %d unexpectedly in use for segment %u",
+                        offset, meta->cbm_index[offset]);
+
+       /* Add the entry. */
+       meta->cbm_index[offset] = segno;
+}
+
+/*
+ * Remove index entries from the metapage.
+ *
+ * This needs to be done in two cases. First, it might be that the whole
+ * index is in the metapage and that we're just trimming away some unused
+ * entries. In that case, pass relocating = false. Second, it might be that
+ * we're relocating index entries from the metapage to an index segment to
+ * make more space in the metapage. In that case, pass relocating = true.
+ */
+void
+cb_metapage_remove_index_entries(CBMetapageData *meta, unsigned count,
+                                                                bool relocating)
+{
+       int                     used = cb_metapage_get_index_entries_used(meta);
+       int                     offset;
+
+       /* This shouldn't be called unless there is some real work to do. */
+       Assert(count > 0);
+
+       /* Sanity checks. */
+       if (used < count)
+               elog(ERROR,
+                        "can't remove %d entries from a page containing only %d entries",
+                        count, used);
+       if (!relocating &&
+               (meta->cbm_oldest_index_segment != CB_INVALID_SEGMENT ||
+                meta->cbm_newest_index_segment != CB_INVALID_SEGMENT ||
+                meta->cbm_index_start != meta->cbm_index_metapage_start))
+               elog(ERROR, "removed index entries should be relocated if index segments exist");
+       if (relocating &&
+               (meta->cbm_oldest_index_segment == CB_INVALID_SEGMENT ||
+                meta->cbm_newest_index_segment == CB_INVALID_SEGMENT))
+               elog(ERROR, "removed index entries can't be relocated if no index segments exist");
+
+       /* Move any entries that we are keeping. */
+       if (count < used)
+               memmove(&meta->cbm_index[0], &meta->cbm_index[count],
+                               sizeof(CBSegNo) * (used - count));
+
+       /* Zap the entries that were formerly in use and are no longer. */
+       for (offset = used - count; offset < used; ++offset)
+               meta->cbm_index[offset] = CB_INVALID_SEGMENT;
+
+       /*
+        * Adjust meta->cbm_index_metapage_start to compensate for the index
+        * entries that we just removed.
+        */
+       meta->cbm_index_metapage_start +=
+               count * meta->cbm_pages_per_segment;
+       if (relocating)
+               meta->cbm_entries_in_newest_index_segment += count;
+       else
+               meta->cbm_index_start = meta->cbm_index_metapage_start;
+}
+
+/*
+ * Copy the indicated number of index entries out of the metapage.
+ */
+void
+cb_metapage_get_index_entries(CBMetapageData *meta, unsigned num_index_entries,
+                                                         CBSegNo *index_entries)
+{
+       Assert(num_index_entries <= cb_metapage_get_index_entries_used(meta));
+
+       memcpy(index_entries, meta->cbm_index,
+                  num_index_entries * sizeof(CBSegNo));
+}
+
+/*
+ * Return various pieces of information that are needed to initialize for
+ * access to a conveyor belt.
+ */
+void
+cb_metapage_get_critical_info(CBMetapageData *meta,
+                                                         uint16 *pages_per_segment,
+                                                         uint64 *index_segments_moved)
+{
+       *pages_per_segment = meta->cbm_pages_per_segment;
+       *index_segments_moved = meta->cbm_index_segments_moved;
+}
+
+/*
+ * Return various pieces of information that are needed to access index
+ * segments.
+ */
+void
+cb_metapage_get_index_info(CBMetapageData *meta,
+                                                  CBPageNo *index_start,
+                                                  CBPageNo *index_metapage_start,
+                                                  CBSegNo *oldest_index_segment,
+                                                  CBSegNo *newest_index_segment,
+                                                  uint64 *index_segments_moved)
+{
+       *index_start = meta->cbm_index_start;
+       *index_metapage_start = meta->cbm_index_metapage_start;
+       *oldest_index_segment = meta->cbm_oldest_index_segment;
+       *newest_index_segment = meta->cbm_newest_index_segment;
+       *index_segments_moved = meta->cbm_index_segments_moved;
+}
+
+/*
+ * Update the metapage to reflect the addition of an index segment.
+ */
+void
+cb_metapage_add_index_segment(CBMetapageData *meta, CBSegNo segno)
+{
+       meta->cbm_newest_index_segment = segno;
+       meta->cbm_entries_in_newest_index_segment = 0;
+       if (meta->cbm_oldest_index_segment == CB_INVALID_SEGMENT)
+               meta->cbm_oldest_index_segment = segno;
+}
+
+/*
+ * Update the metapage to reflect the removal of an index segment.
+ *
+ * 'segno' should be the successor of the index segment being removed,
+ * or CB_INVALID_SEGMENT if, at present, only one index segment exists.
+ */
+void
+cb_metapage_remove_index_segment(CBMetapageData *meta, CBSegNo segno)
+{
+       if (meta->cbm_oldest_index_segment == CB_INVALID_SEGMENT ||
+               meta->cbm_newest_index_segment == CB_INVALID_SEGMENT)
+               elog(ERROR, "can't remove index segment when none remain");
+
+       if (segno == CB_INVALID_SEGMENT)
+       {
+               if (meta->cbm_oldest_index_segment !=
+                       meta->cbm_newest_index_segment)
+                       elog(ERROR, "can't remove last index segment when >1 remain");
+               meta->cbm_oldest_index_segment = CB_INVALID_SEGMENT;
+               meta->cbm_newest_index_segment = CB_INVALID_SEGMENT;
+       }
+       else
+       {
+               if (meta->cbm_oldest_index_segment ==
+                       meta->cbm_newest_index_segment)
+                       elog(ERROR, "must remove last index segment when only one remains");
+               meta->cbm_oldest_index_segment = segno;
+       }
+}
+
+/*
+ * Examine the metapage state to determine how to go about recycling space.
+ *
+ * If the return value is CBM_OBSOLETE_SEGMENT_ENTRIES, then
+ * *oldest_index_segment will be set to the segment number of the oldest index
+ * segment, and *index_vacuum_stop_point will be set to the oldest page number
+ * for which any index entry in the index pages should not be removed. The
+ * caller should remove index entries that precede that point from index
+ * segments, and if possible the segments themselves.
+ *
+ * If the return value is CBM_OBSOLETE_METAPAGE_ENTRIES, then *metapage_segno
+ * will be set to a payload segment that can be deallocated, and
+ * *metapage_offset to the location in the metapage where the index entry
+ * referencing that segment is stored.
+ *
+ * If the return value is CBM_OBSOLETE_METAPAGE_START, then there are
+ * no index segments and no uncleared index entries in the metapage that
+ * are obsolete, but some cleared index entries can be discarded.
+ * *metapage_offset will be set to the smallest metapage offset that cannot
+ * be cleared (either because it is still in use, or because it is not yet
+ * allocated).
+ *
+ * If the return value is CBM_OBSOLETE_NOTHING, there's nothing to do.
+ */
+CBMObsoleteState
+cb_metapage_get_obsolete_state(CBMetapageData *meta,
+                                                          CBSegNo *oldest_index_segment,
+                                                          CBPageNo *index_vacuum_stop_point,
+                                                          CBSegNo *metapage_segno,
+                                                          unsigned *metapage_offset)
+{
+       CBPageNo        istart = meta->cbm_index_start;
+       CBPageNo        imstart = meta->cbm_index_metapage_start;
+       CBPageNo        olpage = meta->cbm_oldest_logical_page;
+       uint16          pps = meta->cbm_pages_per_segment;
+       unsigned        keep_offset;
+       unsigned        offset;
+
+       /* Sanity checks. */
+       if (olpage < istart)
+               elog(ERROR,
+                        "index starts at " UINT64_FORMAT " but oldest logical page is " UINT64_FORMAT,
+                       istart, olpage);
+       if (imstart < istart)
+               elog(ERROR,
+                        "metapage index starts at " UINT64_FORMAT " but index starts at " UINT64_FORMAT,
+                       imstart, istart);
+       if (istart % pps != 0)
+               elog(ERROR,
+                        "index start " UINT64_FORMAT " is not a multiple of pages per segment",
+                        istart);
+       if (imstart % pps != 0)
+               elog(ERROR,
+                        "index metapage start " UINT64_FORMAT " is not a multiple of pages per segment",
+                        imstart);
+
+       /*
+        * Detect the case where there is no obsolete data in the index.
+        *
+        * This happens if the oldest logical page is either equal to the start
+        * of the index, or follows it by less than the number of pages per
+        * segment. In the latter case, some but not all of the pages in the
+        * oldest payload segment are obsolete. We can only clean up entire
+        * payload semgents, so in such cases there is nothing to do.
+        */
+       if (istart + pps > olpage)
+               return CBM_OBSOLETE_NOTHING;
+
+       /*
+        * If there are any index segments, then the first step is to remove
+        * index entries from those segments, and the second step is to remove
+        * the segments themselves if they end up containing no useful entries.
+        * We need not consider doing anything in the metapage itself until no
+        * index segments remain.
+        */
+       if (meta->cbm_oldest_index_segment != CB_INVALID_SEGMENT)
+       {
+               *oldest_index_segment = meta->cbm_oldest_index_segment;
+               *index_vacuum_stop_point =
+                       Min(meta->cbm_index_metapage_start,
+                               meta->cbm_oldest_logical_page);
+               return CBM_OBSOLETE_SEGMENT_ENTRIES;
+       }
+
+       /*
+        * Since there are no index pages, the whole index is in the metapage,
+        * and therefore the logical page number should be somewhere in the range
+        * of pages covered by the metapage.
+        */
+       if (olpage < imstart)
+               elog(ERROR,
+                        "oldest logical page " UINT64_FORMAT " precedes metapage start " UINT64_FORMAT " but there are no index segments",
+                        olpage, imstart);
+
+       /* Search for obsolete index entries that have not yet been cleared. */
+       keep_offset = (olpage - imstart) / pps;
+       for (offset = 0; offset < keep_offset; ++offset)
+       {
+               if (meta->cbm_index[offset] != CB_INVALID_SEGMENT)
+               {
+                       *metapage_segno = meta->cbm_index[offset];
+                       *metapage_offset = offset;
+                       return CBM_OBSOLETE_METAPAGE_ENTRIES;
+               }
+       }
+
+       /*
+        * Apparently, there's nothing left to do but discard already-cleared
+        * index entries.
+        */
+       *metapage_offset = keep_offset;
+       return CBM_OBSOLETE_METAPAGE_START;
+}
+
+/*
+ * Clear a single index entry from the metapage.
+ *
+ * We require that the caller provide not only the offset but the segment
+ * number that is expected to be found at that offset. That lets us check
+ * that nothing unexpected has occurred.
+ */
+void
+cb_metapage_clear_obsolete_index_entry(CBMetapageData *meta,
+                                                                          CBSegNo segno,
+                                                                          unsigned offset)
+{
+       if (meta->cbm_index[offset] != offset)
+               elog(ERROR,
+                        "index entry at offset %u was expected to be %u but found %u",
+                        offset, segno, meta->cbm_index[offset]);
+
+       meta->cbm_index[offset] = CB_INVALID_SEGMENT;
+}
+
+/*
+ * Returns the lowest unused segment number covered by the metapage,
+ * or CB_INVALID_SEGMENT if none.
+ */
+CBSegNo
+cb_metapage_find_free_segment(CBMetapageData *meta)
+{
+       unsigned        i;
+       unsigned        j;
+
+       for (i = 0; i < CB_METAPAGE_FREESPACE_BYTES; ++i)
+       {
+               uint8   b = meta->cbm_freespace_map[i];
+
+               if (b == 0xFF)
+                       continue;
+
+               for (j = 0; j < BITS_PER_BYTE; ++j)
+               {
+                       if ((b & (1 << j)) == 0)
+                               return (i * BITS_PER_BYTE) + j;
+               }
+       }
+
+       return CB_INVALID_SEGMENT;
+}
+
+/*
+ * Get the allocation status of a segment from the metapage fsm.
+ */
+bool
+cb_metapage_get_fsm_bit(CBMetapageData *meta, CBSegNo segno)
+{
+       uint8           byte;
+       uint8           mask;
+
+       if (segno >= CB_METAPAGE_FREESPACE_BYTES * BITS_PER_BYTE)
+               elog(ERROR, "segment %u out of range for metapage fsm", segno);
+
+       byte = meta->cbm_freespace_map[segno / BITS_PER_BYTE];
+       mask = 1 << (segno % BITS_PER_BYTE);
+       return (byte & mask) != 0;
+}
+
+/*
+ * Set the allocation status of a segment in the metapage fsm.
+ *
+ * new_state should be true if the bit is currently clear and should be set,
+ * and false if the bit is currently set and should be cleared. Don't call
+ * this unless you know that the bit actually needs to be changed.
+ */
+void
+cb_metapage_set_fsm_bit(CBMetapageData *meta, CBSegNo segno, bool new_state)
+{
+       uint8      *byte;
+       uint8           mask;
+       uint8           old_state;
+
+       if (segno >= CB_FSM_SEGMENTS_FOR_METAPAGE)
+               elog(ERROR, "segment %u out of range for metapage fsm", segno);
+
+       byte = &meta->cbm_freespace_map[segno / BITS_PER_BYTE];
+       mask = 1 << (segno % BITS_PER_BYTE);
+       old_state = (*byte & mask) != 0;
+
+       if (old_state == new_state)
+               elog(ERROR, "metapage fsm bit for segment %u already has value %d",
+                        segno, old_state ? 1 : 0);
+
+       if (new_state)
+               *byte |= mask;
+       else
+               *byte &= ~mask;
+}
+
+/*
+ * Increment the count of segments allocated.
+ */
+void
+cb_metapage_increment_next_segment(CBMetapageData *meta, CBSegNo segno)
+{
+       if (segno != meta->cbm_next_segment)
+               elog(ERROR, "extending to create segment %u but next segment is %u",
+                        segno, meta->cbm_next_segment);
+
+       meta->cbm_next_segment++;
+}
+
+/*
+ * Increment the count of index segments moved.
+ */
+void
+cb_metapage_increment_index_segments_moved(CBMetapageData *meta)
+{
+       meta->cbm_index_segments_moved++;
+}
diff --git a/src/backend/access/conveyor/cbmodify.c b/src/backend/access/conveyor/cbmodify.c
new file mode 100644 (file)
index 0000000..7d74bce
--- /dev/null
@@ -0,0 +1,686 @@
+/*-------------------------------------------------------------------------
+ *
+ * cbmodify.c
+ *       Routines to make a change to a conveyor belt and XLOG it if needed.
+ *
+ * Each function in this file implements one type of conveyor-belt write
+ * operation. The pages to be modified are assumed to already have been
+ * identified and locked.
+ *
+ * Each function in this file has a corresponding REDO function in
+ * cbxlog.c, except where log_newpage is used.
+ *
+ * Copyright (c) 2016-2021, PostgreSQL Global Development Group
+ *
+ * src/backend/access/conveyor/cbmodify.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/cbfsmpage.h"
+#include "access/cbindexpage.h"
+#include "access/cbmetapage.h"
+#include "access/cbmodify.h"
+#include "access/cbxlog.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+
+/*
+ * Create a metapage, and optionally write XLOG for the change.
+ */
+void
+cb_create_metapage(RelFileNode *rnode,
+                                  ForkNumber fork,
+                                  Buffer metabuffer,
+                                  uint16 pages_per_segment,
+                                  bool needs_xlog)
+{
+       Page            metapage;
+
+       metapage = BufferGetPage(metabuffer);
+       cb_metapage_initialize(metapage, pages_per_segment);
+
+       if (needs_xlog)
+       {
+               XLogRecPtr      lsn;
+
+               lsn = log_newpage(rnode, fork, CONVEYOR_METAPAGE, metapage, true);
+               PageSetLSN(metapage, lsn);
+       }
+
+       MarkBufferDirty(metabuffer);
+}
+
+/*
+ * Create a new FSM page, and optionally write XLOG for the change.
+ */
+CBSegNo
+cb_create_fsmpage(RelFileNode *rnode,
+                                 ForkNumber fork,
+                                 BlockNumber blkno,
+                                 Buffer buffer,
+                                 uint16 pages_per_segment,
+                                 bool needs_xlog)
+{
+       Page            page;
+       CBSegNo         segno;
+
+       START_CRIT_SECTION();
+
+       page = BufferGetPage(buffer);
+       segno = cb_fsmpage_initialize(page, blkno, pages_per_segment);
+       MarkBufferDirty(buffer);
+
+       if (needs_xlog)
+       {
+               XLogRecPtr      lsn;
+
+               lsn = log_newpage(rnode, fork, blkno, page, true);
+               PageSetLSN(page, lsn);
+       }
+
+       END_CRIT_SECTION();
+
+       return segno;
+}
+
+/*
+ * Insert a payload page, and optionally write XLOG for the change.
+ *
+ * Since we have no idea what the contents of the payload page ought to be,
+ * it's up to the caller to initialize it before calling this function.
+ * That means that the caller is also responsible for starting and ending
+ * the required critical section.
+ */
+void
+cb_insert_payload_page(RelFileNode *rnode, ForkNumber fork, Buffer metabuffer,
+                                          BlockNumber payloadblock, Buffer payloadbuffer,
+                                          bool needs_xlog)
+{
+       Page            metapage;
+       Page            payloadpage;
+       CBMetapageData *meta;
+
+       Assert(CritSectionCount > 0);
+
+       payloadpage = BufferGetPage(payloadbuffer);
+       MarkBufferDirty(payloadbuffer);
+
+       metapage = BufferGetPage(metabuffer);
+       meta = cb_metapage_get_special(metapage);
+       cb_metapage_advance_next_logical_page(meta, payloadblock);
+       MarkBufferDirty(metabuffer);
+
+       if (needs_xlog)
+       {
+               XLogRecPtr      lsn;
+
+               XLogBeginInsert();
+               XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage,
+                                                 REGBUF_STANDARD);
+               XLogRegisterBlock(1, rnode, fork, payloadblock,
+                                                 payloadpage, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE);
+
+               PageSetLSN(payloadpage, lsn);
+               PageSetLSN(metapage, lsn);
+       }
+}
+
+/*
+ * Allocate a new payload segment, and optionally write XLOG for the change.
+ *
+ * If the allocation status of the segment is tracked in the metapage,
+ * 'fsmblock' should be InvalidBlockNumber and 'fsmbuffer' should be
+ * InvalidBuffer. Otherwise, 'fsmblock' should be the block number of the
+ * relevant freespace map block and 'fsmbuffer' the corresponding buffer.
+ *
+ * 'is_extend' should be true when we're allocating a segment that hasn't
+ * existed before, necessitating an adjustment to the metapage's
+ * next-segment counter.
+ *
+ * See cb_xlog_allocate_payload_segment for the corresponding REDO routine.
+ */
+void
+cb_allocate_payload_segment(RelFileNode *rnode,
+                                                       ForkNumber fork,
+                                                       Buffer metabuffer,
+                                                       BlockNumber fsmblock,
+                                                       Buffer fsmbuffer,
+                                                       CBSegNo segno,
+                                                       bool is_extend,
+                                                       bool needs_xlog)
+{
+       Page            metapage;
+       CBMetapageData *meta;
+
+       metapage = BufferGetPage(metabuffer);
+       meta = cb_metapage_get_special(metapage);
+
+       START_CRIT_SECTION();
+
+       cb_metapage_add_index_entry(meta, segno);
+       MarkBufferDirty(metabuffer);
+
+       if (is_extend)
+               cb_metapage_increment_next_segment(meta, segno);
+
+       if (fsmblock != InvalidBlockNumber)
+       {
+               cb_fsmpage_set_fsm_bit(BufferGetPage(fsmbuffer), segno, true);
+               MarkBufferDirty(fsmbuffer);
+       }
+       else
+               cb_metapage_set_fsm_bit(meta, segno, true);
+
+       if (needs_xlog)
+       {
+               xl_cb_allocate_payload_segment xlrec;
+               XLogRecPtr      lsn;
+
+               xlrec.segno = segno;
+               xlrec.is_extend = is_extend;
+
+               XLogBeginInsert();
+               XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage,
+                                                 REGBUF_STANDARD);
+               if (fsmblock != InvalidBlockNumber)
+                       XLogRegisterBlock(1, rnode, fork, fsmblock,
+                                                         BufferGetPage(fsmbuffer), REGBUF_STANDARD);
+               XLogRegisterData((char *) &xlrec, SizeOfCBAllocatePayloadSegment);
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_ALLOCATE_PAYLOAD_SEGMENT);
+
+               PageSetLSN(metapage, lsn);
+               if (fsmblock != InvalidBlockNumber)
+                       PageSetLSN(BufferGetPage(fsmbuffer), lsn);
+       }
+
+       END_CRIT_SECTION();
+}
+
+/*
+ * Allocate a new index segment, and optionally write XLOG for the change.
+ *
+ * 'metabuffer' should be the buffer containing the metapage.
+ *
+ * 'indexblock' and 'indexbuffer' should be the block number and buffer for
+ * the first page of the index segment.
+ *
+ * If any index segments already exist, then 'prevblock' should be the
+ * block number of the first page of the last index segment that already
+ * exists, and 'prevbuffer' the corresponding buffer; otherwise, use
+ * InvalidBlockNumber and InvalidBuffer, respectively.
+ *
+ * Similarly, if the allocation status of the segment is tracked in an
+ * FSM page, 'fsmblock' and 'fsmbuffer' should reference that page; if that
+ * information is tracked in the metpaage, the InvalidBlockNumber and
+ * InvalidBuffer.
+ *
+ * 'segno' is the segment number of the new index segment, and 'pageno'
+ * is the first logical page for which it will store index information.
+ *
+ * 'is_extend' should be true when we're allocating a segment that hasn't
+ * existed before, necessitating an adjustment to the metapage's
+ * next-segment counter.
+ *
+ * See cb_xlog_allocate_index_segment for the corresponding REDO routine.
+ */
+void
+cb_allocate_index_segment(RelFileNode *rnode,
+                                                 ForkNumber fork,
+                                                 Buffer metabuffer,
+                                                 BlockNumber indexblock,
+                                                 Buffer indexbuffer,
+                                                 BlockNumber prevblock,
+                                                 Buffer prevbuffer,
+                                                 BlockNumber fsmblock,
+                                                 Buffer fsmbuffer,
+                                                 CBSegNo segno,
+                                                 CBPageNo pageno,
+                                                 bool is_extend,
+                                                 bool needs_xlog)
+{
+       Page            metapage;
+       Page            indexpage;
+       CBMetapageData *meta;
+
+       metapage = BufferGetPage(metabuffer);
+       indexpage = BufferGetPage(indexbuffer);
+
+       meta = cb_metapage_get_special(metapage);
+
+       START_CRIT_SECTION();
+
+       cb_metapage_add_index_segment(meta, segno);
+       MarkBufferDirty(metabuffer);
+
+       if (is_extend)
+               cb_metapage_increment_next_segment(meta, segno);
+
+       cb_indexpage_initialize(indexpage, pageno);
+       MarkBufferDirty(indexbuffer);
+
+       if (prevblock != InvalidBlockNumber)
+       {
+               cb_indexpage_set_next_segment(BufferGetPage(prevbuffer), segno);
+               MarkBufferDirty(prevbuffer);
+       }
+
+       if (fsmblock != InvalidBlockNumber)
+       {
+               cb_fsmpage_set_fsm_bit(BufferGetPage(fsmbuffer), segno, true);
+               MarkBufferDirty(fsmbuffer);
+       }
+       else
+               cb_metapage_set_fsm_bit(meta, segno, true);
+
+       if (needs_xlog)
+       {
+               xl_cb_allocate_index_segment xlrec;
+               XLogRecPtr      lsn;
+
+               xlrec.segno = segno;
+               xlrec.pageno = pageno;
+               xlrec.is_extend = is_extend;
+
+               XLogBeginInsert();
+               XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage,
+                                                 REGBUF_STANDARD);
+               XLogRegisterBlock(1, rnode, fork, indexblock, indexpage,
+                                                 REGBUF_STANDARD | REGBUF_WILL_INIT);
+               if (prevblock != InvalidBlockNumber)
+                       XLogRegisterBlock(2, rnode, fork, prevblock,
+                                                         BufferGetPage(prevbuffer), REGBUF_STANDARD);
+               if (fsmblock != InvalidBlockNumber)
+                       XLogRegisterBlock(3, rnode, fork, fsmblock,
+                                                         BufferGetPage(fsmbuffer), REGBUF_STANDARD);
+               XLogRegisterData((char *) &xlrec, SizeOfCBAllocateIndexSegment);
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_ALLOCATE_INDEX_SEGMENT);
+
+               PageSetLSN(metapage, lsn);
+               PageSetLSN(indexpage, lsn);
+               if (prevblock != InvalidBlockNumber)
+                       PageSetLSN(BufferGetPage(prevbuffer), lsn);
+               if (fsmblock != InvalidBlockNumber)
+                       PageSetLSN(BufferGetPage(fsmbuffer), lsn);
+       }
+
+       END_CRIT_SECTION();
+}
+
+/*
+ * Allocate a new index page in an existing index segment, and optionally
+ * write XLOG for the change.
+ *
+ * 'indexblock' and 'indexbuffer' should be the block number and buffer for
+ * the new page. 'firstindexblock' and 'firstindexbuffer' are the block
+ * number and buffer for the first page of the index segment.
+ *
+ * 'pageno' is the first logical page for which the new index page will
+ * store index information.
+ *
+ * See cb_xlog_allocate_index_page for the corresponding REDO routine.
+ */
+void
+cb_allocate_index_page(RelFileNode *rnode,
+                                          ForkNumber fork,
+                                          BlockNumber indexblock,
+                                          Buffer indexbuffer,
+                                          CBPageNo pageno,
+                                          bool needs_xlog)
+{
+       Page            indexpage;
+
+       indexpage = BufferGetPage(indexbuffer);
+
+       START_CRIT_SECTION();
+
+       cb_indexpage_initialize(indexpage, pageno);
+       MarkBufferDirty(indexbuffer);
+
+       if (needs_xlog)
+       {
+               xl_cb_allocate_index_page xlrec;
+               XLogRecPtr      lsn;
+
+               xlrec.pageno = pageno;
+
+               XLogBeginInsert();
+               XLogRegisterBlock(0, rnode, fork, indexblock, indexpage,
+                                                 REGBUF_STANDARD | REGBUF_WILL_INIT);
+               XLogRegisterData((char *) &xlrec, SizeOfCBAllocateIndexPage);
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_ALLOCATE_INDEX_PAGE);
+
+               PageSetLSN(indexpage, lsn);
+       }
+
+       END_CRIT_SECTION();
+}
+
+/*
+ * Relocate index entries from the metapage to a page in an index segment,
+ * and optionally write XLOG for the change.
+ *
+ * 'pageoffset' is the offset within the index page where the new entries
+ * should be placed.
+ *
+ * 'index_page_start' is the first logical page number covered by the index
+ * page being modified.
+ *
+ * See cb_xlog_allocate_index_segment for the corresponding REDO routine.
+ */
+void
+cb_relocate_index_entries(RelFileNode *rnode,
+                                                 ForkNumber fork,
+                                                 Buffer metabuffer,
+                                                 BlockNumber indexblock,
+                                                 Buffer indexbuffer,
+                                                 unsigned pageoffset,
+                                                 unsigned num_index_entries,
+                                                 CBSegNo *index_entries,
+                                                 CBPageNo index_page_start,
+                                                 bool needs_xlog)
+{
+       Page            metapage;
+       Page            indexpage;
+       CBMetapageData *meta;
+
+       metapage = BufferGetPage(metabuffer);
+       indexpage = BufferGetPage(indexbuffer);
+
+       meta = cb_metapage_get_special(metapage);
+
+       START_CRIT_SECTION();
+
+       /* If these are the first entries on the page, initialize it. */
+       if (pageoffset == 0)
+               cb_indexpage_initialize(indexpage, index_page_start);
+
+       cb_indexpage_add_index_entries(indexpage, pageoffset, num_index_entries,
+                                                                  index_entries);
+       cb_metapage_remove_index_entries(meta, num_index_entries, true);
+
+       MarkBufferDirty(metabuffer);
+       MarkBufferDirty(indexbuffer);
+
+       if (needs_xlog)
+       {
+               xl_cb_relocate_index_entries xlrec;
+               XLogRecPtr      lsn;
+               uint8           flags = REGBUF_STANDARD;
+
+               xlrec.pageoffset = pageoffset;
+               xlrec.num_index_entries = num_index_entries;
+               xlrec.index_page_start = index_page_start;
+
+               if (pageoffset == 0)
+                       flags |= REGBUF_WILL_INIT;
+
+               XLogBeginInsert();
+               XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage,
+                                                 REGBUF_STANDARD);
+               XLogRegisterBlock(1, rnode, fork, indexblock, indexpage, flags);
+               XLogRegisterData((char *) &xlrec, SizeOfCBRelocateIndexEntries);
+               XLogRegisterData((char *) index_entries,
+                                                num_index_entries * sizeof(CBSegNo));
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_RELOCATE_INDEX_ENTRIES);
+
+               PageSetLSN(metapage, lsn);
+               PageSetLSN(indexpage, lsn);
+       }
+
+       END_CRIT_SECTION();
+}
+
+/*
+ * Logically truncate a conveyor belt by updating its notion of the oldest
+ * logical page.
+ */
+void
+cb_logical_truncate(RelFileNode *rnode,
+                                       ForkNumber fork,
+                                       Buffer metabuffer,
+                                       CBPageNo oldest_keeper,
+                                       bool needs_xlog)
+{
+       Page            metapage;
+       CBMetapageData *meta;
+
+       metapage = BufferGetPage(metabuffer);
+       meta = cb_metapage_get_special(metapage);
+
+       START_CRIT_SECTION();
+
+       cb_metapage_advance_oldest_logical_page(meta, oldest_keeper);
+
+       MarkBufferDirty(metabuffer);
+
+       if (needs_xlog)
+       {
+               xl_cb_logical_truncate  xlrec;
+               XLogRecPtr      lsn;
+
+               xlrec.oldest_keeper = oldest_keeper;
+
+               XLogBeginInsert();
+               XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage,
+                                                 REGBUF_STANDARD);
+               XLogRegisterData((char *) &xlrec, SizeOfCBLogicalTruncate);
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_LOGICAL_TRUNCATE);
+
+               PageSetLSN(metapage, lsn);
+       }
+
+       END_CRIT_SECTION();
+}
+
+/*
+ * Clear a block in preparation for deallocating the segment that contains it.
+ *
+ * The block needs to appear unused to ConveyorBeltPageIsUnused(); a simple
+ * call to PageInit() is the easiest way to accomplish that.
+ *
+ * We could use log_newpage() here but it would generate more WAL.
+ */
+void
+cb_clear_block(RelFileNode *rnode,
+                          ForkNumber fork,
+                          BlockNumber blkno,
+                          Buffer buffer,
+                          bool needs_xlog)
+{
+       Page    page = BufferGetPage(buffer);
+
+       START_CRIT_SECTION();
+
+       PageInit(page, BLCKSZ, 0);
+
+       MarkBufferDirty(buffer);
+
+       if (needs_xlog)
+       {
+               XLogRecPtr      lsn;
+
+               XLogBeginInsert();
+               XLogRegisterBlock(0, rnode, fork, blkno, page,
+                                                 REGBUF_STANDARD | REGBUF_WILL_INIT);
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_CLEAR_BLOCK);
+
+               PageSetLSN(page, lsn);
+       }
+
+       END_CRIT_SECTION();
+}
+
+/*
+ * Deallocate a payload segment.
+ *
+ * This is a bit tricky. We need to clear the index entry pointing to the
+ * payload segment, and we also need to clear the FSM bit for the segment.
+ * Either, both, or neither of those could be in the metapage.
+ *
+ * If neither is in the metapage, metabuffer should be InvalidBuffer;
+ * otherwise it should be the buffer containing the metapage.
+ *
+ * If the index entry pointing to the payload segment is in the metapage,
+ * then indexblock should be InvalidBlockNumber and indexbuffer should be
+ * InvalidBuffer; otherwise, they should reference the index page containing
+ * the index entry.
+ *
+ * If the freespace map bit for the segment is in the metapage, then
+ * fsmblock should be InvalidBlockNumber and fsmbuffer should be InvalidBuffer;
+ * otherwise, they should reference the FSM page containing the relevant
+ * freespace map bit.
+ */
+void
+cb_recycle_payload_segment(RelFileNode *rnode,
+                                                  ForkNumber fork,
+                                                  Buffer metabuffer,
+                                                  BlockNumber indexblock,
+                                                  Buffer indexbuffer,
+                                                  BlockNumber fsmblock,
+                                                  Buffer fsmbuffer,
+                                                  CBSegNo segno,
+                                                  unsigned pageoffset,
+                                                  bool needs_xlog)
+{
+       START_CRIT_SECTION();
+
+       if (BufferIsValid(metabuffer))
+       {
+               CBMetapageData *meta;
+
+               Assert(indexblock == InvalidBlockNumber ||
+                          fsmblock == InvalidBlockNumber);
+               meta = cb_metapage_get_special(BufferGetPage(metabuffer));
+               if (indexblock == InvalidBlockNumber)
+                       cb_metapage_clear_obsolete_index_entry(meta, segno, pageoffset);
+               if (fsmblock == InvalidBlockNumber)
+                       cb_metapage_set_fsm_bit(meta, segno, false);
+               MarkBufferDirty(metabuffer);
+       }
+
+       if (indexblock != InvalidBlockNumber)
+       {
+               cb_indexpage_clear_obsolete_entry(BufferGetPage(indexblock),
+                                                                                 segno, pageoffset);
+               MarkBufferDirty(indexbuffer);
+       }
+
+       if (fsmblock != InvalidBlockNumber)
+       {
+               cb_fsmpage_set_fsm_bit(BufferGetPage(fsmbuffer), segno, false);
+               MarkBufferDirty(fsmbuffer);
+       }
+
+       if (needs_xlog)
+       {
+               xl_cb_recycle_payload_segment xlrec;
+               XLogRecPtr      lsn;
+
+               xlrec.segno = segno;
+               xlrec.pageoffset = pageoffset;
+
+               XLogBeginInsert();
+               if (BufferIsValid(metabuffer))
+                       XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE,
+                                                         BufferGetPage(metabuffer), REGBUF_STANDARD);
+               if (indexblock != InvalidBlockNumber)
+                       XLogRegisterBlock(1, rnode, fork, indexblock,
+                                                         BufferGetPage(indexbuffer), REGBUF_STANDARD);
+               if (fsmblock != InvalidBlockNumber)
+                       XLogRegisterBlock(2, rnode, fork, fsmblock,
+                                                         BufferGetPage(fsmbuffer), REGBUF_STANDARD);
+               XLogRegisterData((char *) &xlrec, SizeOfCBShiftMetapageIndex);
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX);
+
+               if (indexblock != InvalidBlockNumber)
+                       PageSetLSN(BufferGetPage(indexbuffer), lsn);
+               if (fsmblock != InvalidBlockNumber)
+                       PageSetLSN(BufferGetPage(fsmbuffer), lsn);
+       }
+
+       END_CRIT_SECTION();
+}
+
+/*
+ * Deallocate an index segment.
+ *
+ * indexblock and indexbuffer should refer to the first block of the segment
+ * to be deallocated. It's the oldest index segment, so we can't clear it
+ * in advance, else we'd lose track of what other index segments exist.
+ *
+ * fsmblock and fsmbuffer should refer to the FSM page that contains the
+ * FSM bit for the segment to be freed. If the segment is covered by the
+ * metapage, pass InvalidBlockNumber and InvalidBuffer, respectively.
+ *
+ * The return value is the segment number of the oldest index segment that
+ * remains after the operation, or CB_INVALID_SEGMENT if none.
+ */
+CBSegNo
+cb_recycle_index_segment(RelFileNode *rnode,
+                                                ForkNumber fork,
+                                                Buffer metabuffer,
+                                                BlockNumber indexblock,
+                                                Buffer indexbuffer,
+                                                BlockNumber fsmblock,
+                                                Buffer fsmbuffer,
+                                                CBSegNo segno,
+                                                bool needs_xlog)
+{
+       elog(ERROR, "XXX cb_recycle_index_segment not implemented yet");
+}
+
+/*
+ * Shift the start of the metapage index by discarding a given number
+ * of already-cleared index entries.
+ */
+void
+cb_shift_metapage_index(RelFileNode *rnode,
+                                               ForkNumber fork,
+                                               Buffer metabuffer,
+                                               unsigned num_entries,
+                                               bool needs_xlog)
+{
+       Page            metapage;
+       CBMetapageData *meta;
+
+       metapage = BufferGetPage(metabuffer);
+       meta = cb_metapage_get_special(metapage);
+
+       START_CRIT_SECTION();
+
+       cb_metapage_remove_index_entries(meta, num_entries, false);
+
+       MarkBufferDirty(metabuffer);
+
+       if (needs_xlog)
+       {
+               xl_cb_shift_metapage_index xlrec;
+               XLogRecPtr      lsn;
+
+               xlrec.num_entries = num_entries;
+
+               XLogBeginInsert();
+               XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage,
+                                                 REGBUF_STANDARD);
+               XLogRegisterData((char *) &xlrec, SizeOfCBShiftMetapageIndex);
+               lsn = XLogInsert(RM_CONVEYOR_ID,
+                                                XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX);
+
+               PageSetLSN(metapage, lsn);
+       }
+
+       END_CRIT_SECTION();
+}
diff --git a/src/backend/access/conveyor/cbxlog.c b/src/backend/access/conveyor/cbxlog.c
new file mode 100644 (file)
index 0000000..2dd030c
--- /dev/null
@@ -0,0 +1,442 @@
+/*-------------------------------------------------------------------------
+ *
+ * cbxlog.c
+ *       XLOG support for conveyor belts.
+ *
+ * For each REDO function in this file, see cbmodify.c for the
+ * corresponding function that performs the modification during normal
+ * running and logs the record that we REDO here.
+ *
+ * Copyright (c) 2016-2021, PostgreSQL Global Development Group
+ *
+ * src/backend/access/conveyor/cbmodify.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/cbfsmpage.h"
+#include "access/cbindexpage.h"
+#include "access/cbmetapage.h"
+#include "access/cbxlog.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "storage/bufmgr.h"
+
+/*
+ * REDO function for cb_insert_payload_page.
+ *
+ * Note that the handling of block 1 is very similar to XLOG_FPI.
+ */
+static void
+cb_xlog_insert_payload_page(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       Buffer          metabuffer;
+       Buffer          payloadbuffer;
+
+       if (!XLogRecHasBlockImage(record, 1))
+               elog(ERROR, "XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE record did not contain full page image of payload block");
+       if (XLogReadBufferForRedo(record, 1, &payloadbuffer) != BLK_RESTORED)
+               elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
+
+       /* last due to lock ordering rules; see README */
+       if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO)
+       {
+               Page    metapage = BufferGetPage(metabuffer);
+               CBMetapageData *meta;
+               BlockNumber     payloadblock;
+
+               meta = cb_metapage_get_special(metapage);
+               XLogRecGetBlockTag(record, 1, NULL, NULL, &payloadblock);
+               cb_metapage_advance_next_logical_page(meta, payloadblock);
+               PageSetLSN(metapage, lsn);
+               MarkBufferDirty(metabuffer);
+       }
+
+       UnlockReleaseBuffer(metabuffer);
+       UnlockReleaseBuffer(payloadbuffer);
+}
+
+/*
+ * REDO function for cb_allocate_payload_segment.
+ */
+static void
+cb_xlog_allocate_payload_segment(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       xl_cb_allocate_payload_segment *xlrec;
+       Buffer          metabuffer;
+       bool            have_fsm_page = XLogRecGetBlockTag(record, 1, NULL, NULL, NULL);
+       Buffer          fsmbuffer = InvalidBuffer;
+
+       xlrec = (xl_cb_allocate_payload_segment *) XLogRecGetData(record);
+
+       if (have_fsm_page &&
+               XLogReadBufferForRedo(record, 1, &fsmbuffer) == BLK_NEEDS_REDO)
+       {
+               Page    fsmpage = BufferGetPage(fsmbuffer);
+
+               cb_fsmpage_set_fsm_bit(fsmpage, xlrec->segno, true);
+               PageSetLSN(fsmpage, lsn);
+               MarkBufferDirty(fsmbuffer);
+       }
+
+       /* last due to lock ordering rules; see README */
+       if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO)
+       {
+               Page    metapage = BufferGetPage(metabuffer);
+               CBMetapageData *meta;
+
+               meta = cb_metapage_get_special(metapage);
+               cb_metapage_add_index_entry(meta, xlrec->segno);
+               if (xlrec->is_extend)
+                       cb_metapage_increment_next_segment(meta, xlrec->segno);
+               if (!have_fsm_page)
+                       cb_metapage_set_fsm_bit(meta, xlrec->segno, true);
+               PageSetLSN(metapage, lsn);
+               MarkBufferDirty(metabuffer);
+       }
+
+       if (BufferIsValid(metabuffer))
+               UnlockReleaseBuffer(metabuffer);
+       if (BufferIsValid(fsmbuffer))
+               UnlockReleaseBuffer(fsmbuffer);
+}
+
+/*
+ * REDO function for cb_allocate_index_segment.
+ */
+static void
+cb_xlog_allocate_index_segment(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       xl_cb_allocate_index_segment *xlrec;
+       bool            have_prev_page;
+       bool            have_fsm_page;
+       Buffer          metabuffer;
+       Buffer          indexbuffer;
+       Buffer          prevbuffer = InvalidBuffer;
+       Buffer          fsmbuffer = InvalidBuffer;
+       Page            indexpage;
+
+       have_prev_page = XLogRecGetBlockTag(record, 2, NULL, NULL, NULL);
+       have_fsm_page = XLogRecGetBlockTag(record, 3, NULL, NULL, NULL);
+
+       xlrec = (xl_cb_allocate_index_segment *) XLogRecGetData(record);
+
+       indexbuffer = XLogInitBufferForRedo(record, 1);
+       indexpage = BufferGetPage(indexbuffer);
+       cb_indexpage_initialize(indexpage, xlrec->pageno);
+       PageSetLSN(indexpage, lsn);
+       MarkBufferDirty(indexbuffer);
+
+       if (have_prev_page &&
+               XLogReadBufferForRedo(record, 2, &prevbuffer) == BLK_NEEDS_REDO)
+       {
+               Page    prevpage = BufferGetPage(prevbuffer);
+
+               cb_indexpage_set_next_segment(prevpage, xlrec->segno);
+               PageSetLSN(prevpage, lsn);
+               MarkBufferDirty(prevbuffer);
+       }
+
+       if (have_fsm_page &&
+               XLogReadBufferForRedo(record, 3, &fsmbuffer) == BLK_NEEDS_REDO)
+       {
+               Page    fsmpage = BufferGetPage(fsmbuffer);
+
+               cb_fsmpage_set_fsm_bit(fsmpage, xlrec->segno, true);
+               PageSetLSN(fsmpage, lsn);
+               MarkBufferDirty(fsmbuffer);
+       }
+
+       /* last due to lock ordering rules; see README */
+       if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO)
+       {
+               Page    metapage = BufferGetPage(metabuffer);
+               CBMetapageData *meta;
+
+               meta = cb_metapage_get_special(metapage);
+               cb_metapage_add_index_segment(meta, xlrec->segno);
+               if (xlrec->is_extend)
+                       cb_metapage_increment_next_segment(meta, xlrec->segno);
+               if (!have_fsm_page)
+                       cb_metapage_set_fsm_bit(meta, xlrec->segno, true);
+               PageSetLSN(metapage, lsn);
+               MarkBufferDirty(metabuffer);
+       }
+
+       if (BufferIsValid(metabuffer))
+               UnlockReleaseBuffer(metabuffer);
+       if (BufferIsValid(indexbuffer))
+               UnlockReleaseBuffer(indexbuffer);
+       if (BufferIsValid(prevbuffer))
+               UnlockReleaseBuffer(prevbuffer);
+       if (BufferIsValid(fsmbuffer))
+               UnlockReleaseBuffer(fsmbuffer);
+}
+
+/*
+ * REDO function for cb_allocate_index_page.
+ */
+static void
+cb_xlog_allocate_index_page(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       xl_cb_allocate_index_page *xlrec;
+       Buffer          indexbuffer;
+       Page            indexpage;
+
+       xlrec = (xl_cb_allocate_index_page *) XLogRecGetData(record);
+
+       indexbuffer = XLogInitBufferForRedo(record, 0);
+       indexpage = BufferGetPage(indexbuffer);
+       cb_indexpage_initialize(indexpage, xlrec->pageno);
+       PageSetLSN(indexpage, lsn);
+       MarkBufferDirty(indexbuffer);
+
+       UnlockReleaseBuffer(indexbuffer);
+}
+
+/*
+ * REDO function for cb_relocate_index_entries.
+ */
+static void
+cb_xlog_relocate_index_entries(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       xl_cb_relocate_index_entries *xlrec;
+       Buffer          metabuffer;
+       Buffer          indexbuffer;
+       ReadBufferMode  mode;
+
+       xlrec = (xl_cb_relocate_index_entries *) XLogRecGetData(record);
+
+       mode = xlrec->pageoffset == 0 ? RBM_ZERO_AND_LOCK : RBM_NORMAL;
+       if (XLogReadBufferForRedoExtended(record, 1, mode, false,
+                                                                         &indexbuffer) == BLK_NEEDS_REDO)
+       {
+               Page    indexpage = BufferGetPage(indexbuffer);
+
+               if (xlrec->pageoffset == 0)
+                       cb_indexpage_initialize(indexpage, xlrec->index_page_start);
+
+               cb_indexpage_add_index_entries(indexpage, xlrec->pageoffset,
+                                                                          xlrec->num_index_entries,
+                                                                          xlrec->index_entries);
+               PageSetLSN(indexpage, lsn);
+               MarkBufferDirty(indexbuffer);
+       }
+
+       /* NB: metapage must be last due to lock ordering rules */
+       if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO)
+       {
+               Page    metapage = BufferGetPage(metabuffer);
+               CBMetapageData *meta;
+
+               meta = cb_metapage_get_special(metapage);
+               cb_metapage_remove_index_entries(meta, xlrec->num_index_entries, true);
+               PageSetLSN(metapage, lsn);
+               MarkBufferDirty(metabuffer);
+       }
+
+       if (BufferIsValid(metabuffer))
+               UnlockReleaseBuffer(metabuffer);
+       if (BufferIsValid(indexbuffer))
+               UnlockReleaseBuffer(indexbuffer);
+}
+
+/*
+ * REDO function for cb_logical_truncate.
+ */
+static void
+cb_xlog_logical_truncate(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       xl_cb_logical_truncate *xlrec;
+       Buffer          metabuffer;
+
+       xlrec = (xl_cb_logical_truncate *) XLogRecGetData(record);
+
+       if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO)
+       {
+               Page    metapage = BufferGetPage(metabuffer);
+               CBMetapageData *meta;
+
+               meta = cb_metapage_get_special(metapage);
+               cb_metapage_advance_oldest_logical_page(meta, xlrec->oldest_keeper);
+               PageSetLSN(metapage, lsn);
+               MarkBufferDirty(metabuffer);
+       }
+
+       if (BufferIsValid(metabuffer))
+               UnlockReleaseBuffer(metabuffer);
+}
+
+/*
+ * REDO function for cb_clear_block.
+ */
+static void
+cb_xlog_clear_block(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       Buffer          buffer;
+       Page            page;
+
+       buffer = XLogInitBufferForRedo(record, 0);
+       page = BufferGetPage(buffer);
+       PageInit(page, 0, BLCKSZ);
+       PageSetLSN(page, lsn);
+       MarkBufferDirty(buffer);
+
+       UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * REDO function for cb_recycle_payload_segment.
+ */
+static void
+cb_xlog_recycle_payload_segment(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       xl_cb_recycle_payload_segment *xlrec;
+       bool            have_metapage;
+       bool            have_index_page;
+       bool            have_fsm_page;
+       Buffer          fsmbuffer = InvalidBuffer;
+       Buffer          indexbuffer = InvalidBuffer;
+       Buffer          metabuffer = InvalidBuffer;
+
+       have_metapage = XLogRecGetBlockTag(record, 0, NULL, NULL, NULL);
+       have_index_page = XLogRecGetBlockTag(record, 1, NULL, NULL, NULL);
+       have_fsm_page = XLogRecGetBlockTag(record, 2, NULL, NULL, NULL);
+
+       xlrec = (xl_cb_recycle_payload_segment *) XLogRecGetData(record);
+
+       if (have_index_page &&
+               XLogReadBufferForRedo(record, 1, &indexbuffer) == BLK_NEEDS_REDO)
+       {
+               Page    indexpage = BufferGetPage(indexbuffer);
+
+               cb_indexpage_clear_obsolete_entry(indexpage, xlrec->segno,
+                                                                                 xlrec->pageoffset);
+               PageSetLSN(indexpage, lsn);
+               MarkBufferDirty(indexbuffer);
+       }
+
+       if (have_fsm_page &&
+               XLogReadBufferForRedo(record, 2, &fsmbuffer) == BLK_NEEDS_REDO)
+       {
+               Page    fsmpage = BufferGetPage(fsmbuffer);
+
+               cb_fsmpage_set_fsm_bit(fsmpage, xlrec->segno, false);
+               PageSetLSN(fsmpage, lsn);
+               MarkBufferDirty(fsmbuffer);
+       }
+
+       /* last due to lock ordering rules; see README */
+       if (have_metapage &&
+               XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO)
+       {
+               Page    metapage = BufferGetPage(metabuffer);
+               CBMetapageData *meta;
+
+               meta = cb_metapage_get_special(metapage);
+               if (!have_index_page)
+                       cb_metapage_clear_obsolete_index_entry(meta, xlrec->segno,
+                                                                                                  xlrec->pageoffset);
+               if (!have_fsm_page)
+                       cb_metapage_set_fsm_bit(meta, xlrec->segno, false);
+               PageSetLSN(metapage, lsn);
+               MarkBufferDirty(metabuffer);
+       }
+
+       if (BufferIsValid(fsmbuffer))
+               UnlockReleaseBuffer(fsmbuffer);
+       if (BufferIsValid(indexbuffer))
+               UnlockReleaseBuffer(indexbuffer);
+       if (BufferIsValid(metabuffer))
+               UnlockReleaseBuffer(metabuffer);
+}
+
+/*
+ * REDO function for cb_recycle_index_segment.
+ */
+static void
+cb_xlog_recycle_index_segment(XLogReaderState *record)
+{
+       elog(ERROR, "XXX cb_xlog_recycle_index_segment not implemented yet");
+}
+
+/*
+ * REDO function for cb_shift_metapage_index.
+ */
+static void
+cb_xlog_shift_metapage_index(XLogReaderState *record)
+{
+       XLogRecPtr      lsn = record->EndRecPtr;
+       xl_cb_shift_metapage_index *xlrec;
+       Buffer          metabuffer;
+
+       xlrec = (xl_cb_shift_metapage_index *) XLogRecGetData(record);
+
+       if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO)
+       {
+               Page    metapage = BufferGetPage(metabuffer);
+               CBMetapageData *meta;
+
+               meta = cb_metapage_get_special(metapage);
+               cb_metapage_remove_index_entries(meta, xlrec->num_entries, false);
+               PageSetLSN(metapage, lsn);
+               MarkBufferDirty(metabuffer);
+       }
+
+       if (BufferIsValid(metabuffer))
+               UnlockReleaseBuffer(metabuffer);
+}
+
+/*
+ * Main entrypoint for conveyor belt REDO.
+ */
+void
+conveyor_redo(XLogReaderState *record)
+{
+       uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+       switch (info)
+       {
+               case XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE:
+                       cb_xlog_insert_payload_page(record);
+                       break;
+               case XLOG_CONVEYOR_ALLOCATE_PAYLOAD_SEGMENT:
+                       cb_xlog_allocate_payload_segment(record);
+                       break;
+               case XLOG_CONVEYOR_ALLOCATE_INDEX_SEGMENT:
+                       cb_xlog_allocate_index_segment(record);
+                       break;
+               case XLOG_CONVEYOR_ALLOCATE_INDEX_PAGE:
+                       cb_xlog_allocate_index_page(record);
+                       break;
+               case XLOG_CONVEYOR_RELOCATE_INDEX_ENTRIES:
+                       cb_xlog_relocate_index_entries(record);
+                       break;
+               case XLOG_CONVEYOR_LOGICAL_TRUNCATE:
+                       cb_xlog_logical_truncate(record);
+                       break;
+               case XLOG_CONVEYOR_CLEAR_BLOCK:
+                       cb_xlog_clear_block(record);
+                       break;
+               case XLOG_CONVEYOR_RECYCLE_PAYLOAD_SEGMENT:
+                       cb_xlog_recycle_payload_segment(record);
+                       break;
+               case XLOG_CONVEYOR_RECYCLE_INDEX_SEGMENT:
+                       cb_xlog_recycle_index_segment(record);
+                       break;
+               case XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX:
+                       cb_xlog_shift_metapage_index(record);
+                       break;
+               default:
+                       elog(PANIC, "conveyor_redo: unknown op code %u", info);
+       }
+}
diff --git a/src/backend/access/conveyor/conveyor.c b/src/backend/access/conveyor/conveyor.c
new file mode 100644 (file)
index 0000000..12a1888
--- /dev/null
@@ -0,0 +1,1978 @@
+/*-------------------------------------------------------------------------
+ *
+ * conveyor.c
+ *       Conveyor belt storage.
+ *
+ * See src/backend/access/conveyor/README for a general overview of
+ * conveyor belt storage.
+ *
+ * src/backend/access/conveyor/conveyor.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/cbcache.h"
+#include "access/cbfsmpage.h"
+#include "access/cbindexpage.h"
+#include "access/cbmetapage.h"
+#include "access/cbmodify.h"
+#include "access/conveyor.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+
+static CBSegNo ConveyorSearchFSMPages(ConveyorBelt *cb,
+                                                                         CBSegNo next_segment,
+                                                                         BlockNumber *fsmblock,
+                                                                         Buffer *fsmbuffer);
+static void ConveyorBeltClearSegment(ConveyorBelt *cb, CBSegNo segno,
+                                                                        bool include_first_page);
+static bool ConveyorBeltClearIndexSegmentEntries(ConveyorBelt *cb,
+                                                                                                Buffer metabuffer,
+                                                                                                CBSegNo index_segment,
+                                                                                                CBPageNo index_vacuum_stop_point,
+                                                                                                CBSegNo *next_index_segment);
+static CBSegNo ConveyorBeltFreeOldestIndexSegment(ConveyorBelt *cb,
+                                                                                                 Buffer metabuffer,
+                                                                                                 CBSegNo oldest_index_segment,
+                                                                                                 CBPageNo index_vacuum_stop_point);
+static Buffer ConveyorBeltExtend(ConveyorBelt *cb, BlockNumber blkno,
+                                                                BlockNumber *possibly_not_on_disk_blkno);
+static BlockNumber ConveyorBeltFSMBlockNumber(ConveyorBelt *cb,
+                                                                                         CBSegNo segno);
+static Buffer ConveyorBeltRead(ConveyorBelt *cb, BlockNumber blkno, int mode);
+static Buffer ConveyorBeltPageIsUnused(Page page);
+
+/*
+ * Handle used to mediate access to a conveyor belt.
+ */
+struct ConveyorBelt
+{
+       Relation        cb_rel;
+       ForkNumber      cb_fork;
+       uint16          cb_pages_per_segment;
+       CBCache    *cb_cache;
+
+       /*
+        * These fields are used for communication between ConveyorBeltGetNewPage,
+        * ConveyorBeltPerformInsert, and ConveyorBeltCleanupInsert.
+        */
+       RelFileNode *cb_insert_relfilenode;
+       Buffer          cb_insert_metabuffer;
+       BlockNumber cb_insert_block;
+       Buffer          cb_insert_buffer;
+};
+
+/*
+ * Create a new conveyor belt.
+ */
+ConveyorBelt *
+ConveyorBeltInitialize(Relation rel,
+                                          ForkNumber fork,
+                                          uint16 pages_per_segment,
+                                          MemoryContext mcxt)
+{
+       ConveyorBelt *cb;
+       Buffer          metabuffer;
+       bool            needs_xlog;
+
+       /* Write a metapage for the new conveyor belt, and XLOG if needed. */
+       needs_xlog = RelationNeedsWAL(rel) || fork == INIT_FORKNUM;
+       metabuffer = ReadBufferExtended(rel, fork, P_NEW, RBM_NORMAL, NULL);
+       if (BufferGetBlockNumber(metabuffer) != CONVEYOR_METAPAGE)
+               elog(ERROR, "can't initialize non-empty fork as conveyor belt");
+       LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
+       cb_create_metapage(&RelationGetSmgr(rel)->smgr_rnode.node, fork,
+                                          metabuffer, pages_per_segment, needs_xlog);
+       UnlockReleaseBuffer(metabuffer);
+
+       /*
+        * Initialize a ConveyorBelt object so that the caller can do something
+        * with the new conveyor belt if they wish.
+        */
+       cb = MemoryContextAlloc(mcxt, sizeof(ConveyorBelt));
+       cb->cb_rel = rel;
+       cb->cb_fork = fork;
+       cb->cb_pages_per_segment = pages_per_segment;
+       cb->cb_cache = cb_cache_create(mcxt, 0);
+       cb->cb_insert_relfilenode = NULL;
+       cb->cb_insert_metabuffer = InvalidBuffer;
+       cb->cb_insert_block = InvalidBlockNumber;
+       cb->cb_insert_buffer = InvalidBuffer;
+       return cb;
+}
+
+/*
+ * Prepare for access to an existing conveyor belt.
+ */
+ConveyorBelt *
+ConveyorBeltOpen(Relation rel, ForkNumber fork, MemoryContext mcxt)
+{
+       Buffer          metabuffer;
+       CBMetapageData *meta;
+       ConveyorBelt *cb;
+       uint16          pages_per_segment;
+       uint64          index_segments_moved;
+
+       /* Read a few critical details from the metapage. */
+       metabuffer = ReadBufferExtended(rel, fork, CONVEYOR_METAPAGE,
+                                                                       RBM_NORMAL, NULL);
+       LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
+       meta = cb_metapage_get_special(BufferGetPage(metabuffer));
+       cb_metapage_get_critical_info(meta,
+                                                                 &pages_per_segment,
+                                                                 &index_segments_moved);
+       UnlockReleaseBuffer(metabuffer);
+
+       /* Initialize and return the ConveyorBelt object. */
+       cb = MemoryContextAlloc(mcxt, sizeof(ConveyorBelt));
+       cb->cb_rel = rel;
+       cb->cb_fork = fork;
+       cb->cb_pages_per_segment = pages_per_segment;
+       cb->cb_cache = cb_cache_create(mcxt, index_segments_moved);
+       cb->cb_insert_relfilenode = NULL;
+       cb->cb_insert_metabuffer = InvalidBuffer;
+       cb->cb_insert_block = InvalidBlockNumber;
+       cb->cb_insert_buffer = InvalidBuffer;
+       return cb;
+}
+
+/*
+ * Get a new page to be added to a conveyor belt.
+ *
+ * On return, *pageno is set to the logical page number of the newly-added
+ * page, and both the metapage and the returned buffer are exclusively locked.
+ *
+ * The intended use of this function is:
+ *
+ * buffer = ConveyorBeltGetNewPage(cb, &pageno);
+ * page = BufferGetPage(buffer);
+ * START_CRIT_SECTION();
+ * // set page contents
+ * ConveyorBeltPerformInsert(cb, buffer);
+ * END_CRIT_SECTION();
+ * ConveyorBeltCleanupInsert(cb, buffer);
+ *
+ * Note that because this function returns with buffer locks held, it's
+ * important to do as little work as possible after this function returns
+ * and before calling ConveyorBeltPerformInsert(). In particular, it's
+ * completely unsafe to do anything complicated like SearchSysCacheN. Doing
+ * so could result in undetected deadlock on the buffer LWLocks, or cause
+ * a relcache flush that would break ConveyorBeltPerformInsert().
+ *
+ * Also note that the "set page contents" step must put some data in the
+ * page, so that either pd_lower is greater than the minimum value
+ * (SizeOfPageHeaderData) or pd_upper is less than the maximum value
+ * (BLCKSZ).
+ *
+ * In future, we might want to provide the caller with an alternative to
+ * calling ConveyorBeltPerformInsert, because that just logs an FPI for
+ * the new page, and some callers might prefer to manage their own xlog
+ * needs.
+ */
+Buffer
+ConveyorBeltGetNewPage(ConveyorBelt *cb, CBPageNo *pageno)
+{
+       BlockNumber indexblock = InvalidBlockNumber;
+       BlockNumber prevblock = InvalidBlockNumber;
+       BlockNumber fsmblock = InvalidBlockNumber;
+       BlockNumber     possibly_not_on_disk_blkno = CONVEYOR_METAPAGE + 1;
+       Buffer          metabuffer;
+       Buffer          indexbuffer = InvalidBuffer;
+       Buffer          prevbuffer = InvalidBuffer;
+       Buffer          fsmbuffer = InvalidBuffer;
+       Buffer          buffer;
+       CBPageNo        next_pageno;
+       CBPageNo        previous_next_pageno = 0;
+       CBSegNo         free_segno = CB_INVALID_SEGMENT;
+       bool            needs_xlog;
+       int                     mode = BUFFER_LOCK_SHARE;
+       int                     iterations_without_next_pageno_change = 0;
+
+       /*
+        * It would be really bad if someone called this function a second time
+        * while the buffer locks from a previous call were still held. So let's
+        * try to make sure that's not the case.
+        */
+       Assert(!BufferIsValid(cb->cb_insert_metabuffer));
+       Assert(!BufferIsValid(cb->cb_insert_buffer));
+
+       /* Do any changes we make here need to be WAL-logged? */
+       needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM;
+
+       /*
+        * We don't do anything in this function that involves catalog access or
+        * accepts invalidation messages, so it's safe to cache this for the
+        * lifetime of this function. Since we'll return with buffer locks held,
+        * the caller had better not do anything like that either, so this should
+        * also still be valid when ConveyorBeltPerformInsert is called.
+        *
+        * XXX. This seems totally bogus, because we should really be doing
+        * CHECK_FOR_INTERRUPTS(), and that might accept invalidation messages.
+        */
+       cb->cb_insert_relfilenode =
+               &RelationGetSmgr(cb->cb_rel)->smgr_rnode.node;
+
+       /*
+        * Read and pin the metapage.
+        *
+        * Among other things, this prevents concurrent truncations, as per the
+        * discussion in src/backend/access/conveyor/README.
+        */
+       metabuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, CONVEYOR_METAPAGE,
+                                                                       RBM_NORMAL, NULL);
+
+       /*
+        * In the easy case where at least one payload segment exists, the newest
+        * payload segment is not full, and nobody else is trying to insert
+        * concurrently, this loop should only iterate once. However, we might not
+        * be that lucky.
+        *
+        * Since we don't want to hold the lock on the metapage while we go
+        * perform necessary preparatory work (e.g. searching through the FSM
+        * pages for a segment that can be allocated), we may find that after
+        * doing some amount of preparatory work and re-locking the metapage, the
+        * situation has changed under us. So we have to be prepared to keep going
+        * around until we get to a state where there's a non-full payload segment
+        * whose first unused page we can lock before someone else grabs it.
+        */
+       while (1)
+       {
+               CBMetapageData *meta;
+               CBMInsertState insert_state;
+               BlockNumber next_blkno;
+               CBPageNo        index_start;
+               CBPageNo        index_metapage_start;
+               CBSegNo         newest_index_segment;
+               CBSegNo         next_segno;
+               bool            can_allocate_segment;
+
+               /*
+                * Examine the metapage to find out what we think we need to do in
+                * order to complete this operation.
+                *
+                * Initially, mode will be BUFFER_LOCK_SHARE. But if a previous pass
+                * through the loop found that we needed to allocate a new payload or
+                * index segement or move index entries out of the metapage, it will
+                * be BUFFER_LOCK_EXCLUSIVE. That's so that if nothing has changed
+                * concurrently, we can complete the operation before releasing the
+                * lock on the metapage.
+                *
+                * NB: Our rule is that the lock on the metapage is acquired last,
+                * after all other buffer locks. If any of indexbuffer, prevbuffer,
+                * and fsmbuffer are valid, they are also exclusively locked at this
+                * point.
+                */
+               LockBuffer(metabuffer, mode);
+               meta = cb_metapage_get_special(BufferGetPage(metabuffer));
+               insert_state = cb_metapage_get_insert_state(meta, &next_blkno,
+                                                                                                       &next_pageno, &next_segno,
+                                                                                                       &index_start,
+                                                                                                       &index_metapage_start,
+                                                                                                       &newest_index_segment);
+
+               /*
+                * There's no fixed upper bound on how many times this loop could
+                * iterate, because some other backend could be currently allocating
+                * pages, and that could prevent us from succeeding in allocating a
+                * page.
+                *
+                * However, if that's happening, the next logical page number should
+                * keep increasing. In the absence of any increase in the next logical
+                * page number, we might still need to iterate a few times, but
+                * not very many. For example, we might read the page the first time
+                * and realize that a new index segment is needed, create it on the
+                * second pass, move index entries into it on the third pass, and
+                * create a payload segment on the fourth pass, but then, barring
+                * concurrent activity, we should succeed in allocating a page on the
+                * next pass.
+                *
+                * Hence, if we loop a large number of times without a change in
+                * the next_pageno value, there's probably a bug. Error out instead
+                * of looping forever.
+                */
+               if (next_pageno > previous_next_pageno)
+               {
+                       previous_next_pageno = next_pageno;
+                       iterations_without_next_pageno_change = 0;
+               }
+               else if (++iterations_without_next_pageno_change >= 10)
+                       elog(ERROR,
+                                "unable to make progress allocating page "
+                                UINT64_FORMAT " (state = %d)",
+                                next_pageno, (int) insert_state);
+
+               /*
+                * next_segno need not exist on disk, but at least the first block
+                * of the previous segment should be there.
+                */
+               if (next_segno > 0)
+               {
+                       BlockNumber last_segno_first_blkno;
+
+                       last_segno_first_blkno =
+                               cb_segment_to_block(cb->cb_pages_per_segment,
+                                                                       next_segno - 1, 0);
+                       if (last_segno_first_blkno > possibly_not_on_disk_blkno)
+                               possibly_not_on_disk_blkno = last_segno_first_blkno + 1;
+               }
+
+               /*
+                * If we need to allocate a payload or index segment, and we don't
+                * currently have a candidate, check whether the metapage knows of a
+                * free segment.
+                */
+               if ((insert_state == CBM_INSERT_NEEDS_PAYLOAD_SEGMENT ||
+                        insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT)
+                       && free_segno == CB_INVALID_SEGMENT)
+                       free_segno = cb_metapage_find_free_segment(meta);
+
+               /*
+                * If we need a new payload or index segment, see whether it's
+                * possible to complete that operation on this trip through the loop.
+                *
+                * This will only be possible if we've got an exclusive lock on the
+                * metapage.
+                *
+                * Furthermore, by rule, we cannot allocate a segment unless at least
+                * the first page of that segment is guaranteed to be on disk. This is
+                * certain to be true for any segment that's been allocated
+                * previously, but otherwise it's only true if we've verified that the
+                * size of the relation on disk is large enough.
+                */
+               if (mode != BUFFER_LOCK_EXCLUSIVE ||
+                       free_segno == CB_INVALID_SEGMENT ||
+                       (insert_state != CBM_INSERT_NEEDS_PAYLOAD_SEGMENT
+                        && insert_state != CBM_INSERT_NEEDS_INDEX_SEGMENT))
+                       can_allocate_segment = false;
+               else
+               {
+                       BlockNumber     free_segno_first_blkno;
+
+                       free_segno_first_blkno =
+                               cb_segment_to_block(cb->cb_pages_per_segment, free_segno, 0);
+                       can_allocate_segment =
+                               (free_segno_first_blkno < possibly_not_on_disk_blkno);
+               }
+
+               /*
+                * If it still looks like we can allocate, check for the case where we
+                * need a new index segment but don't have the other required buffer
+                * locks.
+                */
+               if (can_allocate_segment &&
+                       insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT &&
+                       (!BufferIsValid(indexbuffer) || (!BufferIsValid(prevbuffer)
+                       && newest_index_segment != CB_INVALID_SEGMENT)))
+                       can_allocate_segment = false;
+
+               /*
+                * If it still looks like we can allocate, check for the case where
+                * the segment we planned to allocate is no longer free.
+                */
+               if (can_allocate_segment)
+               {
+                       /* fsmbuffer, if valid, is already exclusively locked. */
+                       if (BufferIsValid(fsmbuffer))
+                               can_allocate_segment =
+                                       !cb_fsmpage_get_fsm_bit(BufferGetPage(fsmbuffer),
+                                                                                       free_segno);
+                       else
+                               can_allocate_segment =
+                                       !cb_metapage_get_fsm_bit(meta, free_segno);
+
+                       /*
+                        * If this segment turned out not to be free, we need a new
+                        * candidate. Check the metapage here, and if that doesn't work
+                        * out, free_segno will end up as CB_INVALID_SEGMENT, and we'll
+                        * search the FSM pages further down.
+                        */
+                       if (!can_allocate_segment)
+                               free_segno = cb_metapage_find_free_segment(meta);
+               }
+
+               /* If it STILL looks like we can allocate, do it! */
+               if (can_allocate_segment)
+               {
+                       if (insert_state == CBM_INSERT_NEEDS_PAYLOAD_SEGMENT)
+                       {
+                               cb_allocate_payload_segment(cb->cb_insert_relfilenode,
+                                                                                       cb->cb_fork, metabuffer,
+                                                                                       fsmblock, fsmbuffer, free_segno,
+                                                                                       free_segno >= next_segno,
+                                                                                       needs_xlog);
+
+                               /*
+                                * We know for sure that there's now a payload segment that
+                                * isn't full - and we know exactly where it's located.
+                                */
+                               insert_state = CBM_INSERT_OK;
+                               next_blkno = cb_segment_to_block(cb->cb_pages_per_segment,
+                                                                                                free_segno, 0);
+                       }
+                       else
+                       {
+                               Assert(insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT);
+
+                               cb_allocate_index_segment(cb->cb_insert_relfilenode,
+                                                                                 cb->cb_fork, metabuffer,
+                                                                                 indexblock, indexbuffer,
+                                                                                 prevblock, prevbuffer,
+                                                                                 fsmblock, fsmbuffer, free_segno,
+                                                                                 index_metapage_start,
+                                                                                 free_segno >= next_segno,
+                                                                                 needs_xlog);
+
+                               /*
+                                * We know for sure that there's now an index segment that
+                                * isn't full, and our next move must be to relocate some
+                                * index entries to that index segment.
+                                */
+                               insert_state = CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED;
+                               next_blkno = indexblock;
+                       }
+
+                       /*
+                        * Whether we allocated or not, the segment we intended to
+                        * allocate is no longer free.
+                        */
+                       free_segno = CB_INVALID_SEGMENT;
+               }
+
+               /*
+                * If we need to relocate index entries and if we have a lock on the
+                * correct index block, then go ahead and do it.
+                */
+               if (insert_state == CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED &&
+                       next_blkno == indexblock)
+               {
+                       unsigned        pageoffset;
+                       unsigned        num_index_entries;
+                       CBSegNo     index_entries[CB_METAPAGE_INDEX_ENTRIES];
+                       CBPageNo        index_page_start;
+                       unsigned        logical_pages_in_index_segments;
+                       unsigned        index_entries_in_index_segments;
+
+                       logical_pages_in_index_segments =
+                               index_metapage_start - index_start;
+                       if (logical_pages_in_index_segments % cb->cb_pages_per_segment != 0)
+                               elog(ERROR, "index starts at " UINT64_FORMAT ", metapage index at " UINT64_FORMAT ", but there are %u pages per segment",
+                                        index_start, index_metapage_start,
+                                        cb->cb_pages_per_segment);
+                       index_entries_in_index_segments =
+                               logical_pages_in_index_segments / cb->cb_pages_per_segment;
+                       pageoffset =
+                               index_entries_in_index_segments % CB_INDEXPAGE_INDEX_ENTRIES;
+
+                       num_index_entries = Min(CB_METAPAGE_INDEX_ENTRIES,
+                                                                       CB_INDEXPAGE_INDEX_ENTRIES - pageoffset);
+                       cb_metapage_get_index_entries(meta, num_index_entries,
+                                                                                 index_entries);
+                       index_page_start = index_metapage_start -
+                               pageoffset * cb->cb_pages_per_segment;
+                       cb_relocate_index_entries(cb->cb_insert_relfilenode, cb->cb_fork,
+                                                                         metabuffer, indexblock, indexbuffer,
+                                                                         pageoffset, num_index_entries,
+                                                                         index_entries, index_page_start,
+                                                                         needs_xlog);
+               }
+
+               /* Release buffer locks and, except for the metapage, also pins. */
+               LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK);
+               if (BufferIsValid(indexbuffer))
+               {
+                       UnlockReleaseBuffer(indexbuffer);
+                       indexblock = InvalidBlockNumber;
+                       indexbuffer = InvalidBuffer;
+               }
+               if (BufferIsValid(prevbuffer))
+               {
+                       UnlockReleaseBuffer(prevbuffer);
+                       prevblock = InvalidBlockNumber;
+                       prevbuffer = InvalidBuffer;
+               }
+               if (BufferIsValid(fsmbuffer))
+               {
+                       UnlockReleaseBuffer(fsmbuffer);
+                       fsmblock = InvalidBlockNumber;
+                       fsmbuffer = InvalidBuffer;
+               }
+
+               if (insert_state != CBM_INSERT_OK)
+               {
+                       /*
+                        * Some sort of preparatory work will be needed in order to insert
+                        * a new page, which will require modifying the metapage.
+                        * Therefore, next time we lock it, we had better grab an
+                        * exclusive lock.
+                        */
+                       mode = BUFFER_LOCK_EXCLUSIVE;
+               }
+               else
+               {
+                       /* Extend the relation if needed. */
+                       buffer = ConveyorBeltExtend(cb, next_blkno,
+                                                                               &possibly_not_on_disk_blkno);
+                       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+                       /*
+                        * If the target buffer is still unused, we're done. Otherwise,
+                        * someone else grabbed that page before we did, so we must fall
+                        * through and retry.
+                        */
+                       if (ConveyorBeltPageIsUnused(BufferGetPage(buffer)))
+                       {
+                               /*
+                                * Remember things that we'll need to know when the caller
+                                * invokes ConveyorBeltPerformInsert and
+                                * ConveyorBeltCleanupInsert.
+                                */
+                               cb->cb_insert_block = next_blkno;
+                               cb->cb_insert_buffer = buffer;
+                               cb->cb_insert_metabuffer = metabuffer;
+
+                               /* Success, so escape toplevel retry loop. */
+                               break;
+                       }
+
+                       /* We'll have to retry with a different buffer. */
+                       UnlockReleaseBuffer(buffer);
+               }
+
+               /*
+                * If the metapage has no more space for index entries, but there's
+                * an index segment into which some of the existing ones could be
+                * moved, then cb_metapage_get_insert_state will have set next_blkno
+                * to the point to the block to which index entries should be moved.
+                *
+                * If the target index segment is the very last one in the conveyor
+                * belt and we're using the pages of that segment for the very first
+                * time, the target page may not exist yet, so be prepared to extend
+                * the relation.
+                */
+               if (insert_state == CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED)
+               {
+                       indexblock = next_blkno;
+                       indexbuffer = ConveyorBeltExtend(cb, indexblock,
+                                                                                        &possibly_not_on_disk_blkno);
+               }
+
+               /*
+                * If we need to add a new index segment and it's not the very first
+                * one, we'll have to update the newest index page with a pointer to
+                * the index page we're going to add, so we must read and pin that
+                * page.
+                *
+                * The names "prevblock" and "prevbuffer" are intended to signify that
+                * what is currently the newest index segment will become the previous
+                * segment relative to the one we're going to add.
+                */
+               if (insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT &&
+                       newest_index_segment != CB_INVALID_SEGMENT)
+               {
+                       prevblock = cb_segment_to_block(cb->cb_pages_per_segment,
+                                                                                       newest_index_segment, 0);
+                       prevbuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork,
+                                                                                       prevblock, RBM_NORMAL, NULL);
+               }
+
+               /*
+                * If we need to add a new segment of either type, make provisions to
+                * do so.
+                */
+               if (insert_state == CBM_INSERT_NEEDS_PAYLOAD_SEGMENT ||
+                       insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT)
+               {
+                       /*
+                        * Search the FSM pages (and create a new one if needed) for a
+                        * free segment, unless we've already have a candidate.
+                        */
+                       if (free_segno == CB_INVALID_SEGMENT)
+                               free_segno = ConveyorSearchFSMPages(cb, next_segno, &fsmblock,
+                                                                                                       &fsmbuffer);
+
+                       if (free_segno > next_segno)
+                       {
+                               /*
+                                * If the FSM thinks that we ought to allocate a segment
+                                * beyond what we think to be the very next one, then someone
+                                * else must have concurrently added a segment, so we'll need
+                                * to loop around, retake the metapage lock, refresh our
+                                * knowledge of next_segno, and then find a new segment to
+                                * allocate.
+                                */
+                               free_segno = CB_INVALID_SEGMENT;
+                       }
+                       else if (free_segno == next_segno)
+                       {
+                               BlockNumber free_block;
+                               Buffer          free_buffer;
+
+                               /*
+                                * We're allocating a new segment. At least the first page must
+                                * exist on disk before we perform the allocation, which means
+                                * we may need to add blocks to the relation fork.
+                                */
+                               free_block = cb_segment_to_block(cb->cb_pages_per_segment,
+                                                                                                free_segno, 0);
+                               free_buffer = ConveyorBeltExtend(cb, free_block,
+                                                                                                &possibly_not_on_disk_blkno);
+                               if (insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT)
+                               {
+                                       indexblock = free_block;
+                                       indexbuffer = free_buffer;
+                               }
+                               else
+                                       ReleaseBuffer(free_buffer);
+                       }
+               }
+
+               /*
+                * Prepare for next attempt by reacquiring all relevant buffer locks,
+                * except for the one on the metapage, which is acquired at the top of
+                * the loop.
+                */
+               if (BufferIsValid(indexbuffer))
+                       LockBuffer(indexbuffer, BUFFER_LOCK_EXCLUSIVE);
+               if (BufferIsValid(prevbuffer))
+                       LockBuffer(prevbuffer, BUFFER_LOCK_EXCLUSIVE);
+               if (BufferIsValid(fsmbuffer))
+                       LockBuffer(fsmbuffer, BUFFER_LOCK_EXCLUSIVE);
+       }
+
+       /*
+        * Relock the metapage. Caller should immediately start a critical section
+        * and populate the buffer.
+        */
+       LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
+
+       /* All done. */
+       *pageno = next_pageno;
+       return buffer;
+}
+
+/*
+ * Actually insert a new page into the conveyor belt.
+ *
+ * See ConveyorBeltGetNewPage for the intended usage of this fucntion.
+ */
+void
+ConveyorBeltPerformInsert(ConveyorBelt *cb, Buffer buffer)
+{
+       bool            needs_xlog;
+
+       /*
+        * We don't really need the caller to tell us which buffer is involved,
+        * because we already have that information. We insist on it anyway as a
+        * debugging cross-check.
+        */
+       if (cb->cb_insert_buffer != buffer)
+       {
+               if (BufferIsValid(cb->cb_insert_buffer))
+                       elog(ERROR, "there is no pending insert");
+               else
+                       elog(ERROR,
+                                "pending insert expected for buffer %u but got buffer %u",
+                                cb->cb_insert_buffer, buffer);
+       }
+
+       /*
+        * ConveyorBeltPageIsUnused is used by ConveyorBeltGetNewPage to figure
+        * out whether a concurrent inserter got there first. Here, we're the
+        * concurrent inserter, and must have initialized the page in a way that
+        * makes that function return false for the newly-inserted page, so that
+        * other backends can tell we got here first.
+        */
+       if (ConveyorBeltPageIsUnused(BufferGetPage(buffer)))
+               elog(ERROR, "can't insert an unused page");
+
+       /* Caller should be doing this inside a critical section. */
+       Assert(CritSectionCount > 0);
+
+       /* We should have the details stashed by ConveyorBeltGetNewPage. */
+       Assert(cb->cb_insert_relfilenode != NULL);
+       Assert(BufferIsValid(cb->cb_insert_metabuffer));
+       Assert(BufferIsValid(cb->cb_insert_buffer));
+       Assert(BlockNumberIsValid(cb->cb_insert_block));
+
+       /* Update metapage, mark buffers dirty, and write XLOG if required. */
+       needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM;
+       cb_insert_payload_page(cb->cb_insert_relfilenode, cb->cb_fork,
+                                                  cb->cb_insert_metabuffer,
+                                                  cb->cb_insert_block, buffer,
+                                                  needs_xlog);
+
+       /*
+        * Buffer locks will be released by ConveyorBeltCleanupInsert, but we can
+        * invalidate some other fields now.
+        */
+       cb->cb_insert_relfilenode = NULL;
+       cb->cb_insert_block = InvalidBlockNumber;
+}
+
+/*
+ * Clean up following the insertion of a new page into the conveyor belt.
+ *
+ * See ConveyorBeltGetNewPage for the intended usage of this fucntion.
+ */
+void
+ConveyorBeltCleanupInsert(ConveyorBelt *cb, Buffer buffer)
+{
+       /* Debugging cross-check, like ConveyorBeltPerformInsert. */
+       if (cb->cb_insert_buffer != buffer)
+       {
+               if (BufferIsValid(cb->cb_insert_buffer))
+                       elog(ERROR, "there is no pending insert");
+               else
+                       elog(ERROR,
+                                "pending insert expected for buffer %u but got buffer %u",
+                                cb->cb_insert_buffer, buffer);
+       }
+
+       /* Release buffer locks and pins. */
+       Assert(BufferIsValid(cb->cb_insert_buffer));
+       Assert(BufferIsValid(cb->cb_insert_metabuffer));
+       UnlockReleaseBuffer(cb->cb_insert_buffer);
+       UnlockReleaseBuffer(cb->cb_insert_metabuffer);
+       cb->cb_insert_buffer = InvalidBuffer;
+       cb->cb_insert_metabuffer = InvalidBuffer;
+}
+
+/*
+ * Read a logical page from a conveyor belt. If the page has already been
+ * truncated away or has not yet been created, returns InvalidBuffer.
+ * Otherwise, reads the page using the given strategy and locks it using
+ * the given buffer lock mode.
+ */
+Buffer
+ConveyorBeltReadBuffer(ConveyorBelt *cb, CBPageNo pageno, int mode,
+                                          BufferAccessStrategy strategy)
+{
+       BlockNumber index_blkno,
+                               payload_blkno;
+       Buffer          metabuffer,
+                               index_buffer,
+                               payload_buffer;
+       CBMetapageData *meta;
+       CBPageNo        index_start,
+                               index_metapage_start,
+                               target_index_segment_start;
+       CBSegNo         oldest_index_segment,
+                               newest_index_segment,
+                               index_segno;
+       unsigned        lppis,
+                               segoff;
+       uint64          index_segments_moved;
+
+       Assert(mode == BUFFER_LOCK_EXCLUSIVE || mode == BUFFER_LOCK_SHARE);
+
+       /*
+        * Lock the metapage and get all the information we need from it. Then
+        * drop the lock on the metapage, but retain the pin, so that neither the
+        * target payload page nor any index page we might need to access can be
+        * concurrently truncated away. See the README for futher details.
+        */
+       metabuffer = ConveyorBeltRead(cb, CONVEYOR_METAPAGE, BUFFER_LOCK_SHARE);
+       meta = cb_metapage_get_special(BufferGetPage(metabuffer));
+       if (!cb_metapage_find_logical_page(meta, pageno, &payload_blkno))
+       {
+               /* Page number too old or too new. */
+               UnlockReleaseBuffer(metabuffer);
+               return InvalidBuffer;
+       }
+       if (payload_blkno != InvalidBlockNumber)
+       {
+               /* Index entry for payload page found on metapage. */
+               LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK);
+               payload_buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork,
+                                                                                       payload_blkno, RBM_NORMAL,
+                                                                                       strategy);
+               LockBuffer(payload_buffer, mode);
+               ReleaseBuffer(metabuffer);
+               return payload_buffer;
+       }
+       cb_metapage_get_index_info(meta, &index_start, &index_metapage_start,
+                                                          &oldest_index_segment, &newest_index_segment,
+                                                          &index_segments_moved);
+       LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK);
+
+       /* Invalidate any obsolete cache entries. */
+       cb_cache_invalidate(cb->cb_cache, index_start, index_segments_moved);
+
+       /*
+        * It's convenient to identify index segments in terms of the first
+        * logical page for which that index segment contains the necessary index
+        * entry. So, take the page number that we were given, and back it up to
+        * the previous index-segment boundary.
+        */
+       lppis = cb_logical_pages_per_index_segment(cb->cb_pages_per_segment);
+       target_index_segment_start = pageno - (pageno - index_start) % lppis;
+
+       /* Search the cache first. Try other strategies if that does not work. */
+       index_segno = cb_cache_lookup(cb->cb_cache, target_index_segment_start);
+       if (index_segno == CB_INVALID_SEGMENT)
+       {
+               if (index_start == target_index_segment_start)
+               {
+                       /* Looks like it's the oldest index segment. */
+                       index_segno = oldest_index_segment;
+               }
+               else if (index_metapage_start - lppis == target_index_segment_start)
+               {
+                       /*
+                        * Looks like it's the newest index segment.
+                        *
+                        * It's worth adding a cache entry for this, because we might end
+                        * up needing it again later, when it's no longer the newest
+                        * entry.
+                        */
+                       index_segno = newest_index_segment;
+                       cb_cache_insert(cb->cb_cache, index_segno,
+                                                       target_index_segment_start);
+               }
+               else
+               {
+                       CBPageNo        index_segment_start;
+
+                       /*
+                        * We don't know where it is and it's not the first or last index
+                        * segment, so we have to walk the chain of index segments to find
+                        * it.
+                        *
+                        * That's possibly going to be slow, especially if there are a lot
+                        * of index segments. However, maybe we can make it a bit faster.
+                        * Instead of starting with the oldest segment and moving forward
+                        * one segment at a time until we find the one we want, search the
+                        * cache for the index segment that most nearly precedes the one
+                        * we want.
+                        */
+                       index_segno = cb_cache_fuzzy_lookup(cb->cb_cache,
+                                                                                               target_index_segment_start,
+                                                                                               &index_segment_start);
+                       if (index_segno == CB_INVALID_SEGMENT)
+                       {
+                               /*
+                                * Sadly, the cache is either entirely empty or at least has
+                                * no entries for any segments older than the one we want, so
+                                * we have to start our search from the oldest segment.
+                                */
+                               index_segno = oldest_index_segment;
+                       }
+
+                       /*
+                        * Here's where we actually search. Make sure to cache the
+                        * results, in case there are more lookups later.
+                        */
+                       while (index_segment_start < target_index_segment_start)
+                       {
+                               CHECK_FOR_INTERRUPTS();
+
+                               index_blkno = cb_segment_to_block(cb->cb_pages_per_segment,
+                                                                                                 index_segno, 0);
+                               index_buffer = ConveyorBeltRead(cb, index_blkno,
+                                                                                               BUFFER_LOCK_SHARE);
+                               index_segno =
+                                       cb_indexpage_get_next_segment(BufferGetPage(index_buffer));
+                               UnlockReleaseBuffer(index_buffer);
+                               index_segment_start += lppis;
+                               cb_cache_insert(cb->cb_cache, index_segno, index_segment_start);
+                       }
+               }
+       }
+
+       /*
+        * We know which index segment we need to read, so now figure out which
+        * page we need from that segment, and then which physical block we need.
+        */
+       segoff = (pageno - target_index_segment_start) /
+               cb_logical_pages_per_index_page(cb->cb_pages_per_segment);
+       index_blkno = cb_segment_to_block(cb->cb_pages_per_segment,
+                                                                         index_segno, segoff);
+
+       /* Read the required index entry. */
+       index_buffer = ConveyorBeltRead(cb, index_blkno, BUFFER_LOCK_SHARE);
+       payload_blkno = cb_indexpage_find_logical_page(BufferGetPage(index_buffer),
+                                                                                                  pageno,
+                                                                                                  cb->cb_pages_per_segment);
+       UnlockReleaseBuffer(index_buffer);
+
+       /* Now we can read and lock the actual payload block. */
+       payload_buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork,
+                                                                               payload_blkno, RBM_NORMAL,
+                                                                               strategy);
+       LockBuffer(payload_buffer, mode);
+
+       /*
+        * Since we've now got the payload block locked, we can release the pin on
+        * the metapage.
+        */
+       ReleaseBuffer(metabuffer);
+       return payload_buffer;
+}
+
+/*
+ * Find out which logical page numbers are currently valid.
+ *
+ * On return, *oldest_logical_page will be set to the smallest page number
+ * that has not yet been removed by truncation, and *next_logical_page will
+ * be set to the smallest page number that does not yet exist.
+ *
+ * Note that, unless the caller knows that there cannot be concurrent
+ * truncations or insertions in progress, either value might be out of
+ * date by the time it is used.
+ */
+void
+ConveyorBeltGetBounds(ConveyorBelt *cb, CBPageNo *oldest_logical_page,
+                                         CBPageNo *next_logical_page)
+{
+       Buffer          metabuffer;
+       CBMetapageData *meta;
+
+       metabuffer = ConveyorBeltRead(cb, CONVEYOR_METAPAGE, BUFFER_LOCK_SHARE);
+       meta = cb_metapage_get_special(BufferGetPage(metabuffer));
+       cb_metapage_get_bounds(meta, oldest_logical_page, next_logical_page);
+       UnlockReleaseBuffer(metabuffer);
+}
+
+/*
+ * Update the conveyor belt's notion of the oldest logical page to be kept.
+ *
+ * This doesn't physically shrink the relation, nor does it even make space
+ * available for reuse by future insertions. It just makes pages prior to
+ * 'oldest_keeper' unavailable, thus potentially allowing the segments
+ * containing those pages to be freed by a future call to ConveyorBeltVacuum.
+ *
+ * A call to this function shouldn't try to move the logical truncation point
+ * backwards. That is, the value of 'oldest_keeper' should always be greater
+ * than or equal to the value passed on the previous call for this conveyor
+ * belt. It also shouldn't try to move the logical truncation point beyond
+ * the current insertion point: don't try to throw away data that hasn't been
+ * inserted yet!
+ *
+ * For routine cleanup of a conveyor belt, the recommended sequence of calls
+ * is ConveyorBeltLogicalTruncate then ConveyorBeltVacuum then
+ * ConveyorBeltPhysicalTruncate. For more aggressive cleanup options, see
+ * ConveyorBeltCompact or ConveyorBeltRewrite.
+ */
+void
+ConveyorBeltLogicalTruncate(ConveyorBelt *cb, CBPageNo oldest_keeper)
+{
+       Buffer          metabuffer;
+       CBMetapageData *meta;
+       CBPageNo        oldest_logical_page;
+       CBPageNo        next_logical_page;
+       RelFileNode *rnode;
+       bool            needs_xlog;
+
+       /*
+        * We must take a cleanup lock to adjust the logical truncation point,
+        * as per the locking protocols in src/backend/access/conveyor/README.
+        */
+       metabuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, CONVEYOR_METAPAGE,
+                                                                       RBM_NORMAL, NULL);
+       LockBufferForCleanup(metabuffer);
+
+       /* Sanity checks. */
+       meta = cb_metapage_get_special(BufferGetPage(metabuffer));
+       cb_metapage_get_bounds(meta, &oldest_logical_page, &next_logical_page);
+       if (oldest_keeper < oldest_logical_page)
+               elog(ERROR,
+                        "can't move truncation point backwards from " UINT64_FORMAT " to " UINT64_FORMAT,
+                        oldest_logical_page, oldest_keeper);
+       if (oldest_keeper > next_logical_page)
+               elog(ERROR,
+                        "can't move truncation point to " UINT64_FORMAT " beyond insert point " UINT64_FORMAT,
+                        oldest_keeper, next_logical_page);
+
+
+       /* Do the real work. */
+       rnode = &RelationGetSmgr(cb->cb_rel)->smgr_rnode.node;
+       needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM;
+       cb_logical_truncate(rnode, cb->cb_fork, metabuffer, oldest_keeper,
+                                               needs_xlog);
+
+       /* Release buffer lock. */
+       UnlockReleaseBuffer(metabuffer);
+}
+
+/*
+ * Recycle segments that are no longer needed.
+ *
+ * Payload segments all of whose pages precede the logical truncation point
+ * can be deallocated. Index segments can be deallocated once they no longer
+ * contain any pointers to payload segments.
+ *
+ * Only one backend should call this at a time for any given conveyor belt.
+ */
+void
+ConveyorBeltVacuum(ConveyorBelt *cb)
+{
+       Buffer          metabuffer;
+       BlockNumber     fsmblock = InvalidBlockNumber;
+       Buffer          fsmbuffer = InvalidBuffer;
+       CBSegNo         cleared_segno = CB_INVALID_SEGMENT;
+       bool            needs_xlog;
+       bool            cleaned_index_segments = false;
+
+       /* Do any changes we make here need to be WAL-logged? */
+       needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM;
+
+       /* Read and pin the metapage. */
+       metabuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, CONVEYOR_METAPAGE,
+                                                                       RBM_NORMAL, NULL);
+       LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
+
+       /*
+        * Main loop.
+        *
+        * At the top of each loop iteration, the metabuffer is pinned and
+        * exclusively locked.  The lock and even the pin may be released by code
+        * inside this loop, but they must be reacquired before beginning the next
+        * iteration.
+        */
+       while (1)
+       {
+               CBMetapageData     *meta;
+               CBMObsoleteState        obsolete_state;
+               CBSegNo         oldest_index_segment;
+               CBPageNo        index_vacuum_stop_point;
+               CBSegNo         metapage_segno;
+               unsigned        metapage_offset;
+
+               /* Assess what kind of work needs to be done. */
+               meta = cb_metapage_get_special(BufferGetPage(metabuffer));
+               obsolete_state =
+                       cb_metapage_get_obsolete_state(meta, &oldest_index_segment,
+                                                                                  &index_vacuum_stop_point,
+                                                                                  &metapage_segno, &metapage_offset);
+
+               /*
+                * If on the previous pass through the loop we concluded that we need
+                * to free a payload segment refrenced by the metapage and if that no
+                * longer seems like the thing we need to do, then release any lock and
+                * pin we may have acquired in preparation for freeing that payload
+                * segment.
+                */
+               if ((obsolete_state != CBM_OBSOLETE_METAPAGE_ENTRIES ||
+                       metapage_segno != cleared_segno) && fsmblock != InvalidBlockNumber)
+               {
+                       UnlockReleaseBuffer(fsmbuffer);
+                       fsmblock = InvalidBlockNumber;
+                       fsmbuffer = InvalidBuffer;
+               }
+
+               /*
+                * Attempt to do whatever useful work seems to be possible based on
+                * obsolete_state.
+                */
+               if (obsolete_state == CBM_OBSOLETE_NOTHING)
+               {
+                       /*
+                        * There is nothing to vacuum.
+                        */
+                       UnlockReleaseBuffer(metabuffer);
+                       return;
+               }
+               else if (obsolete_state == CBM_OBSOLETE_METAPAGE_START)
+               {
+                       /*
+                        * No real work to do, but there are some already-cleared entries
+                        * at the start of the metapage which we should remove to make more
+                        * space for new entries.
+                        */
+                       cb_shift_metapage_index(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node,
+                                                                       cb->cb_fork, metabuffer, metapage_offset, needs_xlog);
+                       UnlockReleaseBuffer(metabuffer);
+                       return;
+               }
+               else if (obsolete_state == CBM_OBSOLETE_METAPAGE_ENTRIES)
+               {
+                       /*
+                        * The metapage contains entries for one or more payload segments
+                        * which can be deallocated.
+                        */
+                       if (metapage_segno != cleared_segno)
+                       {
+                               /*
+                                * We can only recycle a payload segment after clearing the
+                                * pages in that segment. Since we have not done that yet,
+                                * do it now. First release the buffer lock on the metapage,
+                                * to avoid interefering with other use of the conveyor belt.
+                                */
+                               LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK);
+                               ConveyorBeltClearSegment(cb, metapage_segno, true);
+                               cleared_segno = metapage_segno;
+
+                               /*
+                                * Lock the relevant FSM page, if it's not the metapage.
+                                * Per src/backend/access/conveyor/README's locking rules,
+                                * we must do this before relocking the metapage.
+                                */
+                               fsmblock = ConveyorBeltFSMBlockNumber(cb, cleared_segno);
+                               if (fsmblock != InvalidBlockNumber)
+                                       fsmbuffer = ConveyorBeltRead(cb, fsmblock,
+                                                                                                BUFFER_LOCK_EXCLUSIVE);
+
+
+                               /*
+                                * OK, now reacquire a lock on the metapage and loop around.
+                                * Hopefully, the next pass will succeed in freeing a payload
+                                * segment.
+                                */
+                               LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
+                       }
+                       else
+                       {
+                               /*
+                                * The previous pass through the loop made preparations to
+                                * free this payload segment, so now we can do it.
+                                */
+                               cb_recycle_payload_segment(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node,
+                                                                                  cb->cb_fork,
+                                                                                  metabuffer,
+                                                                                  InvalidBlockNumber, InvalidBuffer,
+                                                                                  fsmblock, fsmbuffer,
+                                                                                  cleared_segno, metapage_offset,
+                                                                                  needs_xlog);
+                       }
+               }
+               else if (obsolete_state == CBM_OBSOLETE_SEGMENT_ENTRIES)
+               {
+                       unsigned        empty_index_segments = 0;
+                       CBSegNo         index_segment = oldest_index_segment;
+
+                       /*
+                        * Do this part just once. A single pass through the logic below
+                        * should clean out the index segments as completely as possible,
+                        * so if we end up here again, either the logical truncation point
+                        * changed concurrently, or there's actually nothing to do. Even
+                        * in the former case, it's OK to return without doing anything
+                        * further, because this function only promises to clean up data
+                        * that was no longer needed as of the time it was called. It makes
+                        * no promises about cleaning up things that became obsolete once
+                        * this function was already running.
+                        */
+                       if (cleaned_index_segments)
+                       {
+                               UnlockReleaseBuffer(metabuffer);
+                               break;
+                       }
+                       cleaned_index_segments = true;
+
+                       /*
+                        * Release lock on metapage before locking other pages, but keep
+                        * the pin for efficiency and so that no index segments can
+                        * disappear concurrently.
+                        */
+                       LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK);
+
+                       /*
+                        * Clear as many obsolete index entries out of index segments as
+                        * we can.
+                        */
+                       while (index_segment != CB_INVALID_SEGMENT &&
+                                  ConveyorBeltClearIndexSegmentEntries(cb, metabuffer,
+                                                                                                               index_segment,
+                                                                                                               index_vacuum_stop_point,
+                                                                                                               &index_segment))
+                               ++empty_index_segments;
+
+                       /*
+                        * Free old index segments.
+                        *
+                        * We might stop before freeing the requested number of index
+                        * segments, due to concurrent locking. If that happens,
+                        * give up on performing any further cleanup.
+                        */
+                       while (empty_index_segments > 0)
+                       {
+                               oldest_index_segment =
+                                       ConveyorBeltFreeOldestIndexSegment(cb, metabuffer,
+                                                                                                          oldest_index_segment,
+                                                                                                          index_vacuum_stop_point);
+                               --empty_index_segments;
+                               if (empty_index_segments > 0 &&
+                                       oldest_index_segment == CB_INVALID_SEGMENT)
+                               {
+                                       ReleaseBuffer(metabuffer);
+                                       return;
+                               }
+                       }
+
+                       /*
+                        * If we freed some but not all index segments, all the entries in
+                        * the metapage are still needed, so there is no point in trying to
+                        * clean it up.
+                        */
+                       if (oldest_index_segment != CB_INVALID_SEGMENT)
+                       {
+                               ReleaseBuffer(metabuffer);
+                               return;
+                       }
+
+                       /*
+                        * Relock the metapage prior to looping around. We may still be
+                        * able to clear index entries from the metapage, or adjust the
+                        * start of the metapage index.
+                        */
+                       LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
+               }
+       }
+}
+
+/*
+ * Clear obsolete index entries from a segment.
+ *
+ * metabuffer should be pinned but not locked when this function is called,
+ * and will be in the same state upon return.
+ *
+ * index_segment specifies the target index segment.
+ *
+ * index_vacuum_stop_point defines the point beyond which no index entries
+ * may be removed. If an index entry is found all or part of which would cover
+ * pages greater than or equal to this value, then this function does nothing
+ * further and returns false. If this limit is not reached, this function
+ * returns true.
+ *
+ * *next_index_segment is set to the segment number of the index segment
+ * that follows the one specified by index_segment, or CB_INVALID_SEGMENT
+ * if none.
+ */
+static bool
+ConveyorBeltClearIndexSegmentEntries(ConveyorBelt *cb, Buffer metabuffer,
+                                                                        CBSegNo index_segment,
+                                                                        CBPageNo index_vacuum_stop_point,
+                                                                        CBSegNo *next_index_segment)
+{
+       bool            needs_xlog;
+       bool            need_next_segment = true;
+       unsigned        segoff;
+       BlockNumber     fsmblock = InvalidBlockNumber;
+       Buffer          fsmbuffer = InvalidBuffer;
+
+       /* Do we need to write XLOG for operations on this conveyor belt? */
+       needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM;
+
+       for (segoff = 0; segoff < cb->cb_pages_per_segment; ++segoff)
+       {
+               BlockNumber     indexblock;
+               Buffer          indexbuffer;
+               Page            indexpage;
+               unsigned        pageoffset = 0;
+               CBSegNo         cleared_segno = CB_INVALID_SEGMENT;
+
+               indexblock = cb_segment_to_block(cb->cb_pages_per_segment,
+                                                                                index_segment, segoff);
+               indexbuffer = ConveyorBeltRead(cb, indexblock, BUFFER_LOCK_EXCLUSIVE);
+               indexpage = BufferGetPage(indexbuffer);
+
+               /*
+                * If an index segment page is not initialized, treat it the same
+                * way as if it is initialized but contains no entries.
+                */
+               if (ConveyorBeltPageIsUnused(indexpage))
+               {
+                       if (segoff == 0)
+                               elog(ERROR,
+                                        "conveyor belt index page at segno %u offset 0 should be initialied",
+                                        index_segment);
+                       if (*next_index_segment != CB_INVALID_SEGMENT)
+                               elog(ERROR,
+                                        "non-final index segment page at segno %u offset %u should be initialized",
+                                        index_segment, segoff);
+                       return true;
+               }
+
+               /*
+                * If this is the very first time we've locked an index page in this
+                * segment, it should be the first page, and it will tell us where to
+                * find the next segment once we finish with this one. Grab that
+                * information while we have the page lock.
+                */
+               if (need_next_segment)
+               {
+                       Assert(segoff == 0);
+                       *next_index_segment = cb_indexpage_get_next_segment(indexpage);
+                       need_next_segment = false;
+               }
+
+               /*
+                * Loop over the index entries in this page.
+                *
+                * At the top of each iteration of the loop, the index page is
+                * exclusively locked. The lock may be released and reacquired before
+                * beginning the next iteration.
+                */
+               while (pageoffset < CB_INDEXPAGE_INDEX_ENTRIES)
+               {
+                       CBSegNo         segno;
+                       CBPageNo        first_page;
+
+                       /* Find, or reconfirm, the location of the next obsolete entry. */
+                       segno = cb_indexpage_get_obsolete_entry(indexpage, &pageoffset,
+                                                                                                       &first_page);
+                       if (segno == CB_INVALID_SEGMENT)
+                       {
+                               /* No items remain in this page. */
+                               UnlockReleaseBuffer(indexbuffer);
+                               break;
+                       }
+                       if (first_page + (cb->cb_pages_per_segment * pageoffset) +
+                               cb->cb_pages_per_segment > index_vacuum_stop_point)
+                       {
+                               /*
+                                * At least one entry from this page is still needed, so no
+                                * point in visiting future pages in this index segment, and
+                                * no point in visiting any more index segments.
+                                */
+                               UnlockReleaseBuffer(indexbuffer);
+                               return false;
+                       }
+
+                       /*
+                        * If this is the first time we've considered clearing this
+                        * particular payload segment, we'll need to release the buffer
+                        * lock, do some necessary prep work, reacquire the buffer lock,
+                        * and recheck to make sure nothing has changed.
+                        */
+                       if (segno != cleared_segno)
+                       {
+                               BlockNumber     newfsmblock;
+
+                               /* Release lock on index page. */
+                               LockBuffer(indexbuffer, BUFFER_LOCK_UNLOCK);
+
+                               /*
+                                * Clear the segment that we want to recycle.
+                                *
+                                * Note that we could crash or error out while or after doing
+                                * this and before we actually recycle the segment. If so,
+                                * we'll do it again the next time someone tries to vacuum
+                                * this conveyor belt.  All of that is fine, because nobody
+                                * can be looking at the data any more, and clearing the pages
+                                * is idempotent.
+                                */
+                               ConveyorBeltClearSegment(cb, segno, true);
+
+                               /*
+                                * Make sure that we have the correct FSM buffer pinned.
+                                *
+                                * Often, any FSM buffer that we have pinned previously will
+                                * still be the correct one, either because segment numbers
+                                * allocated around the same time are likely to be close
+                                * together numerically, or just because the conveyor belt may
+                                * not be big enough to need lots of FSM pages.
+                                *
+                                * However, in the worst case, this can change every time.
+                                */
+                               newfsmblock = cb_segment_to_fsm_block(cb->cb_pages_per_segment,
+                                                                                                         segno);
+                               if (fsmblock != newfsmblock)
+                               {
+                                       ReleaseBuffer(fsmbuffer);
+                                       fsmblock = newfsmblock;
+                                       if (fsmblock == InvalidBlockNumber)
+                                               fsmbuffer = InvalidBuffer;
+                                       else
+                                               fsmbuffer =
+                                                       ReadBufferExtended(cb->cb_rel, cb->cb_fork,
+                                                                                          fsmblock, RBM_NORMAL, NULL);
+                               }
+
+                               /* Relock the index page and go around. */
+                               LockBuffer(indexbuffer, BUFFER_LOCK_EXCLUSIVE);
+                               cleared_segno = segno;
+                               continue;
+                       }
+
+                       /*
+                        * Clear the index entry referrring to the payload segment, and
+                        * mark the segment free. To do this, we have to grab the lock
+                        * on whatever page contains the free/busy state, which could be
+                        * either an FSM page or the metapage.
+                        */
+                       if (fsmblock == InvalidBlockNumber)
+                       {
+                               LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
+                               cb_recycle_payload_segment(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node,
+                                                                                  cb->cb_fork,
+                                                                                  metabuffer,
+                                                                                  indexblock, indexbuffer,
+                                                                                  InvalidBlockNumber, InvalidBuffer,
+                                                                                  segno, pageoffset, needs_xlog);
+                               LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK);
+                       }
+                       else
+                       {
+                               LockBuffer(fsmbuffer, BUFFER_LOCK_EXCLUSIVE);
+                               cb_recycle_payload_segment(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node,
+                                                                                  cb->cb_fork,
+                                                                                  InvalidBuffer,
+                                                                                  indexblock, indexbuffer,
+                                                                                  fsmblock, fsmbuffer,
+                                                                                  segno, pageoffset, needs_xlog);
+                               LockBuffer(fsmbuffer, BUFFER_LOCK_UNLOCK);
+                       }
+
+                       /* No need to consider this page offset again. */
+                       ++pageoffset;
+
+                       /* Now we're no longer prepared to clear any segment. */
+                       cleared_segno = CB_INVALID_SEGMENT;
+               }
+       }
+
+       return true;
+}
+
+/*
+ * Attempt to remve the oldest index segment.
+ *
+ * The return value is the segment number of the oldest index segment that
+ * remains after the operation has been completed. If no index segments remain
+ * after the operation or if the operation cannot be completed, the return
+ * value is CB_INVALID_SEGMENT.
+ */
+static CBSegNo
+ConveyorBeltFreeOldestIndexSegment(ConveyorBelt *cb, Buffer metabuffer,
+                                                                  CBSegNo oldest_index_segment,
+                                                                  CBPageNo index_vacuum_stop_point)
+{
+       BlockNumber     firstindexblock;
+       Buffer          firstindexbuffer;
+       BlockNumber     fsmblock;
+       Buffer          fsmbuffer;
+       bool            needs_xlog;
+       CBSegNo         oldest_remaining_index_segment = CB_INVALID_SEGMENT;
+
+       /*
+        * Clear all the blocks in the oldest index segment except for the first.
+        * We must keep the first one until the bitter end, so that it remains
+        * possible to walk the chain of index segments.
+        */
+       ConveyorBeltClearSegment(cb, oldest_index_segment, false);
+
+       /*
+        * Read and pin the first block of the index segment.
+        */
+       needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM;
+       firstindexblock = cb_segment_to_block(cb->cb_pages_per_segment,
+                                                                                 oldest_index_segment, 0);
+       firstindexbuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork,
+                                                                                 firstindexblock, RBM_NORMAL, NULL);
+
+       /*
+        * Also read and pin the appropriate FSM page, unless the busy/free status
+        * of this segment is stored in the metapage.
+        */
+       fsmblock = cb_segment_to_fsm_block(cb->cb_pages_per_segment,
+                                                                          oldest_index_segment);
+       if (fsmblock == InvalidBlockNumber)
+               fsmbuffer = InvalidBuffer;
+       else
+               fsmbuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork,
+                                                                          fsmblock, RBM_NORMAL, NULL);
+
+       /*
+        * The lock ordering described in the README requires the metapage lock
+        * to be taken last, but it also requires that freeing an index segment
+        * take a cleanup lock on the metapage. Since a concurrent reader will
+        * hold a pin on the metapage when trying to lock the first index page,
+        * we can't lock the first index page and then wait for a cleanup lock
+        * on the metapage, because that might deadlock.
+        *
+        * To get around that problem, we take the cleanup lock on the metabuffer
+        * conditionally. If we can't get it, we just skip freeing the oldest
+        * index segment. That's not great, but it's not obvious how we can do
+        * any better.
+        */
+       LockBuffer(firstindexbuffer, BUFFER_LOCK_EXCLUSIVE);
+       LockBuffer(fsmbuffer, BUFFER_LOCK_EXCLUSIVE);
+       if (ConditionalLockBufferForCleanup(metabuffer))
+       {
+               oldest_remaining_index_segment =
+&n