From ade66f7ffcd434e2d37650f8802eda6eb65dae53 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Sun, 19 Sep 2021 19:58:12 -0400 Subject: [PATCH] WIP: Conveyor belt storage. There are still things missing here and probably quite a few bugs yet, but the basics are mostly here and mostly working now. For an overview of the ideas I have in mind, please check out src/backend/access/conveyor/README --- src/backend/access/Makefile | 4 +- src/backend/access/conveyor/Makefile | 24 + src/backend/access/conveyor/README | 187 ++ src/backend/access/conveyor/cbcache.c | 186 ++ src/backend/access/conveyor/cbfsmpage.c | 152 ++ src/backend/access/conveyor/cbindexpage.c | 189 ++ src/backend/access/conveyor/cbmetapage.c | 721 ++++++++ src/backend/access/conveyor/cbmodify.c | 686 +++++++ src/backend/access/conveyor/cbxlog.c | 442 +++++ src/backend/access/conveyor/conveyor.c | 1978 +++++++++++++++++++++ src/backend/access/rmgrdesc/Makefile | 1 + src/backend/access/rmgrdesc/cbdesc.c | 168 ++ src/backend/access/transam/rmgr.c | 1 + src/bin/pg_waldump/.gitignore | 1 + src/bin/pg_waldump/rmgrdesc.c | 1 + src/include/access/cbcache.h | 33 + src/include/access/cbdefs.h | 29 + src/include/access/cbfsmpage.h | 127 ++ src/include/access/cbfsmpage_format.h | 38 + src/include/access/cbindexpage.h | 84 + src/include/access/cbindexpage_format.h | 54 + src/include/access/cbmetapage.h | 179 ++ src/include/access/cbmetapage_format.h | 110 ++ src/include/access/cbmodify.h | 129 ++ src/include/access/cbstorage.h | 165 ++ src/include/access/cbxlog.h | 108 ++ src/include/access/conveyor.h | 59 + src/include/access/rmgrlist.h | 1 + src/tools/pgindent/typedefs.list | 8 + 29 files changed, 5863 insertions(+), 2 deletions(-) create mode 100644 src/backend/access/conveyor/Makefile create mode 100644 src/backend/access/conveyor/README create mode 100644 src/backend/access/conveyor/cbcache.c create mode 100644 src/backend/access/conveyor/cbfsmpage.c create mode 100644 src/backend/access/conveyor/cbindexpage.c create mode 100644 src/backend/access/conveyor/cbmetapage.c create mode 100644 src/backend/access/conveyor/cbmodify.c create mode 100644 src/backend/access/conveyor/cbxlog.c create mode 100644 src/backend/access/conveyor/conveyor.c create mode 100644 src/backend/access/rmgrdesc/cbdesc.c create mode 100644 src/include/access/cbcache.h create mode 100644 src/include/access/cbdefs.h create mode 100644 src/include/access/cbfsmpage.h create mode 100644 src/include/access/cbfsmpage_format.h create mode 100644 src/include/access/cbindexpage.h create mode 100644 src/include/access/cbindexpage_format.h create mode 100644 src/include/access/cbmetapage.h create mode 100644 src/include/access/cbmetapage_format.h create mode 100644 src/include/access/cbmodify.h create mode 100644 src/include/access/cbstorage.h create mode 100644 src/include/access/cbxlog.h create mode 100644 src/include/access/conveyor.h diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 0880e0a8bb..a840ae2e91 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,7 +8,7 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ - table tablesample transam +SUBDIRS = brin common conveyor gin gist hash heap index nbtree rmgrdesc \ + spgist table tablesample transam include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/conveyor/Makefile b/src/backend/access/conveyor/Makefile new file mode 100644 index 0000000000..798f46362a --- /dev/null +++ b/src/backend/access/conveyor/Makefile @@ -0,0 +1,24 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/conveyor +# +# IDENTIFICATION +# src/backend/access/conveyor/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/conveyor +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + cbcache.o \ + cbfsmpage.o \ + cbindexpage.o \ + cbmetapage.o \ + cbmodify.o \ + cbxlog.o \ + conveyor.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/conveyor/README b/src/backend/access/conveyor/README new file mode 100644 index 0000000000..109a7588d1 --- /dev/null +++ b/src/backend/access/conveyor/README @@ -0,0 +1,187 @@ +Conveyor Belt Storage +===================== + +It's pretty common to want an append-only data store, but it's not usually +practical to keep accumulating data forever without ever discarding any of +it. What people most often want to do is periodically discard the oldest +data, keeping all the newer stuff. Hence, imagine a conveyor belt. New data +is continually added at one end of the conveyor belt, and eventually falls +off the other end. Unlike a real conveyor belt, however, our conveyor belt +is of variable length, and can grow and shrink to accommodate the amount of +data that needs to be stored at any given time. + +Some other parts of PostgreSQL, for example xlog.c and slru.c, handle +similar needs by using a series of files at the operating system level, +adding new ones at the end and removing old ones. We want to use a standard +relation fork, and so instead maintain a logical-to-physical page mapping. +Callers allocate new logical page numbers (which are just consecutive 64-bit +integers) and this module takes care of figuring out where to place them +phyically. When the oldest logical pages are thrown away, the blocks +allocated to them can be reused for new logical pages. + +Conceptually, a relation fork organized as a conveyor belt has three parts: + +- Payload. The payload is whatever data the user of this module wishes + to store. The conveyor belt doesn't care what you store in a payload page, + but it does require that you store something: each time a payload page is + initialized, it must end up with either pd_lower > SizeOfPageHeaderData, + or pd_lower < BLCKSZ. + +- Index. The index translates logical page numbers to physical block + numbers. The intention is that pages might be physically relocated - e.g. + they could be moved to lower-numbered blocks to allow the relation to be + physically compacted - so external references to the data should use only + logical page numbers. The index is used to figure out which block + currently contains a given logical page number. + +- Freespace Map. The freespace map is used to decide where payload and + index data should be stored. + +Segments and the Freespace Map +------------------------------ + +Every page in a conveyor belt is either the metapage, or a freespace map +page, or part of a segment. Segments can store either payload data or +index data. Every segment in any particular conveyor belt contains +the same number of pages. As old data is removed from the conveyor belt, +segments get recycled. + +The allocation status of every segment is tracked by a single bit in the +freespace map: 1 = allocated, 0 = free. The initial portion of the freespace +map is stored in the metapage. When the relation has been extended far +enough that there are no more bits available in the metapage to track the +allocation status of further segments, a single freespace map page is +inserted. That single freespace map page is then followed by additional +segments. If we extend the relation far enough that the previous freespace +map page has no more bits available to track the allocation status of +further segments, then it's time to another freespace map page. + +This scheme allows us to compute the location of every freespace map page +based on (1) the number of freespace bits in the metapage, (2) the +number of freespace map bits in each freespace map page, and (3) the +number of pages per segment. (1) and (2) are compile time constants, and +(3) can't be changed after a conveyor belt is created, so once we have +read the metapage for a conveyor belt once, we can compute the location +of freespace map pages at will. + +It's also straightforward to compute the starting and ending block numbers +for any given segment. We just have to be careful to account for the number +of freespace map pages that precede that segment. + +Payload Segments and the Index +------------------------------ + +As payload data is added to a conveyor belt, payload segments are allocated +to store it. As old payload data is removed from a conveyor belt, the +payload segments that contain it can be deallocated and eventually reused. + +Because space for payload data is allocated a segment at a time, the +index just needs to keep track of (1) the segment numbers that contain +payload data, in order of allocation and (2) the starting logical page +number for the first such segment. If there's not much data on the conveyor +belt, all of these segment numbers - which we call index entries - can be +stored in the metapage itself. + +If the metapage runs out of space to store index entries, then we move the +oldest index entries that it presently contains into an index segment, and +continue to insert new index entries into the metapage. The index segments +themselves are organized into a singly linked list. + +As data is removed from a conveyor belt, we'll eventually reach a point +where none of the index entries in a given index segment are needed any +more. At that point, we can deallocate the index segment and reuse it. + +Note that nothing prevents a dealloated index segment from being reused +as a payload segment, or the other way around. + +Removing Old Data +----------------- + +From time to time, we expect to receive a request to discard old data, +which will come in the form of a call to ConveyorBeltLogicalTruncate stating +that all logical pages with page numbers less than some threshold value are +no longer required. Thereafter, a subsequent call to ConveyorBeltVacuum +may be used to free up any segments that are no longer required as a result +of the increased logical truncation point. Finally, a call to +ConveyorBeltPhysicalTruncate may be used to discard unused pages from the +end of the relation. + +ConveyorBeltLogicalTruncate just updates the metapage with a new value for +the oldest logical page in existence. Once this has been done, attempts to +access logical pages prior to the new threshold will be categorically +refused. We require a cleanup lock on the matapage for this step. This +allows a reader which has determined the location of a payload page to +release all buffer locks, retaining just a pin on the metapage, before +reading and locking the target page. Since the page can't be logically +truncated away while the pin on the metapage is held, it also can't be +recycled. + +ConveyorBeltVacuum performs a multi-step process to recycle index and +payload segments whose contents are no longer needed: + +1. If there are now one or more payload segments that contain no accessible + pages, it frees them up. To do this, it must first reinitialize each page of + each such segment, taking a cleanup lock on each page as it does so. + This guarantees that no other backend retains a pin on any such pages. + It should be impossible for any new locks or pins on these pages to be + taken at this point, because pages that have been logically truncated + can no longer be looked up via ConveyorBeltReadBuffer. It then clears + the index entry that points to the segment in question and simultaneously + marks it free in the freespace map. + +2. When all of the index entries in an index segment have been cleared, + the index segment itself can be freed. To do this, we first reinitialize + all the pages, and then update the metapage. The metapage update involves + changing metapage's notion of the oldest index segment and of the + logical page number where the index begins. Simultaneously, the segment + must be marked as free in the freespace map. + + These metapage updates must be performed with a cleanup lock on the + metapage. This allows a concurrent reader to lock the metapage, note the + location of the first index segment, release the lock on the metapage while + retaining the pin, and then go lock the pages in that index segment, or + successor index segments, without fearing that they will vanish. + +3. If index entries are cleared in the metapage itself, then any + remaining entries can be shifted to the start of the array of index + entries stored in the metapage, provided that we simultaneously + adjust the metapage's notion of the logical page number where the + index begins. + +Note that there's no correctness issue if ConveyorBeltVacuum is skipped or +if it is interrupted before all the work that it could perform actually +gets done. It doesn't do anything that can't be deferred until a later time; +the only problem is that you might end up with a lot of bloat. That could +be bad, but it's not a functional breakage. + +Note also that running multiple copies of ConveyorBeltVacuum on the same +conveyor belt at the same time is a bad plan. They'll contend with each +othr trying to do the same work. Consider preventing this by some means +(e.g. a self-exclusive heavyweight lock). + +ConveyorBeltPhysicalTruncate can be used to return disk space to the +operating system. This only works if the highest-numbered segments happen +to be free, and it requires both a relation extension lock (since it would +be bad if someone is trying to make the relation longer while we're trying +to make it shorter) and a cleanup lock on the metapage (since +ConveyorBeltNewPage can't cope with a concurrent truncation; it expects +that the relation will only grow concurrently, not shrink). + +Buffer Lock Ordering +-------------------- + +Various parts of the code need to acquire multiple buffer locks +simultaneously, and must do so in a consistent order to avoid deadlock. We +use this ordering: + +1. Any new page that we're about to allocate. +2. Any payload page. +3. Any index pages. +4. Any FSM page. +5. The metapage. + +It might seem strange to lock the metapage last, because we typically need +to examine it before doing much of anything. However, after we examine it, +we typically want to read other pages, and we don't want to retain the +buffer lock while doing I/O. Instead, we release the buffer lock and then +reacquire it at the end, after we've got all the other locks we need. diff --git a/src/backend/access/conveyor/cbcache.c b/src/backend/access/conveyor/cbcache.c new file mode 100644 index 0000000000..789fe40f66 --- /dev/null +++ b/src/backend/access/conveyor/cbcache.c @@ -0,0 +1,186 @@ +/*------------------------------------------------------------------------- + * + * cbcache.c + * Conveyor belt index segment location cache. + * + * The conveyor belt metapage stores the segment numbers of the oldest and + * newest index segments that currently exist, but the location of any + * other index segment can only be discovered by reading the first page + * of some index segment whose position is known and extracting from it + * the segment number of the next index segment. That's potentially + * expensive, especially if we have to traverse a whole bunch of index + * segments to figure out the location of the one we need, so we maintain + * a cache. + * + * The cache maps the oldest logical page number covered by an index + * segment to the segment number where that index segment is located. + * If older index segments are removed, the corresponding mappings become + * obsolete, but nobody should be accessing those pages anyway. Still, + * we're careful to purge such mappings to avoid wasting memory. + * + * If an index segment is moved, we invalidate the entire cache. This + * is expected to be fairly rare, as it should only happen if someone is + * trying to reduce the on-disk footprint of the conveyor belt. Moreover, + * if someone is doing that, it is likely that multiple index segments + * will be moved in relatively quick succession, so it's not clear that + * a more granular invalidation strategy would help anything. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/backend/access/conveyor/cbcache.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/cbcache.h" +#include "common/hashfn.h" + +typedef struct cb_iseg_entry +{ + CBPageNo index_segment_start; + CBSegNo segno; + char status; +} cb_iseg_entry; + +#define SH_PREFIX cb_iseg +#define SH_ELEMENT_TYPE cb_iseg_entry +#define SH_KEY_TYPE CBPageNo +#define SH_KEY index_segment_start +#define SH_HASH_KEY(tb, key) tag_hash(&key, sizeof(CBPageNo)) +#define SH_EQUAL(tb, a, b) a == b +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +struct CBCache +{ + uint64 index_segments_moved; + CBPageNo oldest_possible_start; + cb_iseg_hash *iseg; +}; + +/* + * Create a new cache. + */ +CBCache * +cb_cache_create(MemoryContext mcxt, uint64 index_segments_moved) +{ + CBCache *cache = MemoryContextAlloc(mcxt, sizeof(CBCache)); + + cache->index_segments_moved = index_segments_moved; + cache->oldest_possible_start = 0; + cache->iseg = cb_iseg_create(mcxt, 128, NULL); + + return cache; +} + +/* + * Invalidate cache entries as required. + * + * If index_segments_moved has changed, throw away everything we think we + * know. Otherwise, if index_start has advanced, throw away any entries that + * precede the new start. + */ +void +cb_cache_invalidate(CBCache *cache, CBPageNo index_start, + uint64 index_segments_moved) +{ + if (index_segments_moved != cache->index_segments_moved) + { + cb_iseg_reset(cache->iseg); + cache->index_segments_moved = index_segments_moved; + } + else if (index_start > cache->oldest_possible_start) + { + cb_iseg_iterator it; + cb_iseg_entry *entry; + + cb_iseg_start_iterate(cache->iseg, &it); + while ((entry = cb_iseg_iterate(cache->iseg, &it)) != NULL) + if (entry->index_segment_start < index_start) + cb_iseg_delete_item(cache->iseg, entry); + } +} + +/* + * Search the cache for an index segment number, given the first logical page + * number covered by that index segment. + * + * It is the caller's responsibility to make sure that pageno is the first + * logical page number covered by some index segment, rather than any random + * page number whose index entry might be anywhere in the segment. We don't + * have enough information here to verify this, and just trust that the caller + * knows what they are doing. + */ +CBSegNo +cb_cache_lookup(CBCache *cache, CBPageNo pageno) +{ + cb_iseg_entry *entry; + + entry = cb_iseg_lookup(cache->iseg, pageno); + return entry != NULL ? entry->segno : CB_INVALID_SEGMENT; +} + +/* + * Search the cache for an index segment that precedes the one for which we + * are searching by as little as possible. + * + * As with cb_cache_lookup, pageno should be the first logical page of the + * index segment in which the caller is interested, although unlike that + * function, this function would still work correctly if it were an arbitrary + * page number, at least as presently implemented. + * + * If no segment with a starting segment number preceding pageno is found + * in cache, the return value is CB_INVALID_SEGMENT and *index_segment_start + * is set to CB_INVALID_LOGICAL_PAGE. Otherwise, the return value is the + * segment number we found and *index_segment_start is set to the starting + * logical page number of that segment. + */ +CBSegNo +cb_cache_fuzzy_lookup(CBCache *cache, CBPageNo pageno, + CBPageNo *index_segment_start) +{ + cb_iseg_iterator it; + cb_iseg_entry *current; + cb_iseg_entry *best = NULL; + + cb_iseg_start_iterate(cache->iseg, &it); + while ((current = cb_iseg_iterate(cache->iseg, &it)) != NULL) + { + if (current->index_segment_start > pageno) + continue; + if (best == NULL || + best->index_segment_start < current->index_segment_start) + best = current; + } + + if (best == NULL) + { + *index_segment_start = CB_INVALID_LOGICAL_PAGE; + return CB_INVALID_SEGMENT; + } + + *index_segment_start = best->index_segment_start; + return best->segno; +} + +/* + * Insert a cache entry. + * + * As in cb_cache_lookup, it's critical that index_segment_start is the first + * logical page number covered by the index segment. + */ +void +cb_cache_insert(CBCache *cache, CBSegNo segno, CBPageNo index_segment_start) +{ + cb_iseg_entry *entry; + bool found; + + entry = cb_iseg_insert(cache->iseg, index_segment_start, &found); + Assert(!found); + entry->segno = segno; + + Assert(index_segment_start >= cache->oldest_possible_start); +} diff --git a/src/backend/access/conveyor/cbfsmpage.c b/src/backend/access/conveyor/cbfsmpage.c new file mode 100644 index 0000000000..65b8f252b7 --- /dev/null +++ b/src/backend/access/conveyor/cbfsmpage.c @@ -0,0 +1,152 @@ +/*------------------------------------------------------------------------- + * + * cbfsmpage.c + * APIs for accessing conveyor belt FSM pages. + * + * Similar to cbmetapage.c, this file abstracts accesses to conveyor + * belt FSM pages, and should be the only backend code that understands + * their internal structure. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/backend/access/conveyor/cbfsmpage.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/cbfsmpage.h" +#include "access/cbfsmpage_format.h" +#include "access/cbmetapage.h" + +static CBFSMPageData *cb_fsmpage_get_special(Page page); + +/* + * Initialize FSM page. + * + * Returns the first segment number that will be covered by the new page. + */ +CBSegNo +cb_fsmpage_initialize(Page page, BlockNumber blkno, uint16 pages_per_segment) +{ + CBFSMPageData *fsmp; + BlockNumber first_fsm_block = cb_first_fsm_block(pages_per_segment); + unsigned fsm_block_spacing = cb_fsm_block_spacing(pages_per_segment); + + /* Sanity checks. */ + Assert(blkno >= first_fsm_block); + Assert((blkno - first_fsm_block) % fsm_block_spacing == 0); + + /* Initialize page. PageInit will zero the payload bits for us. */ + PageInit(page, BLCKSZ, sizeof(CBFSMPageData)); + fsmp = (CBFSMPageData *) PageGetSpecialPointer(page); + fsmp->cbfsm_magic = CB_FSMPAGE_MAGIC; + fsmp->cbfsm_start = cb_first_segment_for_fsm_page(blkno, pages_per_segment); + + return fsmp->cbfsm_start; +} + +/* + * Get the allocation status of a segment from an FSM page. + */ +bool +cb_fsmpage_get_fsm_bit(Page page, CBSegNo segno) +{ + CBFSMPageData *fsmp = cb_fsmpage_get_special(page); + uint8 byte; + uint8 mask; + uint32 bitno; + + if (segno < fsmp->cbfsm_start || + segno >= fsmp->cbfsm_start + CB_FSM_SEGMENTS_PER_FSMPAGE) + elog(ERROR, "segment %u out of range for fsm page starting at segment %u", + segno, fsmp->cbfsm_start); + + bitno = segno - fsmp->cbfsm_start; + byte = fsmp->cbfsm_state[bitno / BITS_PER_BYTE]; + mask = 1 << (bitno % BITS_PER_BYTE); + return (byte & mask) != 0; +} + +/* + * Set the allocation status of a segment in an FSM page. + * + * new_state should be true if the bit is currently clear and should be set, + * and false if the bit is currently set and should be cleared. Don't call + * this unless you know that the bit actually needs to be changed. + */ +void +cb_fsmpage_set_fsm_bit(Page page, CBSegNo segno, bool new_state) +{ + CBFSMPageData *fsmp = cb_fsmpage_get_special(page); + uint8 *byte; + uint8 mask; + uint8 old_state; + uint32 bitno; + + if (segno < fsmp->cbfsm_start || + segno >= fsmp->cbfsm_start + CB_FSM_SEGMENTS_PER_FSMPAGE) + elog(ERROR, "segment %u out of range for fsm page starting at segment %u", + segno, fsmp->cbfsm_start); + + bitno = segno - fsmp->cbfsm_start; + byte = &fsmp->cbfsm_state[bitno / BITS_PER_BYTE]; + mask = 1 << (segno % BITS_PER_BYTE); + old_state = (*byte & mask) != 0; + + if (old_state == new_state) + elog(ERROR, "fsm bit for segment %u already has value %d", + segno, old_state ? 1 : 0); + + if (new_state) + *byte |= mask; + else + *byte &= ~mask; +} + +/* + * Returns the lowest unused segment number covered by the supplied FSM page, + * or CB_INVALID_SEGMENT if none. + */ +CBSegNo +cbfsmpage_find_free_segment(Page page) +{ + CBFSMPageData *fsmp = cb_fsmpage_get_special(page); + unsigned i; + unsigned j; + + StaticAssertStmt(CB_FSMPAGE_FREESPACE_BYTES % sizeof(uint64) == 0, + "CB_FSMPAGE_FREESPACE_BYTES should be a multiple of 8"); + + for (i = 0; i < CB_FSMPAGE_FREESPACE_BYTES; ++i) + { + uint8 b = fsmp->cbfsm_state[i]; + + if (b == 0xFF) + continue; + + for (j = 0; j < BITS_PER_BYTE; ++j) + { + if ((b & (1 << j)) == 0) + return fsmp->cbfsm_start + (i * BITS_PER_BYTE) + j; + } + } + + return CB_INVALID_SEGMENT; +} + +/* + * Given a page that is known to be a conveyor belt free space map page, + * return a pointer to the CBFSMPageData, after checking the magic number. + */ +static CBFSMPageData * +cb_fsmpage_get_special(Page page) +{ + CBFSMPageData *fsmp = (CBFSMPageData *) PageGetSpecialPointer(page); + + if (fsmp->cbfsm_magic != CB_FSMPAGE_MAGIC) + elog(ERROR, "bad magic number in conveyor belt fsm page: %08X", + fsmp->cbfsm_magic); + + return fsmp; +} diff --git a/src/backend/access/conveyor/cbindexpage.c b/src/backend/access/conveyor/cbindexpage.c new file mode 100644 index 0000000000..99ad1419bc --- /dev/null +++ b/src/backend/access/conveyor/cbindexpage.c @@ -0,0 +1,189 @@ +/*------------------------------------------------------------------------- + * + * cbindexpage.c + * APIs for accessing conveyor belt index pages. + * + * Similar to cbmetapage.c, this file abstracts accesses to conveyor + * belt index pages, and should be the only backend code that understands + * their internal structure. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/backend/access/conveyor/cbindexpage.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/cbfsmpage.h" +#include "access/cbindexpage.h" +#include "access/cbindexpage_format.h" + +static CBIndexPageData *cb_indexpage_get_special(Page page); + +/* + * Initialize an index page. + * + * If this is the first page in a new index segment, it has to be the newest + * segment, so there's no next segment yet. And there's never a next segment + * for a page that is not the first one in the segment. + */ +void +cb_indexpage_initialize(Page page, CBPageNo pageno) +{ + CBIndexPageData *ipd; + int i; + + PageInit(page, BLCKSZ, sizeof(CBIndexPageData)); + ipd = (CBIndexPageData *) PageGetSpecialPointer(page); + ipd->cbidx_magic = CB_INDEXPAGE_MAGIC; + ipd->cbidx_next_segment = CB_INVALID_SEGMENT; + ipd->cbidx_first_page = pageno; + + for (i = 0; i < CB_INDEXPAGE_INDEX_ENTRIES; ++i) + ipd->cbidx_entry[i] = CB_INVALID_SEGMENT; +} + +/* + * Figure out where a certain logical page is physically located. + * + * It is the caller's responsibility to supply the correct index page. + */ +BlockNumber +cb_indexpage_find_logical_page(Page page, CBPageNo pageno, + uint16 pages_per_segment) +{ + CBIndexPageData *ipd = cb_indexpage_get_special(page); + unsigned offset; + CBSegNo segno; + + if (pageno < ipd->cbidx_first_page) + elog(ERROR, "can't find index entry for page " UINT64_FORMAT " on an index page that starts at page " UINT64_FORMAT, + pageno, ipd->cbidx_first_page); + offset = (pageno - ipd->cbidx_first_page) / pages_per_segment; + if (offset > CB_INDEXPAGE_INDEX_ENTRIES) + elog(ERROR, "can't find index entry for page " UINT64_FORMAT " on an index page that starts at page " UINT64_FORMAT, + pageno, ipd->cbidx_first_page); + segno = ipd->cbidx_entry[offset]; + if (segno == CB_INVALID_SEGMENT) + elog(ERROR, "no index entry for page " INT64_FORMAT, pageno); + + return cb_segment_to_block(pages_per_segment, segno, + pageno % pages_per_segment); +} + +/* + * Add index entries for logical pages beginning at 'pageno'. + * + * It is the caller's responsibility to supply the correct index page, and + * to make sure that there is enough room for the entries to be added. + */ +void +cb_indexpage_add_index_entries(Page page, + unsigned pageoffset, + unsigned num_index_entries, + CBSegNo *index_entries) +{ + CBIndexPageData *ipd = cb_indexpage_get_special(page); + + if (num_index_entries < 1 || num_index_entries > CB_INDEXPAGE_INDEX_ENTRIES) + elog(ERROR, "can't add %u index entries to an index page", + num_index_entries); + if (pageoffset + num_index_entries > CB_INDEXPAGE_INDEX_ENTRIES) + elog(ERROR, "can't place %u index entries starting at offset %u", + num_index_entries, pageoffset); + + memcpy(&ipd->cbidx_entry[pageoffset], index_entries, + num_index_entries * sizeof(CBSegNo)); +} + +/* + * Get an obsolete index entry for the given segment. + * + * Starts searching for an index entry at the offset given by *pageoffset, + * and update *pageoffset to the offset at which an entry was found, or to + * CB_INDEXPAGE_INDEX_ENTRIES if no entry is found. + * + * Sets *pageno to the first logical page covered by this index page. + * + * Returns the segment number to which the obsolete index entry points. + */ +CBSegNo +cb_indexpage_get_obsolete_entry(Page page, unsigned *pageoffset, + CBPageNo *first_pageno) +{ + CBIndexPageData *ipd = cb_indexpage_get_special(page); + + *first_pageno = ipd->cbidx_first_page; + + while (*pageoffset < CB_INDEXPAGE_INDEX_ENTRIES && + ipd->cbidx_entry[*pageoffset] != CB_INVALID_SEGMENT) + ++*pageoffset; + + return ipd->cbidx_entry[*pageoffset]; +} + +/* + * Clear the obsolete index entry for the given segment from the given page + * offset. + */ +void +cb_indexpage_clear_obsolete_entry(Page page, + CBSegNo segno, + unsigned pageoffset) +{ + CBIndexPageData *ipd = cb_indexpage_get_special(page); + + if (pageoffset >= CB_INDEXPAGE_INDEX_ENTRIES) + elog(ERROR, "page offset %u out of range", pageoffset); + if (ipd->cbidx_entry[pageoffset] != segno) + elog(ERROR, "while clearing index entry %u, found %u where %u was expected", + pageoffset, ipd->cbidx_entry[pageoffset], segno); + + ipd->cbidx_entry[pageoffset] = CB_INVALID_SEGMENT; +} + +/* + * Set the next index segment. + * + * This should only be used on the first page of an index segment, since + * that's where the next segment number is stored. + */ +void +cb_indexpage_set_next_segment(Page page, CBSegNo segno) +{ + CBIndexPageData *ipd = cb_indexpage_get_special(page); + + ipd->cbidx_next_segment = segno; +} + +/* + * Get the next index segment. + * + * This should only be used on the first page of an index segment, since + * that's where the next segment number is stored. + */ +CBSegNo +cb_indexpage_get_next_segment(Page page) +{ + CBIndexPageData *ipd = cb_indexpage_get_special(page); + + return ipd->cbidx_next_segment; +} + +/* + * Given a page that is known to be a conveyor belt free space map page, + * return a pointer to the CBFSMPageData, after checking the magic number. + */ +static CBIndexPageData * +cb_indexpage_get_special(Page page) +{ + CBIndexPageData *ipd = (CBIndexPageData *) PageGetSpecialPointer(page); + + if (ipd->cbidx_magic != CB_INDEXPAGE_MAGIC) + elog(ERROR, "bad magic number in conveyor belt index page: %08X", + ipd->cbidx_magic); + + return ipd; +} diff --git a/src/backend/access/conveyor/cbmetapage.c b/src/backend/access/conveyor/cbmetapage.c new file mode 100644 index 0000000000..15021e092b --- /dev/null +++ b/src/backend/access/conveyor/cbmetapage.c @@ -0,0 +1,721 @@ +/*------------------------------------------------------------------------- + * + * cbmetapage.c + * APIs for accessing conveyor belt metapages. + * + * The goal of this file is to provide a set of functions that can be + * used to perform all necessary access to or modification of a conveyor + * belt metapage. The functions in this file should be the only backend + * code that knows about the actual organization of CBMetapageData, + * but they shouldn't know about the internals of other types of pages + * (like index segment or freespace map pages) nor should they know + * about buffers or locking. + * + * Much - but not all - of the work done here is sanity checking. We + * do this partly to catch bugs, and partly as a defense against the + * possibility that the metapage is corrupted on disk. Because of the + * latter possibility, most of these checks use an elog(ERROR) rather + * than just Assert. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/backend/access/conveyor/cbmetapage.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/cbfsmpage.h" +#include "access/cbindexpage.h" +#include "access/cbmetapage.h" +#include "access/cbmetapage_format.h" + +/* + * Initialize metapage. + */ +void +cb_metapage_initialize(Page page, uint16 pages_per_segment) +{ + CBMetapageData *meta; + int i; + + PageInit(page, BLCKSZ, sizeof(CBMetapageData)); + meta = (CBMetapageData *) PageGetSpecialPointer(page); + meta->cbm_magic = CB_METAPAGE_MAGIC; + meta->cbm_version = CBM_VERSION; + meta->cbm_pages_per_segment = pages_per_segment; + + /* + * PageInit has already zeroed the page, so we only need to initialize any + * fields that need to be non-zero. Everything of type CBPageNo and all of + * the freespace map should start out as 0, but most of the fields of + * CBSegNo fields need to be set to CB_INVALID_SEGMENT. + */ + meta->cbm_oldest_index_segment = CB_INVALID_SEGMENT; + meta->cbm_newest_index_segment = CB_INVALID_SEGMENT; + for (i = 0; i < CB_METAPAGE_INDEX_ENTRIES; ++i) + meta->cbm_index[i] = CB_INVALID_SEGMENT; +} + +/* + * Given a page that is known to be a conveyor belt metapage, return a + * pointer to the CBMetapageData. + * + * We take the opportunity to perform some basic sanity checks here. + */ +CBMetapageData * +cb_metapage_get_special(Page page) +{ + CBMetapageData *meta = (CBMetapageData *) PageGetSpecialPointer(page); + + if (meta->cbm_magic != CB_METAPAGE_MAGIC) + elog(ERROR, "bad magic number in conveyor belt metapage: %08X", + meta->cbm_magic); + if (meta->cbm_version != CBM_VERSION) + elog(ERROR, "bad version in conveyor belt metapage: %08X", + meta->cbm_version); + if (meta->cbm_pages_per_segment == 0) + elog(ERROR, "conveyor belt may not have zero pages per segment"); + + return meta; +} + +/* + * Deduce what we can about the physical location of a logical page. + * + * If the logical page precedes the logical truncation point, returns false. + * Otherwise, returns true. + * + * If the physical location of the block can be computed based on the data + * in the metapage, sets *blkno to the appropriate block number. Otherwise, + * sets *blkno to InvalidBlockNumber. + */ +bool +cb_metapage_find_logical_page(CBMetapageData *meta, + CBPageNo pageno, + BlockNumber *blkno) +{ + CBPageNo relp; + CBSegNo segno; + unsigned segoff; + + /* Physical location unknown, unless we later discover otherwise. */ + *blkno = InvalidBlockNumber; + + /* Is it too old to be accessible? */ + if (pageno < meta->cbm_oldest_logical_page) + return false; + + /* Is it too old to have an index entry in the metapage? */ + if (pageno < meta->cbm_index_metapage_start) + { + /* Index entry exists, but not on metapage. */ + return true; + } + + /* Is it too new to have an index entry? */ + relp = pageno - meta->cbm_index_metapage_start; + if (relp >= CB_METAPAGE_INDEX_ENTRIES * meta->cbm_pages_per_segment) + return false; + + /* Index entry must be in the metapage, if it exists at all. */ + segno = meta->cbm_index[relp / meta->cbm_pages_per_segment]; + segoff = relp % meta->cbm_pages_per_segment; + if (segno == CB_INVALID_SEGMENT) + return false; + + /* Location identified! */ + *blkno = cb_segment_to_block(meta->cbm_pages_per_segment, segno, segoff); + return true; +} + +/* + * Tell the caller what needs to be done to insert a page. + * + * Regardless of the return value, *next_pageno and *next_segno will be + * set to the lowest-numbered logical page that is not allocated and the + * lowest segment number that is not allocated, respectively. In addition, + * *index_start will be set to the first logical page number covered by the + * index, *index_metapage_start to the first logical page number covered by + * the metapage portion of the index, and *newest_index_segment to the segment + * number of the newest index segment, or CB_INVALID_SEGMENT if there is none. + * + * If the return value is CBM_INSERT_OK, *blkno will be set to the block number + * of the first unused page in the unfilled payload segment. + * + * If the return value is CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED, *blkno + * will be set to the first not-entirely-filled page in the newest index + * segment. + */ +CBMInsertState +cb_metapage_get_insert_state(CBMetapageData *meta, + BlockNumber *blkno, + CBPageNo *next_pageno, + CBSegNo *next_segno, + CBPageNo *index_start, + CBPageNo *index_metapage_start, + CBSegNo *newest_index_segment) +{ + CBPageNo relp; + CBSegNo segno; + unsigned segoff; + + /* Set the values that we return unconditionally. */ + *next_pageno = meta->cbm_next_logical_page; + *next_segno = meta->cbm_next_segment; + *index_start = meta->cbm_index_start; + *index_metapage_start = meta->cbm_index_metapage_start; + *newest_index_segment = meta->cbm_newest_index_segment; + + /* Compute next logical page number relative to start of metapage. */ + relp = meta->cbm_next_logical_page - meta->cbm_index_metapage_start; + + /* + * If the next logical page number doesn't fit on the metapage, we need to + * make space by relocating some index entries to an index segment. + * + * Potentially, we could instead clean out some index entries from the + * metapage that now precede the logical truncation point, but that would + * require a cleanup lock on the metapage, and it normally isn't going to + * be possible, because typically the last truncate operation will have + * afterwards done any such work that is possible. We might miss an + * opportunity in the case where the last truncate operation didn't clean + * up fully, but hopefully that's rare enough that we don't need to stress + * about it. + * + * If the newest index segment is already full, then a new index segment + * will need to be created. Otherwise, some entries can be copied into the + * existing index segment. + */ + if (relp >= CB_METAPAGE_INDEX_ENTRIES * meta->cbm_pages_per_segment) + { + unsigned entries; + unsigned maxentries; + + entries = meta->cbm_entries_in_newest_index_segment; + maxentries = CB_INDEXPAGE_INDEX_ENTRIES * meta->cbm_pages_per_segment; + + if (entries > maxentries) + elog(ERROR, + "newest index segment listed as using %u of %u entries", + entries, maxentries); + else if (entries == maxentries || + meta->cbm_newest_index_segment == CB_INVALID_SEGMENT) + return CBM_INSERT_NEEDS_INDEX_SEGMENT; + else + { + /* Figure out which block should be targeted. */ + *blkno = cb_segment_to_block(meta->cbm_pages_per_segment, + meta->cbm_newest_index_segment, + entries / CB_INDEXPAGE_INDEX_ENTRIES); + + return CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED; + } + } + + /* Compute current insertion segment and offset. */ + segno = meta->cbm_index[relp / meta->cbm_pages_per_segment]; + segoff = meta->cbm_next_logical_page % meta->cbm_pages_per_segment; + + /* + * If the next logical page number would be covered by an index entry that + * does not yet exist, we need a new payload segment. + */ + if (segno == CB_INVALID_SEGMENT) + return CBM_INSERT_NEEDS_PAYLOAD_SEGMENT; + + /* Looks like we can go ahead and insert a page. Hooray! */ + *blkno = cb_segment_to_block(meta->cbm_pages_per_segment, segno, segoff); + return CBM_INSERT_OK; +} + +/* + * Advance the next logical page number for this conveyor belt by one. + * + * We require the caller to specify the physical block number where the new + * block was placed. This allows us to perform some sanity-checking. + */ +void +cb_metapage_advance_next_logical_page(CBMetapageData *meta, + BlockNumber blkno) +{ + BlockNumber expected_blkno; + CBPageNo dummy_pageno; + CBSegNo dummy_segno; + + /* Perform sanity checks. */ + if (cb_metapage_get_insert_state(meta, &expected_blkno, &dummy_pageno, + &dummy_segno, &dummy_pageno, + &dummy_pageno, &dummy_segno) + != CBM_INSERT_OK) + elog(ERROR, "no active insertion segment"); + if (blkno != expected_blkno) + elog(ERROR, "new page is at block %u but expected block %u", + blkno, expected_blkno); + + /* Do the real work. */ + meta->cbm_next_logical_page++; +} + +/* + * Advance the oldest logical page number for this conveyor belt. + */ +void +cb_metapage_advance_oldest_logical_page(CBMetapageData *meta, + CBPageNo oldest_logical_page) +{ + /* + * Something must be desperately wrong if an effort is ever made to set + * the value backwards or even to the existing value. Higher-level code + * can choose to do nothing in such cases rather than rejecting them, but + * this function should only get called when we're committed to dirtying + * the page and (if required) writing WAL. + */ + if (meta->cbm_oldest_logical_page >= oldest_logical_page) + elog(ERROR, "oldest logical page is already " UINT64_FORMAT " so can't be set to " UINT64_FORMAT, + meta->cbm_oldest_logical_page, oldest_logical_page); + + /* Do the real work. */ + meta->cbm_oldest_logical_page = oldest_logical_page; +} + +/* + * Get the oldest and next logical page numbers for this conveyor belt. + */ +void +cb_metapage_get_bounds(CBMetapageData *meta, CBPageNo *oldest_logical_page, + CBPageNo *next_logical_page) +{ + *oldest_logical_page = meta->cbm_oldest_logical_page; + *next_logical_page = meta->cbm_next_logical_page; +} + +/* + * Compute the number of index entries that are used in the metapage. + * + * For our purposes here, an index entry isn't used unless there are some + * logical pages associated with it. It's possible that the real number + * of index entries is one higher than the value we return, but if so, + * no pages have been allocated from the final segment just yet. + * + * The reason this is OK is that the intended purpose of this function is + * to figure out where a new index entry ought to be put, and we shouldn't + * be putting a new index entry into the page at all unless all of the + * existing entries point to segments that are completely full. If we + * needed to know how many entries had been filled in, whether or not any + * of the associated storage was in use, we could do that by adding 1 to + * the value computed here here if the entry at that offset is already + * initialized. + */ +int +cb_metapage_get_index_entries_used(CBMetapageData *meta) +{ + CBPageNo relp; + + /* + * Compute next logical page number relative to start of metapage. + * + * NB: The number of index entries could be equal to the number that will + * fit on the page, but it cannot be more. + */ + relp = meta->cbm_next_logical_page - meta->cbm_index_metapage_start; + if (relp > CB_METAPAGE_INDEX_ENTRIES * meta->cbm_pages_per_segment) + elog(ERROR, + "next logical page " UINT64_FORMAT " not in metapage index starting at " UINT64_FORMAT, + meta->cbm_next_logical_page, meta->cbm_index_start); + + /* Now we can calculate the answer. */ + return relp / meta->cbm_pages_per_segment; +} + +/* + * Add a new index entry to the metapage. + */ +void +cb_metapage_add_index_entry(CBMetapageData *meta, CBSegNo segno) +{ + int offset = cb_metapage_get_index_entries_used(meta); + + /* Sanity checks. */ + if (offset >= CB_METAPAGE_INDEX_ENTRIES) + elog(ERROR, "no space for index entries remains on metapage"); + if (meta->cbm_index[offset] != CB_INVALID_SEGMENT) + elog(ERROR, "index entry at offset %d unexpectedly in use for segment %u", + offset, meta->cbm_index[offset]); + + /* Add the entry. */ + meta->cbm_index[offset] = segno; +} + +/* + * Remove index entries from the metapage. + * + * This needs to be done in two cases. First, it might be that the whole + * index is in the metapage and that we're just trimming away some unused + * entries. In that case, pass relocating = false. Second, it might be that + * we're relocating index entries from the metapage to an index segment to + * make more space in the metapage. In that case, pass relocating = true. + */ +void +cb_metapage_remove_index_entries(CBMetapageData *meta, unsigned count, + bool relocating) +{ + int used = cb_metapage_get_index_entries_used(meta); + int offset; + + /* This shouldn't be called unless there is some real work to do. */ + Assert(count > 0); + + /* Sanity checks. */ + if (used < count) + elog(ERROR, + "can't remove %d entries from a page containing only %d entries", + count, used); + if (!relocating && + (meta->cbm_oldest_index_segment != CB_INVALID_SEGMENT || + meta->cbm_newest_index_segment != CB_INVALID_SEGMENT || + meta->cbm_index_start != meta->cbm_index_metapage_start)) + elog(ERROR, "removed index entries should be relocated if index segments exist"); + if (relocating && + (meta->cbm_oldest_index_segment == CB_INVALID_SEGMENT || + meta->cbm_newest_index_segment == CB_INVALID_SEGMENT)) + elog(ERROR, "removed index entries can't be relocated if no index segments exist"); + + /* Move any entries that we are keeping. */ + if (count < used) + memmove(&meta->cbm_index[0], &meta->cbm_index[count], + sizeof(CBSegNo) * (used - count)); + + /* Zap the entries that were formerly in use and are no longer. */ + for (offset = used - count; offset < used; ++offset) + meta->cbm_index[offset] = CB_INVALID_SEGMENT; + + /* + * Adjust meta->cbm_index_metapage_start to compensate for the index + * entries that we just removed. + */ + meta->cbm_index_metapage_start += + count * meta->cbm_pages_per_segment; + if (relocating) + meta->cbm_entries_in_newest_index_segment += count; + else + meta->cbm_index_start = meta->cbm_index_metapage_start; +} + +/* + * Copy the indicated number of index entries out of the metapage. + */ +void +cb_metapage_get_index_entries(CBMetapageData *meta, unsigned num_index_entries, + CBSegNo *index_entries) +{ + Assert(num_index_entries <= cb_metapage_get_index_entries_used(meta)); + + memcpy(index_entries, meta->cbm_index, + num_index_entries * sizeof(CBSegNo)); +} + +/* + * Return various pieces of information that are needed to initialize for + * access to a conveyor belt. + */ +void +cb_metapage_get_critical_info(CBMetapageData *meta, + uint16 *pages_per_segment, + uint64 *index_segments_moved) +{ + *pages_per_segment = meta->cbm_pages_per_segment; + *index_segments_moved = meta->cbm_index_segments_moved; +} + +/* + * Return various pieces of information that are needed to access index + * segments. + */ +void +cb_metapage_get_index_info(CBMetapageData *meta, + CBPageNo *index_start, + CBPageNo *index_metapage_start, + CBSegNo *oldest_index_segment, + CBSegNo *newest_index_segment, + uint64 *index_segments_moved) +{ + *index_start = meta->cbm_index_start; + *index_metapage_start = meta->cbm_index_metapage_start; + *oldest_index_segment = meta->cbm_oldest_index_segment; + *newest_index_segment = meta->cbm_newest_index_segment; + *index_segments_moved = meta->cbm_index_segments_moved; +} + +/* + * Update the metapage to reflect the addition of an index segment. + */ +void +cb_metapage_add_index_segment(CBMetapageData *meta, CBSegNo segno) +{ + meta->cbm_newest_index_segment = segno; + meta->cbm_entries_in_newest_index_segment = 0; + if (meta->cbm_oldest_index_segment == CB_INVALID_SEGMENT) + meta->cbm_oldest_index_segment = segno; +} + +/* + * Update the metapage to reflect the removal of an index segment. + * + * 'segno' should be the successor of the index segment being removed, + * or CB_INVALID_SEGMENT if, at present, only one index segment exists. + */ +void +cb_metapage_remove_index_segment(CBMetapageData *meta, CBSegNo segno) +{ + if (meta->cbm_oldest_index_segment == CB_INVALID_SEGMENT || + meta->cbm_newest_index_segment == CB_INVALID_SEGMENT) + elog(ERROR, "can't remove index segment when none remain"); + + if (segno == CB_INVALID_SEGMENT) + { + if (meta->cbm_oldest_index_segment != + meta->cbm_newest_index_segment) + elog(ERROR, "can't remove last index segment when >1 remain"); + meta->cbm_oldest_index_segment = CB_INVALID_SEGMENT; + meta->cbm_newest_index_segment = CB_INVALID_SEGMENT; + } + else + { + if (meta->cbm_oldest_index_segment == + meta->cbm_newest_index_segment) + elog(ERROR, "must remove last index segment when only one remains"); + meta->cbm_oldest_index_segment = segno; + } +} + +/* + * Examine the metapage state to determine how to go about recycling space. + * + * If the return value is CBM_OBSOLETE_SEGMENT_ENTRIES, then + * *oldest_index_segment will be set to the segment number of the oldest index + * segment, and *index_vacuum_stop_point will be set to the oldest page number + * for which any index entry in the index pages should not be removed. The + * caller should remove index entries that precede that point from index + * segments, and if possible the segments themselves. + * + * If the return value is CBM_OBSOLETE_METAPAGE_ENTRIES, then *metapage_segno + * will be set to a payload segment that can be deallocated, and + * *metapage_offset to the location in the metapage where the index entry + * referencing that segment is stored. + * + * If the return value is CBM_OBSOLETE_METAPAGE_START, then there are + * no index segments and no uncleared index entries in the metapage that + * are obsolete, but some cleared index entries can be discarded. + * *metapage_offset will be set to the smallest metapage offset that cannot + * be cleared (either because it is still in use, or because it is not yet + * allocated). + * + * If the return value is CBM_OBSOLETE_NOTHING, there's nothing to do. + */ +CBMObsoleteState +cb_metapage_get_obsolete_state(CBMetapageData *meta, + CBSegNo *oldest_index_segment, + CBPageNo *index_vacuum_stop_point, + CBSegNo *metapage_segno, + unsigned *metapage_offset) +{ + CBPageNo istart = meta->cbm_index_start; + CBPageNo imstart = meta->cbm_index_metapage_start; + CBPageNo olpage = meta->cbm_oldest_logical_page; + uint16 pps = meta->cbm_pages_per_segment; + unsigned keep_offset; + unsigned offset; + + /* Sanity checks. */ + if (olpage < istart) + elog(ERROR, + "index starts at " UINT64_FORMAT " but oldest logical page is " UINT64_FORMAT, + istart, olpage); + if (imstart < istart) + elog(ERROR, + "metapage index starts at " UINT64_FORMAT " but index starts at " UINT64_FORMAT, + imstart, istart); + if (istart % pps != 0) + elog(ERROR, + "index start " UINT64_FORMAT " is not a multiple of pages per segment", + istart); + if (imstart % pps != 0) + elog(ERROR, + "index metapage start " UINT64_FORMAT " is not a multiple of pages per segment", + imstart); + + /* + * Detect the case where there is no obsolete data in the index. + * + * This happens if the oldest logical page is either equal to the start + * of the index, or follows it by less than the number of pages per + * segment. In the latter case, some but not all of the pages in the + * oldest payload segment are obsolete. We can only clean up entire + * payload semgents, so in such cases there is nothing to do. + */ + if (istart + pps > olpage) + return CBM_OBSOLETE_NOTHING; + + /* + * If there are any index segments, then the first step is to remove + * index entries from those segments, and the second step is to remove + * the segments themselves if they end up containing no useful entries. + * We need not consider doing anything in the metapage itself until no + * index segments remain. + */ + if (meta->cbm_oldest_index_segment != CB_INVALID_SEGMENT) + { + *oldest_index_segment = meta->cbm_oldest_index_segment; + *index_vacuum_stop_point = + Min(meta->cbm_index_metapage_start, + meta->cbm_oldest_logical_page); + return CBM_OBSOLETE_SEGMENT_ENTRIES; + } + + /* + * Since there are no index pages, the whole index is in the metapage, + * and therefore the logical page number should be somewhere in the range + * of pages covered by the metapage. + */ + if (olpage < imstart) + elog(ERROR, + "oldest logical page " UINT64_FORMAT " precedes metapage start " UINT64_FORMAT " but there are no index segments", + olpage, imstart); + + /* Search for obsolete index entries that have not yet been cleared. */ + keep_offset = (olpage - imstart) / pps; + for (offset = 0; offset < keep_offset; ++offset) + { + if (meta->cbm_index[offset] != CB_INVALID_SEGMENT) + { + *metapage_segno = meta->cbm_index[offset]; + *metapage_offset = offset; + return CBM_OBSOLETE_METAPAGE_ENTRIES; + } + } + + /* + * Apparently, there's nothing left to do but discard already-cleared + * index entries. + */ + *metapage_offset = keep_offset; + return CBM_OBSOLETE_METAPAGE_START; +} + +/* + * Clear a single index entry from the metapage. + * + * We require that the caller provide not only the offset but the segment + * number that is expected to be found at that offset. That lets us check + * that nothing unexpected has occurred. + */ +void +cb_metapage_clear_obsolete_index_entry(CBMetapageData *meta, + CBSegNo segno, + unsigned offset) +{ + if (meta->cbm_index[offset] != offset) + elog(ERROR, + "index entry at offset %u was expected to be %u but found %u", + offset, segno, meta->cbm_index[offset]); + + meta->cbm_index[offset] = CB_INVALID_SEGMENT; +} + +/* + * Returns the lowest unused segment number covered by the metapage, + * or CB_INVALID_SEGMENT if none. + */ +CBSegNo +cb_metapage_find_free_segment(CBMetapageData *meta) +{ + unsigned i; + unsigned j; + + for (i = 0; i < CB_METAPAGE_FREESPACE_BYTES; ++i) + { + uint8 b = meta->cbm_freespace_map[i]; + + if (b == 0xFF) + continue; + + for (j = 0; j < BITS_PER_BYTE; ++j) + { + if ((b & (1 << j)) == 0) + return (i * BITS_PER_BYTE) + j; + } + } + + return CB_INVALID_SEGMENT; +} + +/* + * Get the allocation status of a segment from the metapage fsm. + */ +bool +cb_metapage_get_fsm_bit(CBMetapageData *meta, CBSegNo segno) +{ + uint8 byte; + uint8 mask; + + if (segno >= CB_METAPAGE_FREESPACE_BYTES * BITS_PER_BYTE) + elog(ERROR, "segment %u out of range for metapage fsm", segno); + + byte = meta->cbm_freespace_map[segno / BITS_PER_BYTE]; + mask = 1 << (segno % BITS_PER_BYTE); + return (byte & mask) != 0; +} + +/* + * Set the allocation status of a segment in the metapage fsm. + * + * new_state should be true if the bit is currently clear and should be set, + * and false if the bit is currently set and should be cleared. Don't call + * this unless you know that the bit actually needs to be changed. + */ +void +cb_metapage_set_fsm_bit(CBMetapageData *meta, CBSegNo segno, bool new_state) +{ + uint8 *byte; + uint8 mask; + uint8 old_state; + + if (segno >= CB_FSM_SEGMENTS_FOR_METAPAGE) + elog(ERROR, "segment %u out of range for metapage fsm", segno); + + byte = &meta->cbm_freespace_map[segno / BITS_PER_BYTE]; + mask = 1 << (segno % BITS_PER_BYTE); + old_state = (*byte & mask) != 0; + + if (old_state == new_state) + elog(ERROR, "metapage fsm bit for segment %u already has value %d", + segno, old_state ? 1 : 0); + + if (new_state) + *byte |= mask; + else + *byte &= ~mask; +} + +/* + * Increment the count of segments allocated. + */ +void +cb_metapage_increment_next_segment(CBMetapageData *meta, CBSegNo segno) +{ + if (segno != meta->cbm_next_segment) + elog(ERROR, "extending to create segment %u but next segment is %u", + segno, meta->cbm_next_segment); + + meta->cbm_next_segment++; +} + +/* + * Increment the count of index segments moved. + */ +void +cb_metapage_increment_index_segments_moved(CBMetapageData *meta) +{ + meta->cbm_index_segments_moved++; +} diff --git a/src/backend/access/conveyor/cbmodify.c b/src/backend/access/conveyor/cbmodify.c new file mode 100644 index 0000000000..7d74bce027 --- /dev/null +++ b/src/backend/access/conveyor/cbmodify.c @@ -0,0 +1,686 @@ +/*------------------------------------------------------------------------- + * + * cbmodify.c + * Routines to make a change to a conveyor belt and XLOG it if needed. + * + * Each function in this file implements one type of conveyor-belt write + * operation. The pages to be modified are assumed to already have been + * identified and locked. + * + * Each function in this file has a corresponding REDO function in + * cbxlog.c, except where log_newpage is used. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/backend/access/conveyor/cbmodify.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/cbfsmpage.h" +#include "access/cbindexpage.h" +#include "access/cbmetapage.h" +#include "access/cbmodify.h" +#include "access/cbxlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" + +/* + * Create a metapage, and optionally write XLOG for the change. + */ +void +cb_create_metapage(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + uint16 pages_per_segment, + bool needs_xlog) +{ + Page metapage; + + metapage = BufferGetPage(metabuffer); + cb_metapage_initialize(metapage, pages_per_segment); + + if (needs_xlog) + { + XLogRecPtr lsn; + + lsn = log_newpage(rnode, fork, CONVEYOR_METAPAGE, metapage, true); + PageSetLSN(metapage, lsn); + } + + MarkBufferDirty(metabuffer); +} + +/* + * Create a new FSM page, and optionally write XLOG for the change. + */ +CBSegNo +cb_create_fsmpage(RelFileNode *rnode, + ForkNumber fork, + BlockNumber blkno, + Buffer buffer, + uint16 pages_per_segment, + bool needs_xlog) +{ + Page page; + CBSegNo segno; + + START_CRIT_SECTION(); + + page = BufferGetPage(buffer); + segno = cb_fsmpage_initialize(page, blkno, pages_per_segment); + MarkBufferDirty(buffer); + + if (needs_xlog) + { + XLogRecPtr lsn; + + lsn = log_newpage(rnode, fork, blkno, page, true); + PageSetLSN(page, lsn); + } + + END_CRIT_SECTION(); + + return segno; +} + +/* + * Insert a payload page, and optionally write XLOG for the change. + * + * Since we have no idea what the contents of the payload page ought to be, + * it's up to the caller to initialize it before calling this function. + * That means that the caller is also responsible for starting and ending + * the required critical section. + */ +void +cb_insert_payload_page(RelFileNode *rnode, ForkNumber fork, Buffer metabuffer, + BlockNumber payloadblock, Buffer payloadbuffer, + bool needs_xlog) +{ + Page metapage; + Page payloadpage; + CBMetapageData *meta; + + Assert(CritSectionCount > 0); + + payloadpage = BufferGetPage(payloadbuffer); + MarkBufferDirty(payloadbuffer); + + metapage = BufferGetPage(metabuffer); + meta = cb_metapage_get_special(metapage); + cb_metapage_advance_next_logical_page(meta, payloadblock); + MarkBufferDirty(metabuffer); + + if (needs_xlog) + { + XLogRecPtr lsn; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage, + REGBUF_STANDARD); + XLogRegisterBlock(1, rnode, fork, payloadblock, + payloadpage, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE); + + PageSetLSN(payloadpage, lsn); + PageSetLSN(metapage, lsn); + } +} + +/* + * Allocate a new payload segment, and optionally write XLOG for the change. + * + * If the allocation status of the segment is tracked in the metapage, + * 'fsmblock' should be InvalidBlockNumber and 'fsmbuffer' should be + * InvalidBuffer. Otherwise, 'fsmblock' should be the block number of the + * relevant freespace map block and 'fsmbuffer' the corresponding buffer. + * + * 'is_extend' should be true when we're allocating a segment that hasn't + * existed before, necessitating an adjustment to the metapage's + * next-segment counter. + * + * See cb_xlog_allocate_payload_segment for the corresponding REDO routine. + */ +void +cb_allocate_payload_segment(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber fsmblock, + Buffer fsmbuffer, + CBSegNo segno, + bool is_extend, + bool needs_xlog) +{ + Page metapage; + CBMetapageData *meta; + + metapage = BufferGetPage(metabuffer); + meta = cb_metapage_get_special(metapage); + + START_CRIT_SECTION(); + + cb_metapage_add_index_entry(meta, segno); + MarkBufferDirty(metabuffer); + + if (is_extend) + cb_metapage_increment_next_segment(meta, segno); + + if (fsmblock != InvalidBlockNumber) + { + cb_fsmpage_set_fsm_bit(BufferGetPage(fsmbuffer), segno, true); + MarkBufferDirty(fsmbuffer); + } + else + cb_metapage_set_fsm_bit(meta, segno, true); + + if (needs_xlog) + { + xl_cb_allocate_payload_segment xlrec; + XLogRecPtr lsn; + + xlrec.segno = segno; + xlrec.is_extend = is_extend; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage, + REGBUF_STANDARD); + if (fsmblock != InvalidBlockNumber) + XLogRegisterBlock(1, rnode, fork, fsmblock, + BufferGetPage(fsmbuffer), REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec, SizeOfCBAllocatePayloadSegment); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_ALLOCATE_PAYLOAD_SEGMENT); + + PageSetLSN(metapage, lsn); + if (fsmblock != InvalidBlockNumber) + PageSetLSN(BufferGetPage(fsmbuffer), lsn); + } + + END_CRIT_SECTION(); +} + +/* + * Allocate a new index segment, and optionally write XLOG for the change. + * + * 'metabuffer' should be the buffer containing the metapage. + * + * 'indexblock' and 'indexbuffer' should be the block number and buffer for + * the first page of the index segment. + * + * If any index segments already exist, then 'prevblock' should be the + * block number of the first page of the last index segment that already + * exists, and 'prevbuffer' the corresponding buffer; otherwise, use + * InvalidBlockNumber and InvalidBuffer, respectively. + * + * Similarly, if the allocation status of the segment is tracked in an + * FSM page, 'fsmblock' and 'fsmbuffer' should reference that page; if that + * information is tracked in the metpaage, the InvalidBlockNumber and + * InvalidBuffer. + * + * 'segno' is the segment number of the new index segment, and 'pageno' + * is the first logical page for which it will store index information. + * + * 'is_extend' should be true when we're allocating a segment that hasn't + * existed before, necessitating an adjustment to the metapage's + * next-segment counter. + * + * See cb_xlog_allocate_index_segment for the corresponding REDO routine. + */ +void +cb_allocate_index_segment(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber indexblock, + Buffer indexbuffer, + BlockNumber prevblock, + Buffer prevbuffer, + BlockNumber fsmblock, + Buffer fsmbuffer, + CBSegNo segno, + CBPageNo pageno, + bool is_extend, + bool needs_xlog) +{ + Page metapage; + Page indexpage; + CBMetapageData *meta; + + metapage = BufferGetPage(metabuffer); + indexpage = BufferGetPage(indexbuffer); + + meta = cb_metapage_get_special(metapage); + + START_CRIT_SECTION(); + + cb_metapage_add_index_segment(meta, segno); + MarkBufferDirty(metabuffer); + + if (is_extend) + cb_metapage_increment_next_segment(meta, segno); + + cb_indexpage_initialize(indexpage, pageno); + MarkBufferDirty(indexbuffer); + + if (prevblock != InvalidBlockNumber) + { + cb_indexpage_set_next_segment(BufferGetPage(prevbuffer), segno); + MarkBufferDirty(prevbuffer); + } + + if (fsmblock != InvalidBlockNumber) + { + cb_fsmpage_set_fsm_bit(BufferGetPage(fsmbuffer), segno, true); + MarkBufferDirty(fsmbuffer); + } + else + cb_metapage_set_fsm_bit(meta, segno, true); + + if (needs_xlog) + { + xl_cb_allocate_index_segment xlrec; + XLogRecPtr lsn; + + xlrec.segno = segno; + xlrec.pageno = pageno; + xlrec.is_extend = is_extend; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage, + REGBUF_STANDARD); + XLogRegisterBlock(1, rnode, fork, indexblock, indexpage, + REGBUF_STANDARD | REGBUF_WILL_INIT); + if (prevblock != InvalidBlockNumber) + XLogRegisterBlock(2, rnode, fork, prevblock, + BufferGetPage(prevbuffer), REGBUF_STANDARD); + if (fsmblock != InvalidBlockNumber) + XLogRegisterBlock(3, rnode, fork, fsmblock, + BufferGetPage(fsmbuffer), REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec, SizeOfCBAllocateIndexSegment); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_ALLOCATE_INDEX_SEGMENT); + + PageSetLSN(metapage, lsn); + PageSetLSN(indexpage, lsn); + if (prevblock != InvalidBlockNumber) + PageSetLSN(BufferGetPage(prevbuffer), lsn); + if (fsmblock != InvalidBlockNumber) + PageSetLSN(BufferGetPage(fsmbuffer), lsn); + } + + END_CRIT_SECTION(); +} + +/* + * Allocate a new index page in an existing index segment, and optionally + * write XLOG for the change. + * + * 'indexblock' and 'indexbuffer' should be the block number and buffer for + * the new page. 'firstindexblock' and 'firstindexbuffer' are the block + * number and buffer for the first page of the index segment. + * + * 'pageno' is the first logical page for which the new index page will + * store index information. + * + * See cb_xlog_allocate_index_page for the corresponding REDO routine. + */ +void +cb_allocate_index_page(RelFileNode *rnode, + ForkNumber fork, + BlockNumber indexblock, + Buffer indexbuffer, + CBPageNo pageno, + bool needs_xlog) +{ + Page indexpage; + + indexpage = BufferGetPage(indexbuffer); + + START_CRIT_SECTION(); + + cb_indexpage_initialize(indexpage, pageno); + MarkBufferDirty(indexbuffer); + + if (needs_xlog) + { + xl_cb_allocate_index_page xlrec; + XLogRecPtr lsn; + + xlrec.pageno = pageno; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, fork, indexblock, indexpage, + REGBUF_STANDARD | REGBUF_WILL_INIT); + XLogRegisterData((char *) &xlrec, SizeOfCBAllocateIndexPage); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_ALLOCATE_INDEX_PAGE); + + PageSetLSN(indexpage, lsn); + } + + END_CRIT_SECTION(); +} + +/* + * Relocate index entries from the metapage to a page in an index segment, + * and optionally write XLOG for the change. + * + * 'pageoffset' is the offset within the index page where the new entries + * should be placed. + * + * 'index_page_start' is the first logical page number covered by the index + * page being modified. + * + * See cb_xlog_allocate_index_segment for the corresponding REDO routine. + */ +void +cb_relocate_index_entries(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber indexblock, + Buffer indexbuffer, + unsigned pageoffset, + unsigned num_index_entries, + CBSegNo *index_entries, + CBPageNo index_page_start, + bool needs_xlog) +{ + Page metapage; + Page indexpage; + CBMetapageData *meta; + + metapage = BufferGetPage(metabuffer); + indexpage = BufferGetPage(indexbuffer); + + meta = cb_metapage_get_special(metapage); + + START_CRIT_SECTION(); + + /* If these are the first entries on the page, initialize it. */ + if (pageoffset == 0) + cb_indexpage_initialize(indexpage, index_page_start); + + cb_indexpage_add_index_entries(indexpage, pageoffset, num_index_entries, + index_entries); + cb_metapage_remove_index_entries(meta, num_index_entries, true); + + MarkBufferDirty(metabuffer); + MarkBufferDirty(indexbuffer); + + if (needs_xlog) + { + xl_cb_relocate_index_entries xlrec; + XLogRecPtr lsn; + uint8 flags = REGBUF_STANDARD; + + xlrec.pageoffset = pageoffset; + xlrec.num_index_entries = num_index_entries; + xlrec.index_page_start = index_page_start; + + if (pageoffset == 0) + flags |= REGBUF_WILL_INIT; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage, + REGBUF_STANDARD); + XLogRegisterBlock(1, rnode, fork, indexblock, indexpage, flags); + XLogRegisterData((char *) &xlrec, SizeOfCBRelocateIndexEntries); + XLogRegisterData((char *) index_entries, + num_index_entries * sizeof(CBSegNo)); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_RELOCATE_INDEX_ENTRIES); + + PageSetLSN(metapage, lsn); + PageSetLSN(indexpage, lsn); + } + + END_CRIT_SECTION(); +} + +/* + * Logically truncate a conveyor belt by updating its notion of the oldest + * logical page. + */ +void +cb_logical_truncate(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + CBPageNo oldest_keeper, + bool needs_xlog) +{ + Page metapage; + CBMetapageData *meta; + + metapage = BufferGetPage(metabuffer); + meta = cb_metapage_get_special(metapage); + + START_CRIT_SECTION(); + + cb_metapage_advance_oldest_logical_page(meta, oldest_keeper); + + MarkBufferDirty(metabuffer); + + if (needs_xlog) + { + xl_cb_logical_truncate xlrec; + XLogRecPtr lsn; + + xlrec.oldest_keeper = oldest_keeper; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage, + REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec, SizeOfCBLogicalTruncate); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_LOGICAL_TRUNCATE); + + PageSetLSN(metapage, lsn); + } + + END_CRIT_SECTION(); +} + +/* + * Clear a block in preparation for deallocating the segment that contains it. + * + * The block needs to appear unused to ConveyorBeltPageIsUnused(); a simple + * call to PageInit() is the easiest way to accomplish that. + * + * We could use log_newpage() here but it would generate more WAL. + */ +void +cb_clear_block(RelFileNode *rnode, + ForkNumber fork, + BlockNumber blkno, + Buffer buffer, + bool needs_xlog) +{ + Page page = BufferGetPage(buffer); + + START_CRIT_SECTION(); + + PageInit(page, BLCKSZ, 0); + + MarkBufferDirty(buffer); + + if (needs_xlog) + { + XLogRecPtr lsn; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, fork, blkno, page, + REGBUF_STANDARD | REGBUF_WILL_INIT); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_CLEAR_BLOCK); + + PageSetLSN(page, lsn); + } + + END_CRIT_SECTION(); +} + +/* + * Deallocate a payload segment. + * + * This is a bit tricky. We need to clear the index entry pointing to the + * payload segment, and we also need to clear the FSM bit for the segment. + * Either, both, or neither of those could be in the metapage. + * + * If neither is in the metapage, metabuffer should be InvalidBuffer; + * otherwise it should be the buffer containing the metapage. + * + * If the index entry pointing to the payload segment is in the metapage, + * then indexblock should be InvalidBlockNumber and indexbuffer should be + * InvalidBuffer; otherwise, they should reference the index page containing + * the index entry. + * + * If the freespace map bit for the segment is in the metapage, then + * fsmblock should be InvalidBlockNumber and fsmbuffer should be InvalidBuffer; + * otherwise, they should reference the FSM page containing the relevant + * freespace map bit. + */ +void +cb_recycle_payload_segment(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber indexblock, + Buffer indexbuffer, + BlockNumber fsmblock, + Buffer fsmbuffer, + CBSegNo segno, + unsigned pageoffset, + bool needs_xlog) +{ + START_CRIT_SECTION(); + + if (BufferIsValid(metabuffer)) + { + CBMetapageData *meta; + + Assert(indexblock == InvalidBlockNumber || + fsmblock == InvalidBlockNumber); + meta = cb_metapage_get_special(BufferGetPage(metabuffer)); + if (indexblock == InvalidBlockNumber) + cb_metapage_clear_obsolete_index_entry(meta, segno, pageoffset); + if (fsmblock == InvalidBlockNumber) + cb_metapage_set_fsm_bit(meta, segno, false); + MarkBufferDirty(metabuffer); + } + + if (indexblock != InvalidBlockNumber) + { + cb_indexpage_clear_obsolete_entry(BufferGetPage(indexblock), + segno, pageoffset); + MarkBufferDirty(indexbuffer); + } + + if (fsmblock != InvalidBlockNumber) + { + cb_fsmpage_set_fsm_bit(BufferGetPage(fsmbuffer), segno, false); + MarkBufferDirty(fsmbuffer); + } + + if (needs_xlog) + { + xl_cb_recycle_payload_segment xlrec; + XLogRecPtr lsn; + + xlrec.segno = segno; + xlrec.pageoffset = pageoffset; + + XLogBeginInsert(); + if (BufferIsValid(metabuffer)) + XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, + BufferGetPage(metabuffer), REGBUF_STANDARD); + if (indexblock != InvalidBlockNumber) + XLogRegisterBlock(1, rnode, fork, indexblock, + BufferGetPage(indexbuffer), REGBUF_STANDARD); + if (fsmblock != InvalidBlockNumber) + XLogRegisterBlock(2, rnode, fork, fsmblock, + BufferGetPage(fsmbuffer), REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec, SizeOfCBShiftMetapageIndex); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX); + + if (indexblock != InvalidBlockNumber) + PageSetLSN(BufferGetPage(indexbuffer), lsn); + if (fsmblock != InvalidBlockNumber) + PageSetLSN(BufferGetPage(fsmbuffer), lsn); + } + + END_CRIT_SECTION(); +} + +/* + * Deallocate an index segment. + * + * indexblock and indexbuffer should refer to the first block of the segment + * to be deallocated. It's the oldest index segment, so we can't clear it + * in advance, else we'd lose track of what other index segments exist. + * + * fsmblock and fsmbuffer should refer to the FSM page that contains the + * FSM bit for the segment to be freed. If the segment is covered by the + * metapage, pass InvalidBlockNumber and InvalidBuffer, respectively. + * + * The return value is the segment number of the oldest index segment that + * remains after the operation, or CB_INVALID_SEGMENT if none. + */ +CBSegNo +cb_recycle_index_segment(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber indexblock, + Buffer indexbuffer, + BlockNumber fsmblock, + Buffer fsmbuffer, + CBSegNo segno, + bool needs_xlog) +{ + elog(ERROR, "XXX cb_recycle_index_segment not implemented yet"); +} + +/* + * Shift the start of the metapage index by discarding a given number + * of already-cleared index entries. + */ +void +cb_shift_metapage_index(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + unsigned num_entries, + bool needs_xlog) +{ + Page metapage; + CBMetapageData *meta; + + metapage = BufferGetPage(metabuffer); + meta = cb_metapage_get_special(metapage); + + START_CRIT_SECTION(); + + cb_metapage_remove_index_entries(meta, num_entries, false); + + MarkBufferDirty(metabuffer); + + if (needs_xlog) + { + xl_cb_shift_metapage_index xlrec; + XLogRecPtr lsn; + + xlrec.num_entries = num_entries; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, fork, CONVEYOR_METAPAGE, metapage, + REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec, SizeOfCBShiftMetapageIndex); + lsn = XLogInsert(RM_CONVEYOR_ID, + XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX); + + PageSetLSN(metapage, lsn); + } + + END_CRIT_SECTION(); +} diff --git a/src/backend/access/conveyor/cbxlog.c b/src/backend/access/conveyor/cbxlog.c new file mode 100644 index 0000000000..2dd030c1ac --- /dev/null +++ b/src/backend/access/conveyor/cbxlog.c @@ -0,0 +1,442 @@ +/*------------------------------------------------------------------------- + * + * cbxlog.c + * XLOG support for conveyor belts. + * + * For each REDO function in this file, see cbmodify.c for the + * corresponding function that performs the modification during normal + * running and logs the record that we REDO here. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/backend/access/conveyor/cbmodify.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/cbfsmpage.h" +#include "access/cbindexpage.h" +#include "access/cbmetapage.h" +#include "access/cbxlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "storage/bufmgr.h" + +/* + * REDO function for cb_insert_payload_page. + * + * Note that the handling of block 1 is very similar to XLOG_FPI. + */ +static void +cb_xlog_insert_payload_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer metabuffer; + Buffer payloadbuffer; + + if (!XLogRecHasBlockImage(record, 1)) + elog(ERROR, "XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE record did not contain full page image of payload block"); + if (XLogReadBufferForRedo(record, 1, &payloadbuffer) != BLK_RESTORED) + elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); + + /* last due to lock ordering rules; see README */ + if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO) + { + Page metapage = BufferGetPage(metabuffer); + CBMetapageData *meta; + BlockNumber payloadblock; + + meta = cb_metapage_get_special(metapage); + XLogRecGetBlockTag(record, 1, NULL, NULL, &payloadblock); + cb_metapage_advance_next_logical_page(meta, payloadblock); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + } + + UnlockReleaseBuffer(metabuffer); + UnlockReleaseBuffer(payloadbuffer); +} + +/* + * REDO function for cb_allocate_payload_segment. + */ +static void +cb_xlog_allocate_payload_segment(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_cb_allocate_payload_segment *xlrec; + Buffer metabuffer; + bool have_fsm_page = XLogRecGetBlockTag(record, 1, NULL, NULL, NULL); + Buffer fsmbuffer = InvalidBuffer; + + xlrec = (xl_cb_allocate_payload_segment *) XLogRecGetData(record); + + if (have_fsm_page && + XLogReadBufferForRedo(record, 1, &fsmbuffer) == BLK_NEEDS_REDO) + { + Page fsmpage = BufferGetPage(fsmbuffer); + + cb_fsmpage_set_fsm_bit(fsmpage, xlrec->segno, true); + PageSetLSN(fsmpage, lsn); + MarkBufferDirty(fsmbuffer); + } + + /* last due to lock ordering rules; see README */ + if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO) + { + Page metapage = BufferGetPage(metabuffer); + CBMetapageData *meta; + + meta = cb_metapage_get_special(metapage); + cb_metapage_add_index_entry(meta, xlrec->segno); + if (xlrec->is_extend) + cb_metapage_increment_next_segment(meta, xlrec->segno); + if (!have_fsm_page) + cb_metapage_set_fsm_bit(meta, xlrec->segno, true); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + } + + if (BufferIsValid(metabuffer)) + UnlockReleaseBuffer(metabuffer); + if (BufferIsValid(fsmbuffer)) + UnlockReleaseBuffer(fsmbuffer); +} + +/* + * REDO function for cb_allocate_index_segment. + */ +static void +cb_xlog_allocate_index_segment(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_cb_allocate_index_segment *xlrec; + bool have_prev_page; + bool have_fsm_page; + Buffer metabuffer; + Buffer indexbuffer; + Buffer prevbuffer = InvalidBuffer; + Buffer fsmbuffer = InvalidBuffer; + Page indexpage; + + have_prev_page = XLogRecGetBlockTag(record, 2, NULL, NULL, NULL); + have_fsm_page = XLogRecGetBlockTag(record, 3, NULL, NULL, NULL); + + xlrec = (xl_cb_allocate_index_segment *) XLogRecGetData(record); + + indexbuffer = XLogInitBufferForRedo(record, 1); + indexpage = BufferGetPage(indexbuffer); + cb_indexpage_initialize(indexpage, xlrec->pageno); + PageSetLSN(indexpage, lsn); + MarkBufferDirty(indexbuffer); + + if (have_prev_page && + XLogReadBufferForRedo(record, 2, &prevbuffer) == BLK_NEEDS_REDO) + { + Page prevpage = BufferGetPage(prevbuffer); + + cb_indexpage_set_next_segment(prevpage, xlrec->segno); + PageSetLSN(prevpage, lsn); + MarkBufferDirty(prevbuffer); + } + + if (have_fsm_page && + XLogReadBufferForRedo(record, 3, &fsmbuffer) == BLK_NEEDS_REDO) + { + Page fsmpage = BufferGetPage(fsmbuffer); + + cb_fsmpage_set_fsm_bit(fsmpage, xlrec->segno, true); + PageSetLSN(fsmpage, lsn); + MarkBufferDirty(fsmbuffer); + } + + /* last due to lock ordering rules; see README */ + if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO) + { + Page metapage = BufferGetPage(metabuffer); + CBMetapageData *meta; + + meta = cb_metapage_get_special(metapage); + cb_metapage_add_index_segment(meta, xlrec->segno); + if (xlrec->is_extend) + cb_metapage_increment_next_segment(meta, xlrec->segno); + if (!have_fsm_page) + cb_metapage_set_fsm_bit(meta, xlrec->segno, true); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + } + + if (BufferIsValid(metabuffer)) + UnlockReleaseBuffer(metabuffer); + if (BufferIsValid(indexbuffer)) + UnlockReleaseBuffer(indexbuffer); + if (BufferIsValid(prevbuffer)) + UnlockReleaseBuffer(prevbuffer); + if (BufferIsValid(fsmbuffer)) + UnlockReleaseBuffer(fsmbuffer); +} + +/* + * REDO function for cb_allocate_index_page. + */ +static void +cb_xlog_allocate_index_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_cb_allocate_index_page *xlrec; + Buffer indexbuffer; + Page indexpage; + + xlrec = (xl_cb_allocate_index_page *) XLogRecGetData(record); + + indexbuffer = XLogInitBufferForRedo(record, 0); + indexpage = BufferGetPage(indexbuffer); + cb_indexpage_initialize(indexpage, xlrec->pageno); + PageSetLSN(indexpage, lsn); + MarkBufferDirty(indexbuffer); + + UnlockReleaseBuffer(indexbuffer); +} + +/* + * REDO function for cb_relocate_index_entries. + */ +static void +cb_xlog_relocate_index_entries(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_cb_relocate_index_entries *xlrec; + Buffer metabuffer; + Buffer indexbuffer; + ReadBufferMode mode; + + xlrec = (xl_cb_relocate_index_entries *) XLogRecGetData(record); + + mode = xlrec->pageoffset == 0 ? RBM_ZERO_AND_LOCK : RBM_NORMAL; + if (XLogReadBufferForRedoExtended(record, 1, mode, false, + &indexbuffer) == BLK_NEEDS_REDO) + { + Page indexpage = BufferGetPage(indexbuffer); + + if (xlrec->pageoffset == 0) + cb_indexpage_initialize(indexpage, xlrec->index_page_start); + + cb_indexpage_add_index_entries(indexpage, xlrec->pageoffset, + xlrec->num_index_entries, + xlrec->index_entries); + PageSetLSN(indexpage, lsn); + MarkBufferDirty(indexbuffer); + } + + /* NB: metapage must be last due to lock ordering rules */ + if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO) + { + Page metapage = BufferGetPage(metabuffer); + CBMetapageData *meta; + + meta = cb_metapage_get_special(metapage); + cb_metapage_remove_index_entries(meta, xlrec->num_index_entries, true); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + } + + if (BufferIsValid(metabuffer)) + UnlockReleaseBuffer(metabuffer); + if (BufferIsValid(indexbuffer)) + UnlockReleaseBuffer(indexbuffer); +} + +/* + * REDO function for cb_logical_truncate. + */ +static void +cb_xlog_logical_truncate(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_cb_logical_truncate *xlrec; + Buffer metabuffer; + + xlrec = (xl_cb_logical_truncate *) XLogRecGetData(record); + + if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO) + { + Page metapage = BufferGetPage(metabuffer); + CBMetapageData *meta; + + meta = cb_metapage_get_special(metapage); + cb_metapage_advance_oldest_logical_page(meta, xlrec->oldest_keeper); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + } + + if (BufferIsValid(metabuffer)) + UnlockReleaseBuffer(metabuffer); +} + +/* + * REDO function for cb_clear_block. + */ +static void +cb_xlog_clear_block(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, 0, BLCKSZ); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); +} + +/* + * REDO function for cb_recycle_payload_segment. + */ +static void +cb_xlog_recycle_payload_segment(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_cb_recycle_payload_segment *xlrec; + bool have_metapage; + bool have_index_page; + bool have_fsm_page; + Buffer fsmbuffer = InvalidBuffer; + Buffer indexbuffer = InvalidBuffer; + Buffer metabuffer = InvalidBuffer; + + have_metapage = XLogRecGetBlockTag(record, 0, NULL, NULL, NULL); + have_index_page = XLogRecGetBlockTag(record, 1, NULL, NULL, NULL); + have_fsm_page = XLogRecGetBlockTag(record, 2, NULL, NULL, NULL); + + xlrec = (xl_cb_recycle_payload_segment *) XLogRecGetData(record); + + if (have_index_page && + XLogReadBufferForRedo(record, 1, &indexbuffer) == BLK_NEEDS_REDO) + { + Page indexpage = BufferGetPage(indexbuffer); + + cb_indexpage_clear_obsolete_entry(indexpage, xlrec->segno, + xlrec->pageoffset); + PageSetLSN(indexpage, lsn); + MarkBufferDirty(indexbuffer); + } + + if (have_fsm_page && + XLogReadBufferForRedo(record, 2, &fsmbuffer) == BLK_NEEDS_REDO) + { + Page fsmpage = BufferGetPage(fsmbuffer); + + cb_fsmpage_set_fsm_bit(fsmpage, xlrec->segno, false); + PageSetLSN(fsmpage, lsn); + MarkBufferDirty(fsmbuffer); + } + + /* last due to lock ordering rules; see README */ + if (have_metapage && + XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO) + { + Page metapage = BufferGetPage(metabuffer); + CBMetapageData *meta; + + meta = cb_metapage_get_special(metapage); + if (!have_index_page) + cb_metapage_clear_obsolete_index_entry(meta, xlrec->segno, + xlrec->pageoffset); + if (!have_fsm_page) + cb_metapage_set_fsm_bit(meta, xlrec->segno, false); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + } + + if (BufferIsValid(fsmbuffer)) + UnlockReleaseBuffer(fsmbuffer); + if (BufferIsValid(indexbuffer)) + UnlockReleaseBuffer(indexbuffer); + if (BufferIsValid(metabuffer)) + UnlockReleaseBuffer(metabuffer); +} + +/* + * REDO function for cb_recycle_index_segment. + */ +static void +cb_xlog_recycle_index_segment(XLogReaderState *record) +{ + elog(ERROR, "XXX cb_xlog_recycle_index_segment not implemented yet"); +} + +/* + * REDO function for cb_shift_metapage_index. + */ +static void +cb_xlog_shift_metapage_index(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_cb_shift_metapage_index *xlrec; + Buffer metabuffer; + + xlrec = (xl_cb_shift_metapage_index *) XLogRecGetData(record); + + if (XLogReadBufferForRedo(record, 0, &metabuffer) == BLK_NEEDS_REDO) + { + Page metapage = BufferGetPage(metabuffer); + CBMetapageData *meta; + + meta = cb_metapage_get_special(metapage); + cb_metapage_remove_index_entries(meta, xlrec->num_entries, false); + PageSetLSN(metapage, lsn); + MarkBufferDirty(metabuffer); + } + + if (BufferIsValid(metabuffer)) + UnlockReleaseBuffer(metabuffer); +} + +/* + * Main entrypoint for conveyor belt REDO. + */ +void +conveyor_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE: + cb_xlog_insert_payload_page(record); + break; + case XLOG_CONVEYOR_ALLOCATE_PAYLOAD_SEGMENT: + cb_xlog_allocate_payload_segment(record); + break; + case XLOG_CONVEYOR_ALLOCATE_INDEX_SEGMENT: + cb_xlog_allocate_index_segment(record); + break; + case XLOG_CONVEYOR_ALLOCATE_INDEX_PAGE: + cb_xlog_allocate_index_page(record); + break; + case XLOG_CONVEYOR_RELOCATE_INDEX_ENTRIES: + cb_xlog_relocate_index_entries(record); + break; + case XLOG_CONVEYOR_LOGICAL_TRUNCATE: + cb_xlog_logical_truncate(record); + break; + case XLOG_CONVEYOR_CLEAR_BLOCK: + cb_xlog_clear_block(record); + break; + case XLOG_CONVEYOR_RECYCLE_PAYLOAD_SEGMENT: + cb_xlog_recycle_payload_segment(record); + break; + case XLOG_CONVEYOR_RECYCLE_INDEX_SEGMENT: + cb_xlog_recycle_index_segment(record); + break; + case XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX: + cb_xlog_shift_metapage_index(record); + break; + default: + elog(PANIC, "conveyor_redo: unknown op code %u", info); + } +} diff --git a/src/backend/access/conveyor/conveyor.c b/src/backend/access/conveyor/conveyor.c new file mode 100644 index 0000000000..12a188884d --- /dev/null +++ b/src/backend/access/conveyor/conveyor.c @@ -0,0 +1,1978 @@ +/*------------------------------------------------------------------------- + * + * conveyor.c + * Conveyor belt storage. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * src/backend/access/conveyor/conveyor.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/cbcache.h" +#include "access/cbfsmpage.h" +#include "access/cbindexpage.h" +#include "access/cbmetapage.h" +#include "access/cbmodify.h" +#include "access/conveyor.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "utils/rel.h" + +static CBSegNo ConveyorSearchFSMPages(ConveyorBelt *cb, + CBSegNo next_segment, + BlockNumber *fsmblock, + Buffer *fsmbuffer); +static void ConveyorBeltClearSegment(ConveyorBelt *cb, CBSegNo segno, + bool include_first_page); +static bool ConveyorBeltClearIndexSegmentEntries(ConveyorBelt *cb, + Buffer metabuffer, + CBSegNo index_segment, + CBPageNo index_vacuum_stop_point, + CBSegNo *next_index_segment); +static CBSegNo ConveyorBeltFreeOldestIndexSegment(ConveyorBelt *cb, + Buffer metabuffer, + CBSegNo oldest_index_segment, + CBPageNo index_vacuum_stop_point); +static Buffer ConveyorBeltExtend(ConveyorBelt *cb, BlockNumber blkno, + BlockNumber *possibly_not_on_disk_blkno); +static BlockNumber ConveyorBeltFSMBlockNumber(ConveyorBelt *cb, + CBSegNo segno); +static Buffer ConveyorBeltRead(ConveyorBelt *cb, BlockNumber blkno, int mode); +static Buffer ConveyorBeltPageIsUnused(Page page); + +/* + * Handle used to mediate access to a conveyor belt. + */ +struct ConveyorBelt +{ + Relation cb_rel; + ForkNumber cb_fork; + uint16 cb_pages_per_segment; + CBCache *cb_cache; + + /* + * These fields are used for communication between ConveyorBeltGetNewPage, + * ConveyorBeltPerformInsert, and ConveyorBeltCleanupInsert. + */ + RelFileNode *cb_insert_relfilenode; + Buffer cb_insert_metabuffer; + BlockNumber cb_insert_block; + Buffer cb_insert_buffer; +}; + +/* + * Create a new conveyor belt. + */ +ConveyorBelt * +ConveyorBeltInitialize(Relation rel, + ForkNumber fork, + uint16 pages_per_segment, + MemoryContext mcxt) +{ + ConveyorBelt *cb; + Buffer metabuffer; + bool needs_xlog; + + /* Write a metapage for the new conveyor belt, and XLOG if needed. */ + needs_xlog = RelationNeedsWAL(rel) || fork == INIT_FORKNUM; + metabuffer = ReadBufferExtended(rel, fork, P_NEW, RBM_NORMAL, NULL); + if (BufferGetBlockNumber(metabuffer) != CONVEYOR_METAPAGE) + elog(ERROR, "can't initialize non-empty fork as conveyor belt"); + LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE); + cb_create_metapage(&RelationGetSmgr(rel)->smgr_rnode.node, fork, + metabuffer, pages_per_segment, needs_xlog); + UnlockReleaseBuffer(metabuffer); + + /* + * Initialize a ConveyorBelt object so that the caller can do something + * with the new conveyor belt if they wish. + */ + cb = MemoryContextAlloc(mcxt, sizeof(ConveyorBelt)); + cb->cb_rel = rel; + cb->cb_fork = fork; + cb->cb_pages_per_segment = pages_per_segment; + cb->cb_cache = cb_cache_create(mcxt, 0); + cb->cb_insert_relfilenode = NULL; + cb->cb_insert_metabuffer = InvalidBuffer; + cb->cb_insert_block = InvalidBlockNumber; + cb->cb_insert_buffer = InvalidBuffer; + return cb; +} + +/* + * Prepare for access to an existing conveyor belt. + */ +ConveyorBelt * +ConveyorBeltOpen(Relation rel, ForkNumber fork, MemoryContext mcxt) +{ + Buffer metabuffer; + CBMetapageData *meta; + ConveyorBelt *cb; + uint16 pages_per_segment; + uint64 index_segments_moved; + + /* Read a few critical details from the metapage. */ + metabuffer = ReadBufferExtended(rel, fork, CONVEYOR_METAPAGE, + RBM_NORMAL, NULL); + LockBuffer(metabuffer, BUFFER_LOCK_SHARE); + meta = cb_metapage_get_special(BufferGetPage(metabuffer)); + cb_metapage_get_critical_info(meta, + &pages_per_segment, + &index_segments_moved); + UnlockReleaseBuffer(metabuffer); + + /* Initialize and return the ConveyorBelt object. */ + cb = MemoryContextAlloc(mcxt, sizeof(ConveyorBelt)); + cb->cb_rel = rel; + cb->cb_fork = fork; + cb->cb_pages_per_segment = pages_per_segment; + cb->cb_cache = cb_cache_create(mcxt, index_segments_moved); + cb->cb_insert_relfilenode = NULL; + cb->cb_insert_metabuffer = InvalidBuffer; + cb->cb_insert_block = InvalidBlockNumber; + cb->cb_insert_buffer = InvalidBuffer; + return cb; +} + +/* + * Get a new page to be added to a conveyor belt. + * + * On return, *pageno is set to the logical page number of the newly-added + * page, and both the metapage and the returned buffer are exclusively locked. + * + * The intended use of this function is: + * + * buffer = ConveyorBeltGetNewPage(cb, &pageno); + * page = BufferGetPage(buffer); + * START_CRIT_SECTION(); + * // set page contents + * ConveyorBeltPerformInsert(cb, buffer); + * END_CRIT_SECTION(); + * ConveyorBeltCleanupInsert(cb, buffer); + * + * Note that because this function returns with buffer locks held, it's + * important to do as little work as possible after this function returns + * and before calling ConveyorBeltPerformInsert(). In particular, it's + * completely unsafe to do anything complicated like SearchSysCacheN. Doing + * so could result in undetected deadlock on the buffer LWLocks, or cause + * a relcache flush that would break ConveyorBeltPerformInsert(). + * + * Also note that the "set page contents" step must put some data in the + * page, so that either pd_lower is greater than the minimum value + * (SizeOfPageHeaderData) or pd_upper is less than the maximum value + * (BLCKSZ). + * + * In future, we might want to provide the caller with an alternative to + * calling ConveyorBeltPerformInsert, because that just logs an FPI for + * the new page, and some callers might prefer to manage their own xlog + * needs. + */ +Buffer +ConveyorBeltGetNewPage(ConveyorBelt *cb, CBPageNo *pageno) +{ + BlockNumber indexblock = InvalidBlockNumber; + BlockNumber prevblock = InvalidBlockNumber; + BlockNumber fsmblock = InvalidBlockNumber; + BlockNumber possibly_not_on_disk_blkno = CONVEYOR_METAPAGE + 1; + Buffer metabuffer; + Buffer indexbuffer = InvalidBuffer; + Buffer prevbuffer = InvalidBuffer; + Buffer fsmbuffer = InvalidBuffer; + Buffer buffer; + CBPageNo next_pageno; + CBPageNo previous_next_pageno = 0; + CBSegNo free_segno = CB_INVALID_SEGMENT; + bool needs_xlog; + int mode = BUFFER_LOCK_SHARE; + int iterations_without_next_pageno_change = 0; + + /* + * It would be really bad if someone called this function a second time + * while the buffer locks from a previous call were still held. So let's + * try to make sure that's not the case. + */ + Assert(!BufferIsValid(cb->cb_insert_metabuffer)); + Assert(!BufferIsValid(cb->cb_insert_buffer)); + + /* Do any changes we make here need to be WAL-logged? */ + needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM; + + /* + * We don't do anything in this function that involves catalog access or + * accepts invalidation messages, so it's safe to cache this for the + * lifetime of this function. Since we'll return with buffer locks held, + * the caller had better not do anything like that either, so this should + * also still be valid when ConveyorBeltPerformInsert is called. + * + * XXX. This seems totally bogus, because we should really be doing + * CHECK_FOR_INTERRUPTS(), and that might accept invalidation messages. + */ + cb->cb_insert_relfilenode = + &RelationGetSmgr(cb->cb_rel)->smgr_rnode.node; + + /* + * Read and pin the metapage. + * + * Among other things, this prevents concurrent truncations, as per the + * discussion in src/backend/access/conveyor/README. + */ + metabuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, CONVEYOR_METAPAGE, + RBM_NORMAL, NULL); + + /* + * In the easy case where at least one payload segment exists, the newest + * payload segment is not full, and nobody else is trying to insert + * concurrently, this loop should only iterate once. However, we might not + * be that lucky. + * + * Since we don't want to hold the lock on the metapage while we go + * perform necessary preparatory work (e.g. searching through the FSM + * pages for a segment that can be allocated), we may find that after + * doing some amount of preparatory work and re-locking the metapage, the + * situation has changed under us. So we have to be prepared to keep going + * around until we get to a state where there's a non-full payload segment + * whose first unused page we can lock before someone else grabs it. + */ + while (1) + { + CBMetapageData *meta; + CBMInsertState insert_state; + BlockNumber next_blkno; + CBPageNo index_start; + CBPageNo index_metapage_start; + CBSegNo newest_index_segment; + CBSegNo next_segno; + bool can_allocate_segment; + + /* + * Examine the metapage to find out what we think we need to do in + * order to complete this operation. + * + * Initially, mode will be BUFFER_LOCK_SHARE. But if a previous pass + * through the loop found that we needed to allocate a new payload or + * index segement or move index entries out of the metapage, it will + * be BUFFER_LOCK_EXCLUSIVE. That's so that if nothing has changed + * concurrently, we can complete the operation before releasing the + * lock on the metapage. + * + * NB: Our rule is that the lock on the metapage is acquired last, + * after all other buffer locks. If any of indexbuffer, prevbuffer, + * and fsmbuffer are valid, they are also exclusively locked at this + * point. + */ + LockBuffer(metabuffer, mode); + meta = cb_metapage_get_special(BufferGetPage(metabuffer)); + insert_state = cb_metapage_get_insert_state(meta, &next_blkno, + &next_pageno, &next_segno, + &index_start, + &index_metapage_start, + &newest_index_segment); + + /* + * There's no fixed upper bound on how many times this loop could + * iterate, because some other backend could be currently allocating + * pages, and that could prevent us from succeeding in allocating a + * page. + * + * However, if that's happening, the next logical page number should + * keep increasing. In the absence of any increase in the next logical + * page number, we might still need to iterate a few times, but + * not very many. For example, we might read the page the first time + * and realize that a new index segment is needed, create it on the + * second pass, move index entries into it on the third pass, and + * create a payload segment on the fourth pass, but then, barring + * concurrent activity, we should succeed in allocating a page on the + * next pass. + * + * Hence, if we loop a large number of times without a change in + * the next_pageno value, there's probably a bug. Error out instead + * of looping forever. + */ + if (next_pageno > previous_next_pageno) + { + previous_next_pageno = next_pageno; + iterations_without_next_pageno_change = 0; + } + else if (++iterations_without_next_pageno_change >= 10) + elog(ERROR, + "unable to make progress allocating page " + UINT64_FORMAT " (state = %d)", + next_pageno, (int) insert_state); + + /* + * next_segno need not exist on disk, but at least the first block + * of the previous segment should be there. + */ + if (next_segno > 0) + { + BlockNumber last_segno_first_blkno; + + last_segno_first_blkno = + cb_segment_to_block(cb->cb_pages_per_segment, + next_segno - 1, 0); + if (last_segno_first_blkno > possibly_not_on_disk_blkno) + possibly_not_on_disk_blkno = last_segno_first_blkno + 1; + } + + /* + * If we need to allocate a payload or index segment, and we don't + * currently have a candidate, check whether the metapage knows of a + * free segment. + */ + if ((insert_state == CBM_INSERT_NEEDS_PAYLOAD_SEGMENT || + insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT) + && free_segno == CB_INVALID_SEGMENT) + free_segno = cb_metapage_find_free_segment(meta); + + /* + * If we need a new payload or index segment, see whether it's + * possible to complete that operation on this trip through the loop. + * + * This will only be possible if we've got an exclusive lock on the + * metapage. + * + * Furthermore, by rule, we cannot allocate a segment unless at least + * the first page of that segment is guaranteed to be on disk. This is + * certain to be true for any segment that's been allocated + * previously, but otherwise it's only true if we've verified that the + * size of the relation on disk is large enough. + */ + if (mode != BUFFER_LOCK_EXCLUSIVE || + free_segno == CB_INVALID_SEGMENT || + (insert_state != CBM_INSERT_NEEDS_PAYLOAD_SEGMENT + && insert_state != CBM_INSERT_NEEDS_INDEX_SEGMENT)) + can_allocate_segment = false; + else + { + BlockNumber free_segno_first_blkno; + + free_segno_first_blkno = + cb_segment_to_block(cb->cb_pages_per_segment, free_segno, 0); + can_allocate_segment = + (free_segno_first_blkno < possibly_not_on_disk_blkno); + } + + /* + * If it still looks like we can allocate, check for the case where we + * need a new index segment but don't have the other required buffer + * locks. + */ + if (can_allocate_segment && + insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT && + (!BufferIsValid(indexbuffer) || (!BufferIsValid(prevbuffer) + && newest_index_segment != CB_INVALID_SEGMENT))) + can_allocate_segment = false; + + /* + * If it still looks like we can allocate, check for the case where + * the segment we planned to allocate is no longer free. + */ + if (can_allocate_segment) + { + /* fsmbuffer, if valid, is already exclusively locked. */ + if (BufferIsValid(fsmbuffer)) + can_allocate_segment = + !cb_fsmpage_get_fsm_bit(BufferGetPage(fsmbuffer), + free_segno); + else + can_allocate_segment = + !cb_metapage_get_fsm_bit(meta, free_segno); + + /* + * If this segment turned out not to be free, we need a new + * candidate. Check the metapage here, and if that doesn't work + * out, free_segno will end up as CB_INVALID_SEGMENT, and we'll + * search the FSM pages further down. + */ + if (!can_allocate_segment) + free_segno = cb_metapage_find_free_segment(meta); + } + + /* If it STILL looks like we can allocate, do it! */ + if (can_allocate_segment) + { + if (insert_state == CBM_INSERT_NEEDS_PAYLOAD_SEGMENT) + { + cb_allocate_payload_segment(cb->cb_insert_relfilenode, + cb->cb_fork, metabuffer, + fsmblock, fsmbuffer, free_segno, + free_segno >= next_segno, + needs_xlog); + + /* + * We know for sure that there's now a payload segment that + * isn't full - and we know exactly where it's located. + */ + insert_state = CBM_INSERT_OK; + next_blkno = cb_segment_to_block(cb->cb_pages_per_segment, + free_segno, 0); + } + else + { + Assert(insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT); + + cb_allocate_index_segment(cb->cb_insert_relfilenode, + cb->cb_fork, metabuffer, + indexblock, indexbuffer, + prevblock, prevbuffer, + fsmblock, fsmbuffer, free_segno, + index_metapage_start, + free_segno >= next_segno, + needs_xlog); + + /* + * We know for sure that there's now an index segment that + * isn't full, and our next move must be to relocate some + * index entries to that index segment. + */ + insert_state = CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED; + next_blkno = indexblock; + } + + /* + * Whether we allocated or not, the segment we intended to + * allocate is no longer free. + */ + free_segno = CB_INVALID_SEGMENT; + } + + /* + * If we need to relocate index entries and if we have a lock on the + * correct index block, then go ahead and do it. + */ + if (insert_state == CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED && + next_blkno == indexblock) + { + unsigned pageoffset; + unsigned num_index_entries; + CBSegNo index_entries[CB_METAPAGE_INDEX_ENTRIES]; + CBPageNo index_page_start; + unsigned logical_pages_in_index_segments; + unsigned index_entries_in_index_segments; + + logical_pages_in_index_segments = + index_metapage_start - index_start; + if (logical_pages_in_index_segments % cb->cb_pages_per_segment != 0) + elog(ERROR, "index starts at " UINT64_FORMAT ", metapage index at " UINT64_FORMAT ", but there are %u pages per segment", + index_start, index_metapage_start, + cb->cb_pages_per_segment); + index_entries_in_index_segments = + logical_pages_in_index_segments / cb->cb_pages_per_segment; + pageoffset = + index_entries_in_index_segments % CB_INDEXPAGE_INDEX_ENTRIES; + + num_index_entries = Min(CB_METAPAGE_INDEX_ENTRIES, + CB_INDEXPAGE_INDEX_ENTRIES - pageoffset); + cb_metapage_get_index_entries(meta, num_index_entries, + index_entries); + index_page_start = index_metapage_start - + pageoffset * cb->cb_pages_per_segment; + cb_relocate_index_entries(cb->cb_insert_relfilenode, cb->cb_fork, + metabuffer, indexblock, indexbuffer, + pageoffset, num_index_entries, + index_entries, index_page_start, + needs_xlog); + } + + /* Release buffer locks and, except for the metapage, also pins. */ + LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK); + if (BufferIsValid(indexbuffer)) + { + UnlockReleaseBuffer(indexbuffer); + indexblock = InvalidBlockNumber; + indexbuffer = InvalidBuffer; + } + if (BufferIsValid(prevbuffer)) + { + UnlockReleaseBuffer(prevbuffer); + prevblock = InvalidBlockNumber; + prevbuffer = InvalidBuffer; + } + if (BufferIsValid(fsmbuffer)) + { + UnlockReleaseBuffer(fsmbuffer); + fsmblock = InvalidBlockNumber; + fsmbuffer = InvalidBuffer; + } + + if (insert_state != CBM_INSERT_OK) + { + /* + * Some sort of preparatory work will be needed in order to insert + * a new page, which will require modifying the metapage. + * Therefore, next time we lock it, we had better grab an + * exclusive lock. + */ + mode = BUFFER_LOCK_EXCLUSIVE; + } + else + { + /* Extend the relation if needed. */ + buffer = ConveyorBeltExtend(cb, next_blkno, + &possibly_not_on_disk_blkno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If the target buffer is still unused, we're done. Otherwise, + * someone else grabbed that page before we did, so we must fall + * through and retry. + */ + if (ConveyorBeltPageIsUnused(BufferGetPage(buffer))) + { + /* + * Remember things that we'll need to know when the caller + * invokes ConveyorBeltPerformInsert and + * ConveyorBeltCleanupInsert. + */ + cb->cb_insert_block = next_blkno; + cb->cb_insert_buffer = buffer; + cb->cb_insert_metabuffer = metabuffer; + + /* Success, so escape toplevel retry loop. */ + break; + } + + /* We'll have to retry with a different buffer. */ + UnlockReleaseBuffer(buffer); + } + + /* + * If the metapage has no more space for index entries, but there's + * an index segment into which some of the existing ones could be + * moved, then cb_metapage_get_insert_state will have set next_blkno + * to the point to the block to which index entries should be moved. + * + * If the target index segment is the very last one in the conveyor + * belt and we're using the pages of that segment for the very first + * time, the target page may not exist yet, so be prepared to extend + * the relation. + */ + if (insert_state == CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED) + { + indexblock = next_blkno; + indexbuffer = ConveyorBeltExtend(cb, indexblock, + &possibly_not_on_disk_blkno); + } + + /* + * If we need to add a new index segment and it's not the very first + * one, we'll have to update the newest index page with a pointer to + * the index page we're going to add, so we must read and pin that + * page. + * + * The names "prevblock" and "prevbuffer" are intended to signify that + * what is currently the newest index segment will become the previous + * segment relative to the one we're going to add. + */ + if (insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT && + newest_index_segment != CB_INVALID_SEGMENT) + { + prevblock = cb_segment_to_block(cb->cb_pages_per_segment, + newest_index_segment, 0); + prevbuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, + prevblock, RBM_NORMAL, NULL); + } + + /* + * If we need to add a new segment of either type, make provisions to + * do so. + */ + if (insert_state == CBM_INSERT_NEEDS_PAYLOAD_SEGMENT || + insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT) + { + /* + * Search the FSM pages (and create a new one if needed) for a + * free segment, unless we've already have a candidate. + */ + if (free_segno == CB_INVALID_SEGMENT) + free_segno = ConveyorSearchFSMPages(cb, next_segno, &fsmblock, + &fsmbuffer); + + if (free_segno > next_segno) + { + /* + * If the FSM thinks that we ought to allocate a segment + * beyond what we think to be the very next one, then someone + * else must have concurrently added a segment, so we'll need + * to loop around, retake the metapage lock, refresh our + * knowledge of next_segno, and then find a new segment to + * allocate. + */ + free_segno = CB_INVALID_SEGMENT; + } + else if (free_segno == next_segno) + { + BlockNumber free_block; + Buffer free_buffer; + + /* + * We're allocating a new segment. At least the first page must + * exist on disk before we perform the allocation, which means + * we may need to add blocks to the relation fork. + */ + free_block = cb_segment_to_block(cb->cb_pages_per_segment, + free_segno, 0); + free_buffer = ConveyorBeltExtend(cb, free_block, + &possibly_not_on_disk_blkno); + if (insert_state == CBM_INSERT_NEEDS_INDEX_SEGMENT) + { + indexblock = free_block; + indexbuffer = free_buffer; + } + else + ReleaseBuffer(free_buffer); + } + } + + /* + * Prepare for next attempt by reacquiring all relevant buffer locks, + * except for the one on the metapage, which is acquired at the top of + * the loop. + */ + if (BufferIsValid(indexbuffer)) + LockBuffer(indexbuffer, BUFFER_LOCK_EXCLUSIVE); + if (BufferIsValid(prevbuffer)) + LockBuffer(prevbuffer, BUFFER_LOCK_EXCLUSIVE); + if (BufferIsValid(fsmbuffer)) + LockBuffer(fsmbuffer, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * Relock the metapage. Caller should immediately start a critical section + * and populate the buffer. + */ + LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE); + + /* All done. */ + *pageno = next_pageno; + return buffer; +} + +/* + * Actually insert a new page into the conveyor belt. + * + * See ConveyorBeltGetNewPage for the intended usage of this fucntion. + */ +void +ConveyorBeltPerformInsert(ConveyorBelt *cb, Buffer buffer) +{ + bool needs_xlog; + + /* + * We don't really need the caller to tell us which buffer is involved, + * because we already have that information. We insist on it anyway as a + * debugging cross-check. + */ + if (cb->cb_insert_buffer != buffer) + { + if (BufferIsValid(cb->cb_insert_buffer)) + elog(ERROR, "there is no pending insert"); + else + elog(ERROR, + "pending insert expected for buffer %u but got buffer %u", + cb->cb_insert_buffer, buffer); + } + + /* + * ConveyorBeltPageIsUnused is used by ConveyorBeltGetNewPage to figure + * out whether a concurrent inserter got there first. Here, we're the + * concurrent inserter, and must have initialized the page in a way that + * makes that function return false for the newly-inserted page, so that + * other backends can tell we got here first. + */ + if (ConveyorBeltPageIsUnused(BufferGetPage(buffer))) + elog(ERROR, "can't insert an unused page"); + + /* Caller should be doing this inside a critical section. */ + Assert(CritSectionCount > 0); + + /* We should have the details stashed by ConveyorBeltGetNewPage. */ + Assert(cb->cb_insert_relfilenode != NULL); + Assert(BufferIsValid(cb->cb_insert_metabuffer)); + Assert(BufferIsValid(cb->cb_insert_buffer)); + Assert(BlockNumberIsValid(cb->cb_insert_block)); + + /* Update metapage, mark buffers dirty, and write XLOG if required. */ + needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM; + cb_insert_payload_page(cb->cb_insert_relfilenode, cb->cb_fork, + cb->cb_insert_metabuffer, + cb->cb_insert_block, buffer, + needs_xlog); + + /* + * Buffer locks will be released by ConveyorBeltCleanupInsert, but we can + * invalidate some other fields now. + */ + cb->cb_insert_relfilenode = NULL; + cb->cb_insert_block = InvalidBlockNumber; +} + +/* + * Clean up following the insertion of a new page into the conveyor belt. + * + * See ConveyorBeltGetNewPage for the intended usage of this fucntion. + */ +void +ConveyorBeltCleanupInsert(ConveyorBelt *cb, Buffer buffer) +{ + /* Debugging cross-check, like ConveyorBeltPerformInsert. */ + if (cb->cb_insert_buffer != buffer) + { + if (BufferIsValid(cb->cb_insert_buffer)) + elog(ERROR, "there is no pending insert"); + else + elog(ERROR, + "pending insert expected for buffer %u but got buffer %u", + cb->cb_insert_buffer, buffer); + } + + /* Release buffer locks and pins. */ + Assert(BufferIsValid(cb->cb_insert_buffer)); + Assert(BufferIsValid(cb->cb_insert_metabuffer)); + UnlockReleaseBuffer(cb->cb_insert_buffer); + UnlockReleaseBuffer(cb->cb_insert_metabuffer); + cb->cb_insert_buffer = InvalidBuffer; + cb->cb_insert_metabuffer = InvalidBuffer; +} + +/* + * Read a logical page from a conveyor belt. If the page has already been + * truncated away or has not yet been created, returns InvalidBuffer. + * Otherwise, reads the page using the given strategy and locks it using + * the given buffer lock mode. + */ +Buffer +ConveyorBeltReadBuffer(ConveyorBelt *cb, CBPageNo pageno, int mode, + BufferAccessStrategy strategy) +{ + BlockNumber index_blkno, + payload_blkno; + Buffer metabuffer, + index_buffer, + payload_buffer; + CBMetapageData *meta; + CBPageNo index_start, + index_metapage_start, + target_index_segment_start; + CBSegNo oldest_index_segment, + newest_index_segment, + index_segno; + unsigned lppis, + segoff; + uint64 index_segments_moved; + + Assert(mode == BUFFER_LOCK_EXCLUSIVE || mode == BUFFER_LOCK_SHARE); + + /* + * Lock the metapage and get all the information we need from it. Then + * drop the lock on the metapage, but retain the pin, so that neither the + * target payload page nor any index page we might need to access can be + * concurrently truncated away. See the README for futher details. + */ + metabuffer = ConveyorBeltRead(cb, CONVEYOR_METAPAGE, BUFFER_LOCK_SHARE); + meta = cb_metapage_get_special(BufferGetPage(metabuffer)); + if (!cb_metapage_find_logical_page(meta, pageno, &payload_blkno)) + { + /* Page number too old or too new. */ + UnlockReleaseBuffer(metabuffer); + return InvalidBuffer; + } + if (payload_blkno != InvalidBlockNumber) + { + /* Index entry for payload page found on metapage. */ + LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK); + payload_buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, + payload_blkno, RBM_NORMAL, + strategy); + LockBuffer(payload_buffer, mode); + ReleaseBuffer(metabuffer); + return payload_buffer; + } + cb_metapage_get_index_info(meta, &index_start, &index_metapage_start, + &oldest_index_segment, &newest_index_segment, + &index_segments_moved); + LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK); + + /* Invalidate any obsolete cache entries. */ + cb_cache_invalidate(cb->cb_cache, index_start, index_segments_moved); + + /* + * It's convenient to identify index segments in terms of the first + * logical page for which that index segment contains the necessary index + * entry. So, take the page number that we were given, and back it up to + * the previous index-segment boundary. + */ + lppis = cb_logical_pages_per_index_segment(cb->cb_pages_per_segment); + target_index_segment_start = pageno - (pageno - index_start) % lppis; + + /* Search the cache first. Try other strategies if that does not work. */ + index_segno = cb_cache_lookup(cb->cb_cache, target_index_segment_start); + if (index_segno == CB_INVALID_SEGMENT) + { + if (index_start == target_index_segment_start) + { + /* Looks like it's the oldest index segment. */ + index_segno = oldest_index_segment; + } + else if (index_metapage_start - lppis == target_index_segment_start) + { + /* + * Looks like it's the newest index segment. + * + * It's worth adding a cache entry for this, because we might end + * up needing it again later, when it's no longer the newest + * entry. + */ + index_segno = newest_index_segment; + cb_cache_insert(cb->cb_cache, index_segno, + target_index_segment_start); + } + else + { + CBPageNo index_segment_start; + + /* + * We don't know where it is and it's not the first or last index + * segment, so we have to walk the chain of index segments to find + * it. + * + * That's possibly going to be slow, especially if there are a lot + * of index segments. However, maybe we can make it a bit faster. + * Instead of starting with the oldest segment and moving forward + * one segment at a time until we find the one we want, search the + * cache for the index segment that most nearly precedes the one + * we want. + */ + index_segno = cb_cache_fuzzy_lookup(cb->cb_cache, + target_index_segment_start, + &index_segment_start); + if (index_segno == CB_INVALID_SEGMENT) + { + /* + * Sadly, the cache is either entirely empty or at least has + * no entries for any segments older than the one we want, so + * we have to start our search from the oldest segment. + */ + index_segno = oldest_index_segment; + } + + /* + * Here's where we actually search. Make sure to cache the + * results, in case there are more lookups later. + */ + while (index_segment_start < target_index_segment_start) + { + CHECK_FOR_INTERRUPTS(); + + index_blkno = cb_segment_to_block(cb->cb_pages_per_segment, + index_segno, 0); + index_buffer = ConveyorBeltRead(cb, index_blkno, + BUFFER_LOCK_SHARE); + index_segno = + cb_indexpage_get_next_segment(BufferGetPage(index_buffer)); + UnlockReleaseBuffer(index_buffer); + index_segment_start += lppis; + cb_cache_insert(cb->cb_cache, index_segno, index_segment_start); + } + } + } + + /* + * We know which index segment we need to read, so now figure out which + * page we need from that segment, and then which physical block we need. + */ + segoff = (pageno - target_index_segment_start) / + cb_logical_pages_per_index_page(cb->cb_pages_per_segment); + index_blkno = cb_segment_to_block(cb->cb_pages_per_segment, + index_segno, segoff); + + /* Read the required index entry. */ + index_buffer = ConveyorBeltRead(cb, index_blkno, BUFFER_LOCK_SHARE); + payload_blkno = cb_indexpage_find_logical_page(BufferGetPage(index_buffer), + pageno, + cb->cb_pages_per_segment); + UnlockReleaseBuffer(index_buffer); + + /* Now we can read and lock the actual payload block. */ + payload_buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, + payload_blkno, RBM_NORMAL, + strategy); + LockBuffer(payload_buffer, mode); + + /* + * Since we've now got the payload block locked, we can release the pin on + * the metapage. + */ + ReleaseBuffer(metabuffer); + return payload_buffer; +} + +/* + * Find out which logical page numbers are currently valid. + * + * On return, *oldest_logical_page will be set to the smallest page number + * that has not yet been removed by truncation, and *next_logical_page will + * be set to the smallest page number that does not yet exist. + * + * Note that, unless the caller knows that there cannot be concurrent + * truncations or insertions in progress, either value might be out of + * date by the time it is used. + */ +void +ConveyorBeltGetBounds(ConveyorBelt *cb, CBPageNo *oldest_logical_page, + CBPageNo *next_logical_page) +{ + Buffer metabuffer; + CBMetapageData *meta; + + metabuffer = ConveyorBeltRead(cb, CONVEYOR_METAPAGE, BUFFER_LOCK_SHARE); + meta = cb_metapage_get_special(BufferGetPage(metabuffer)); + cb_metapage_get_bounds(meta, oldest_logical_page, next_logical_page); + UnlockReleaseBuffer(metabuffer); +} + +/* + * Update the conveyor belt's notion of the oldest logical page to be kept. + * + * This doesn't physically shrink the relation, nor does it even make space + * available for reuse by future insertions. It just makes pages prior to + * 'oldest_keeper' unavailable, thus potentially allowing the segments + * containing those pages to be freed by a future call to ConveyorBeltVacuum. + * + * A call to this function shouldn't try to move the logical truncation point + * backwards. That is, the value of 'oldest_keeper' should always be greater + * than or equal to the value passed on the previous call for this conveyor + * belt. It also shouldn't try to move the logical truncation point beyond + * the current insertion point: don't try to throw away data that hasn't been + * inserted yet! + * + * For routine cleanup of a conveyor belt, the recommended sequence of calls + * is ConveyorBeltLogicalTruncate then ConveyorBeltVacuum then + * ConveyorBeltPhysicalTruncate. For more aggressive cleanup options, see + * ConveyorBeltCompact or ConveyorBeltRewrite. + */ +void +ConveyorBeltLogicalTruncate(ConveyorBelt *cb, CBPageNo oldest_keeper) +{ + Buffer metabuffer; + CBMetapageData *meta; + CBPageNo oldest_logical_page; + CBPageNo next_logical_page; + RelFileNode *rnode; + bool needs_xlog; + + /* + * We must take a cleanup lock to adjust the logical truncation point, + * as per the locking protocols in src/backend/access/conveyor/README. + */ + metabuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, CONVEYOR_METAPAGE, + RBM_NORMAL, NULL); + LockBufferForCleanup(metabuffer); + + /* Sanity checks. */ + meta = cb_metapage_get_special(BufferGetPage(metabuffer)); + cb_metapage_get_bounds(meta, &oldest_logical_page, &next_logical_page); + if (oldest_keeper < oldest_logical_page) + elog(ERROR, + "can't move truncation point backwards from " UINT64_FORMAT " to " UINT64_FORMAT, + oldest_logical_page, oldest_keeper); + if (oldest_keeper > next_logical_page) + elog(ERROR, + "can't move truncation point to " UINT64_FORMAT " beyond insert point " UINT64_FORMAT, + oldest_keeper, next_logical_page); + + + /* Do the real work. */ + rnode = &RelationGetSmgr(cb->cb_rel)->smgr_rnode.node; + needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM; + cb_logical_truncate(rnode, cb->cb_fork, metabuffer, oldest_keeper, + needs_xlog); + + /* Release buffer lock. */ + UnlockReleaseBuffer(metabuffer); +} + +/* + * Recycle segments that are no longer needed. + * + * Payload segments all of whose pages precede the logical truncation point + * can be deallocated. Index segments can be deallocated once they no longer + * contain any pointers to payload segments. + * + * Only one backend should call this at a time for any given conveyor belt. + */ +void +ConveyorBeltVacuum(ConveyorBelt *cb) +{ + Buffer metabuffer; + BlockNumber fsmblock = InvalidBlockNumber; + Buffer fsmbuffer = InvalidBuffer; + CBSegNo cleared_segno = CB_INVALID_SEGMENT; + bool needs_xlog; + bool cleaned_index_segments = false; + + /* Do any changes we make here need to be WAL-logged? */ + needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM; + + /* Read and pin the metapage. */ + metabuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, CONVEYOR_METAPAGE, + RBM_NORMAL, NULL); + LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Main loop. + * + * At the top of each loop iteration, the metabuffer is pinned and + * exclusively locked. The lock and even the pin may be released by code + * inside this loop, but they must be reacquired before beginning the next + * iteration. + */ + while (1) + { + CBMetapageData *meta; + CBMObsoleteState obsolete_state; + CBSegNo oldest_index_segment; + CBPageNo index_vacuum_stop_point; + CBSegNo metapage_segno; + unsigned metapage_offset; + + /* Assess what kind of work needs to be done. */ + meta = cb_metapage_get_special(BufferGetPage(metabuffer)); + obsolete_state = + cb_metapage_get_obsolete_state(meta, &oldest_index_segment, + &index_vacuum_stop_point, + &metapage_segno, &metapage_offset); + + /* + * If on the previous pass through the loop we concluded that we need + * to free a payload segment refrenced by the metapage and if that no + * longer seems like the thing we need to do, then release any lock and + * pin we may have acquired in preparation for freeing that payload + * segment. + */ + if ((obsolete_state != CBM_OBSOLETE_METAPAGE_ENTRIES || + metapage_segno != cleared_segno) && fsmblock != InvalidBlockNumber) + { + UnlockReleaseBuffer(fsmbuffer); + fsmblock = InvalidBlockNumber; + fsmbuffer = InvalidBuffer; + } + + /* + * Attempt to do whatever useful work seems to be possible based on + * obsolete_state. + */ + if (obsolete_state == CBM_OBSOLETE_NOTHING) + { + /* + * There is nothing to vacuum. + */ + UnlockReleaseBuffer(metabuffer); + return; + } + else if (obsolete_state == CBM_OBSOLETE_METAPAGE_START) + { + /* + * No real work to do, but there are some already-cleared entries + * at the start of the metapage which we should remove to make more + * space for new entries. + */ + cb_shift_metapage_index(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node, + cb->cb_fork, metabuffer, metapage_offset, needs_xlog); + UnlockReleaseBuffer(metabuffer); + return; + } + else if (obsolete_state == CBM_OBSOLETE_METAPAGE_ENTRIES) + { + /* + * The metapage contains entries for one or more payload segments + * which can be deallocated. + */ + if (metapage_segno != cleared_segno) + { + /* + * We can only recycle a payload segment after clearing the + * pages in that segment. Since we have not done that yet, + * do it now. First release the buffer lock on the metapage, + * to avoid interefering with other use of the conveyor belt. + */ + LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK); + ConveyorBeltClearSegment(cb, metapage_segno, true); + cleared_segno = metapage_segno; + + /* + * Lock the relevant FSM page, if it's not the metapage. + * Per src/backend/access/conveyor/README's locking rules, + * we must do this before relocking the metapage. + */ + fsmblock = ConveyorBeltFSMBlockNumber(cb, cleared_segno); + if (fsmblock != InvalidBlockNumber) + fsmbuffer = ConveyorBeltRead(cb, fsmblock, + BUFFER_LOCK_EXCLUSIVE); + + + /* + * OK, now reacquire a lock on the metapage and loop around. + * Hopefully, the next pass will succeed in freeing a payload + * segment. + */ + LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE); + } + else + { + /* + * The previous pass through the loop made preparations to + * free this payload segment, so now we can do it. + */ + cb_recycle_payload_segment(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node, + cb->cb_fork, + metabuffer, + InvalidBlockNumber, InvalidBuffer, + fsmblock, fsmbuffer, + cleared_segno, metapage_offset, + needs_xlog); + } + } + else if (obsolete_state == CBM_OBSOLETE_SEGMENT_ENTRIES) + { + unsigned empty_index_segments = 0; + CBSegNo index_segment = oldest_index_segment; + + /* + * Do this part just once. A single pass through the logic below + * should clean out the index segments as completely as possible, + * so if we end up here again, either the logical truncation point + * changed concurrently, or there's actually nothing to do. Even + * in the former case, it's OK to return without doing anything + * further, because this function only promises to clean up data + * that was no longer needed as of the time it was called. It makes + * no promises about cleaning up things that became obsolete once + * this function was already running. + */ + if (cleaned_index_segments) + { + UnlockReleaseBuffer(metabuffer); + break; + } + cleaned_index_segments = true; + + /* + * Release lock on metapage before locking other pages, but keep + * the pin for efficiency and so that no index segments can + * disappear concurrently. + */ + LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK); + + /* + * Clear as many obsolete index entries out of index segments as + * we can. + */ + while (index_segment != CB_INVALID_SEGMENT && + ConveyorBeltClearIndexSegmentEntries(cb, metabuffer, + index_segment, + index_vacuum_stop_point, + &index_segment)) + ++empty_index_segments; + + /* + * Free old index segments. + * + * We might stop before freeing the requested number of index + * segments, due to concurrent locking. If that happens, + * give up on performing any further cleanup. + */ + while (empty_index_segments > 0) + { + oldest_index_segment = + ConveyorBeltFreeOldestIndexSegment(cb, metabuffer, + oldest_index_segment, + index_vacuum_stop_point); + --empty_index_segments; + if (empty_index_segments > 0 && + oldest_index_segment == CB_INVALID_SEGMENT) + { + ReleaseBuffer(metabuffer); + return; + } + } + + /* + * If we freed some but not all index segments, all the entries in + * the metapage are still needed, so there is no point in trying to + * clean it up. + */ + if (oldest_index_segment != CB_INVALID_SEGMENT) + { + ReleaseBuffer(metabuffer); + return; + } + + /* + * Relock the metapage prior to looping around. We may still be + * able to clear index entries from the metapage, or adjust the + * start of the metapage index. + */ + LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE); + } + } +} + +/* + * Clear obsolete index entries from a segment. + * + * metabuffer should be pinned but not locked when this function is called, + * and will be in the same state upon return. + * + * index_segment specifies the target index segment. + * + * index_vacuum_stop_point defines the point beyond which no index entries + * may be removed. If an index entry is found all or part of which would cover + * pages greater than or equal to this value, then this function does nothing + * further and returns false. If this limit is not reached, this function + * returns true. + * + * *next_index_segment is set to the segment number of the index segment + * that follows the one specified by index_segment, or CB_INVALID_SEGMENT + * if none. + */ +static bool +ConveyorBeltClearIndexSegmentEntries(ConveyorBelt *cb, Buffer metabuffer, + CBSegNo index_segment, + CBPageNo index_vacuum_stop_point, + CBSegNo *next_index_segment) +{ + bool needs_xlog; + bool need_next_segment = true; + unsigned segoff; + BlockNumber fsmblock = InvalidBlockNumber; + Buffer fsmbuffer = InvalidBuffer; + + /* Do we need to write XLOG for operations on this conveyor belt? */ + needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM; + + for (segoff = 0; segoff < cb->cb_pages_per_segment; ++segoff) + { + BlockNumber indexblock; + Buffer indexbuffer; + Page indexpage; + unsigned pageoffset = 0; + CBSegNo cleared_segno = CB_INVALID_SEGMENT; + + indexblock = cb_segment_to_block(cb->cb_pages_per_segment, + index_segment, segoff); + indexbuffer = ConveyorBeltRead(cb, indexblock, BUFFER_LOCK_EXCLUSIVE); + indexpage = BufferGetPage(indexbuffer); + + /* + * If an index segment page is not initialized, treat it the same + * way as if it is initialized but contains no entries. + */ + if (ConveyorBeltPageIsUnused(indexpage)) + { + if (segoff == 0) + elog(ERROR, + "conveyor belt index page at segno %u offset 0 should be initialied", + index_segment); + if (*next_index_segment != CB_INVALID_SEGMENT) + elog(ERROR, + "non-final index segment page at segno %u offset %u should be initialized", + index_segment, segoff); + return true; + } + + /* + * If this is the very first time we've locked an index page in this + * segment, it should be the first page, and it will tell us where to + * find the next segment once we finish with this one. Grab that + * information while we have the page lock. + */ + if (need_next_segment) + { + Assert(segoff == 0); + *next_index_segment = cb_indexpage_get_next_segment(indexpage); + need_next_segment = false; + } + + /* + * Loop over the index entries in this page. + * + * At the top of each iteration of the loop, the index page is + * exclusively locked. The lock may be released and reacquired before + * beginning the next iteration. + */ + while (pageoffset < CB_INDEXPAGE_INDEX_ENTRIES) + { + CBSegNo segno; + CBPageNo first_page; + + /* Find, or reconfirm, the location of the next obsolete entry. */ + segno = cb_indexpage_get_obsolete_entry(indexpage, &pageoffset, + &first_page); + if (segno == CB_INVALID_SEGMENT) + { + /* No items remain in this page. */ + UnlockReleaseBuffer(indexbuffer); + break; + } + if (first_page + (cb->cb_pages_per_segment * pageoffset) + + cb->cb_pages_per_segment > index_vacuum_stop_point) + { + /* + * At least one entry from this page is still needed, so no + * point in visiting future pages in this index segment, and + * no point in visiting any more index segments. + */ + UnlockReleaseBuffer(indexbuffer); + return false; + } + + /* + * If this is the first time we've considered clearing this + * particular payload segment, we'll need to release the buffer + * lock, do some necessary prep work, reacquire the buffer lock, + * and recheck to make sure nothing has changed. + */ + if (segno != cleared_segno) + { + BlockNumber newfsmblock; + + /* Release lock on index page. */ + LockBuffer(indexbuffer, BUFFER_LOCK_UNLOCK); + + /* + * Clear the segment that we want to recycle. + * + * Note that we could crash or error out while or after doing + * this and before we actually recycle the segment. If so, + * we'll do it again the next time someone tries to vacuum + * this conveyor belt. All of that is fine, because nobody + * can be looking at the data any more, and clearing the pages + * is idempotent. + */ + ConveyorBeltClearSegment(cb, segno, true); + + /* + * Make sure that we have the correct FSM buffer pinned. + * + * Often, any FSM buffer that we have pinned previously will + * still be the correct one, either because segment numbers + * allocated around the same time are likely to be close + * together numerically, or just because the conveyor belt may + * not be big enough to need lots of FSM pages. + * + * However, in the worst case, this can change every time. + */ + newfsmblock = cb_segment_to_fsm_block(cb->cb_pages_per_segment, + segno); + if (fsmblock != newfsmblock) + { + ReleaseBuffer(fsmbuffer); + fsmblock = newfsmblock; + if (fsmblock == InvalidBlockNumber) + fsmbuffer = InvalidBuffer; + else + fsmbuffer = + ReadBufferExtended(cb->cb_rel, cb->cb_fork, + fsmblock, RBM_NORMAL, NULL); + } + + /* Relock the index page and go around. */ + LockBuffer(indexbuffer, BUFFER_LOCK_EXCLUSIVE); + cleared_segno = segno; + continue; + } + + /* + * Clear the index entry referrring to the payload segment, and + * mark the segment free. To do this, we have to grab the lock + * on whatever page contains the free/busy state, which could be + * either an FSM page or the metapage. + */ + if (fsmblock == InvalidBlockNumber) + { + LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE); + cb_recycle_payload_segment(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node, + cb->cb_fork, + metabuffer, + indexblock, indexbuffer, + InvalidBlockNumber, InvalidBuffer, + segno, pageoffset, needs_xlog); + LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK); + } + else + { + LockBuffer(fsmbuffer, BUFFER_LOCK_EXCLUSIVE); + cb_recycle_payload_segment(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node, + cb->cb_fork, + InvalidBuffer, + indexblock, indexbuffer, + fsmblock, fsmbuffer, + segno, pageoffset, needs_xlog); + LockBuffer(fsmbuffer, BUFFER_LOCK_UNLOCK); + } + + /* No need to consider this page offset again. */ + ++pageoffset; + + /* Now we're no longer prepared to clear any segment. */ + cleared_segno = CB_INVALID_SEGMENT; + } + } + + return true; +} + +/* + * Attempt to remve the oldest index segment. + * + * The return value is the segment number of the oldest index segment that + * remains after the operation has been completed. If no index segments remain + * after the operation or if the operation cannot be completed, the return + * value is CB_INVALID_SEGMENT. + */ +static CBSegNo +ConveyorBeltFreeOldestIndexSegment(ConveyorBelt *cb, Buffer metabuffer, + CBSegNo oldest_index_segment, + CBPageNo index_vacuum_stop_point) +{ + BlockNumber firstindexblock; + Buffer firstindexbuffer; + BlockNumber fsmblock; + Buffer fsmbuffer; + bool needs_xlog; + CBSegNo oldest_remaining_index_segment = CB_INVALID_SEGMENT; + + /* + * Clear all the blocks in the oldest index segment except for the first. + * We must keep the first one until the bitter end, so that it remains + * possible to walk the chain of index segments. + */ + ConveyorBeltClearSegment(cb, oldest_index_segment, false); + + /* + * Read and pin the first block of the index segment. + */ + needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM; + firstindexblock = cb_segment_to_block(cb->cb_pages_per_segment, + oldest_index_segment, 0); + firstindexbuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, + firstindexblock, RBM_NORMAL, NULL); + + /* + * Also read and pin the appropriate FSM page, unless the busy/free status + * of this segment is stored in the metapage. + */ + fsmblock = cb_segment_to_fsm_block(cb->cb_pages_per_segment, + oldest_index_segment); + if (fsmblock == InvalidBlockNumber) + fsmbuffer = InvalidBuffer; + else + fsmbuffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, + fsmblock, RBM_NORMAL, NULL); + + /* + * The lock ordering described in the README requires the metapage lock + * to be taken last, but it also requires that freeing an index segment + * take a cleanup lock on the metapage. Since a concurrent reader will + * hold a pin on the metapage when trying to lock the first index page, + * we can't lock the first index page and then wait for a cleanup lock + * on the metapage, because that might deadlock. + * + * To get around that problem, we take the cleanup lock on the metabuffer + * conditionally. If we can't get it, we just skip freeing the oldest + * index segment. That's not great, but it's not obvious how we can do + * any better. + */ + LockBuffer(firstindexbuffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(fsmbuffer, BUFFER_LOCK_EXCLUSIVE); + if (ConditionalLockBufferForCleanup(metabuffer)) + { + oldest_remaining_index_segment = + cb_recycle_index_segment(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node, + cb->cb_fork, metabuffer, + firstindexblock, firstindexbuffer, + fsmblock, fsmbuffer, + oldest_index_segment, needs_xlog); + LockBuffer(metabuffer, BUFFER_LOCK_UNLOCK); + } + UnlockReleaseBuffer(fsmbuffer); + UnlockReleaseBuffer(firstindexbuffer); + + return oldest_remaining_index_segment; +} + +/* + * Clear all pages in a segment, or alternatively all pages in a segment + * except for the first one. The segment can be a payload segment that isn't + * needed any more (in which case we should clear all the pages) or the oldest + * index segment from which all index entries have been cleared (in which + * case we should clear all pages but the first). + * + * This needs to leave each page in a state where ConveyorBeltPageIsUnused + * would return true. Otherwise, if this is reused as a payload segment, + * ConveyorBeltGetNewPage will get confused, as the pages it's trying to + * allocate will seem to have been concurrently allocated by some other + * backend. + * + * This needs to take a cleanup lock on each page to make sure that there are + * no lingering locks or pins on the page. + */ +static void +ConveyorBeltClearSegment(ConveyorBelt *cb, CBSegNo segno, + bool include_first_page) +{ + BlockNumber firstblkno; + BlockNumber stopblkno; + BlockNumber blkno; + bool needs_xlog; + + firstblkno = cb_segment_to_block(cb->cb_pages_per_segment, segno, 0); + if (!include_first_page) + firstblkno++; + stopblkno = firstblkno + cb->cb_pages_per_segment; + needs_xlog = RelationNeedsWAL(cb->cb_rel) || cb->cb_fork == INIT_FORKNUM; + + for (blkno = firstblkno; blkno < stopblkno; ++blkno) + { + Buffer buffer; + + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, blkno, + RBM_NORMAL, NULL); + LockBufferForCleanup(buffer); + cb_clear_block(&RelationGetSmgr(cb->cb_rel)->smgr_rnode.node, + cb->cb_fork, blkno, buffer, needs_xlog); + UnlockReleaseBuffer(buffer); + } +} + +/* + * Pin and return the block indicated by 'blkno', extending if needed. + * + * On entry, *possibly_not_on_disk_blkno should be the smallest block number + * not known to exist on disk. If this function discovers that later blocks + * exist on disk, or extends the relation so that they do, this value will be + * updated accordingly. + * + * If the relation would need to be extended by more than the number of pages + * in a single segment, an error will occur. This shouldn't happen unless + * something has gone wrong, because the first page of a segment is supposed to + * exist on disk before it's allocated. Therefore, if we create segment N+1, at + * least the first page of segment N should already be there, so we shouldn't + * be extending by more than one segment. There is a special case when the + * segments are separated by an FSM page, but there the FSM page should be + * created on disk before allocating the segment which follows, so the same + * rule applies. + */ +static Buffer +ConveyorBeltExtend(ConveyorBelt *cb, BlockNumber blkno, + BlockNumber *possibly_not_on_disk_blkno) +{ + BlockNumber nblocks; + Buffer buffer; + + /* If the block we need is already known to be on disk, just pin it. */ + if (blkno < *possibly_not_on_disk_blkno) + return ReadBufferExtended(cb->cb_rel, cb->cb_fork, blkno, + RBM_NORMAL, NULL); + + /* + * We may need to extend, but can't safely do that if someone else might be + * doing so. Since we don't currently have a concept of separate relation + * extension locks per fork, we just have to take the only and only + * relation-level lock. + */ + LockRelationForExtension(cb->cb_rel, ExclusiveLock); + + /* Check relation length, now that we have the extension lock. */ + nblocks = RelationGetNumberOfBlocksInFork(cb->cb_rel, cb->cb_fork); + + /* Complain if possibly_not_on_disk_blkno was a lie. */ + if (nblocks < *possibly_not_on_disk_blkno) + ereport(ERROR, + errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("expected at least %u blocks on disk, but found only %u blocks", + *possibly_not_on_disk_blkno, blkno)); + + /* + * If the block we need turns out to be on disk after all, we have no need + * to extend the relation, and can just read it. We do need to take care to + * update *possibly_not_on_disk_blkno to reduce the likelihood of needing + * to take the relation extension lock again in the future. + */ + if (blkno < nblocks) + { + *possibly_not_on_disk_blkno = nblocks; + UnlockRelationForExtension(cb->cb_rel, ExclusiveLock); + return ReadBufferExtended(cb->cb_rel, cb->cb_fork, blkno, RBM_NORMAL, NULL); + } + + /* + * Complain if we'd have to extend the relation too far. See also the + * function header comments. + */ + if (nblocks + cb->cb_pages_per_segment < blkno) + ereport(ERROR, + errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("should not need to extend by more than %u blocks to reach block %u, but have only %u blocks", + cb->cb_pages_per_segment, blkno, nblocks)); + + /* Add any blocks that are needed prior to the requested block. */ + while (nblocks < blkno) + { + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, P_NEW, + RBM_NORMAL, NULL); + Assert(BufferGetBlockNumber(buffer) == nblocks); + ReleaseBuffer(buffer); + ++nblocks; + } + + /* Add the requested block. */ + buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, P_NEW, + RBM_NORMAL, NULL); + Assert(BufferGetBlockNumber(buffer) == blkno); + + /* Done extending relation. */ + UnlockRelationForExtension(cb->cb_rel, ExclusiveLock); + + /* Remember that the relation is now longer than it used to be. */ + *possibly_not_on_disk_blkno = blkno + 1; + return buffer; +} + +/* + * Figure out where the FSM bit for a given segment number is located. + * + * Returns InvalidBlockNumber if the segment's FSM bit is in the metapage, + * or otherwise the block number of the FSM page that contains that FSM bit. + */ +BlockNumber +ConveyorBeltFSMBlockNumber(ConveyorBelt *cb, CBSegNo segno) +{ + BlockNumber firstblkno; + unsigned stride; + unsigned whichfsmpage; + + if (segno < CB_FSM_SEGMENTS_FOR_METAPAGE) + return InvalidBlockNumber; + + firstblkno = cb_first_fsm_block(cb->cb_pages_per_segment); + stride = cb_fsm_block_spacing(cb->cb_pages_per_segment); + whichfsmpage = (segno - CB_FSM_SEGMENTS_FOR_METAPAGE) + / CB_FSM_SEGMENTS_PER_FSMPAGE; + + return firstblkno + (stride * whichfsmpage); +} + +/* + * Convenience function to read and lock a block. + */ +static Buffer +ConveyorBeltRead(ConveyorBelt *cb, BlockNumber blkno, int mode) +{ + Buffer buffer; + + Assert(blkno != P_NEW); + Assert(mode == BUFFER_LOCK_SHARE || mode == BUFFER_LOCK_EXCLUSIVE); + buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, blkno, + RBM_NORMAL, NULL); + LockBuffer(buffer, mode); + return buffer; +} + +/* + * We consider a page unused if it's either new (i.e. all zeroes) or if + * neither pd_lower nor pd_upper have moved. + */ +static Buffer +ConveyorBeltPageIsUnused(Page page) +{ + PageHeader ph = (PageHeader) page; + + if (PageIsNew(page)) + return true; + + return (ph->pd_lower <= SizeOfPageHeaderData && ph->pd_upper == BLCKSZ); +} + +/* + * Find a free segment by searching all of the FSM pages that currently exist, + * and if that doesn't turn up anything, adding a new FSM page. + * + * Note that this doesn't search the metapage. That's because the caller needs + * to do that before releasing the content lock on the metapage, whereas this + * should be called while retaining only a pin on the metapage, since it needs + * to read and lock other pages. + * + * 'next_segment' is the lowest-numbered segment that has not yet been + * created. + * + * If any FSM page covers a segment that is not currently allocated, returns + * the lowest such segment number. This might be equal to next_segment, but + * should not be greater. + * + * If *fsmbuffer is not InvalidBuffer, it is assumed to be a pinned FSM + * buffer and will be unpinned. + * + * On return, *fsmbuffer will be set to the buffer that contains the FSM page + * covering the segment whose segment number was returned, and *fsmblock + * will be set to the corresponding block number. + */ +static CBSegNo +ConveyorSearchFSMPages(ConveyorBelt *cb, CBSegNo next_segment, + BlockNumber *fsmblock, Buffer *fsmbuffer) +{ + bool have_extension_lock = false; + BlockNumber firstblkno; + BlockNumber currentblkno; + BlockNumber stopblkno; + Buffer buffer = InvalidBuffer; + CBSegNo segno; + unsigned stride; + + /* + * Release any previous buffer pin. + * + * We shouldn't ever return without setting *fsmblock and *fsmbuffer to + * some legal value, so these stores are just paranoia. + */ + if (BufferIsValid(*fsmbuffer)) + { + ReleaseBuffer(*fsmbuffer); + *fsmblock = InvalidBlockNumber; + *fsmbuffer = InvalidBuffer; + } + + /* + * Work out the locations of the FSM blocks. + * + * stopblkno doesn't need to be perfectly accurate, just good enough that + * we search all of the FSM pages that are guaranteed to exist and no more. + * If next_segment is greater than zero, we know that the segment prior to + * the next segment has to exist, and so any FSM pages which would precede + * that must also exist. However, if next_segment is 0, or really any value + * less than or equal to CB_FSM_SEGMENTS_FOR_METAPAGE, then there may be + * no FSM pages at all. + * + * NB: When next_segment points to the first segment covered by some FSM + * page, that FSM page doesn't have to exist yet. We have to be careful + * to assume only that the previous segment exists. + */ + firstblkno = cb_first_fsm_block(cb->cb_pages_per_segment); + if (next_segment <= CB_FSM_SEGMENTS_FOR_METAPAGE) + stopblkno = 0; + else + stopblkno = cb_segment_to_block(cb->cb_pages_per_segment, + next_segment - 1, 0); + stride = cb_fsm_block_spacing(cb->cb_pages_per_segment); + + /* + * Search the FSM blocks one by one. + * + * NB: This might be too expensive for large conveyor belts. Perhaps we + * should avoid searching blocks that this backend recently searched and + * found to be full, or perhaps the on-disk format should contain + * information to help us avoid useless searching. + */ + for (currentblkno = firstblkno; currentblkno < stopblkno; + currentblkno += stride) + { + Buffer buffer; + CBSegNo segno; + + CHECK_FOR_INTERRUPTS(); + + buffer = ConveyorBeltRead(cb, currentblkno, BUFFER_LOCK_SHARE); + segno = cbfsmpage_find_free_segment(BufferGetPage(buffer)); + + if (segno != CB_INVALID_SEGMENT) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + Assert(segno <= next_segment); + *fsmblock = currentblkno; + *fsmbuffer = buffer; + return segno; + } + + UnlockReleaseBuffer(buffer); + } + + /* Loop should have iterated to completion. */ + Assert(currentblkno >= stopblkno); + + /* + * We've searched every FSM page that covers an allocated segment number, + * so it's time to think about adding a new FSM page. However, it's + * possible that someone else already did that, but didn't actually + * allocate a segment. It's also possible that someone extended the + * relation with the intention of adding a new FSM page, but didn't manage + * to complete the operation. Figure out which it is. + */ + while (1) + { + bool needs_init = false; + BlockNumber blkno; + BlockNumber nblocks; + Page page; + + CHECK_FOR_INTERRUPTS(); + + nblocks = RelationGetNumberOfBlocksInFork(cb->cb_rel, cb->cb_fork); + + /* If the relation needs to be physically extended, do so. */ + if (nblocks <= currentblkno) + { + /* + * We don't currently have a concept of separate relation + * extension locks per fork, so for now we just have to take the + * only and only relation-level lock. + */ + if (!have_extension_lock) + { + LockRelationForExtension(cb->cb_rel, ExclusiveLock); + have_extension_lock = true; + + /* + * Somebody else might have extended the relation while we + * were waiting for the relation extension lock, so recheck + * the length. + */ + nblocks = + RelationGetNumberOfBlocksInFork(cb->cb_rel, cb->cb_fork); + } + + /* + * If the relation would need to be extended by more than one + * segment to add this FSM page, something has gone wrong. Nobody + * should create a segment without extending the relation far + * enough that at least the first page exists physically. + */ + if (nblocks <= currentblkno - cb->cb_pages_per_segment) + ereport(ERROR, + errcode(ERRCODE_DATA_CORRUPTED), + errmsg("can't add conveyor belt FSM block at block %u with only %u blocks on disk", + currentblkno, currentblkno)); + + /* + * If the previous segment wasn't fully allocated on disk, add + * empty pages to fill it out. + */ + while (nblocks <= currentblkno - 1) + { + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, + P_NEW, RBM_NORMAL, NULL); + ReleaseBuffer(buffer); + ++nblocks; + } + + /* + * Now the relation should be of the correct length to add the new + * FSM page, unless someone already did it while we were waiting + * for the extension lock. + */ + if (nblocks <= currentblkno) + { + buffer = ReadBufferExtended(cb->cb_rel, cb->cb_fork, + P_NEW, RBM_NORMAL, NULL); + blkno = BufferGetBlockNumber(buffer); + if (blkno != currentblkno) + elog(ERROR, + "expected new FSM page as block %u but got block %u", + currentblkno, blkno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + needs_init = true; + } + } + + /* + * If we physically extended the relation to make room for the new FSM + * page, then we already have a pin and a content lock on the correct + * page. Otherwise, we still need to read it, and also check whether + * it has been initialized. + */ + if (!BufferIsValid(buffer)) + { + buffer = ConveyorBeltRead(cb, currentblkno, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + + if (PageIsNew(page)) + { + /* Appears to need initialization, so get exclusive lock. */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * It's possible that someone else initialized it after we we + * released our share-lock and before we got the exclusive + * lock, so retest whether initialization is required. + */ + if (PageIsNew(page)) + needs_init = true; + } + } + + /* + * If we found an FSM page that has already been initialized, we just + * need to search it. Often it will have no bits set, because it's + * beyond what we thought the last segment was, but if there's + * concurrent activity, things might have changed. + * + * If we found an empty page, or created a new empty page by + * physically extending the relation, then we need to initialize it. + */ + if (!needs_init) + segno = cbfsmpage_find_free_segment(page); + else + { + RelFileNode *rnode; + bool needs_xlog; + + rnode = &RelationGetSmgr(cb->cb_rel)->smgr_rnode.node; + needs_xlog = RelationNeedsWAL(cb->cb_rel) + || cb->cb_fork == INIT_FORKNUM; + segno = cb_create_fsmpage(rnode, cb->cb_fork, currentblkno, buffer, + cb->cb_pages_per_segment, needs_xlog); + } + + /* Release our shared or exclusive buffer lock, but keep the pin. */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* Hopefully we found a segment and are done. */ + if (segno != CB_INVALID_SEGMENT) + break; + + /* + * Somehow this FSM page, which at last check was beyond the last + * allocated segment, now has no bits free whatsoever. Either we've + * been asleep for an extrordinarily long time while a huge amount of + * other work has happened, or the data on disk is corrupted, or + * there's a bug. + */ + elog(DEBUG1, + "no free segments in recently-new conveyor belt FSM page at block %u", + currentblkno); + + /* Try the next FSM block. */ + ReleaseBuffer(buffer); + currentblkno += stride; + } + + /* Finish up. */ + if (have_extension_lock) + UnlockRelationForExtension(cb->cb_rel, ExclusiveLock); + Assert(BufferIsValid(buffer)); + *fsmblock = currentblkno; + *fsmbuffer = buffer; + return segno; +} diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile index f88d72fd86..3884e0129d 100644 --- a/src/backend/access/rmgrdesc/Makefile +++ b/src/backend/access/rmgrdesc/Makefile @@ -10,6 +10,7 @@ include $(top_builddir)/src/Makefile.global OBJS = \ brindesc.o \ + cbdesc.o \ clogdesc.o \ committsdesc.o \ dbasedesc.o \ diff --git a/src/backend/access/rmgrdesc/cbdesc.c b/src/backend/access/rmgrdesc/cbdesc.c new file mode 100644 index 0000000000..85019a3e13 --- /dev/null +++ b/src/backend/access/rmgrdesc/cbdesc.c @@ -0,0 +1,168 @@ +/*------------------------------------------------------------------------- + * + * cbdesc.c + * rmgr descriptor routines for access/conveyor/cbxlog.c + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/cbdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/cbxlog.h" + +extern void +conveyor_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE: + { + /* Nothing extra to print. */ + break; + } + + case XLOG_CONVEYOR_ALLOCATE_PAYLOAD_SEGMENT: + { + xl_cb_allocate_payload_segment *xlrec; + + xlrec = (xl_cb_allocate_payload_segment *) rec; + + appendStringInfo(buf, "segno %u is_extend %d", + xlrec->segno, xlrec->is_extend ? 1 : 0); + break; + } + + + case XLOG_CONVEYOR_ALLOCATE_INDEX_SEGMENT: + { + xl_cb_allocate_index_segment *xlrec; + + xlrec = (xl_cb_allocate_index_segment *) rec; + + appendStringInfo(buf, "segno %u pageno " UINT64_FORMAT " is_extend %d", + xlrec->segno, xlrec->pageno, + xlrec->is_extend ? 1 : 0); + break; + } + + + case XLOG_CONVEYOR_RELOCATE_INDEX_ENTRIES: + { + xl_cb_relocate_index_entries *xlrec; + unsigned i; + + xlrec = (xl_cb_relocate_index_entries *) rec; + + appendStringInfo(buf, "pageoffset %u num_index_entries %u index_page_start " UINT64_FORMAT, + xlrec->pageoffset, xlrec->num_index_entries, + xlrec->index_page_start); + for (i = 0; i < xlrec->num_index_entries; ++i) + { + if (i == 0) + appendStringInfoString(buf, " entries"); + appendStringInfo(buf, " %u", xlrec->index_entries[i]); + } + break; + } + + case XLOG_CONVEYOR_LOGICAL_TRUNCATE: + { + xl_cb_logical_truncate *xlrec; + + xlrec = (xl_cb_logical_truncate *) rec; + + appendStringInfo(buf, "oldest_keeper " UINT64_FORMAT, + xlrec->oldest_keeper); + break; + } + + case XLOG_CONVEYOR_CLEAR_BLOCK: + { + /* Nothing extra to print. */ + break; + } + + case XLOG_CONVEYOR_RECYCLE_PAYLOAD_SEGMENT: + { + xl_cb_recycle_payload_segment *xlrec; + + xlrec = (xl_cb_recycle_payload_segment *) rec; + + appendStringInfo(buf, "segno %u pageoffset %u", + xlrec->segno, xlrec->pageoffset); + break; + } + + case XLOG_CONVEYOR_RECYCLE_INDEX_SEGMENT: + { + xl_cb_recycle_index_segment *xlrec; + + xlrec = (xl_cb_recycle_index_segment *) rec; + + appendStringInfo(buf, "segno %u", + xlrec->segno); + break; + } + + case XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX: + { + xl_cb_shift_metapage_index *xlrec; + + xlrec = (xl_cb_shift_metapage_index *) rec; + + appendStringInfo(buf, "num_entries %u", + xlrec->num_entries); + break; + } + } +} + +extern const char * +conveyor_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE: + id = "INSERT_PAYLOAD_PAGE"; + break; + case XLOG_CONVEYOR_ALLOCATE_PAYLOAD_SEGMENT: + id = "ALLOCATE_PAYLOAD_SEGMENT"; + break; + case XLOG_CONVEYOR_ALLOCATE_INDEX_SEGMENT: + id = "ALLOCATE_INDEX_SEGMENT"; + break; + case XLOG_CONVEYOR_ALLOCATE_INDEX_PAGE: + id = "ALLOCATE_INDEX_PAGE"; + break; + case XLOG_CONVEYOR_RELOCATE_INDEX_ENTRIES: + id = "RELOCATE_INDEX_ENTRIES"; + break; + case XLOG_CONVEYOR_LOGICAL_TRUNCATE: + id = "LOGICAL_TRUNCATE"; + break; + case XLOG_CONVEYOR_CLEAR_BLOCK: + id = "CLEAR_BLOCK"; + break; + case XLOG_CONVEYOR_RECYCLE_PAYLOAD_SEGMENT: + id = "RECYCLE_PAYLOAD_SEGMENT"; + break; + case XLOG_CONVEYOR_RECYCLE_INDEX_SEGMENT: + id = "RECYCLE_INDEX_SEGMENT"; + break; + case XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX: + id = "SHIFT_METAPAGE_INDEX"; + break; + } + + return id; +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 58091f6b52..432a573d0e 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -8,6 +8,7 @@ #include "postgres.h" #include "access/brin_xlog.h" +#include "access/cbxlog.h" #include "access/clog.h" #include "access/commit_ts.h" #include "access/generic_xlog.h" diff --git a/src/bin/pg_waldump/.gitignore b/src/bin/pg_waldump/.gitignore index 3be00a8b61..774869c5a4 100644 --- a/src/bin/pg_waldump/.gitignore +++ b/src/bin/pg_waldump/.gitignore @@ -2,6 +2,7 @@ # Source files copied from src/backend/access/rmgrdesc/ /brindesc.c +/cbdesc.c /clogdesc.c /committsdesc.c /dbasedesc.c diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index 852d8ca4b1..7cb46bdb68 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -9,6 +9,7 @@ #include "postgres.h" #include "access/brin_xlog.h" +#include "access/cbxlog.h" #include "access/clog.h" #include "access/commit_ts.h" #include "access/generic_xlog.h" diff --git a/src/include/access/cbcache.h b/src/include/access/cbcache.h new file mode 100644 index 0000000000..79353c1897 --- /dev/null +++ b/src/include/access/cbcache.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * cbcache.h + * Conveyor belt index segment location cache. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbcache.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBCACHE_H +#define CBCACHE_H + +#include "access/cbdefs.h" + +struct CBCache; +typedef struct CBCache CBCache; + +extern CBCache *cb_cache_create(MemoryContext mcxt, + uint64 index_segments_moved); +extern void cb_cache_invalidate(CBCache *cache, CBPageNo index_start, + uint64 index_segments_moved); +extern CBSegNo cb_cache_lookup(CBCache *cache, CBPageNo pageno); +extern CBSegNo cb_cache_fuzzy_lookup(CBCache *cache, CBPageNo pageno, + CBPageNo *index_segment_start); +extern void cb_cache_insert(CBCache *cache, CBSegNo segno, + CBPageNo index_segment_start); + +#endif /* CBCACHE_H */ diff --git a/src/include/access/cbdefs.h b/src/include/access/cbdefs.h new file mode 100644 index 0000000000..aa5e2f4993 --- /dev/null +++ b/src/include/access/cbdefs.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * cbdefs.h + * Commonly-used conveyor-belt definitions. + * + * It's a little annoying to have a separate header file for just these + * few definitions - but the alternatives all seem worse. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbdefs.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBDEFS_H +#define CBDEFS_H + +/* Logical page numbers are unsigned, 64-bit integers. */ +typedef uint64 CBPageNo; +#define CB_INVALID_LOGICAL_PAGE ((CBPageNo) -1) + +/* Segment numbers are unsigned, 32-bit integers. */ +typedef uint32 CBSegNo; +#define CB_INVALID_SEGMENT ((CBSegNo) -1) + +#endif /* CBDEFS_H */ diff --git a/src/include/access/cbfsmpage.h b/src/include/access/cbfsmpage.h new file mode 100644 index 0000000000..40e0133d65 --- /dev/null +++ b/src/include/access/cbfsmpage.h @@ -0,0 +1,127 @@ +/*------------------------------------------------------------------------- + * + * cbfsmpage.h + * APIs for accessing conveyor belt free space map pages. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbfsmpage.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBFSMPAGE_H +#define CBFSMPAGE_H + +#include "access/cbdefs.h" +#include "access/cbmetapage.h" +#include "storage/bufpage.h" + +/* + * Number of free space map bytes reserved for fixed-size data. + * + * This needs to be at least large enough to hold a PageHeader plus the + * non-array fields in CBFSMPageData. We make it comfortably larger than + * that in case we ever want to enlarge CBFSMPageData. + */ +#define CB_FSMPAGE_RESERVE_BYTES 128 + +/* + * Number of bytes left over to store segment allocation status. + */ +#define CB_FSMPAGE_FREESPACE_BYTES (BLCKSZ - CB_FSMPAGE_RESERVE_BYTES) + +/* + * Number of segments covered by one FSM page. + */ +#define CB_FSM_SEGMENTS_PER_FSMPAGE \ + (CB_FSMPAGE_FREESPACE_BYTES * BITS_PER_BYTE) + +/* + * Function prototypes. + */ +extern CBSegNo cb_fsmpage_initialize(Page page, BlockNumber blkno, + uint16 pages_per_segment); +extern bool cb_fsmpage_get_fsm_bit(Page page, CBSegNo segno); +extern void cb_fsmpage_set_fsm_bit(Page page, CBSegNo segno, bool new_state); +extern CBSegNo cbfsmpage_find_free_segment(Page page); + +/* + * Where is the first FSM block located? + */ +static inline BlockNumber +cb_first_fsm_block(uint16 pages_per_segment) +{ + /* Add 1 to account for the metapage. */ + return 1 + CB_FSM_SEGMENTS_FOR_METAPAGE * (BlockNumber) pages_per_segment; +} + +/* + * How far apart are FSM blocks? + */ +static inline unsigned +cb_fsm_block_spacing(uint16 pages_per_segment) +{ + /* Add 1 to account for the FSM page itself. */ + return 1 + CB_FSM_SEGMENTS_PER_FSMPAGE * (unsigned) pages_per_segment; +} + +/* + * Figure out which block number contains a certain block within a certain + * segment. + */ +static inline BlockNumber +cb_segment_to_block(uint16 pages_per_segment, CBSegNo segno, unsigned segoff) +{ + unsigned extra_pages = 1; + + Assert(segoff < pages_per_segment); + + if (segno >= CB_FSM_SEGMENTS_FOR_METAPAGE) + extra_pages += 1 + (segno - CB_FSM_SEGMENTS_FOR_METAPAGE) + / CB_FSM_SEGMENTS_PER_FSMPAGE; + + return extra_pages + segno * pages_per_segment + segoff; +} + +/* + * Figure out the segment number of the first segment covered by an FSM page. + */ +static inline CBSegNo +cb_first_segment_for_fsm_page(BlockNumber blkno, uint16 pages_per_segment) +{ + BlockNumber first_fsm_block = cb_first_fsm_block(pages_per_segment); + unsigned fsm_block_spacing = cb_fsm_block_spacing(pages_per_segment); + unsigned fsm_index; + + Assert(blkno >= first_fsm_block); + Assert((blkno - first_fsm_block) % fsm_block_spacing == 0); + + fsm_index = (blkno - first_fsm_block) / fsm_block_spacing; + return CB_FSM_SEGMENTS_FOR_METAPAGE + + (fsm_index * CB_FSM_SEGMENTS_PER_FSMPAGE); +} + +/* + * Figure out which FSM block covers a certain segment number. + * + * If the FSM entry for the indicated segment is in the metapage, the return + * value is InvalidBlockNumber. + */ +static inline BlockNumber +cb_segment_to_fsm_block(uint16 pages_per_segment, CBSegNo segno) +{ + BlockNumber first_fsm_block = cb_first_fsm_block(pages_per_segment); + unsigned fsm_block_spacing = cb_fsm_block_spacing(pages_per_segment); + unsigned fsm_block_index; + + if (segno < CB_FSM_SEGMENTS_FOR_METAPAGE) + return InvalidBlockNumber; + fsm_block_index = + (segno - CB_FSM_SEGMENTS_FOR_METAPAGE) / CB_FSM_SEGMENTS_PER_FSMPAGE; + return first_fsm_block + (fsm_block_index * fsm_block_spacing); +} + +#endif /* CBFSMPAGE_H */ diff --git a/src/include/access/cbfsmpage_format.h b/src/include/access/cbfsmpage_format.h new file mode 100644 index 0000000000..2c072c41a6 --- /dev/null +++ b/src/include/access/cbfsmpage_format.h @@ -0,0 +1,38 @@ +/*------------------------------------------------------------------------- + * + * cbmetapage_format.h + * Actual on-disk format for a conveyor-belt FSM page. + * + * Backend code should not typically include this file directly, even if + * it's code that is part of the conveyor belt implemenation. Instead, it + * should use the interface routines defined in cbfsmpage.h. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbfsmpage_format.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBFSMPAGE_FORMAT_H +#define CBFSMPAGE_FORMAT_H + +#include "access/cbmetapage.h" + +/* Magic number for the FSM page. */ +#define CB_FSMPAGE_MAGIC 0x30263162 + +/* + * A conveyor belt FSM page will store a struct of this type in the page's + * special space. + */ +typedef struct CBFSMPageData +{ + uint32 cbfsm_magic; /* always CB_FSMPAGE_MAGIC */ + CBSegNo cbfsm_start; /* first segment this page describes */ + uint8 cbfsm_state[CB_FSMPAGE_FREESPACE_BYTES]; +} CBFSMPageData; + +#endif diff --git a/src/include/access/cbindexpage.h b/src/include/access/cbindexpage.h new file mode 100644 index 0000000000..cb99a73de0 --- /dev/null +++ b/src/include/access/cbindexpage.h @@ -0,0 +1,84 @@ +/*------------------------------------------------------------------------- + * + * cbindexpage.h + * APIs for accessing conveyor belt index pages. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbindexpage.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBINDEXPAGE_H +#define CBINDEXPAGE_H + +#include "access/cbdefs.h" +#include "storage/bufpage.h" + +/* + * Number of index page bytes reserved for fixed-size data. + * + * This needs to be at least large enough to hold a PageHeader plus the + * non-array fields in CBIndexPageData. We make it comfortably larger than + * that in case we ever want to enlarge CBIndexPageData. + */ +#define CB_INDEXPAGE_RESERVE_BYTES 128 + +/* + * Number of index entries per index page. + */ +#define CB_INDEXPAGE_INDEX_ENTRIES \ + ((BLCKSZ - CB_INDEXPAGE_RESERVE_BYTES) / sizeof(CBSegNo)) + +/* + * Function prototypes. + */ +extern void cb_indexpage_initialize(Page page, CBPageNo pageno); +extern BlockNumber cb_indexpage_find_logical_page(Page page, + CBPageNo pageno, + uint16 pages_per_segment); +extern void cb_indexpage_add_index_entries(Page page, + unsigned pageoffset, + unsigned num_index_entries, + CBSegNo *index_entries); +extern CBSegNo cb_indexpage_get_obsolete_entry(Page page, + unsigned *pageoffset, + CBPageNo *first_pageno); +extern void cb_indexpage_clear_obsolete_entry(Page page, + CBSegNo segno, + unsigned pageoffset); +extern void cb_indexpage_set_next_segment(Page page, CBSegNo segno); +extern CBSegNo cb_indexpage_get_next_segment(Page page); + +/* + * How many index entries will fit into an index segment? + */ +static inline unsigned +cb_index_entries_per_index_segment(uint16 pages_per_segment) +{ + return CB_INDEXPAGE_INDEX_ENTRIES * (unsigned) pages_per_segment; +} + +/* + * How many logical pages can we map using a single index segment? + */ +static inline unsigned +cb_logical_pages_per_index_segment(uint16 pages_per_segment) +{ + return cb_index_entries_per_index_segment(pages_per_segment) + * (unsigned) pages_per_segment; +} + +/* + * How many logical pages can we map using a single index page? + */ +static inline unsigned +cb_logical_pages_per_index_page(uint16 pages_per_segment) +{ + return CB_INDEXPAGE_INDEX_ENTRIES * (unsigned) pages_per_segment; +} + +#endif /* CBINDEXPAGE_H */ diff --git a/src/include/access/cbindexpage_format.h b/src/include/access/cbindexpage_format.h new file mode 100644 index 0000000000..904ad97714 --- /dev/null +++ b/src/include/access/cbindexpage_format.h @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * cbindexpage_format.h + * Actual on-disk format for a conveyor-belt index page. + * + * Backend code should not typically include this file directly, even if + * it's code that is part of the conveyor belt implemenation. Instead, it + * should use the interface routines defined in cbindexpage.h. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbindexpage_format.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBINDEXPAGE_FORMAT_H +#define CBINDEXPAGE_FORMAT_H + +#include "access/cbindexpage.h" + +/* Magic number for the index page. */ +#define CB_INDEXPAGE_MAGIC 0x62334c54 + +/* + * A conveyor belt index page will store a struct of this type in the page's + * special space. + */ +typedef struct CBIndexPageData +{ + /* Always CB_INDEXPAGE_MAGIC. */ + uint32 cbidx_magic; + + /* + * If this is the first page of an index segment and there is at least one + * index segment after this one, then this is the segment number of the + * next such segment. Otherwise, it's CB_INVALID_SEGMENT. + */ + CBSegNo cbidx_next_segment; + + /* + * The first logical page number of the first segment whose index entry + * is stored on this page. Technically this isn't required, but it seems + * good to have for sanity checks. + */ + CBPageNo cbidx_first_page; + + /* The actual index entries stored on this page. */ + CBSegNo cbidx_entry[CB_INDEXPAGE_INDEX_ENTRIES]; +} CBIndexPageData; + +#endif diff --git a/src/include/access/cbmetapage.h b/src/include/access/cbmetapage.h new file mode 100644 index 0000000000..b7a65187bb --- /dev/null +++ b/src/include/access/cbmetapage.h @@ -0,0 +1,179 @@ +/*------------------------------------------------------------------------- + * + * cbmetapage.h + * APIs for accessing conveyor belt metapages. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbmetapage.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBMETAPAGE_H +#define CBMETAPAGE_H + +#include "access/cbdefs.h" +#include "storage/bufpage.h" + +/* Opaque struct for the actual metapage format. */ +struct CBMetapageData; +typedef struct CBMetapageData CBMetapageData; + +/* In which block number is the metapage stored? */ +#define CONVEYOR_METAPAGE 0 + +/* + * Number of metapage bytes reserved for fixed-size data. + * + * This needs to be at least large enough to hold a PageHeader plus the + * non-array fields in CBMetaPageData. We make it comfortably larger than + * that in case we ever want to enlarge CBMetaPageData. + */ +#define CB_METAPAGE_RESERVE_BYTES 256 + +/* + * Number of index entries and freespace map bytes stored in the metapage. + * + * Somewhat arbitrarily, we allocate half the page to index entries and + * the remaining space to freespace map bytes. Note that the freespace map + * is much more compact than the index (1 bit per segment vs. 4 bytes per + * segment) so a little bit of space goes a long way. We could further + * reduce the size of the freespace map to make room for more index entries, + * but it doesn't seem like it would have much of an impact either way. + */ +#define CB_METAPAGE_INDEX_BYTES (BLCKSZ / 2) +#define CB_METAPAGE_INDEX_ENTRIES \ + (CB_METAPAGE_INDEX_BYTES / sizeof(CBSegNo)) +#define CB_METAPAGE_FREESPACE_BYTES \ + (BLCKSZ - CB_METAPAGE_RESERVE_BYTES - \ + CB_METAPAGE_INDEX_ENTRIES * sizeof(CBSegNo)) + +/* + * Number of segments whose allocation status can be tracked in the metapage. + */ +#define CB_FSM_SEGMENTS_FOR_METAPAGE \ + (CB_METAPAGE_FREESPACE_BYTES * BITS_PER_BYTE) + +/* + * Possible states of the metapage with regard to the proposed insertion of + * a payload page. + * + * CBM_INSERT_OK means that the most recent index entry is for a payload + * segment that is not yet full. All other values imply that this is not + * the case. + * + * CBM_INSERT_NEEDS_PAYLOAD_SEGMENT means that there is still room in the + * metapage for more index entries. + * + * CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED means that there is no more room + * in the metapage for additional index entries, but there is room in the + * newest index segment for entries to be relocated from the metapage. + * + * CBM_INSERT_NEEDS_INDEX_SEGMENT means that there is no more room in + * the metapage for additional index entries, and the newest index segment + * is full, too. + */ +typedef enum +{ + CBM_INSERT_OK, + CBM_INSERT_NEEDS_PAYLOAD_SEGMENT, + CBM_INSERT_NEEDS_INDEX_ENTRIES_RELOCATED, + CBM_INSERT_NEEDS_INDEX_SEGMENT +} CBMInsertState; + +/* + * Possible states of the metapage with regard to obsoleting index entries. + * + * CBM_OBSOLETE_SEGMENT_ENTRIES means that there may be index entries which + * are no longer required in the oldest index segment. + * + * CBM_OBSOLETE_METAPAGE_ENTRIES means that there are no index segments in + * existence and that there is at least one index entry in the metapage + * that is no longer required. + * + * CBM_OBSOLETE_METAPAGE_START means that there are no index segments in + * in existence and that all index entries in the metapage prior to the + * logical truncation point have been cleared; however, the metapage's + * notion of where the index begins should be advanced to free up space + * in the metapage. + * + * CBM_OBSOLETE_NOTHING means that there is no cleanup work of this type + * to be done. + */ +typedef enum +{ + CBM_OBSOLETE_SEGMENT_ENTRIES, + CBM_OBSOLETE_METAPAGE_ENTRIES, + CBM_OBSOLETE_METAPAGE_START, + CBM_OBSOLETE_NOTHING +} CBMObsoleteState; + +/* + * Function prototypes. + */ +extern void cb_metapage_initialize(Page page, uint16 pages_per_segment); +extern CBMetapageData *cb_metapage_get_special(Page page); +extern bool cb_metapage_find_logical_page(CBMetapageData *meta, + CBPageNo pageno, + BlockNumber *blkno); +extern CBMInsertState cb_metapage_find_next_logical_page(CBMetapageData *meta, + CBPageNo *pageno, + BlockNumber *blkno, + CBSegNo *next_segno); +extern CBMInsertState cb_metapage_get_insert_state(CBMetapageData *meta, + BlockNumber *blkno, + CBPageNo *next_pageno, + CBSegNo *next_segno, + CBPageNo *index_start, + CBPageNo *index_metapage_start, + CBSegNo *newest_index_segment); +extern void cb_metapage_advance_next_logical_page(CBMetapageData *meta, + BlockNumber blkno); +extern void cb_metapage_advance_oldest_logical_page(CBMetapageData *meta, + CBPageNo oldest_logical_page); +extern void cb_metapage_get_bounds(CBMetapageData *meta, + CBPageNo *oldest_logical_page, + CBPageNo *next_logical_page); +extern int cb_metapage_get_index_entries_used(CBMetapageData *meta); +extern void cb_metapage_add_index_entry(CBMetapageData *meta, CBSegNo segno); +extern void cb_metapage_remove_index_entries(CBMetapageData *meta, + unsigned count, + bool relocating); +extern void cb_metapage_get_index_entries(CBMetapageData *meta, + unsigned num_index_entries, + CBSegNo *index_entries); +extern void cb_metapage_get_critical_info(CBMetapageData *meta, + uint16 *pages_per_segment, + uint64 *index_segments_moved); +extern void cb_metapage_get_index_info(CBMetapageData *meta, + CBPageNo *index_start, + CBPageNo *index_metapage_start, + CBSegNo *oldest_index_segment, + CBSegNo *newest_index_segment, + uint64 *index_segments_moved); + +extern void cb_metapage_add_index_segment(CBMetapageData *meta, + CBSegNo segno); +extern void cb_metapage_remove_index_segment(CBMetapageData *meta, + CBSegNo segno); +extern CBMObsoleteState cb_metapage_get_obsolete_state(CBMetapageData *meta, + CBSegNo *oldest_index_segment, + CBPageNo *index_vacuum_stop_point, + CBSegNo *metapage_segno, + unsigned *metapage_offset); +extern void cb_metapage_clear_obsolete_index_entry(CBMetapageData *meta, + CBSegNo segno, + unsigned offset); + +extern CBSegNo cb_metapage_find_free_segment(CBMetapageData *meta); +extern bool cb_metapage_get_fsm_bit(CBMetapageData *meta, CBSegNo segno); +extern void cb_metapage_set_fsm_bit(CBMetapageData *meta, CBSegNo segno, + bool new_state); +extern void cb_metapage_increment_next_segment(CBMetapageData *meta, + CBSegNo segno); +extern void cb_metapage_increment_index_segments_moved(CBMetapageData *meta); + +#endif /* CBMETAPAGE_H */ diff --git a/src/include/access/cbmetapage_format.h b/src/include/access/cbmetapage_format.h new file mode 100644 index 0000000000..a39ab44949 --- /dev/null +++ b/src/include/access/cbmetapage_format.h @@ -0,0 +1,110 @@ +/*------------------------------------------------------------------------- + * + * cbmetapage_format.h + * Actual on-disk format for a conveyor-belt metapage. + * + * Backend code should not typically include this file directly, even if + * it's code that is part of the conveyor belt implemenation. Instead, it + * should use the interface routines defined in cbmetapage.h. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbmetapage_format.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBMETAPAGE_FORMAT_H +#define CBMETAPAGE_FORMAT_H + +#include "access/cbmetapage.h" + +/* Magic number for the metapage. */ +#define CB_METAPAGE_MAGIC 0x43304e76 + +/* Conveyor belt metapage version. */ +#define CBM_VERSION 1 + +/* + * Conveyor belt metapage. + */ +struct CBMetapageData +{ + /* + * Basic information. + * + * cbm_magic should always be CB_METAPAGE_MAGIC, and cbm_version should + * always be CB_VERSION (or an older, supported version greater than or + * equal to 1, but right now 1 is the current version). + * + * cbm_pages_per_segment is the number of pages per segment. Making this + * larger reduces the number of index and freespace map segments required + * and decreases fragmentation at the storage level, but it also increases + * the granularity of space reuse. + */ + uint32 cbm_magic; + uint32 cbm_version; + uint16 cbm_pages_per_segment; + + /* + * Logical start and end of the conveyor belt. + * + * cbm_oldest_logical_page is the smallest logical page number that has + * not yet been truncated away. The conveyor belt is free to remove older + * data or recycle the pages for new data, but doesn't necessarily do so + * immediately. + * + * cbm_next_logical_page is the smallest logical page number that has not + * yet been allocated. + */ + CBPageNo cbm_oldest_logical_page; + CBPageNo cbm_next_logical_page; + + /* + * Information for logical-to-physical indexing. + * + * cbm_index_start is the oldest logical page number for which we might + * still have a logical-to-physical mapping. It can be older than + * cbm_oldest_logical_page if we haven't thrown away all the old data yet, + * but it can't be newer. + * + * cbm_index_metapage_start is the oldest logical page number whose + * logical-to-physical mapping, if it exists, is stored in the metapage. + * It cannot be smaller than cbm_index_start. + * + * cbm_oldest_index_segment and cbm_newest_index_segment are the oldest + * and newest index segments that exist. Both values will be + * CB_INVALID_SEGMENT if there are no index segments. Otherwise, the + * mapping for cbm_index_start is stored in the first entry in the + * first page of cbm_oldest_index_segment. + * + * cbm_entries_in_newest_index_segment is the number of index entries + * in the newest index segment, or 0 if there are no index segments. + * + * cbm_index_segments_moved is the total number of times in the history + * of this conveyor belt that an index segment has been physically + * moved to a different segment number. This helps backends to know + * whether their cached notions of where index entries for particular + * logical pages are located are still valid. + * + * cbm_next_segment is the lowest-numbered segment that does not yet + * exist. + */ + CBPageNo cbm_index_start; + CBPageNo cbm_index_metapage_start; + CBSegNo cbm_oldest_index_segment; + CBSegNo cbm_newest_index_segment; + unsigned cbm_entries_in_newest_index_segment; + uint64 cbm_index_segments_moved; + CBSegNo cbm_next_segment; + + /* + * In-metapage portion of index and freespace map. + */ + CBSegNo cbm_index[CB_METAPAGE_INDEX_ENTRIES]; + uint8 cbm_freespace_map[CB_METAPAGE_FREESPACE_BYTES]; +}; + +#endif diff --git a/src/include/access/cbmodify.h b/src/include/access/cbmodify.h new file mode 100644 index 0000000000..4d31b4b175 --- /dev/null +++ b/src/include/access/cbmodify.h @@ -0,0 +1,129 @@ +/*------------------------------------------------------------------------- + * + * cbmodify.h + * Routines to make a change to a conveyor belt and XLOG it if needed. + * + * All of these routines assume that the required buffers have been + * correctly identified by the called, and that all necessary pins and + * locks have been acquired, and that the caller has verified that the + * page is in the correct starting state for the proposed modification. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. See src/backend/access/conveyor/cbxlog.c for + * the corresponding REDO routines. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbmodify.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBMODIFY_H +#define CBMODIFY_H + +#include "access/cbdefs.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/relfilenode.h" + +extern void cb_create_metapage(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + uint16 pages_per_segment, + bool needs_xlog); + +extern CBSegNo cb_create_fsmpage(RelFileNode *rnode, + ForkNumber fork, + BlockNumber blkno, + Buffer buffer, + uint16 pages_per_segment, + bool needs_xlog); + +extern void cb_insert_payload_page(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber payloadblock, + Buffer payloadbuffer, + bool needs_xlog); + +extern void cb_allocate_payload_segment(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber fsmblock, + Buffer fsmbuffer, + CBSegNo segno, + bool is_extend, + bool needs_xlog); + +extern void cb_allocate_index_segment(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber indexblock, + Buffer indexbuffer, + BlockNumber prevblock, + Buffer prevbuffer, + BlockNumber fsmblock, + Buffer fsmbuffer, + CBSegNo segno, + CBPageNo pageno, + bool is_extend, + bool needs_xlog); + +extern void cb_allocate_index_page(RelFileNode *rnode, + ForkNumber fork, + BlockNumber indexblock, + Buffer indexbuffer, + CBPageNo pageno, + bool needs_xlog); + +extern void cb_relocate_index_entries(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber indexblock, + Buffer indexbuffer, + unsigned pageoffset, + unsigned num_index_entries, + CBSegNo *index_entries, + CBPageNo index_page_start, + bool needs_xlog); + +extern void cb_logical_truncate(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + CBPageNo oldest_keeper, + bool needs_xlog); + +extern void cb_clear_block(RelFileNode *rnode, + ForkNumber fork, + BlockNumber blkno, + Buffer buffer, + bool needs_xlog); + +extern void cb_recycle_payload_segment(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber indexblock, + Buffer indexbuffer, + BlockNumber fsmblock, + Buffer fsmbuffer, + CBSegNo segno, + unsigned pageoffset, + bool needs_xlog); + +extern CBSegNo cb_recycle_index_segment(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + BlockNumber indexblock, + Buffer indexbuffer, + BlockNumber fsmblock, + Buffer fsmbuffer, + CBSegNo segno, + bool needs_xlog); + +extern void cb_shift_metapage_index(RelFileNode *rnode, + ForkNumber fork, + Buffer metabuffer, + unsigned num_entries, + bool needs_xlog); + +#endif /* CBMODIFY_H */ diff --git a/src/include/access/cbstorage.h b/src/include/access/cbstorage.h new file mode 100644 index 0000000000..b9df430f9c --- /dev/null +++ b/src/include/access/cbstorage.h @@ -0,0 +1,165 @@ +/* + * Mid-level operations know about pages, buffers, and xlog; and they + * can touch multiple kinds of things - e.g. allocating a payload segment + * touches the segment itself and the index entry that points to it. + * But, if they are write operations, they should only write one XLOG + * record, not multiple records. And if they are read operations, they + * should do only one kind of thing. So for example "look for a segment + * to allocate" could be a mid-level operation but not "look for a payload + * segment to allocate, and then try to allocate it if you find one, and if + * you don't find one then try to allocate a new freespace map segment first + * and then retry." + * + * cb_initialize: Initialize conveyor belt. + * + * cb_lock_new_page: Lock next logical page. + * cb_log_new_page: Like log_newpage, but also bump next-page counter. + * cb_append_new_page: Just bump next-page counter. + * + * cb_allocate_payload_segment: Zero, add index entry, mark used. + * cb_allocate_index_segment: Initialize, add pointers to it, mark used. + * cb_allocate_fsm_segment: Initialize, add pointer to it, mark used. + * cb_deallocate_payload_segment: Remove index entry + mark free. + * cb_deallocate_index_segment: Change incoming link(s) + mark free. + * cb_deallocate_fsm_segment: Change incoming link + mark free. + * cb_find_unused_segment: Check the freespace map for a segment to allocate. + * + * cb_relocate_index_entries: Move metapage index entries to index segment. + * cb_trim_index_entries: Zap some unused index entries. + * + * cb_lookup_page: Find the physical position of a logical page, searching + * the metapage and index pages as required. + * + * cb_truncate: Update oldest logical page. + * + * Eventually: + * cb_move_payload_segment: Overwrite index entry. + * cb_move_index_segment: Overwrite incoming link(s). + * cb_move_fsm_segment: Overwrite incoming link. + * and whatever infrastructure we need for a full rewrite. + * + * xlog operations: + * 0x00 XLOG_CONVEYOR_NEWPAGE + * 0x10 XLOG_CONVEYOR_ALLOCATE_PAYLOAD + * 0x20 XLOG_CONVEYOR_ALLOCATE_INDEX + * 0x30 XLOG_CONVEYOR_ALLOCATE_FSM + * 0x40 XLOG_CONVEYOR_DEALLOCATE_PAYLOAD + * 0x50 XLOG_CONVEYOR_DEALLOCATE_INDEX + * 0x60 XLOG_CONVEYOR_DEALLOCATE_FSM + * 0x70 XLOG_CONVEYOR_RELOCATE_INDEX_ENTRIES + * 0x80 XLOG_CONVEYOR_TRIM_INDEX_ENTRIES + * 0xA0 XLOG_CONVEYOR_MOVE_PAYLOAD_SEGMENT + * 0xB0 XLOG_CONVEYOR_MOVE_INDEX_SEGMENT + * 0xC0 XLOG_CONVEYOR_MOVE_FSM_SEGMENT + */ + +/* Log the addition of a new logical page to the conveyor belt. */ +extern void cb_xlog_newpage(RelFileNode *rnode, ForkNumber fork, + BlockNumber blkno, Page page, bool page_std, + Page metapage); + +extern void cb_initialize(Relation rel, ForkNumber fork, + uint16 pages_per_segment, XLogRecPtr lsn); + +if not in recovery +- check that the relation has 0 blocks +- extend it by one page +- write a metapage and xlog the change + +if in recovery +- not reached because we would use log_newpage + +extern void cb_allocate_payload_segment(Relation rel, ForkNumber fork, + CBSegNo segno, Buffer metabuffer); +- always: add entry to metapage and MarkBufferDirty +- !recovery: XLogInsert to set lsn +- PageSetLSN +- so the idea is: caller has to give us a locked buffer that is in a known + good state for the operation we want to perform, and all of the details + needed to make and log the changes + +LOCKING ORDER + metapage + index segment or freespace map segment page + payload page + +PSEUDOCODE FOR INSERTING A PAGE + +pin metapage +while (!done) +{ + lock metapage + decide: insert page, relocate entries, or insert index segment + if action = relocate entries: + if we've already got the target page pinned, lock it and move stuff + then action = insert index segment + if action = insert index segment: + if we've already got a segment ready to go, do the insert + then action = insert page + if action = insert page: + lock target page, extending relation if required + (maybe boom if not new) + done (keep locks on target page + metapage) + unlock metapage + + if action = relocate entries: + release any pin on what used to be the last index page + pin and lock the current last index page + if it's full, we'll have to add a new one + otherwise, unlock and loop + + if action = insert index segment: + identify a possible target segment - conditional cleanup lock 1st page + if we can't, we'll have to extend the freespace map first + initialize target segment, keeping a pin on the first page + loop +} + +if we are holding onto a pin on an index page that we didn't end up +using, release it + +if we are holding onto a pin on a proposed new index segment that we +didn't up using, release it + +MOVING PAGES + +we need to copy all of the pages to new locations, and then adjust the +entries that point to that segment. we have trouble if, after we copy +a page, somebody modifies it. + +if the client code doesn't modify payload pages after insertion, then +this can't be a problem for anything but the current insertion segment + +if we don't allow concurrent moves, then it can't be a problem for +anything but the last index segment page, which could have stuff +added to it - but otherwise entries can't be modified. + +it doesn't seem like we have any big problems if moving pages just +requires that the relation fork is read-only. nearly every read and +write operation requires locking the metapage; off-hand, it seems like +the only possible exception is allocating or freeing a freespace map +segment whose used/free status is stored on some other freespace map page. + +that case probably needs to be made to modify the metapage, too, +if anyone is going to cache any state. + +/* + * Low-level operations for index pages + * + * INDEX PAGES + * cb_index_page_initialize: initialize index page + * cb_index_page_get_entry: get N'th entry from page if in range + * cb_index_page_first_free_entry: get first free entry and + * # of free entries + * cb_index_page_append_entries: add N entries to page starting at position P + * (must be first free) + * cb_index_page_first_used_entry: get first used entry offset and value + * cb_index_page_clear_entry: clear N'th entry + * cb_index_page_get_next_segment: get next segment + * cb_index_Page_set_next_segment: set next segment + * (All operations should cross-check expected starting logical page.) + * + * PAYLOAD PAGES + * cb_clear_payload_pages: clear pages + */ + diff --git a/src/include/access/cbxlog.h b/src/include/access/cbxlog.h new file mode 100644 index 0000000000..f307ffd3be --- /dev/null +++ b/src/include/access/cbxlog.h @@ -0,0 +1,108 @@ +/*------------------------------------------------------------------------- + * + * cbxlog.h + * XLOG support for conveyor belts. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/cbxlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef CBXLOG_H +#define CBXLOG_H + +#include "access/cbdefs.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" + +#define XLOG_CONVEYOR_INSERT_PAYLOAD_PAGE 0x10 +#define XLOG_CONVEYOR_ALLOCATE_PAYLOAD_SEGMENT 0x20 +#define XLOG_CONVEYOR_ALLOCATE_INDEX_SEGMENT 0x30 +#define XLOG_CONVEYOR_ALLOCATE_INDEX_PAGE 0x40 +#define XLOG_CONVEYOR_RELOCATE_INDEX_ENTRIES 0x50 +#define XLOG_CONVEYOR_LOGICAL_TRUNCATE 0x60 +#define XLOG_CONVEYOR_CLEAR_BLOCK 0x70 +#define XLOG_CONVEYOR_RECYCLE_PAYLOAD_SEGMENT 0x80 +#define XLOG_CONVEYOR_RECYCLE_INDEX_SEGMENT 0x90 +#define XLOG_CONVEYOR_SHIFT_METAPAGE_INDEX 0xA0 + +typedef struct xl_cb_allocate_payload_segment +{ + CBSegNo segno; + bool is_extend; +} xl_cb_allocate_payload_segment; + +#define SizeOfCBAllocatePayloadSegment \ + (offsetof(xl_cb_allocate_payload_segment, is_extend) + sizeof(bool)) + +typedef struct xl_cb_allocate_index_segment +{ + CBSegNo segno; + CBPageNo pageno; + bool is_extend; +} xl_cb_allocate_index_segment; + +#define SizeOfCBAllocateIndexSegment \ + (offsetof(xl_cb_allocate_index_segment, is_extend) + sizeof(bool)) + +typedef struct xl_cb_allocate_index_page +{ + CBPageNo pageno; +} xl_cb_allocate_index_page; + +#define SizeOfCBAllocateIndexPage \ + (offsetof(xl_cb_allocate_index_page, pageno) + sizeof(CBPageNo)) + +typedef struct xl_cb_relocate_index_entries +{ + unsigned pageoffset; + unsigned num_index_entries; + CBPageNo index_page_start; + CBSegNo index_entries[FLEXIBLE_ARRAY_MEMBER]; +} xl_cb_relocate_index_entries; + +#define SizeOfCBRelocateIndexEntries \ + (offsetof(xl_cb_relocate_index_entries, index_entries)) + +typedef struct xl_cb_logical_truncate +{ + CBPageNo oldest_keeper; +} xl_cb_logical_truncate; + +#define SizeOfCBLogicalTruncate \ + (offsetof(xl_cb_logical_truncate, oldest_keeper) + sizeof(CBPageNo)) + +typedef struct xl_cb_recycle_payload_segment +{ + CBSegNo segno; + unsigned pageoffset; +} xl_cb_recycle_payload_segment; + +#define SizeOfCBRecyclePayloadSegment \ + (offsetof(xl_cb_recycle_payload_segment, segno) + sizeof(CBSegNo)) + +typedef struct xl_cb_recycle_index_segment +{ + CBSegNo segno; +} xl_cb_recycle_index_segment; + +#define SizeOfCBRecycleIndexSegment \ + (offsetof(xl_cb_recycle_index_segment, segno) + sizeof(CBSegNo)) + +typedef struct xl_cb_shift_metapage_index +{ + unsigned num_entries; +} xl_cb_shift_metapage_index; + +#define SizeOfCBShiftMetapageIndex \ + (offsetof(xl_cb_shift_metapage_index, num_entries) + sizeof(unsigned)) + +extern void conveyor_desc(StringInfo buf, XLogReaderState *record); +extern void conveyor_redo(XLogReaderState *record); +extern const char *conveyor_identify(uint8 info); + +#endif /* CBXLOG_H */ diff --git a/src/include/access/conveyor.h b/src/include/access/conveyor.h new file mode 100644 index 0000000000..1b1ce5ba22 --- /dev/null +++ b/src/include/access/conveyor.h @@ -0,0 +1,59 @@ +/*------------------------------------------------------------------------- + * + * conveyor.h + * Public API for conveyor belt storage. + * + * See src/backend/access/conveyor/README for a general overview of + * conveyor belt storage. + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * src/include/access/conveyor.h + * + *------------------------------------------------------------------------- + */ +#ifndef CONVEYOR_H +#define CONVEYOR_H + +#include "access/cbdefs.h" +#include "common/relpath.h" +#include "utils/relcache.h" +#include "storage/bufmgr.h" + +struct ConveyorBelt; +typedef struct ConveyorBelt ConveyorBelt; + +/* Routines to create a new conveyor belt, or open an existing one. */ +extern ConveyorBelt *ConveyorBeltInitialize(Relation rel, + ForkNumber fork, + uint16 pages_per_segment, + MemoryContext mcxt); +extern ConveyorBelt *ConveyorBeltOpen(Relation rel, + ForkNumber fork, + MemoryContext mcxt); + +/* Routines to inserting new data into a conveyor belt. */ +extern Buffer ConveyorBeltGetNewPage(ConveyorBelt *cb, CBPageNo *pageno); +extern void ConveyorBeltPerformInsert(ConveyorBelt *cb, Buffer buffer); +extern void ConveyorBeltCleanupInsert(ConveyorBelt *cb, Buffer buffer); + +/* Routines for reading data from a conveyor belt. */ +extern Buffer ConveyorBeltReadBuffer(ConveyorBelt *cb, CBPageNo pageno, + int mode, + BufferAccessStrategy strategy); +extern void ConveyorBeltGetBounds(ConveyorBelt *cb, + CBPageNo *oldest_logical_page, + CBPageNo *next_logical_page); + +/* Routines for removing old data from a conveyor belt. */ +extern void ConveyorBeltLogicalTruncate(ConveyorBelt *cb, + CBPageNo oldest_keeper); +extern void ConveyorBeltVacuum(ConveyorBelt *cb); +extern void ConveyorBeltPhysicalTruncate(ConveyorBelt *cb); +extern void ConveyorBeltCompact(ConveyorBelt *cb); +extern ConveyorBelt *ConveyorBeltRewrite(ConveyorBelt *cb, + Relation rel, + ForkNumber fork, + MemoryContext mcxt); + +#endif /* CONVEYOR_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index f582cf535f..f84976bce6 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL) PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask) PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL) +PG_RMGR(RM_CONVEYOR_ID, "Conveyor", conveyor_redo, conveyor_desc, conveyor_identify, NULL, NULL, NULL) diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index f41ef0d2bc..4e492fb150 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3779,3 +3779,11 @@ yyscan_t z_stream z_streamp zic_t +cb_iseg_entry +CBCache +CBMetapageData +CBPageNo +CBSegNo +ConveyorBelt +xl_cb_allocate_index_segment +xl_cb_allocate_payload_segment -- 2.39.5