From bd8d9c9bdfa0c2168bb37edca6fa88168cacbbaa Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 9 Dec 2025 13:53:03 +0200 Subject: [PATCH] Widen MultiXactOffset to 64 bits This eliminates MultiXactOffset wraparound and the 2^32 limit on the total number of multixid members. Multixids are still limited to 2^31, but this is a nice improvement because 'members' can grow much faster than the number of multixids. On such systems, you can now run longer before hitting hard limits or triggering anti-wraparound vacuums. Not having to deal with MultiXactOffset wraparound also simplifies the code and removes some gnarly corner cases. We no longer need to perform emergency anti-wraparound freezing because of running out of 'members' space, so the offset stop limit is gone. But you might still not want 'members' to consume huge amounts of disk space. For that reason, I kept the logic for lowering vacuum's multixid freezing cutoff if a large amount of 'members' space is used. The thresholds for that are roughly the same as the "safe" and "danger" thresholds used before, 2 billion transactions and 4 billion transactions. This keeps the behavior for the freeze cutoff roughly the same as before. It might make sense to make this smarter or configurable, now that the threshold is only needed to manage disk usage, but that's left for the future. Add code to pg_upgrade to convert multitransactions from the old to the new format, rewriting the pg_multixact SLRU files. Because pg_upgrade now rewrites the files, we can get rid of some hacks we had put in place to deal with old bugs and upgraded clusters. Bump catalog version for the pg_multixact/offsets format change. Author: Maxim Orlov Reviewed-by: Ashutosh Bapat Reviewed-by: Alexander Korotkov Reviewed-by: wenhui qiu Discussion: https://www.postgresql.org/message-id/CACG%3DezaWg7_nt-8ey4aKv2w9LcuLthHknwCawmBgEeTnJrJTcw@mail.gmail.com --- doc/src/sgml/ref/pg_resetwal.sgml | 13 +- src/backend/access/rmgrdesc/mxactdesc.c | 4 +- src/backend/access/rmgrdesc/xlogdesc.c | 2 +- src/backend/access/transam/multixact.c | 554 ++++-------------- src/backend/access/transam/xlog.c | 6 +- src/backend/access/transam/xlogrecovery.c | 2 +- src/backend/commands/vacuum.c | 6 +- src/backend/postmaster/autovacuum.c | 4 +- src/bin/pg_controldata/pg_controldata.c | 2 +- src/bin/pg_resetwal/pg_resetwal.c | 38 +- src/bin/pg_resetwal/t/001_basic.pl | 2 +- src/bin/pg_upgrade/Makefile | 3 + src/bin/pg_upgrade/meson.build | 4 + src/bin/pg_upgrade/multixact_read_v18.c | 340 +++++++++++ src/bin/pg_upgrade/multixact_read_v18.h | 37 ++ src/bin/pg_upgrade/multixact_rewrite.c | 191 ++++++ src/bin/pg_upgrade/pg_upgrade.c | 84 ++- src/bin/pg_upgrade/pg_upgrade.h | 12 +- src/bin/pg_upgrade/slru_io.c | 268 +++++++++ src/bin/pg_upgrade/slru_io.h | 52 ++ .../pg_upgrade/t/007_multixact_conversion.pl | 427 ++++++++++++++ src/include/access/multixact.h | 7 +- src/include/access/multixact_internal.h | 28 +- src/include/c.h | 2 +- src/include/catalog/catversion.h | 2 +- .../test_slru/t/002_multixact_wraparound.pl | 2 +- .../perl/PostgreSQL/Test/BackgroundPsql.pm | 15 +- src/test/perl/PostgreSQL/Test/Cluster.pm | 21 +- src/tools/pgindent/typedefs.list | 3 + 29 files changed, 1609 insertions(+), 522 deletions(-) create mode 100644 src/bin/pg_upgrade/multixact_read_v18.c create mode 100644 src/bin/pg_upgrade/multixact_read_v18.h create mode 100644 src/bin/pg_upgrade/multixact_rewrite.c create mode 100644 src/bin/pg_upgrade/slru_io.c create mode 100644 src/bin/pg_upgrade/slru_io.h create mode 100644 src/bin/pg_upgrade/t/007_multixact_conversion.pl diff --git a/doc/src/sgml/ref/pg_resetwal.sgml b/doc/src/sgml/ref/pg_resetwal.sgml index 2c019c2aac6..41f2b1d480c 100644 --- a/doc/src/sgml/ref/pg_resetwal.sgml +++ b/doc/src/sgml/ref/pg_resetwal.sgml @@ -267,14 +267,17 @@ PostgreSQL documentation A safe value for the next multitransaction ID (first part) can be determined by looking for the numerically largest file name in the directory pg_multixact/offsets under the data directory, - adding one, and then multiplying by 65536 (0x10000). Conversely, a safe + adding one, and then multiplying by 32768 (0x8000). Conversely, a safe value for the oldest multitransaction ID (second part of ) can be determined by looking for the numerically smallest - file name in the same directory and multiplying by 65536. The file - names are in hexadecimal, so the easiest way to do this is to specify - the option value in hexadecimal and append four zeroes. + file name in the same directory and multiplying by 32768 (0x8000). + Note that the file names are in hexadecimal. It is usually easiest + to specify the option value in hexadecimal too. For example, if + 000F and 0007 are the greatest and + smallest entries in pg_multixact/offsets, + -m 0x80000,0x38000 will work. - + diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index 3ca0582db36..052dd0a4ce5 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -65,7 +65,7 @@ multixact_desc(StringInfo buf, XLogReaderState *record) xl_multixact_create *xlrec = (xl_multixact_create *) rec; int i; - appendStringInfo(buf, "%u offset %u nmembers %d: ", xlrec->mid, + appendStringInfo(buf, "%u offset %" PRIu64 " nmembers %d: ", xlrec->mid, xlrec->moff, xlrec->nmembers); for (i = 0; i < xlrec->nmembers; i++) out_member(buf, &xlrec->members[i]); @@ -74,7 +74,7 @@ multixact_desc(StringInfo buf, XLogReaderState *record) { xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec; - appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)", + appendStringInfo(buf, "offsets [%u, %u), members [%" PRIu64 ", %" PRIu64 ")", xlrec->startTruncOff, xlrec->endTruncOff, xlrec->startTruncMemb, xlrec->endTruncMemb); } diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index cd6c2a2f650..441034f5929 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -66,7 +66,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%08X; " - "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; " + "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %" PRIu64 "; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 14d46fb761b..72a4e50852a 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -89,10 +89,14 @@ #include "utils/memutils.h" -/* Multixact members wraparound thresholds. */ -#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) -#define MULTIXACT_MEMBER_DANGER_THRESHOLD \ - (MaxMultiXactOffset - MaxMultiXactOffset / 4) +/* + * Thresholds used to keep members disk usage in check when multixids have a + * lot of members. When MULTIXACT_MEMBER_LOW_THRESHOLD is reached, vacuum + * starts freezing multixids more aggressively, even if the normal multixid + * age limits haven't been reached yet. + */ +#define MULTIXACT_MEMBER_LOW_THRESHOLD UINT64CONST(2000000000) +#define MULTIXACT_MEMBER_HIGH_THRESHOLD UINT64CONST(4000000000) static inline MultiXactId PreviousMultiXactId(MultiXactId multi) @@ -137,11 +141,9 @@ typedef struct MultiXactStateData /* * Oldest multixact offset that is potentially referenced by a multixact - * referenced by a relation. We don't always know this value, so there's - * a flag here to indicate whether or not we currently do. + * referenced by a relation. */ MultiXactOffset oldestOffset; - bool oldestOffsetKnown; /* support for anti-wraparound measures */ MultiXactId multiVacLimit; @@ -149,9 +151,6 @@ typedef struct MultiXactStateData MultiXactId multiStopLimit; MultiXactId multiWrapLimit; - /* support for members anti-wraparound measures */ - MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -272,13 +271,9 @@ static void mXactCachePut(MultiXactId multi, int nmembers, /* management of SLRU infrastructure */ static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2); static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2); -static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, - MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, - MultiXactOffset start, uint32 distance); -static bool SetOffsetVacuumLimit(bool is_startup); +static void SetOldestOffset(void); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, @@ -1073,90 +1068,22 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) ExtendMultiXactOffset(result + 1); /* - * Reserve the members space, similarly to above. Also, be careful not to - * return zero as the starting offset for any multixact. See - * GetMultiXactIdMembers() for motivation. + * Reserve the members space, similarly to above. */ nextOffset = MultiXactState->nextOffset; - if (nextOffset == 0) - { - *offset = 1; - nmembers++; /* allocate member slot 0 too */ - } - else - *offset = nextOffset; - - /*---------- - * Protect against overrun of the members space as well, with the - * following rules: - * - * If we're past offsetStopLimit, refuse to generate more multis. - * If we're close to offsetStopLimit, emit a warning. - * - * Arbitrarily, we start emitting warnings when we're 20 segments or less - * from offsetStopLimit. - * - * Note we haven't updated the shared state yet, so if we fail at this - * point, the multixact ID we grabbed can still be used by the next guy. - * - * Note that there is no point in forcing autovacuum runs here: the - * multixact freeze settings would have to be reduced for that to have any - * effect. - *---------- - */ -#define OFFSET_WARN_SEGMENTS 20 - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, - nmembers)) - { - /* see comment in the corresponding offsets wraparound case */ - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("multixact \"members\" limit exceeded"), - errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", - "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", - MultiXactState->offsetStopLimit - nextOffset - 1, - nmembers, - MultiXactState->offsetStopLimit - nextOffset - 1), - errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.", - MultiXactState->oldestMultiXactDB))); - } /* - * Check whether we should kick autovacuum into action, to prevent members - * wraparound. NB we use a much larger window to trigger autovacuum than - * just the warning limit. The warning is just a measure of last resort - - * this is in line with GetNewTransactionId's behaviour. + * Offsets are 64-bit integers and will never wrap around. Firstly, it + * would take an unrealistic amount of time and resources to consume 2^64 + * offsets. Secondly, multixid creation is WAL-logged, so you would run + * out of LSNs before reaching offset wraparound. Nevertheless, check for + * wraparound as a sanity check. */ - if (!MultiXactState->oldestOffsetKnown || - (MultiXactState->nextOffset - MultiXactState->oldestOffset - > MULTIXACT_MEMBER_SAFE_THRESHOLD)) - { - /* - * To avoid swamping the postmaster with signals, we issue the autovac - * request only when crossing a segment boundary. With default - * compilation settings that's roughly after 50k members. This still - * gives plenty of chances before we get into real trouble. - */ - if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != - (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - } - - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, - nextOffset, - nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) - ereport(WARNING, + if (nextOffset + nmembers < nextOffset) + ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", - "database with OID %u must be vacuumed before %d more multixact members are used", - MultiXactState->offsetStopLimit - nextOffset + nmembers, - MultiXactState->oldestMultiXactDB, - MultiXactState->offsetStopLimit - nextOffset + nmembers), - errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings."))); + errmsg("MultiXact members would wrap around"))); + *offset = nextOffset; ExtendMultiXactMember(nextOffset, nmembers); @@ -1177,8 +1104,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) * the next iteration. But note that nextMXact may be InvalidMultiXactId * or the first value on a segment-beginning page after this routine * exits, so anyone else looking at the variable must be prepared to deal - * with either case. Similarly, nextOffset may be zero, but we won't use - * that as the actual start offset of the next multixact. + * with either case. */ (MultiXactState->nextMXact)++; @@ -1186,7 +1112,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) LWLockRelease(MultiXactGenLock); - debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); + debug_elog4(DEBUG2, "GetNew: returning %u offset %" PRIu64, + result, *offset); return result; } @@ -1228,7 +1155,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactOffset *offptr; MultiXactOffset offset; int length; - int truelength; MultiXactId oldestMXact; MultiXactId nextMXact; MultiXactMember *ptr; @@ -1304,16 +1230,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * Find out the offset at which we need to start reading MultiXactMembers * and the number of members in the multixact. We determine the latter as * the difference between this multixact's starting offset and the next - * one's. However, there is one corner case to worry about: - * - * Because GetNewMultiXactId skips over offset zero, to reserve zero for - * to mean "unset", there is an ambiguity near the point of offset - * wraparound. If we see next multixact's offset is one, is that our - * multixact's actual endpoint, or did it end at zero with a subsequent - * increment? We handle this using the knowledge that if the zero'th - * member slot wasn't filled, it'll contain zero, and zero isn't a valid - * transaction ID so it can't be a multixact member. Therefore, if we - * read a zero from the members array, just ignore it. + * one's. */ pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -1380,10 +1297,11 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, LWLockRelease(lock); lock = NULL; + /* A multixid with zero members should not happen */ + Assert(length > 0); + /* read the members */ ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); - - truelength = 0; prev_pageno = -1; for (int i = 0; i < length; i++, offset++) { @@ -1420,37 +1338,27 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, xactptr = (TransactionId *) (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); - - if (!TransactionIdIsValid(*xactptr)) - { - /* Corner case: we must be looking at unused slot zero */ - Assert(offset == 0); - continue; - } + Assert(TransactionIdIsValid(*xactptr)); flagsoff = MXOffsetToFlagsOffset(offset); bshift = MXOffsetToFlagsBitShift(offset); flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); - ptr[truelength].xid = *xactptr; - ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; - truelength++; + ptr[i].xid = *xactptr; + ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; } LWLockRelease(lock); - /* A multixid with zero members should not happen */ - Assert(truelength > 0); - /* * Copy the result into the local cache. */ - mXactCachePut(multi, truelength, ptr); + mXactCachePut(multi, length, ptr); debug_elog3(DEBUG2, "GetMembers: no cache for %s", - mxid_to_string(multi, truelength, ptr)); + mxid_to_string(multi, length, ptr)); *members = ptr; - return truelength; + return length; } /* @@ -1857,7 +1765,7 @@ MultiXactShmemInit(void) "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER, LWTRANCHE_MULTIXACTMEMBER_SLRU, SYNC_HANDLER_MULTIXACT_MEMBER, - false); + true); /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ /* Initialize our shared state struct */ @@ -1912,48 +1820,6 @@ BootStrapMultiXact(void) SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0); } -/* - * MaybeExtendOffsetSlru - * Extend the offsets SLRU area, if necessary - * - * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might - * contain files that are shorter than necessary; this would occur if the old - * installation had used multixacts beyond the first page (files cannot be - * copied, because the on-disk representation is different). pg_upgrade would - * update pg_control to set the next offset value to be at that position, so - * that tuples marked as locked by such MultiXacts would be seen as visible - * without having to consult multixact. However, trying to create and use a - * new MultiXactId would result in an error because the page on which the new - * value would reside does not exist. This routine is in charge of creating - * such pages. - */ -static void -MaybeExtendOffsetSlru(void) -{ - int64 pageno; - LWLock *lock; - - pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) - { - int slotno; - - /* - * Fortunately for us, SimpleLruWritePage is already prepared to deal - * with creating a new segment file even if the page we're writing is - * not the first in it, so this is enough. - */ - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - } - - LWLockRelease(lock); -} - /* * This must be called ONCE during postmaster or standalone-backend startup. * @@ -2092,8 +1958,8 @@ TrimMultiXact(void) MultiXactState->finishedStartup = true; LWLockRelease(MultiXactGenLock); - /* Now compute how far away the next members wraparound is. */ - SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); + /* Now compute how far away the next multixid wraparound is. */ + SetMultiXactIdLimit(oldestMXact, oldestMXactDB); } /* @@ -2114,7 +1980,7 @@ MultiXactGetCheckptMulti(bool is_shutdown, LWLockRelease(MultiXactGenLock); debug_elog6(DEBUG2, - "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u", + "MultiXact: checkpoint is nextMulti %u, nextOffset %" PRIu64 ", oldestMulti %u in DB %u", *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); } @@ -2149,26 +2015,12 @@ void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset) { - debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", + debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %" PRIu64, nextMulti, nextMultiOffset); LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->nextMXact = nextMulti; MultiXactState->nextOffset = nextMultiOffset; LWLockRelease(MultiXactGenLock); - - /* - * During a binary upgrade, make sure that the offsets SLRU is large - * enough to contain the next value that would be created. - * - * We need to do this pretty early during the first startup in binary - * upgrade mode: before StartupMultiXact() in fact, because this routine - * is called even before that by StartupXLOG(). And we can't do it - * earlier than at this point, because during that first call of this - * routine we determine the MultiXactState->nextMXact value that - * MaybeExtendOffsetSlru needs. - */ - if (IsBinaryUpgrade) - MaybeExtendOffsetSlru(); } /* @@ -2176,28 +2028,24 @@ MultiXactSetNextMXact(MultiXactId nextMulti, * datminmxid (ie, the oldest MultiXactId that might exist in any database * of our cluster), and the OID of the (or a) database with that value. * - * is_startup is true when we are just starting the cluster, false when we - * are updating state in a running cluster. This only affects log messages. + * This also updates MultiXactState->oldestOffset, by looking up the offset of + * MultiXactState->oldestMultiXactId. */ void -SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, - bool is_startup) +SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) { MultiXactId multiVacLimit; MultiXactId multiWarnLimit; MultiXactId multiStopLimit; MultiXactId multiWrapLimit; MultiXactId curMulti; - bool needs_offset_vacuum; Assert(MultiXactIdIsValid(oldest_datminmxid)); /* * We pretend that a wrap will happen halfway through the multixact ID * space, but that's not really true, because multixacts wrap differently - * from transaction IDs. Note that, separately from any concern about - * multixact IDs wrapping, we must ensure that multixact members do not - * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. + * from transaction IDs. */ multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); if (multiWrapLimit < FirstMultiXactId) @@ -2265,8 +2113,14 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, Assert(!InRecovery); - /* Set limits for offset vacuum. */ - needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); + /* + * Offsets are 64-bits wide and never wrap around, so we don't need to + * consider them for emergency autovacuum purposes. But now that we're in + * a consistent state, determine MultiXactState->oldestOffset. It will be + * used to adjust the freezing cutoff, to keep the offsets disk usage in + * check. + */ + SetOldestOffset(); /* * If past the autovacuum force point, immediately signal an autovac @@ -2275,8 +2129,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, * database, it'll call here, and we'll signal the postmaster to start * another iteration immediately if there are still any old databases. */ - if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || - needs_offset_vacuum) && IsUnderPostmaster) + if (MultiXactIdPrecedes(multiVacLimit, curMulti) && IsUnderPostmaster) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); /* Give an immediate warning if past the wrap warn point */ @@ -2338,9 +2191,9 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti, debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti); MultiXactState->nextMXact = minMulti; } - if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) + if (MultiXactState->nextOffset < minMultiOffset) { - debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", + debug_elog3(DEBUG2, "MultiXact: setting next offset to %" PRIU64, minMultiOffset); MultiXactState->nextOffset = minMultiOffset; } @@ -2359,7 +2212,7 @@ MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) Assert(InRecovery); if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) - SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); + SetMultiXactIdLimit(oldestMulti, oldestMultiDB); } /* @@ -2442,27 +2295,11 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockRelease(lock); } - /* - * Compute the number of items till end of current page. Careful: if - * addition of unsigned ints wraps around, we're at the last page of - * the last segment; since that page holds a different number of items - * than other pages, we need to do it differently. - */ - if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) - { - /* - * This is the last page of the last segment; we can compute the - * number of items left to allocate in it without modulo - * arithmetic. - */ - difference = MaxMultiXactOffset - offset + 1; - } - else - difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + /* Compute the number of items till end of current page. */ + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; /* - * Advance to next page, taking care to properly handle the wraparound - * case. OK if nmembers goes negative. + * Advance to next page. OK if nmembers goes negative. */ nmembers -= difference; offset += difference; @@ -2524,28 +2361,17 @@ GetOldestMultiXactId(void) } /* - * Determine how aggressively we need to vacuum in order to prevent member - * wraparound. - * - * To do so determine what's the oldest member offset and install the limit - * info in MultiXactState, where it can be used to prevent overrun of old data - * in the members SLRU area. - * - * The return value is true if emergency autovacuum is required and false - * otherwise. + * Calculate the oldest member offset and install it in MultiXactState, where + * it can be used to adjust multixid freezing cutoffs. */ -static bool -SetOffsetVacuumLimit(bool is_startup) +static void +SetOldestOffset(void) { MultiXactId oldestMultiXactId; MultiXactId nextMXact; MultiXactOffset oldestOffset = 0; /* placate compiler */ - MultiXactOffset prevOldestOffset; MultiXactOffset nextOffset; bool oldestOffsetKnown = false; - bool prevOldestOffsetKnown; - MultiXactOffset offsetStopLimit = 0; - MultiXactOffset prevOffsetStopLimit; /* * NB: Have to prevent concurrent truncation, we might otherwise try to @@ -2558,9 +2384,6 @@ SetOffsetVacuumLimit(bool is_startup) oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMXact = MultiXactState->nextMXact; nextOffset = MultiXactState->nextOffset; - prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; - prevOldestOffset = MultiXactState->oldestOffset; - prevOffsetStopLimit = MultiXactState->offsetStopLimit; Assert(MultiXactState->finishedStartup); LWLockRelease(MultiXactGenLock); @@ -2583,121 +2406,39 @@ SetOffsetVacuumLimit(bool is_startup) else { /* - * Figure out where the oldest existing multixact's offsets are - * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, - * the supposedly-earliest multixact might not really exist. We are - * careful not to fail in that case. + * Look up the offset at which the oldest existing multixact's members + * are stored. If we cannot find it, be careful not to fail, and + * leave oldestOffset unchanged. oldestOffset is initialized to zero + * at system startup, which prevents truncating members until a proper + * value is calculated. + * + * (We had bugs in early releases of PostgreSQL 9.3.X and 9.4.X where + * the supposedly-earliest multixact might not really exist. Those + * should be long gone by now, so this should not fail, but let's + * still be defensive.) */ oldestOffsetKnown = find_multixact_start(oldestMultiXactId, &oldestOffset); if (oldestOffsetKnown) ereport(DEBUG1, - (errmsg_internal("oldest MultiXactId member is at offset %u", + (errmsg_internal("oldest MultiXactId member is at offset %" PRIu64, oldestOffset))); else ereport(LOG, - (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", + (errmsg("MultiXact member truncation is disabled because oldest checkpointed MultiXact %u does not exist on disk", oldestMultiXactId))); } LWLockRelease(MultiXactTruncationLock); - /* - * If we can, compute limits (and install them MultiXactState) to prevent - * overrun of old data in the members SLRU area. We can only do so if the - * oldest offset is known though. - */ + /* Install the computed value */ if (oldestOffsetKnown) { - /* move back to start of the corresponding segment */ - offsetStopLimit = oldestOffset - (oldestOffset % - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); - - /* always leave one segment before the wraparound point */ - offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - - if (!prevOldestOffsetKnown && !is_startup) - ereport(LOG, - (errmsg("MultiXact member wraparound protections are now enabled"))); - - ereport(DEBUG1, - (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", - offsetStopLimit, oldestMultiXactId))); - } - else if (prevOldestOffsetKnown) - { - /* - * If we failed to get the oldest offset this time, but we have a - * value from a previous pass through this function, use the old - * values rather than automatically forcing an emergency autovacuum - * cycle again. - */ - oldestOffset = prevOldestOffset; - oldestOffsetKnown = true; - offsetStopLimit = prevOffsetStopLimit; + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestOffset = oldestOffset; + LWLockRelease(MultiXactGenLock); } - - /* Install the computed values */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - MultiXactState->oldestOffset = oldestOffset; - MultiXactState->oldestOffsetKnown = oldestOffsetKnown; - MultiXactState->offsetStopLimit = offsetStopLimit; - LWLockRelease(MultiXactGenLock); - - /* - * Do we need an emergency autovacuum? If we're not sure, assume yes. - */ - return !oldestOffsetKnown || - (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); -} - -/* - * Return whether adding "distance" to "start" would move past "boundary". - * - * We use this to determine whether the addition is "wrapping around" the - * boundary point, hence the name. The reason we don't want to use the regular - * 2^31-modulo arithmetic here is that we want to be able to use the whole of - * the 2^32-1 space here, allowing for more multixacts than would fit - * otherwise. - */ -static bool -MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, - uint32 distance) -{ - MultiXactOffset finish; - - /* - * Note that offset number 0 is not used (see GetMultiXactIdMembers), so - * if the addition wraps around the UINT_MAX boundary, skip that value. - */ - finish = start + distance; - if (finish < start) - finish++; - - /*----------------------------------------------------------------------- - * When the boundary is numerically greater than the starting point, any - * value numerically between the two is not wrapped: - * - * <----S----B----> - * [---) = F wrapped past B (and UINT_MAX) - * [---) = F not wrapped - * [----] = F wrapped past B - * - * When the boundary is numerically less than the starting point (i.e. the - * UINT_MAX wraparound occurs somewhere in between) then all values in - * between are wrapped: - * - * <----B----S----> - * [---) = F not wrapped past B (but wrapped past UINT_MAX) - * [---) = F wrapped past B (and UINT_MAX) - * [----] = F not wrapped - *----------------------------------------------------------------------- - */ - if (start < boundary) - return finish >= boundary || finish < start; - else - return finish >= boundary && finish < start; } /* @@ -2751,37 +2492,23 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) * members: Number of member entries (nextOffset - oldestOffset) * oldestMultiXactId: Oldest MultiXact ID still in use * oldestOffset: Oldest offset still in use - * - * Returns false if unable to determine, the oldest offset being unknown. */ -bool +void GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset) { MultiXactOffset nextOffset; MultiXactId nextMultiXactId; - bool oldestOffsetKnown; LWLockAcquire(MultiXactGenLock, LW_SHARED); nextOffset = MultiXactState->nextOffset; *oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMultiXactId = MultiXactState->nextMXact; *oldestOffset = MultiXactState->oldestOffset; - oldestOffsetKnown = MultiXactState->oldestOffsetKnown; LWLockRelease(MultiXactGenLock); - if (!oldestOffsetKnown) - { - *members = 0; - *multixacts = 0; - *oldestMultiXactId = InvalidMultiXactId; - *oldestOffset = 0; - return false; - } - *members = nextOffset - *oldestOffset; *multixacts = nextMultiXactId - *oldestMultiXactId; - return true; } /* @@ -2790,26 +2517,27 @@ GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, * vacuum_multixact_freeze_table_age work together to make sure we never have * too many multixacts; we hope that, at least under normal circumstances, * this will also be sufficient to keep us from using too many offsets. - * However, if the average multixact has many members, we might exhaust the - * members space while still using few enough members that these limits fail - * to trigger relminmxid advancement by VACUUM. At that point, we'd have no - * choice but to start failing multixact-creating operations with an error. - * - * To prevent that, if more than a threshold portion of the members space is - * used, we effectively reduce autovacuum_multixact_freeze_max_age and - * to a value just less than the number of multixacts in use. We hope that - * this will quickly trigger autovacuuming on the table or tables with the - * oldest relminmxid, thus allowing datminmxid values to advance and removing - * some members. - * - * As the fraction of the member space currently in use grows, we become - * more aggressive in clamping this value. That not only causes autovacuum - * to ramp up, but also makes any manual vacuums the user issues more - * aggressive. This happens because vacuum_get_cutoffs() will clamp the - * freeze table and the minimum freeze age cutoffs based on the effective - * autovacuum_multixact_freeze_max_age this function returns. In the worst - * case, we'll claim the freeze_max_age to zero, and every vacuum of any - * table will freeze every multixact. + * However, if the average multixact has many members, we might accumulate a + * large amount of members, consuming disk space, while still using few enough + * multixids that the multixid limits fail to trigger relminmxid advancement + * by VACUUM. + * + * To prevent that, if the members space usage exceeds a threshold + * (MULTIXACT_MEMBER_LOW_THRESHOLD), we effectively reduce + * autovacuum_multixact_freeze_max_age to a value just less than the number of + * multixacts in use. We hope that this will quickly trigger autovacuuming on + * the table or tables with the oldest relminmxid, thus allowing datminmxid + * values to advance and removing some members. + * + * As the amount of the member space in use grows, we become more aggressive + * in clamping this value. That not only causes autovacuum to ramp up, but + * also makes any manual vacuums the user issues more aggressive. This + * happens because vacuum_get_cutoffs() will clamp the freeze table and the + * minimum freeze age cutoffs based on the effective + * autovacuum_multixact_freeze_max_age this function returns. At the extreme, + * when the members usage reaches MULTIXACT_MEMBER_HIGH_THRESHOLD, we clamp + * freeze_max_age to zero, and every vacuum of any table will freeze every + * multixact. */ int MultiXactMemberFreezeThreshold(void) @@ -2822,26 +2550,33 @@ MultiXactMemberFreezeThreshold(void) MultiXactId oldestMultiXactId; MultiXactOffset oldestOffset; - /* If we can't determine member space utilization, assume the worst. */ - if (!GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset)) - return 0; + /* Read the current offsets and members usage. */ + GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset); /* If member space utilization is low, no special action is required. */ - if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) + if (members <= MULTIXACT_MEMBER_LOW_THRESHOLD) return autovacuum_multixact_freeze_max_age; /* * Compute a target for relminmxid advancement. The number of multixacts * we try to eliminate from the system is based on how far we are past - * MULTIXACT_MEMBER_SAFE_THRESHOLD. + * MULTIXACT_MEMBER_LOW_THRESHOLD. + * + * The way this formula works is that when members is exactly at the low + * threshold, fraction = 0.0, and we set freeze_max_age equal to + * mxid_age(oldestMultiXactId). As members grows further, towards the + * high threshold, fraction grows linearly from 0.0 to 1.0, and the result + * shrinks from mxid_age(oldestMultiXactId) to 0. Beyond the high + * threshold, fraction > 1.0 and the result is clamped to 0. */ - fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / - (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); - victim_multixacts = multixacts * fraction; + fraction = (double) (members - MULTIXACT_MEMBER_LOW_THRESHOLD) / + (MULTIXACT_MEMBER_HIGH_THRESHOLD - MULTIXACT_MEMBER_LOW_THRESHOLD); /* fraction could be > 1.0, but lowest possible freeze age is zero */ - if (victim_multixacts > multixacts) + if (fraction >= 1.0) return 0; + + victim_multixacts = multixacts * fraction; result = multixacts - victim_multixacts; /* @@ -2877,36 +2612,12 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data /* * Delete members segments [oldest, newOldest) - * - * The members SLRU can, in contrast to the offsets one, be filled to almost - * the full range at once. This means SimpleLruTruncate() can't trivially be - * used - instead the to-be-deleted range is computed using the offsets - * SLRU. C.f. TruncateMultiXact(). */ static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) { - const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); - int64 startsegment = MXOffsetToMemberSegment(oldestOffset); - int64 endsegment = MXOffsetToMemberSegment(newOldestOffset); - int64 segment = startsegment; - - /* - * Delete all the segments but the last one. The last segment can still - * contain, possibly partially, valid data. - */ - while (segment != endsegment) - { - elog(DEBUG2, "truncating multixact members segment %" PRIx64, - segment); - SlruDeleteSegment(MultiXactMemberCtl, segment); - - /* move to next segment, handling wraparound correctly */ - if (segment == maxsegment) - segment = 0; - else - segment += 1; - } + SimpleLruTruncate(MultiXactMemberCtl, + MXOffsetToMemberPage(newOldestOffset)); } /* @@ -3050,7 +2761,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) elog(DEBUG1, "performing multixact truncation: " "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " - "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", + "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")", oldestMulti, newOldestMulti, MultiXactIdToOffsetSegment(oldestMulti), MultiXactIdToOffsetSegment(newOldestMulti), @@ -3091,6 +2802,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->oldestMultiXactId = newOldestMulti; MultiXactState->oldestMultiXactDB = newOldestMultiDB; + MultiXactState->oldestOffset = newOldestOffset; LWLockRelease(MultiXactGenLock); /* First truncate members */ @@ -3130,20 +2842,13 @@ MultiXactOffsetPagePrecedes(int64 page1, int64 page2) /* * Decide whether a MultiXactMember page number is "older" for truncation - * purposes. There is no "invalid offset number" so use the numbers verbatim. + * purposes. There is no "invalid offset number" and members never wrap + * around, so use the numbers verbatim. */ static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2) { - MultiXactOffset offset1; - MultiXactOffset offset2; - - offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; - offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; - - return (MultiXactOffsetPrecedes(offset1, offset2) && - MultiXactOffsetPrecedes(offset1, - offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1)); + return page1 < page2; } /* @@ -3175,17 +2880,6 @@ MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) } -/* - * Decide which of two offsets is earlier. - */ -static bool -MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) -{ - int32 diff = (int32) (offset1 - offset2); - - return (diff < 0); -} - /* * Write a TRUNCATE xlog record * @@ -3278,7 +2972,7 @@ multixact_redo(XLogReaderState *record) elog(DEBUG1, "replaying multixact truncation: " "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " - "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", + "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")", xlrec.startTruncOff, xlrec.endTruncOff, MultiXactIdToOffsetSegment(xlrec.startTruncOff), MultiXactIdToOffsetSegment(xlrec.endTruncOff), @@ -3293,7 +2987,7 @@ multixact_redo(XLogReaderState *record) * Advance the horizon values, so they're current at the end of * recovery. */ - SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); + SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB); PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 89cbda9cc7c..6ced1d57282 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5137,7 +5137,7 @@ BootStrapXLOG(uint32 data_checksum_version) FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; checkPoint.nextMulti = FirstMultiXactId; - checkPoint.nextMultiOffset = 0; + checkPoint.nextMultiOffset = 1; checkPoint.oldestXid = FirstNormalTransactionId; checkPoint.oldestXidDB = Template1DbOid; checkPoint.oldestMulti = FirstMultiXactId; @@ -5153,7 +5153,7 @@ BootStrapXLOG(uint32 data_checksum_version) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); /* Set up the XLOG page header */ @@ -5632,7 +5632,7 @@ StartupXLOG(void) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); SetCommitTsLimit(checkPoint.oldestCommitTsXid, checkPoint.newestCommitTsXid); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 21b8f179ba0..51dea342a4d 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -886,7 +886,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, U64FromFullTransactionId(checkPoint.nextXid), checkPoint.nextOid))); ereport(DEBUG1, - (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", + (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64, checkPoint.nextMulti, checkPoint.nextMultiOffset))); ereport(DEBUG1, (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 29def1e94fa..0528d1b6ecb 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1147,8 +1147,8 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams params, /* * Also compute the multixact age for which freezing is urgent. This is - * normally autovacuum_multixact_freeze_max_age, but may be less if we are - * short of multixact member space. + * normally autovacuum_multixact_freeze_max_age, but may be less if + * multixact members are bloated. */ effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); @@ -1973,7 +1973,7 @@ vac_truncate_clog(TransactionId frozenXID, * signaling twice? */ SetTransactionIdLimit(frozenXID, oldestxid_datoid); - SetMultiXactIdLimit(minMulti, minmulti_datoid, false); + SetMultiXactIdLimit(minMulti, minmulti_datoid); LWLockRelease(WrapLimitsVacuumLock); } diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 1c38488f2cb..f4830f896f3 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -1936,8 +1936,8 @@ do_autovacuum(void) /* * Compute the multixact age for which freezing is urgent. This is - * normally autovacuum_multixact_freeze_max_age, but may be less if we are - * short of multixact member space. + * normally autovacuum_multixact_freeze_max_age, but may be less if + * multixact members are bloated. */ effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 30ad46912e1..a4060309ae0 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -271,7 +271,7 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile->checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), + printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"), ControlFile->checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile->checkPointCopy.oldestXid); diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 07c95f9ab80..56012d5f4c4 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -115,6 +115,7 @@ static void KillExistingWALSummaries(void); static void WriteEmptyXLOG(void); static void usage(void); static uint32 strtouint32_strict(const char *restrict s, char **restrict endptr, int base); +static uint64 strtouint64_strict(const char *restrict s, char **restrict endptr, int base); int @@ -293,7 +294,7 @@ main(int argc, char *argv[]) case 'O': errno = 0; - next_mxoff_val = strtouint32_strict(optarg, &endptr, 0); + next_mxoff_val = strtouint64_strict(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-O"); @@ -772,7 +773,7 @@ PrintControlValues(bool guessed) ControlFile.checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile.checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), + printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"), ControlFile.checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile.checkPointCopy.oldestXid); @@ -848,7 +849,7 @@ PrintNewControlValues(void) if (next_mxoff_given) { - printf(_("NextMultiOffset: %u\n"), + printf(_("NextMultiOffset: %" PRIu64 "\n"), ControlFile.checkPointCopy.nextMultiOffset); } @@ -1276,3 +1277,34 @@ strtouint32_strict(const char *restrict s, char **restrict endptr, int base) return (uint32) val; } + +/* + * strtouint64_strict -- like strtou64(), but doesn't accept negative values + */ +static uint64 +strtouint64_strict(const char *restrict s, char **restrict endptr, int base) +{ + uint64 val; + bool is_neg; + + /* skip leading whitespace */ + while (isspace((unsigned char) *s)) + s++; + + /* + * Is it negative? We still call strtou64() if it was, to set 'endptr'. + * (The current callers don't care though.) + */ + is_neg = (*s == '-'); + + val = strtou64(s, endptr, base); + + /* reject if it was negative */ + if (errno == 0 && is_neg) + { + errno = ERANGE; + val = 0; + } + + return val; +} diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl index 8717b144bc0..8bab9add74f 100644 --- a/src/bin/pg_resetwal/t/001_basic.pl +++ b/src/bin/pg_resetwal/t/001_basic.pl @@ -237,7 +237,7 @@ push @cmd, sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1])); @files = get_slru_files('pg_multixact/offsets'); -$mult = 32 * $blcksz / 4; +$mult = 32 * $blcksz / 8; # --multixact-ids argument is "new,old" push @cmd, '--multixact-ids' => sprintf("%d,%d", diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index 69fcf593cae..726df4b7525 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -18,11 +18,14 @@ OBJS = \ file.o \ function.o \ info.o \ + multixact_read_v18.o \ + multixact_rewrite.o \ option.o \ parallel.o \ pg_upgrade.o \ relfilenumber.o \ server.o \ + slru_io.o \ tablespace.o \ task.o \ util.o \ diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index ac992f0d14b..41f1126206b 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -8,11 +8,14 @@ pg_upgrade_sources = files( 'file.c', 'function.c', 'info.c', + 'multixact_read_v18.c', + 'multixact_rewrite.c', 'option.c', 'parallel.c', 'pg_upgrade.c', 'relfilenumber.c', 'server.c', + 'slru_io.c', 'tablespace.c', 'task.c', 'util.c', @@ -47,6 +50,7 @@ tests += { 't/004_subscription.pl', 't/005_char_signedness.pl', 't/006_transfer_modes.pl', + 't/007_multixact_conversion.pl', ], 'test_kwargs': {'priority': 40}, # pg_upgrade tests are slow }, diff --git a/src/bin/pg_upgrade/multixact_read_v18.c b/src/bin/pg_upgrade/multixact_read_v18.c new file mode 100644 index 00000000000..e7496a73e0e --- /dev/null +++ b/src/bin/pg_upgrade/multixact_read_v18.c @@ -0,0 +1,340 @@ +/* + * multixact_read_v18.c + * + * Functions to read multixact SLRUs from clusters of PostgreSQL version 18 + * and older. In version 19, the multixid offsets were expanded from 32 to 64 + * bits. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_read_v18.c + */ + +#include "postgres_fe.h" + +#include "multixact_read_v18.h" +#include "pg_upgrade.h" + +/* + * NOTE: below are a bunch of definitions that are copy-pasted from + * multixact.c from version 18. It's important that this file doesn't + * #include the new definitions with same names from "multixact_internal.h"! + * + * To further avoid confusion in the functions exposed outside this source + * file, we use MultiXactOffset32 to represent the old-style 32-bit multixid + * offsets. The new 64-bit MultiXactOffset should not be used anywhere in + * this file. + */ +#ifdef MULTIXACT_INTERNAL_H +#error multixact_internal.h should not be included in multixact_read_v18.c +#endif +#define MultiXactOffset should_not_be_used + +/* We need four bytes per offset and 8 bytes per base for each page. */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset32)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(MultiXactOffset32 offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(MultiXactOffset32 offset) +{ + MultiXactOffset32 group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(MultiXactOffset32 offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +static inline int +MXOffsetToFlagsBitShift(MultiXactOffset32 offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* + * Construct reader of old multixacts. + * + * Returns the malloced memory used by the all other calls in this module. + */ +OldMultiXactReader * +AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti, + MultiXactOffset32 nextOffset) +{ + OldMultiXactReader *state = state = pg_malloc(sizeof(*state)); + char dir[MAXPGPATH] = {0}; + + state->nextMXact = nextMulti; + state->nextOffset = nextOffset; + + pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata); + state->offset = AllocSlruRead(dir, false); + + pg_sprintf(dir, "%s/pg_multixact/members", pgdata); + state->members = AllocSlruRead(dir, false); + + return state; +} + +/* + * This is a simplified version of the GetMultiXactIdMembers() server + * function: + * + * - Only return the updating member, if any. Upgrade only cares about the + * updaters. If there is no updating member, return somewhat arbitrarily + * the first locking-only member, because we don't have any way to represent + * "no members". + * + * - Because there's no concurrent activity, we don't need to worry about + * locking and some corner cases. + * + * - Don't bail out on invalid entries. If the server crashes, it can leave + * invalid or half-written entries on disk. Such multixids won't appear + * anywhere else on disk, so the server will never try to read them. During + * upgrade, however, we scan through all multixids in order, and will + * encounter such invalid but unreferenced multixids too. + * + * Returns true on success, false if the multixact was invalid. + */ +bool +GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, + MultiXactMember *member) +{ + MultiXactId nextMXact, + nextOffset, + tmpMXact; + int64 pageno, + prev_pageno; + int entryno, + length; + char *buf; + MultiXactOffset32 *offptr, + offset; + MultiXactOffset32 nextMXOffset; + TransactionId result_xid = InvalidTransactionId; + MultiXactStatus result_status = 0; + + nextMXact = state->nextMXact; + nextOffset = state->nextOffset; + + /* + * Comment copied from GetMultiXactIdMembers in PostgreSQL v18 + * multixact.c: + * + * Find out the offset at which we need to start reading MultiXactMembers + * and the number of members in the multixact. We determine the latter as + * the difference between this multixact's starting offset and the next + * one's. However, there are some corner cases to worry about: + * + * 1. This multixact may be the latest one created, in which case there is + * no next one to look at. The next multixact's offset should be set + * already, as we set it in RecordNewMultiXact(), but we used to not do + * that in older minor versions. To cope with that case, if this + * multixact is the latest one created, use the nextOffset value we read + * above as the endpoint. + * + * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero + * for to mean "unset", there is an ambiguity near the point of offset + * wraparound. If we see next multixact's offset is one, is that our + * multixact's actual endpoint, or did it end at zero with a subsequent + * increment? We handle this using the knowledge that if the zero'th + * member slot wasn't filled, it'll contain zero, and zero isn't a valid + * transaction ID so it can't be a multixact member. Therefore, if we + * read a zero from the members array, just ignore it. + */ + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + buf = SlruReadSwitchPage(state->offset, pageno); + offptr = (MultiXactOffset32 *) buf; + offptr += entryno; + offset = *offptr; + + if (offset == 0) + { + /* Invalid entry */ + return false; + } + + /* + * Use the same increment rule as GetNewMultiXactId(), that is, don't + * handle wraparound explicitly until needed. + */ + tmpMXact = multi + 1; + + if (nextMXact == tmpMXact) + { + /* Corner case 1: there is no next multixact */ + nextMXOffset = nextOffset; + } + else + { + /* handle wraparound if needed */ + if (tmpMXact < FirstMultiXactId) + tmpMXact = FirstMultiXactId; + + prev_pageno = pageno; + + pageno = MultiXactIdToOffsetPage(tmpMXact); + entryno = MultiXactIdToOffsetEntry(tmpMXact); + + if (pageno != prev_pageno) + buf = SlruReadSwitchPage(state->offset, pageno); + + offptr = (MultiXactOffset32 *) buf; + offptr += entryno; + nextMXOffset = *offptr; + } + + if (nextMXOffset == 0) + { + /* Invalid entry */ + return false; + } + length = nextMXOffset - offset; + + /* read the members */ + prev_pageno = -1; + for (int i = 0; i < length; i++, offset++) + { + TransactionId *xactptr; + uint32 *flagsptr; + int flagsoff; + int bshift; + int memberoff; + MultiXactStatus status; + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + + if (pageno != prev_pageno) + { + buf = SlruReadSwitchPage(state->members, pageno); + prev_pageno = pageno; + } + + xactptr = (TransactionId *) (buf + memberoff); + if (!TransactionIdIsValid(*xactptr)) + { + /* + * Corner case 2: we are looking at unused slot zero + */ + if (offset == 0) + continue; + + /* + * Otherwise this is an invalid entry that should not be + * referenced from anywhere in the heap. We could return 'false' + * here, but we prefer to continue reading the members and + * converting them the best we can, to preserve evidence in case + * this is corruption that should not happen. + */ + } + + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + flagsptr = (uint32 *) (buf + flagsoff); + + status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; + + /* + * Remember the updating XID among the members, or first locking XID + * if no updating XID. + */ + if (ISUPDATE_from_mxstatus(status)) + { + /* sanity check */ + if (ISUPDATE_from_mxstatus(result_status)) + { + /* + * We don't expect to see more than one updating member, even + * if the server had crashed. + */ + pg_fatal("multixact %u has more than one updating member", + multi); + } + result_xid = *xactptr; + result_status = status; + } + else if (!TransactionIdIsValid(result_xid)) + { + result_xid = *xactptr; + result_status = status; + } + } + + member->xid = result_xid; + member->status = result_status; + return true; +} + +/* + * Frees the malloced reader. + */ +void +FreeOldMultiXactReader(OldMultiXactReader *state) +{ + FreeSlruRead(state->offset); + FreeSlruRead(state->members); + + pfree(state); +} diff --git a/src/bin/pg_upgrade/multixact_read_v18.h b/src/bin/pg_upgrade/multixact_read_v18.h new file mode 100644 index 00000000000..6ef485b53e1 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_read_v18.h @@ -0,0 +1,37 @@ +/* + * multixact_read_v18.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_read_v18.h + */ +#ifndef MULTIXACT_READ_V18_H +#define MULTIXACT_READ_V18_H + +#include "access/multixact.h" +#include "slru_io.h" + +/* + * MultiXactOffset changed from uint32 to uint64 between versions 18 and 19. + * MultiXactOffset32 is used to represent a 32-bit offset from the old + * cluster. + */ +typedef uint32 MultiXactOffset32; + +typedef struct OldMultiXactReader +{ + MultiXactId nextMXact; + MultiXactOffset32 nextOffset; + + SlruSegState *offset; + SlruSegState *members; +} OldMultiXactReader; + +extern OldMultiXactReader *AllocOldMultiXactRead(char *pgdata, + MultiXactId nextMulti, + MultiXactOffset32 nextOffset); +extern bool GetOldMultiXactIdSingleMember(OldMultiXactReader *state, + MultiXactId multi, + MultiXactMember *member); +extern void FreeOldMultiXactReader(OldMultiXactReader *reader); + +#endif /* MULTIXACT_READ_V18_H */ diff --git a/src/bin/pg_upgrade/multixact_rewrite.c b/src/bin/pg_upgrade/multixact_rewrite.c new file mode 100644 index 00000000000..4e56922d83f --- /dev/null +++ b/src/bin/pg_upgrade/multixact_rewrite.c @@ -0,0 +1,191 @@ +/* + * multixact_rewrite.c + * + * Functions to convert multixact SLRUs from the pre-v19 format to the current + * format with 64-bit MultiXactOffsets. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_rewrite.c + */ + +#include "postgres_fe.h" + +#include "access/multixact_internal.h" +#include "multixact_read_v18.h" +#include "pg_upgrade.h" + +static void RecordMultiXactOffset(SlruSegState *offsets_writer, MultiXactId multi, + MultiXactOffset offset); +static void RecordMultiXactMembers(SlruSegState *members_writer, + MultiXactOffset offset, + int nmembers, MultiXactMember *members); + +/* + * Convert pg_multixact/offset and /members from the old pre-v19 format with + * 32-bit offsets to the current format. + * + * Multixids in the range [from_multi, to_multi) are read from the old + * cluster, and written in the new format. An important edge case is that if + * from_multi == to_multi, this initializes the new pg_multixact files in the + * new format without trying to open any old files. (We rely on that when + * upgrading from PostgreSQL version 9.2 or below.) + * + * Returns the new nextOffset value; the caller should set it in the new + * control file. The new members always start from offset 1, regardless of + * the offset range used in the old cluster. + */ +MultiXactOffset +rewrite_multixacts(MultiXactId from_multi, MultiXactId to_multi) +{ + MultiXactOffset next_offset; + SlruSegState *offsets_writer; + SlruSegState *members_writer; + char dir[MAXPGPATH] = {0}; + bool prev_multixid_valid = false; + + /* + * The range of valid multi XIDs is unchanged by the conversion (they are + * referenced from the heap tables), but the members SLRU is rewritten to + * start from offset 1. + */ + next_offset = 1; + + /* Prepare to write the new SLRU files */ + pg_sprintf(dir, "%s/pg_multixact/offsets", new_cluster.pgdata); + offsets_writer = AllocSlruWrite(dir, false); + SlruWriteSwitchPage(offsets_writer, MultiXactIdToOffsetPage(from_multi)); + + pg_sprintf(dir, "%s/pg_multixact/members", new_cluster.pgdata); + members_writer = AllocSlruWrite(dir, true /* use long segment names */ ); + SlruWriteSwitchPage(members_writer, MXOffsetToMemberPage(next_offset)); + + /* + * Convert old multixids, if needed, by reading them one-by-one from the + * old cluster. + */ + if (to_multi != from_multi) + { + OldMultiXactReader *old_reader; + + old_reader = AllocOldMultiXactRead(old_cluster.pgdata, + old_cluster.controldata.chkpnt_nxtmulti, + old_cluster.controldata.chkpnt_nxtmxoff); + + for (MultiXactId multi = from_multi; multi != to_multi;) + { + MultiXactMember member; + bool multixid_valid; + + /* + * Read this multixid's members. + * + * Locking-only XIDs that may be part of multi-xids don't matter + * after upgrade, as there can be no transactions running across + * upgrade. So as a small optimization, we only read one member + * from each multixid: the one updating one, or if there was no + * update, arbitrarily the first locking xid. + */ + multixid_valid = GetOldMultiXactIdSingleMember(old_reader, multi, &member); + + /* + * Write the new offset to pg_multixact/offsets. + * + * Even if this multixid is invalid, we still need to write its + * offset if the *previous* multixid was valid. That's because + * when reading a multixid, the number of members is calculated + * from the difference between the two offsets. + */ + RecordMultiXactOffset(offsets_writer, multi, + (multixid_valid || prev_multixid_valid) ? next_offset : 0); + + /* Write the members */ + if (multixid_valid) + { + RecordMultiXactMembers(members_writer, next_offset, 1, &member); + next_offset += 1; + } + + /* Advance to next multixid, handling wraparound */ + multi++; + if (multi < FirstMultiXactId) + multi = FirstMultiXactId; + prev_multixid_valid = multixid_valid; + } + + FreeOldMultiXactReader(old_reader); + } + + /* Write the final 'next' offset to the last SLRU page */ + RecordMultiXactOffset(offsets_writer, to_multi, + prev_multixid_valid ? next_offset : 0); + + /* Flush the last SLRU pages */ + FreeSlruWrite(offsets_writer); + FreeSlruWrite(members_writer); + + return next_offset; +} + + +/* + * Write one offset to the offset SLRU + */ +static void +RecordMultiXactOffset(SlruSegState *offsets_writer, MultiXactId multi, + MultiXactOffset offset) +{ + int64 pageno; + int entryno; + char *buf; + MultiXactOffset *offptr; + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + buf = SlruWriteSwitchPage(offsets_writer, pageno); + offptr = (MultiXactOffset *) buf; + offptr[entryno] = offset; +} + +/* + * Write the members for one multixid in the members SLRU + * + * (Currently, this is only ever called with nmembers == 1) + */ +static void +RecordMultiXactMembers(SlruSegState *members_writer, + MultiXactOffset offset, + int nmembers, MultiXactMember *members) +{ + for (int i = 0; i < nmembers; i++, offset++) + { + int64 pageno; + char *buf; + TransactionId *memberptr; + uint32 *flagsptr; + uint32 flagsval; + int bshift; + int flagsoff; + int memberoff; + + Assert(members[i].status <= MultiXactStatusUpdate); + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + + buf = SlruWriteSwitchPage(members_writer, pageno); + + memberptr = (TransactionId *) (buf + memberoff); + + *memberptr = members[i].xid; + + flagsptr = (uint32 *) (buf + flagsoff); + + flagsval = *flagsptr; + flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= (members[i].status << bshift); + *flagsptr = flagsval; + } +} diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 490e98fa26f..47119222655 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -43,6 +43,7 @@ #include +#include "access/multixact.h" #include "catalog/pg_class_d.h" #include "common/file_perm.h" #include "common/logging.h" @@ -807,15 +808,15 @@ copy_xact_xlog_xid(void) new_cluster.pgdata); check_ok(); - /* - * If the old server is before the MULTIXACT_FORMATCHANGE_CAT_VER change - * (see pg_upgrade.h) and the new server is after, then we don't copy - * pg_multixact files, but we need to reset pg_control so that the new - * server doesn't attempt to read multis older than the cutoff value. - */ - if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && - new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + /* Copy or convert pg_multixact files */ + Assert(new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER); + Assert(new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER); + if (old_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER) { + /* No change in multixact format, just copy the files */ + MultiXactId new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti; + MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff; + copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); copy_subdir_files("pg_multixact/members", "pg_multixact/members"); @@ -826,38 +827,67 @@ copy_xact_xlog_xid(void) * counters here and the oldest multi present on system. */ exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", - new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmxoff, - old_cluster.controldata.chkpnt_nxtmulti, + "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"", + new_cluster.bindir, new_nxtmxoff, new_nxtmulti, old_cluster.controldata.chkpnt_oldstMulti, new_cluster.pgdata); check_ok(); } - else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + else { + /* Conversion is needed */ + MultiXactId nxtmulti; + MultiXactId oldstMulti; + MultiXactOffset nxtmxoff; + /* - * Remove offsets/0000 file created by initdb that no longer matches - * the new multi-xid value. "members" starts at zero so no need to - * remove it. + * Determine the range of multixacts to convert. */ - remove_new_subdir("pg_multixact/offsets", false); + nxtmulti = old_cluster.controldata.chkpnt_nxtmulti; + if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + { + /* Versions 9.3 - 18: convert all multixids */ + oldstMulti = old_cluster.controldata.chkpnt_oldstMulti; + } + else + { + /* + * In PostgreSQL 9.2 and below, multitransactions were only used + * for row locking, and as such don't need to be preserved during + * upgrade. In that case, we utilize rewrite_multixacts() just to + * initialize new, empty files in the new format. + * + * It's important that the oldest multi is set to the latest value + * used by the old system, so that multixact.c returns the empty + * set for multis that might be present on disk. + */ + oldstMulti = nxtmulti; + } + /* handle wraparound */ + if (nxtmulti < FirstMultiXactId) + nxtmulti = FirstMultiXactId; + if (oldstMulti < FirstMultiXactId) + oldstMulti = FirstMultiXactId; - prep_status("Setting oldest multixact ID in new cluster"); + /* + * Remove the files created by initdb in the new cluster. + * rewrite_multixacts() will create new ones. + */ + remove_new_subdir("pg_multixact/members", false); + remove_new_subdir("pg_multixact/offsets", false); /* - * We don't preserve files in this case, but it's important that the - * oldest multi is set to the latest value used by the old system, so - * that multixact.c returns the empty set for multis that might be - * present on disk. We set next multi to the value following that; it - * might end up wrapped around (i.e. 0) if the old cluster had - * next=MaxMultiXactId, but multixact.c can cope with that just fine. + * Create new pg_multixact files, converting old ones if needed. */ + prep_status("Converting pg_multixact files"); + nxtmxoff = rewrite_multixacts(oldstMulti, nxtmulti); + check_ok(); + + prep_status("Setting next multixact ID and offset for new cluster"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -m %u,%u \"%s\"", + "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"", new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmulti + 1, - old_cluster.controldata.chkpnt_nxtmulti, + nxtmxoff, nxtmulti, oldstMulti, new_cluster.pgdata); check_ok(); } diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index e86336f4be9..be30dceed5c 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -114,6 +114,13 @@ extern char *output_files[]; */ #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 +/* + * MultiXactOffset was changed from 32-bit to 64-bit in version 19, at this + * catalog version. pg_multixact files need to be converted when upgrading + * across this version. + */ +#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 202512091 + /* * large object chunk size added to pg_controldata, * commit 5f93c37805e7485488480916b4585e098d3cc883 @@ -235,7 +242,7 @@ typedef struct uint32 chkpnt_nxtepoch; uint32 chkpnt_nxtoid; uint32 chkpnt_nxtmulti; - uint32 chkpnt_nxtmxoff; + uint64 chkpnt_nxtmxoff; uint32 chkpnt_oldstMulti; uint32 chkpnt_oldstxid; uint32 align; @@ -499,6 +506,9 @@ void old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, void report_extension_updates(ClusterInfo *cluster); +/* multixact_rewrite.c */ +MultiXactOffset rewrite_multixacts(MultiXactId from_multi, MultiXactId to_multi); + /* parallel.c */ void parallel_exec_prog(const char *log_file, const char *opt_log_file, const char *fmt,...) pg_attribute_printf(3, 4); diff --git a/src/bin/pg_upgrade/slru_io.c b/src/bin/pg_upgrade/slru_io.c new file mode 100644 index 00000000000..812a241fe62 --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.c @@ -0,0 +1,268 @@ +/* + * slru_io.c + * + * Routines for reading and writing SLRU files during upgrade. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.c + */ + +#include "postgres_fe.h" + +#include + +#include "common/fe_memutils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "pg_upgrade.h" +#include "port/pg_iovec.h" +#include "slru_io.h" + +static SlruSegState *AllocSlruSegState(const char *dir); +static char *SlruFileName(SlruSegState *state, int64 segno); +static void SlruFlush(SlruSegState *state); + +/* common parts of AllocSlruRead and AllocSlruWrite */ +static SlruSegState * +AllocSlruSegState(const char *dir) +{ + SlruSegState *state = pg_malloc(sizeof(*state)); + + state->dir = pstrdup(dir); + state->fn = NULL; + state->fd = -1; + state->segno = -1; + state->pageno = 0; + + /* state->writing and state->long_segment_names must be set by caller! */ + + return state; +} + +/* similar to the backend function with the same name */ +static char * +SlruFileName(SlruSegState *state, int64 segno) +{ + if (state->long_segment_names) + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF)); + return psprintf("%s/%015" PRIX64, state->dir, segno); + } + else + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF)); + return psprintf("%s/%04X", state->dir, (unsigned int) segno); + } +} + +/* + * Create SLRU reader for dir. + */ +SlruSegState * +AllocSlruRead(const char *dir, bool long_segment_names) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = false; + state->long_segment_names = long_segment_names; + + return state; +} + +/* + * Read the given page into memory buffer. + * + * Reading can be done in random order. + * + * If the file containing 'pageno' does not exist, a fatal error is raised. + * If the file exists but is shorter than expected, the missing part is read + * as zeros and a warning is logged. That is reasonable behavior for current + * callers. + * + * This is the slow path of the inlineable SlruReadSwitchPage() function. + */ +char * +SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno) +{ + int64 segno; + off_t offset; + ssize_t bytes_read; + + Assert(!state->writing); /* read only mode */ + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + /* If the new page is on a different SLRU segment, open the new segment */ + segno = pageno / SLRU_PAGES_PER_SEGMENT; + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + + state->segno = -1; + } + + state->fn = SlruFileName(state, segno); + if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", state->fn); + state->segno = segno; + } + + offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + bytes_read = 0; + while (bytes_read < BLCKSZ) + { + ssize_t rc; + + rc = pg_pread(state->fd, + &state->buf.data + bytes_read, + BLCKSZ - bytes_read, + offset + bytes_read); + if (rc < 0) + { + if (errno == EINTR) + continue; + pg_fatal("could not read file \"%s\": %m", state->fn); + } + if (rc == 0) + { + /* unexpected EOF */ + pg_log(PG_WARNING, "unexpected EOF reading file \"%s\" at offset %zd, reading as zeros", state->fn, + offset + bytes_read); + memset(&state->buf.data + bytes_read, 0, BLCKSZ - bytes_read); + break; + } + bytes_read += rc; + } + state->pageno = pageno; + + return state->buf.data; +} + +/* + * Free the reader. + */ +void +FreeSlruRead(SlruSegState *state) +{ + Assert(!state->writing); /* read only mode */ + + if (state->fd != -1) + close(state->fd); + pg_free(state); +} + +/* + * Create SLRU writer for dir. + */ +SlruSegState * +AllocSlruWrite(const char *dir, bool long_segment_names) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = true; + state->long_segment_names = long_segment_names; + + return state; +} + +/* + * Open the given page for writing. + * + * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that + * each segment is written in full before moving on to the next one. This + * limitation would be easy to lift if needed, but it fits the usage pattern + * of current callers. + * + * This is the slow path of the inlineable SlruWriteSwitchPage() function. + */ +char * +SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno) +{ + int64 segno; + off_t offset; + + Assert(state->writing); + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + SlruFlush(state); + memset(state->buf.data, 0, BLCKSZ); + + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + + state->segno = -1; + } + + /* Create the segment */ + state->fn = SlruFileName(state, segno); + if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + { + pg_fatal("could not create file \"%s\": %m", state->fn); + } + + state->segno = segno; + + if (offset > 0) + { + if (pg_pwrite_zeros(state->fd, offset, 0) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + } + + state->pageno = pageno; + + return state->buf.data; +} + +static void +SlruFlush(SlruSegState *state) +{ + struct iovec iovec = { + .iov_base = &state->buf, + .iov_len = BLCKSZ, + }; + off_t offset; + + if (state->segno == -1) + return; + + offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); +} + +/* + * Free the writer. + */ +void +FreeSlruWrite(SlruSegState *state) +{ + Assert(state->writing); + + SlruFlush(state); + + if (state->fd != -1) + close(state->fd); + pg_free(state); +} diff --git a/src/bin/pg_upgrade/slru_io.h b/src/bin/pg_upgrade/slru_io.h new file mode 100644 index 00000000000..5c80a679b4d --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.h @@ -0,0 +1,52 @@ +/* + * slru_io.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.h + */ + +#ifndef SLRU_IO_H +#define SLRU_IO_H + +/* + * State for reading or writing an SLRU, with a one page buffer. + */ +typedef struct SlruSegState +{ + bool writing; + bool long_segment_names; + + char *dir; + char *fn; + int fd; + int64 segno; + uint64 pageno; + + PGAlignedBlock buf; +} SlruSegState; + +extern SlruSegState *AllocSlruRead(const char *dir, bool long_segment_names); +extern char *SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno); +extern void FreeSlruRead(SlruSegState *state); + +static inline char * +SlruReadSwitchPage(SlruSegState *state, uint64 pageno) +{ + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + return SlruReadSwitchPageSlow(state, pageno); +} + +extern SlruSegState *AllocSlruWrite(const char *dir, bool long_segment_names); +extern char *SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno); +extern void FreeSlruWrite(SlruSegState *state); + +static inline char * +SlruWriteSwitchPage(SlruSegState *state, uint64 pageno) +{ + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + return SlruWriteSwitchPageSlow(state, pageno); +} + +#endif /* SLRU_IO_H */ diff --git a/src/bin/pg_upgrade/t/007_multixact_conversion.pl b/src/bin/pg_upgrade/t/007_multixact_conversion.pl new file mode 100644 index 00000000000..443b93c7545 --- /dev/null +++ b/src/bin/pg_upgrade/t/007_multixact_conversion.pl @@ -0,0 +1,427 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Version 19 expanded MultiXactOffset from 32 to 64 bits. Upgrading +# across that requires rewriting the SLRU files to the new format. +# This file contains tests for the conversion. +# +# To run, set 'oldinstall' ENV variable to point to a pre-v19 +# installation. If it's not set, or if it points to a v19 or above +# installation, this still performs a very basic test, upgrading a +# cluster with some multixacts. It's not very interesting, however, +# because there's no conversion involved in that case. + +use strict; +use warnings FATAL => 'all'; + +use Math::BigInt; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Temp dir for a dumps. +my $tempdir = PostgreSQL::Test::Utils::tempdir; + +# A workload that consumes multixids. The purpose of this is to +# generate some multixids in the old cluster, so that we can test +# upgrading them. The workload is a mix of KEY SHARE locking queries +# and UPDATEs, and commits and aborts, to generate a mix of multixids +# with different statuses. It consumes around 3000 multixids with +# 30000 members. That's enough to span more than one multixids +# 'offsets' page, and more than one 'members' segment. +# +# The workload leaves behind a table called 'mxofftest' containing a +# small number of rows referencing some of the generated multixids. +# +# Because this function is used to generate test data on the old +# installation, it needs to work with older PostgreSQL server +# versions. +# +# The first argument is the cluster to connect to, the second argument +# is a cluster using the new version. We need the 'psql' binary from +# the new version, the new cluster is otherwise unused. (We need to +# use the new 'psql' because some of the more advanced background psql +# perl module features depend on a fairly recent psql version.) +sub mxact_workload +{ + my $node = shift; # Cluster to connect to + my $binnode = shift; # Use the psql binary from this cluster + + my $connstr = $node->connstr('postgres'); + + $node->start; + $node->safe_psql( + 'postgres', qq[ + CREATE TABLE mxofftest (id INT PRIMARY KEY, n_updated INT) + WITH (AUTOVACUUM_ENABLED=FALSE); + INSERT INTO mxofftest SELECT G, 0 FROM GENERATE_SERIES(1, 50) G; + ]); + + my $nclients = 20; + my $update_every = 13; + my $abort_every = 11; + my @connections = (); + + # Silence the logging of the statements we run to avoid + # unnecessarily bloating the test logs. This runs before the + # upgrade we're testing, so the details should not be very + # interesting for debugging. But if needed, you can make it more + # verbose by setting this. + my $verbose = 0; + + # Open multiple connections to the database. Start a transaction + # in each connection. + for (0 .. $nclients) + { + # Use the psql binary from the new installation. The + # BackgroundPsql functionality doesn't work with older psql + # versions. + my $conn = $binnode->background_psql('', + connstr => $node->connstr('postgres')); + + $conn->query_safe("SET log_statement=none", verbose => $verbose) + unless $verbose; + $conn->query_safe("SET enable_seqscan=off", verbose => $verbose); + $conn->query_safe("BEGIN", verbose => $verbose); + + push(@connections, $conn); + } + + # Run queries using cycling through the connections in a + # round-robin fashion. We keep a transaction open in each + # connection at all times, and lock/update the rows. With 10 + # connections, each SELECT FOR KEY SHARE query generates a new + # multixid, containing the 10 XIDs of all the transactions running + # at the time. + for (my $i = 0; $i < 3000; $i++) + { + my $conn = $connections[ $i % $nclients ]; + + my $sql = ($i % $abort_every == 0) ? "ABORT" : "COMMIT"; + $conn->query_safe($sql, verbose => $verbose); + + $conn->query_safe("BEGIN", verbose => $verbose); + if ($i % $update_every == 0) + { + $sql = qq[ + UPDATE mxofftest SET n_updated = n_updated + 1 WHERE id = ${i} % 50; + ]; + } + else + { + my $threshold = int($i / 3000 * 50); + $sql = qq[ + select count(*) from ( + SELECT * FROM mxofftest WHERE id >= $threshold FOR KEY SHARE + ) as x + ]; + } + $conn->query_safe($sql, verbose => $verbose); + } + + for my $conn (@connections) + { + $conn->quit(); + } + + $node->stop; + return; +} + +# Return contents of the 'mxofftest' table, created by mxact_workload +sub get_test_table_contents +{ + my ($node, $filename) = @_; + + my $contents = $node->safe_psql('postgres', + "SELECT ctid, xmin, xmax, * FROM mxofftest"); + + my $path = $tempdir . '/' . $filename; + open(my $fh, '>', $path) + || die "could not open $path for writing $!"; + print $fh $contents; + close($fh); + + return $path; +} + +# Return the members of all updating multixids in the given range +sub get_updating_multixact_members +{ + my ($node, $from, $to, $filename) = @_; + + my $path = $tempdir . '/' . $filename; + open(my $fh, '>', $path) + || die "could not open $path for writing $!"; + + if ($to >= $from) + { + my $res = $node->safe_psql( + 'postgres', qq[ + SELECT multi, mode, xid + FROM generate_series($from, $to - 1) as multi, + pg_get_multixact_members(multi::text::xid) + WHERE mode not in ('keysh', 'sh'); + ]); + print $fh $res; + } + else + { + # Multixids wrapped around. Split the query into two parts, + # before and after the wraparound. + my $res = $node->safe_psql( + 'postgres', qq[ + SELECT multi, mode, xid + FROM generate_series($from, 4294967295) as multi, + pg_get_multixact_members(multi::text::xid) + WHERE mode not in ('keysh', 'sh'); + ]); + print $fh $res; + $res = $node->safe_psql( + 'postgres', qq[ + SELECT multi, mode, xid + FROM generate_series(1, $to - 1) as multi, + pg_get_multixact_members(multi::text::xid) + WHERE mode not in ('keysh', 'sh'); + ]); + print $fh $res; + } + + close($fh); + return $path; +} + +# Read multixid related fields from the control file +# +# Note: This is used on both the old and the new installation, so the +# command arguments and the output parsing used here must work with +# all PostgreSQL versions supported by the test. +sub read_multixid_fields +{ + my $node = shift; + + my $pg_controldata_path = $node->installed_command('pg_controldata'); + my ($stdout, $stderr) = + run_command([ $pg_controldata_path, $node->data_dir ]); + $stdout =~ /^Latest checkpoint's oldestMultiXid:\s*(.*)$/m + or die "could not read oldestMultiXid from pg_controldata"; + my $oldest_multi_xid = $1; + $stdout =~ /^Latest checkpoint's NextMultiXactId:\s*(.*)$/m + or die "could not read NextMultiXactId from pg_controldata"; + my $next_multi_xid = $1; + $stdout =~ /^Latest checkpoint's NextMultiOffset:\s*(.*)$/m + or die "could not read NextMultiOffset from pg_controldata"; + my $next_multi_offset = $1; + + return ($oldest_multi_xid, $next_multi_xid, $next_multi_offset); +} + +# Reset a cluster's next multixid and mxoffset to given values. +# +# Note: This is used on the old insallation, so the command arguments +# and the output parsing used here must work with all pre-v19 +# PostgreSQL versions supported by the test. +sub reset_mxid_mxoffset_pre_v19 +{ + my $node = shift; + my $mxid = shift; + my $mxoffset = shift; + + my $pg_resetwal_path = $node->installed_command('pg_resetwal'); + # Get block size + my ($out, $err) = + run_command([ $pg_resetwal_path, '--dry-run', $node->data_dir ]); + $out =~ /^Database block size: *(\d+)$/m or die; + + # Verify that no multixids are currently in use. Resetting would + # destroy them. (A freshly initialized cluster has no multixids.) + $out =~ /^Latest checkpoint's NextMultiXactId: *(\d+)$/m or die; + my $next_mxid = $1; + $out =~ /^Latest checkpoint's oldestMultiXid: *(\d+)$/m or die; + my $oldest_mxid = $1; + die "cluster has some multixids in use" unless $next_mxid == $oldest_mxid; + + # Extract a few other values from pg_resetwal --dry-run output + # that we need for the calculations below + $out =~ /^Database block size: *(\d+)$/m or die; + my $blcksz = $1; + # SLRU_PAGES_PER_SEGMENT is always 32 on pre-19 versions + my $slru_pages_per_segment = 32; + + # Do the reset + my @cmd = ( + $pg_resetwal_path, + '--pgdata' => $node->data_dir, + '--multixact-offset' => $mxoffset, + '--multixact-ids' => "$mxid,$mxid"); + command_ok(\@cmd, 'reset multixids and offset'); + + # pg_resetwal just updates the control file. The cluster will + # refuse to start up, if the SLRU segments corresponding to the + # next multixid and offset does not exist. Create a segments that + # covers the given values, filled with zeros. But first remove + # any old segments. + unlink glob $node->data_dir . "/pg_multixact/offsets/*"; + unlink glob $node->data_dir . "/pg_multixact/members/*"; + + # Initialize the 'offsets' SLRU file containing the new next multixid + # with zeros + # + # sizeof(MultiXactOffset) == 4 in PostgreSQL versions before 19 + my $multixact_offsets_per_page = $blcksz / 4; + my $segno = + int($mxid / $multixact_offsets_per_page / $slru_pages_per_segment); + my $path = + sprintf('%s/pg_multixact/offsets/%04X', $node->data_dir, $segno); + open my $fh, ">", $path + or die "could not open \"$path\": $!"; + binmode $fh; + my $bytes_per_seg = $slru_pages_per_segment * $blcksz; + syswrite($fh, "\0" x $bytes_per_seg) == $bytes_per_seg + or die "could not write to \"$path\": $!"; + close $fh; + + # Same for the 'members' SLRU + my $multixact_members_per_page = int($blcksz / 20) * 4; + $segno = + int($mxoffset / $multixact_members_per_page / $slru_pages_per_segment); + $path = sprintf "%s/pg_multixact/members/%04X", $node->data_dir, $segno; + open $fh, ">", $path + or die "could not open \"$path\": $!"; + binmode $fh; + syswrite($fh, "\0" x $bytes_per_seg) == $bytes_per_seg + or die "could not write to \"$path\": $!"; + close($fh); +} + +# Main test workhorse routine. Dump data on old version, run +# pg_upgrade, compare data after upgrade. +sub upgrade_and_compare +{ + my $tag = shift; + my $oldnode = shift; + my $newnode = shift; + + command_ok( + [ + 'pg_upgrade', '--no-sync', + '--old-datadir' => $oldnode->data_dir, + '--new-datadir' => $newnode->data_dir, + '--old-bindir' => $oldnode->config_data('--bindir'), + '--new-bindir' => $newnode->config_data('--bindir'), + '--socketdir' => $newnode->host, + '--old-port' => $oldnode->port, + '--new-port' => $newnode->port, + ], + 'run of pg_upgrade for new instance'); + + # Dump contents of the test table, and the status of all updating + # multixids from the old cluster. (Locking-only multixids don't + # need to be preserved so we ignore those) + # + # Note: we do this *after* running pg_upgrade, to ensure that we + # don't set all the hint bits before upgrade by doing the SELECT + # on the table. + my ($multixids_start, $multixids_end, undef) = + read_multixid_fields($oldnode); + $oldnode->start; + my $old_table_contents = + get_test_table_contents($oldnode, "oldnode_${tag}_table_contents"); + my $old_multixacts = + get_updating_multixact_members($oldnode, $multixids_start, + $multixids_end, "oldnode_${tag}_multixacts"); + $oldnode->stop; + + # Compare them with the upgraded cluster + $newnode->start; + my $new_table_contents = + get_test_table_contents($newnode, "newnode_${tag}_table_contents"); + my $new_multixacts = + get_updating_multixact_members($newnode, $multixids_start, + $multixids_end, "newnode_${tag}_multixacts"); + $newnode->stop; + + compare_files($old_table_contents, $new_table_contents, + 'test table contents from original and upgraded clusters match'); + compare_files($old_multixacts, $new_multixacts, + 'multixact members from original and upgraded clusters match'); +} + +my $old_version; + +# Basic scenario: Create a cluster using old installation, run +# multixid-creating workload on it, then upgrade. +# +# This works even even if the old and new version is the same, +# although it's not very interesting as the conversion routines only +# run when upgrading from a pre-v19 cluster. +{ + my $tag = 'basic'; + my $old = + PostgreSQL::Test::Cluster->new("${tag}_oldnode", + install_path => $ENV{oldinstall}); + my $new = PostgreSQL::Test::Cluster->new("${tag}_newnode"); + + $old->init(extra => ['-k']); + + $old_version = $old->pg_version; + note "old installation is version $old_version\n"; + + # Run the workload + my (undef, $start_mxid, $start_mxoff) = read_multixid_fields($old); + mxact_workload($old, $new); + my (undef, $finish_mxid, $finish_mxoff) = read_multixid_fields($old); + + note "Testing upgrade, ${tag} scenario\n" + . " mxid from ${start_mxid} to ${finish_mxid}\n" + . " oldnode mxoff from ${start_mxoff} to ${finish_mxoff}\n"; + + $new->init; + upgrade_and_compare($tag, $old, $new); +} + +# Wraparound scenario: This is the same as the basic scenario, but the +# old cluster goes through multixid and offset wraparound. +# +# This requires the old installation to be version 18 or older, +# because the hacks we use to reset the old cluster to a state just +# before the wraparound rely on the pre-v19 file format. If the old +# cluster is of v19 or above, multixact SLRU conversion is not needed +# anyway. +SKIP: +{ + skip + "skipping mxoffset conversion tests because upgrading from the old version does not require conversion" + if ($old_version >= '19devel'); + + my $tag = 'wraparound'; + my $old = + PostgreSQL::Test::Cluster->new("${tag}_oldnode", + install_path => $ENV{oldinstall}); + my $new = PostgreSQL::Test::Cluster->new("${tag}_newnode"); + + $old->init(extra => ['-k']); + + # Reset the old cluster to just before multixid and 32-bit offset + # wraparound. + reset_mxid_mxoffset_pre_v19($old, 0xFFFFFA00, 0xFFFFEC00); + + # Run the workload. This crosses multixid and offset wraparound. + my (undef, $start_mxid, $start_mxoff) = read_multixid_fields($old); + mxact_workload($old, $new); + my (undef, $finish_mxid, $finish_mxoff) = read_multixid_fields($old); + + note "Testing upgrade, ${tag} scenario\n" + . " mxid from ${start_mxid} to ${finish_mxid}\n" + . " oldnode mxoff from ${start_mxoff} to ${finish_mxoff}\n"; + + # Verify that wraparounds happened. + cmp_ok($finish_mxid, '<', $start_mxid, + "multixid wrapped around in old cluster"); + cmp_ok($finish_mxoff, '<', $start_mxoff, + "mxoff wrapped around in old cluster"); + + $new->init; + upgrade_and_compare($tag, $old, $new); +} + +done_testing(); diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 82e4bb90dd5..6433fe16364 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -28,8 +28,6 @@ #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) -#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF) - /* * Possible multixact lock modes ("status"). The first four modes are for * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the @@ -111,7 +109,7 @@ extern bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly); extern void MultiXactIdSetOldestMember(void); extern int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly); -extern bool GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, +extern void GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset); extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); @@ -131,8 +129,7 @@ extern void BootStrapMultiXact(void); extern void StartupMultiXact(void); extern void TrimMultiXact(void); extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, - Oid oldest_datoid, - bool is_startup); + Oid oldest_datoid); extern void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId *nextMulti, MultiXactOffset *nextMultiOffset, diff --git a/src/include/access/multixact_internal.h b/src/include/access/multixact_internal.h index f711f0a81eb..f2d6539e8a6 100644 --- a/src/include/access/multixact_internal.h +++ b/src/include/access/multixact_internal.h @@ -13,6 +13,11 @@ * src/include/access/multixact_internal.h */ #ifndef MULTIXACT_INTERNAL_H + +/* + * Note: This is not only to prevent including this file twice. + * MULTIXACT_INTERNAL_H is checked explicitly in multixact_read_v18.c. + */ #define MULTIXACT_INTERNAL_H #include "access/multixact.h" @@ -21,17 +26,9 @@ /* * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is * used everywhere else in Postgres. - * - * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, - * MultiXact page numbering also wraps around at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need - * take no explicit notice of that fact in this module, except when comparing - * segment and page numbers in TruncateMultiXact (see - * MultiXactOffsetPagePrecedes). */ -/* We need four bytes per offset */ +/* We need 8 bytes per offset */ #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) static inline int64 @@ -80,19 +77,6 @@ MultiXactIdToOffsetSegment(MultiXactId multi) #define MULTIXACT_MEMBERS_PER_PAGE \ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) -/* - * Because the number of items per page is not a divisor of the last item - * number (member 0xFFFFFFFF), the last segment does not use the maximum number - * of pages, and moreover the last used page therein does not use the same - * number of items as previous pages. (Another way to say it is that the - * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page - * has some empty space after that item.) - * - * This constant is the number of members in the last page of the last segment. - */ -#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ - ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) - /* page in which a member is to be found */ static inline int64 MXOffsetToMemberPage(MultiXactOffset offset) diff --git a/src/include/c.h b/src/include/c.h index ccd2b654d45..62cbf7a2eec 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -669,7 +669,7 @@ typedef uint32 SubTransactionId; /* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */ typedef TransactionId MultiXactId; -typedef uint32 MultiXactOffset; +typedef uint64 MultiXactOffset; typedef uint32 CommandId; diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 2fa6c8c60f0..82dc84e4099 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202512061 +#define CATALOG_VERSION_NO 202512091 #endif diff --git a/src/test/modules/test_slru/t/002_multixact_wraparound.pl b/src/test/modules/test_slru/t/002_multixact_wraparound.pl index 169333fc564..272d8e6fb08 100644 --- a/src/test/modules/test_slru/t/002_multixact_wraparound.pl +++ b/src/test/modules/test_slru/t/002_multixact_wraparound.pl @@ -37,7 +37,7 @@ my $slru_pages_per_segment = $1; # initialize the 'offsets' SLRU file containing the new next multixid # with zeros -my $multixact_offsets_per_page = $blcksz / 4; # sizeof(MultiXactOffset) == 4 +my $multixact_offsets_per_page = $blcksz / 8; # sizeof(MultiXactOffset) == 8 my $segno = int(0xFFFFFFF8 / $multixact_offsets_per_page / $slru_pages_per_segment); my $slru_file = sprintf('%s/pg_multixact/offsets/%04X', $node_pgdata, $segno); diff --git a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm index 60bbd5dd445..9825aaa9bb4 100644 --- a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm +++ b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm @@ -230,18 +230,23 @@ Executes a query in the current session and returns the output in scalar context and (output, error) in list context where error is 1 in case there was output generated on stderr when executing the query. +By default, the query and its results are printed to the test output. This +can be disabled by passing the keyword parameter verbose => false. + =cut sub query { - my ($self, $query) = @_; + my ($self, $query, %params) = @_; my $ret; my $output; my $query_cnt = $self->{query_cnt}++; + $params{verbose} = 1 unless defined $params{verbose}; + local $Test::Builder::Level = $Test::Builder::Level + 1; - note "issuing query $query_cnt via background psql: $query"; + note "issuing query $query_cnt via background psql: $query" unless !$params{verbose}; $self->{timeout}->start() if (defined($self->{query_timer_restart})); @@ -280,7 +285,7 @@ sub query explain { stdout => $self->{stdout}, stderr => $self->{stderr}, - }; + } unless !$params{verbose}; # Remove banner from stdout and stderr, our caller doesn't care. The # first newline is optional, as there would not be one if consuming an @@ -308,9 +313,9 @@ Query failure is determined by it producing output on stderr. sub query_safe { - my ($self, $query) = @_; + my ($self, $query, %params) = @_; - my $ret = $self->query($query); + my $ret = $self->query($query, %params); if ($self->{stderr} ne "") { diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 747528c4af1..295988b8b87 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -1793,13 +1793,20 @@ sub _get_env return (%inst_env); } -# Private routine to get an installation path qualified command. -# -# IPC::Run maintains a cache, %cmd_cache, mapping commands to paths. Tests -# which use nodes spanning more than one postgres installation path need to -# avoid confusing which installation's binaries get run. Setting $ENV{PATH} is -# insufficient, as IPC::Run does not check to see if the path has changed since -# caching a command. +=pod + +=item $node->installed_command(cmd) + +Get an installation path qualified command. + +IPC::Run maintains a cache, %cmd_cache, mapping commands to paths. Tests +which use nodes spanning more than one postgres installation path need to +avoid confusing which installation's binaries get run. Setting $ENV{PATH} is +insufficient, as IPC::Run does not check to see if the path has changed since +caching a command. + +=cut + sub installed_command { my ($self, $cmd) = @_; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 6e2ed0c8825..9dd65b10254 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1731,6 +1731,7 @@ MultiXactMember MultiXactOffset MultiXactStateData MultiXactStatus +MultiXactWriter MultirangeIOData MultirangeParseState MultirangeType @@ -1816,6 +1817,7 @@ OffsetVarNodes_context Oid OidOptions OkeysState +OldMultiXactReader OldToNewMapping OldToNewMappingData OnCommitAction @@ -2814,6 +2816,7 @@ SlruCtlData SlruErrorCause SlruPageStatus SlruScanCallback +SlruSegState SlruShared SlruSharedData SlruWriteAll -- 2.39.5