A safe value for the next multitransaction ID (first part) can be
determined by looking for the numerically largest file name in the
directory <filename>pg_multixact/offsets</filename> under the data directory,
- adding one, and then multiplying by 65536 (0x10000). Conversely, a safe
+ adding one, and then multiplying by 32768 (0x8000). Conversely, a safe
value for the oldest multitransaction ID (second part of
<option>-m</option>) can be determined by looking for the numerically smallest
- file name in the same directory and multiplying by 65536. The file
- names are in hexadecimal, so the easiest way to do this is to specify
- the option value in hexadecimal and append four zeroes.
+ file name in the same directory and multiplying by 32768 (0x8000).
+ Note that the file names are in hexadecimal. It is usually easiest
+ to specify the option value in hexadecimal too. For example, if
+ <filename>000F</filename> and <filename>0007</filename> are the greatest and
+ smallest entries in <filename>pg_multixact/offsets</filename>,
+ <literal>-m 0x80000,0x38000</literal> will work.
</para>
- <!-- 65536 = SLRU_PAGES_PER_SEGMENT * BLCKSZ / sizeof(MultiXactOffset) -->
+ <!-- 32768 = SLRU_PAGES_PER_SEGMENT * BLCKSZ / sizeof(MultiXactOffset) -->
</listitem>
</varlistentry>
xl_multixact_create *xlrec = (xl_multixact_create *) rec;
int i;
- appendStringInfo(buf, "%u offset %u nmembers %d: ", xlrec->mid,
+ appendStringInfo(buf, "%u offset %" PRIu64 " nmembers %d: ", xlrec->mid,
xlrec->moff, xlrec->nmembers);
for (i = 0; i < xlrec->nmembers; i++)
out_member(buf, &xlrec->members[i]);
{
xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec;
- appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)",
+ appendStringInfo(buf, "offsets [%u, %u), members [%" PRIu64 ", %" PRIu64 ")",
xlrec->startTruncOff, xlrec->endTruncOff,
xlrec->startTruncMemb, xlrec->endTruncMemb);
}
CheckPoint *checkpoint = (CheckPoint *) rec;
appendStringInfo(buf, "redo %X/%08X; "
- "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; "
+ "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %" PRIu64 "; "
"oldest xid %u in DB %u; oldest multi %u in DB %u; "
"oldest/newest commit timestamp xid: %u/%u; "
"oldest running xid %u; %s",
#include "utils/memutils.h"
-/* Multixact members wraparound thresholds. */
-#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
-#define MULTIXACT_MEMBER_DANGER_THRESHOLD \
- (MaxMultiXactOffset - MaxMultiXactOffset / 4)
+/*
+ * Thresholds used to keep members disk usage in check when multixids have a
+ * lot of members. When MULTIXACT_MEMBER_LOW_THRESHOLD is reached, vacuum
+ * starts freezing multixids more aggressively, even if the normal multixid
+ * age limits haven't been reached yet.
+ */
+#define MULTIXACT_MEMBER_LOW_THRESHOLD UINT64CONST(2000000000)
+#define MULTIXACT_MEMBER_HIGH_THRESHOLD UINT64CONST(4000000000)
static inline MultiXactId
PreviousMultiXactId(MultiXactId multi)
/*
* Oldest multixact offset that is potentially referenced by a multixact
- * referenced by a relation. We don't always know this value, so there's
- * a flag here to indicate whether or not we currently do.
+ * referenced by a relation.
*/
MultiXactOffset oldestOffset;
- bool oldestOffsetKnown;
/* support for anti-wraparound measures */
MultiXactId multiVacLimit;
MultiXactId multiStopLimit;
MultiXactId multiWrapLimit;
- /* support for members anti-wraparound measures */
- MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
-
/*
* Per-backend data starts here. We have two arrays stored in the area
* immediately following the MultiXactStateData struct. Each is indexed by
/* management of SLRU infrastructure */
static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
-static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
- MultiXactOffset offset2);
static void ExtendMultiXactOffset(MultiXactId multi);
static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
-static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
- MultiXactOffset start, uint32 distance);
-static bool SetOffsetVacuumLimit(bool is_startup);
+static void SetOldestOffset(void);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
ExtendMultiXactOffset(result + 1);
/*
- * Reserve the members space, similarly to above. Also, be careful not to
- * return zero as the starting offset for any multixact. See
- * GetMultiXactIdMembers() for motivation.
+ * Reserve the members space, similarly to above.
*/
nextOffset = MultiXactState->nextOffset;
- if (nextOffset == 0)
- {
- *offset = 1;
- nmembers++; /* allocate member slot 0 too */
- }
- else
- *offset = nextOffset;
-
- /*----------
- * Protect against overrun of the members space as well, with the
- * following rules:
- *
- * If we're past offsetStopLimit, refuse to generate more multis.
- * If we're close to offsetStopLimit, emit a warning.
- *
- * Arbitrarily, we start emitting warnings when we're 20 segments or less
- * from offsetStopLimit.
- *
- * Note we haven't updated the shared state yet, so if we fail at this
- * point, the multixact ID we grabbed can still be used by the next guy.
- *
- * Note that there is no point in forcing autovacuum runs here: the
- * multixact freeze settings would have to be reduced for that to have any
- * effect.
- *----------
- */
-#define OFFSET_WARN_SEGMENTS 20
- if (MultiXactState->oldestOffsetKnown &&
- MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
- nmembers))
- {
- /* see comment in the corresponding offsets wraparound case */
- SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
-
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("multixact \"members\" limit exceeded"),
- errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
- "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
- MultiXactState->offsetStopLimit - nextOffset - 1,
- nmembers,
- MultiXactState->offsetStopLimit - nextOffset - 1),
- errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
- MultiXactState->oldestMultiXactDB)));
- }
/*
- * Check whether we should kick autovacuum into action, to prevent members
- * wraparound. NB we use a much larger window to trigger autovacuum than
- * just the warning limit. The warning is just a measure of last resort -
- * this is in line with GetNewTransactionId's behaviour.
+ * Offsets are 64-bit integers and will never wrap around. Firstly, it
+ * would take an unrealistic amount of time and resources to consume 2^64
+ * offsets. Secondly, multixid creation is WAL-logged, so you would run
+ * out of LSNs before reaching offset wraparound. Nevertheless, check for
+ * wraparound as a sanity check.
*/
- if (!MultiXactState->oldestOffsetKnown ||
- (MultiXactState->nextOffset - MultiXactState->oldestOffset
- > MULTIXACT_MEMBER_SAFE_THRESHOLD))
- {
- /*
- * To avoid swamping the postmaster with signals, we issue the autovac
- * request only when crossing a segment boundary. With default
- * compilation settings that's roughly after 50k members. This still
- * gives plenty of chances before we get into real trouble.
- */
- if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
- (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
- SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
- }
-
- if (MultiXactState->oldestOffsetKnown &&
- MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
- nextOffset,
- nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
- ereport(WARNING,
+ if (nextOffset + nmembers < nextOffset)
+ ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
- "database with OID %u must be vacuumed before %d more multixact members are used",
- MultiXactState->offsetStopLimit - nextOffset + nmembers,
- MultiXactState->oldestMultiXactDB,
- MultiXactState->offsetStopLimit - nextOffset + nmembers),
- errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
+ errmsg("MultiXact members would wrap around")));
+ *offset = nextOffset;
ExtendMultiXactMember(nextOffset, nmembers);
* the next iteration. But note that nextMXact may be InvalidMultiXactId
* or the first value on a segment-beginning page after this routine
* exits, so anyone else looking at the variable must be prepared to deal
- * with either case. Similarly, nextOffset may be zero, but we won't use
- * that as the actual start offset of the next multixact.
+ * with either case.
*/
(MultiXactState->nextMXact)++;
LWLockRelease(MultiXactGenLock);
- debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
+ debug_elog4(DEBUG2, "GetNew: returning %u offset %" PRIu64,
+ result, *offset);
return result;
}
MultiXactOffset *offptr;
MultiXactOffset offset;
int length;
- int truelength;
MultiXactId oldestMXact;
MultiXactId nextMXact;
MultiXactMember *ptr;
* Find out the offset at which we need to start reading MultiXactMembers
* and the number of members in the multixact. We determine the latter as
* the difference between this multixact's starting offset and the next
- * one's. However, there is one corner case to worry about:
- *
- * Because GetNewMultiXactId skips over offset zero, to reserve zero for
- * to mean "unset", there is an ambiguity near the point of offset
- * wraparound. If we see next multixact's offset is one, is that our
- * multixact's actual endpoint, or did it end at zero with a subsequent
- * increment? We handle this using the knowledge that if the zero'th
- * member slot wasn't filled, it'll contain zero, and zero isn't a valid
- * transaction ID so it can't be a multixact member. Therefore, if we
- * read a zero from the members array, just ignore it.
+ * one's.
*/
pageno = MultiXactIdToOffsetPage(multi);
entryno = MultiXactIdToOffsetEntry(multi);
LWLockRelease(lock);
lock = NULL;
+ /* A multixid with zero members should not happen */
+ Assert(length > 0);
+
/* read the members */
ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
-
- truelength = 0;
prev_pageno = -1;
for (int i = 0; i < length; i++, offset++)
{
xactptr = (TransactionId *)
(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
-
- if (!TransactionIdIsValid(*xactptr))
- {
- /* Corner case: we must be looking at unused slot zero */
- Assert(offset == 0);
- continue;
- }
+ Assert(TransactionIdIsValid(*xactptr));
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
- ptr[truelength].xid = *xactptr;
- ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
- truelength++;
+ ptr[i].xid = *xactptr;
+ ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
}
LWLockRelease(lock);
- /* A multixid with zero members should not happen */
- Assert(truelength > 0);
-
/*
* Copy the result into the local cache.
*/
- mXactCachePut(multi, truelength, ptr);
+ mXactCachePut(multi, length, ptr);
debug_elog3(DEBUG2, "GetMembers: no cache for %s",
- mxid_to_string(multi, truelength, ptr));
+ mxid_to_string(multi, length, ptr));
*members = ptr;
- return truelength;
+ return length;
}
/*
"pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
LWTRANCHE_MULTIXACTMEMBER_SLRU,
SYNC_HANDLER_MULTIXACT_MEMBER,
- false);
+ true);
/* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
/* Initialize our shared state struct */
SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0);
}
-/*
- * MaybeExtendOffsetSlru
- * Extend the offsets SLRU area, if necessary
- *
- * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
- * contain files that are shorter than necessary; this would occur if the old
- * installation had used multixacts beyond the first page (files cannot be
- * copied, because the on-disk representation is different). pg_upgrade would
- * update pg_control to set the next offset value to be at that position, so
- * that tuples marked as locked by such MultiXacts would be seen as visible
- * without having to consult multixact. However, trying to create and use a
- * new MultiXactId would result in an error because the page on which the new
- * value would reside does not exist. This routine is in charge of creating
- * such pages.
- */
-static void
-MaybeExtendOffsetSlru(void)
-{
- int64 pageno;
- LWLock *lock;
-
- pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
- lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
-
- LWLockAcquire(lock, LW_EXCLUSIVE);
-
- if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
- {
- int slotno;
-
- /*
- * Fortunately for us, SimpleLruWritePage is already prepared to deal
- * with creating a new segment file even if the page we're writing is
- * not the first in it, so this is enough.
- */
- slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
- SimpleLruWritePage(MultiXactOffsetCtl, slotno);
- }
-
- LWLockRelease(lock);
-}
-
/*
* This must be called ONCE during postmaster or standalone-backend startup.
*
MultiXactState->finishedStartup = true;
LWLockRelease(MultiXactGenLock);
- /* Now compute how far away the next members wraparound is. */
- SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
+ /* Now compute how far away the next multixid wraparound is. */
+ SetMultiXactIdLimit(oldestMXact, oldestMXactDB);
}
/*
LWLockRelease(MultiXactGenLock);
debug_elog6(DEBUG2,
- "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
+ "MultiXact: checkpoint is nextMulti %u, nextOffset %" PRIu64 ", oldestMulti %u in DB %u",
*nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
}
MultiXactSetNextMXact(MultiXactId nextMulti,
MultiXactOffset nextMultiOffset)
{
- debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
+ debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %" PRIu64,
nextMulti, nextMultiOffset);
LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
MultiXactState->nextMXact = nextMulti;
MultiXactState->nextOffset = nextMultiOffset;
LWLockRelease(MultiXactGenLock);
-
- /*
- * During a binary upgrade, make sure that the offsets SLRU is large
- * enough to contain the next value that would be created.
- *
- * We need to do this pretty early during the first startup in binary
- * upgrade mode: before StartupMultiXact() in fact, because this routine
- * is called even before that by StartupXLOG(). And we can't do it
- * earlier than at this point, because during that first call of this
- * routine we determine the MultiXactState->nextMXact value that
- * MaybeExtendOffsetSlru needs.
- */
- if (IsBinaryUpgrade)
- MaybeExtendOffsetSlru();
}
/*
* datminmxid (ie, the oldest MultiXactId that might exist in any database
* of our cluster), and the OID of the (or a) database with that value.
*
- * is_startup is true when we are just starting the cluster, false when we
- * are updating state in a running cluster. This only affects log messages.
+ * This also updates MultiXactState->oldestOffset, by looking up the offset of
+ * MultiXactState->oldestMultiXactId.
*/
void
-SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
- bool is_startup)
+SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
{
MultiXactId multiVacLimit;
MultiXactId multiWarnLimit;
MultiXactId multiStopLimit;
MultiXactId multiWrapLimit;
MultiXactId curMulti;
- bool needs_offset_vacuum;
Assert(MultiXactIdIsValid(oldest_datminmxid));
/*
* We pretend that a wrap will happen halfway through the multixact ID
* space, but that's not really true, because multixacts wrap differently
- * from transaction IDs. Note that, separately from any concern about
- * multixact IDs wrapping, we must ensure that multixact members do not
- * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
+ * from transaction IDs.
*/
multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
if (multiWrapLimit < FirstMultiXactId)
Assert(!InRecovery);
- /* Set limits for offset vacuum. */
- needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
+ /*
+ * Offsets are 64-bits wide and never wrap around, so we don't need to
+ * consider them for emergency autovacuum purposes. But now that we're in
+ * a consistent state, determine MultiXactState->oldestOffset. It will be
+ * used to adjust the freezing cutoff, to keep the offsets disk usage in
+ * check.
+ */
+ SetOldestOffset();
/*
* If past the autovacuum force point, immediately signal an autovac
* database, it'll call here, and we'll signal the postmaster to start
* another iteration immediately if there are still any old databases.
*/
- if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
- needs_offset_vacuum) && IsUnderPostmaster)
+ if (MultiXactIdPrecedes(multiVacLimit, curMulti) && IsUnderPostmaster)
SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
/* Give an immediate warning if past the wrap warn point */
debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
MultiXactState->nextMXact = minMulti;
}
- if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
+ if (MultiXactState->nextOffset < minMultiOffset)
{
- debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
+ debug_elog3(DEBUG2, "MultiXact: setting next offset to %" PRIU64,
minMultiOffset);
MultiXactState->nextOffset = minMultiOffset;
}
Assert(InRecovery);
if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
- SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
+ SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
}
/*
LWLockRelease(lock);
}
- /*
- * Compute the number of items till end of current page. Careful: if
- * addition of unsigned ints wraps around, we're at the last page of
- * the last segment; since that page holds a different number of items
- * than other pages, we need to do it differently.
- */
- if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
- {
- /*
- * This is the last page of the last segment; we can compute the
- * number of items left to allocate in it without modulo
- * arithmetic.
- */
- difference = MaxMultiXactOffset - offset + 1;
- }
- else
- difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
+ /* Compute the number of items till end of current page. */
+ difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
/*
- * Advance to next page, taking care to properly handle the wraparound
- * case. OK if nmembers goes negative.
+ * Advance to next page. OK if nmembers goes negative.
*/
nmembers -= difference;
offset += difference;
}
/*
- * Determine how aggressively we need to vacuum in order to prevent member
- * wraparound.
- *
- * To do so determine what's the oldest member offset and install the limit
- * info in MultiXactState, where it can be used to prevent overrun of old data
- * in the members SLRU area.
- *
- * The return value is true if emergency autovacuum is required and false
- * otherwise.
+ * Calculate the oldest member offset and install it in MultiXactState, where
+ * it can be used to adjust multixid freezing cutoffs.
*/
-static bool
-SetOffsetVacuumLimit(bool is_startup)
+static void
+SetOldestOffset(void)
{
MultiXactId oldestMultiXactId;
MultiXactId nextMXact;
MultiXactOffset oldestOffset = 0; /* placate compiler */
- MultiXactOffset prevOldestOffset;
MultiXactOffset nextOffset;
bool oldestOffsetKnown = false;
- bool prevOldestOffsetKnown;
- MultiXactOffset offsetStopLimit = 0;
- MultiXactOffset prevOffsetStopLimit;
/*
* NB: Have to prevent concurrent truncation, we might otherwise try to
oldestMultiXactId = MultiXactState->oldestMultiXactId;
nextMXact = MultiXactState->nextMXact;
nextOffset = MultiXactState->nextOffset;
- prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
- prevOldestOffset = MultiXactState->oldestOffset;
- prevOffsetStopLimit = MultiXactState->offsetStopLimit;
Assert(MultiXactState->finishedStartup);
LWLockRelease(MultiXactGenLock);
else
{
/*
- * Figure out where the oldest existing multixact's offsets are
- * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
- * the supposedly-earliest multixact might not really exist. We are
- * careful not to fail in that case.
+ * Look up the offset at which the oldest existing multixact's members
+ * are stored. If we cannot find it, be careful not to fail, and
+ * leave oldestOffset unchanged. oldestOffset is initialized to zero
+ * at system startup, which prevents truncating members until a proper
+ * value is calculated.
+ *
+ * (We had bugs in early releases of PostgreSQL 9.3.X and 9.4.X where
+ * the supposedly-earliest multixact might not really exist. Those
+ * should be long gone by now, so this should not fail, but let's
+ * still be defensive.)
*/
oldestOffsetKnown =
find_multixact_start(oldestMultiXactId, &oldestOffset);
if (oldestOffsetKnown)
ereport(DEBUG1,
- (errmsg_internal("oldest MultiXactId member is at offset %u",
+ (errmsg_internal("oldest MultiXactId member is at offset %" PRIu64,
oldestOffset)));
else
ereport(LOG,
- (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
+ (errmsg("MultiXact member truncation is disabled because oldest checkpointed MultiXact %u does not exist on disk",
oldestMultiXactId)));
}
LWLockRelease(MultiXactTruncationLock);
- /*
- * If we can, compute limits (and install them MultiXactState) to prevent
- * overrun of old data in the members SLRU area. We can only do so if the
- * oldest offset is known though.
- */
+ /* Install the computed value */
if (oldestOffsetKnown)
{
- /* move back to start of the corresponding segment */
- offsetStopLimit = oldestOffset - (oldestOffset %
- (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
-
- /* always leave one segment before the wraparound point */
- offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
-
- if (!prevOldestOffsetKnown && !is_startup)
- ereport(LOG,
- (errmsg("MultiXact member wraparound protections are now enabled")));
-
- ereport(DEBUG1,
- (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
- offsetStopLimit, oldestMultiXactId)));
- }
- else if (prevOldestOffsetKnown)
- {
- /*
- * If we failed to get the oldest offset this time, but we have a
- * value from a previous pass through this function, use the old
- * values rather than automatically forcing an emergency autovacuum
- * cycle again.
- */
- oldestOffset = prevOldestOffset;
- oldestOffsetKnown = true;
- offsetStopLimit = prevOffsetStopLimit;
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ MultiXactState->oldestOffset = oldestOffset;
+ LWLockRelease(MultiXactGenLock);
}
-
- /* Install the computed values */
- LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
- MultiXactState->oldestOffset = oldestOffset;
- MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
- MultiXactState->offsetStopLimit = offsetStopLimit;
- LWLockRelease(MultiXactGenLock);
-
- /*
- * Do we need an emergency autovacuum? If we're not sure, assume yes.
- */
- return !oldestOffsetKnown ||
- (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
-}
-
-/*
- * Return whether adding "distance" to "start" would move past "boundary".
- *
- * We use this to determine whether the addition is "wrapping around" the
- * boundary point, hence the name. The reason we don't want to use the regular
- * 2^31-modulo arithmetic here is that we want to be able to use the whole of
- * the 2^32-1 space here, allowing for more multixacts than would fit
- * otherwise.
- */
-static bool
-MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
- uint32 distance)
-{
- MultiXactOffset finish;
-
- /*
- * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
- * if the addition wraps around the UINT_MAX boundary, skip that value.
- */
- finish = start + distance;
- if (finish < start)
- finish++;
-
- /*-----------------------------------------------------------------------
- * When the boundary is numerically greater than the starting point, any
- * value numerically between the two is not wrapped:
- *
- * <----S----B---->
- * [---) = F wrapped past B (and UINT_MAX)
- * [---) = F not wrapped
- * [----] = F wrapped past B
- *
- * When the boundary is numerically less than the starting point (i.e. the
- * UINT_MAX wraparound occurs somewhere in between) then all values in
- * between are wrapped:
- *
- * <----B----S---->
- * [---) = F not wrapped past B (but wrapped past UINT_MAX)
- * [---) = F wrapped past B (and UINT_MAX)
- * [----] = F not wrapped
- *-----------------------------------------------------------------------
- */
- if (start < boundary)
- return finish >= boundary || finish < start;
- else
- return finish >= boundary && finish < start;
}
/*
* members: Number of member entries (nextOffset - oldestOffset)
* oldestMultiXactId: Oldest MultiXact ID still in use
* oldestOffset: Oldest offset still in use
- *
- * Returns false if unable to determine, the oldest offset being unknown.
*/
-bool
+void
GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members,
MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
{
MultiXactOffset nextOffset;
MultiXactId nextMultiXactId;
- bool oldestOffsetKnown;
LWLockAcquire(MultiXactGenLock, LW_SHARED);
nextOffset = MultiXactState->nextOffset;
*oldestMultiXactId = MultiXactState->oldestMultiXactId;
nextMultiXactId = MultiXactState->nextMXact;
*oldestOffset = MultiXactState->oldestOffset;
- oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
LWLockRelease(MultiXactGenLock);
- if (!oldestOffsetKnown)
- {
- *members = 0;
- *multixacts = 0;
- *oldestMultiXactId = InvalidMultiXactId;
- *oldestOffset = 0;
- return false;
- }
-
*members = nextOffset - *oldestOffset;
*multixacts = nextMultiXactId - *oldestMultiXactId;
- return true;
}
/*
* vacuum_multixact_freeze_table_age work together to make sure we never have
* too many multixacts; we hope that, at least under normal circumstances,
* this will also be sufficient to keep us from using too many offsets.
- * However, if the average multixact has many members, we might exhaust the
- * members space while still using few enough members that these limits fail
- * to trigger relminmxid advancement by VACUUM. At that point, we'd have no
- * choice but to start failing multixact-creating operations with an error.
- *
- * To prevent that, if more than a threshold portion of the members space is
- * used, we effectively reduce autovacuum_multixact_freeze_max_age and
- * to a value just less than the number of multixacts in use. We hope that
- * this will quickly trigger autovacuuming on the table or tables with the
- * oldest relminmxid, thus allowing datminmxid values to advance and removing
- * some members.
- *
- * As the fraction of the member space currently in use grows, we become
- * more aggressive in clamping this value. That not only causes autovacuum
- * to ramp up, but also makes any manual vacuums the user issues more
- * aggressive. This happens because vacuum_get_cutoffs() will clamp the
- * freeze table and the minimum freeze age cutoffs based on the effective
- * autovacuum_multixact_freeze_max_age this function returns. In the worst
- * case, we'll claim the freeze_max_age to zero, and every vacuum of any
- * table will freeze every multixact.
+ * However, if the average multixact has many members, we might accumulate a
+ * large amount of members, consuming disk space, while still using few enough
+ * multixids that the multixid limits fail to trigger relminmxid advancement
+ * by VACUUM.
+ *
+ * To prevent that, if the members space usage exceeds a threshold
+ * (MULTIXACT_MEMBER_LOW_THRESHOLD), we effectively reduce
+ * autovacuum_multixact_freeze_max_age to a value just less than the number of
+ * multixacts in use. We hope that this will quickly trigger autovacuuming on
+ * the table or tables with the oldest relminmxid, thus allowing datminmxid
+ * values to advance and removing some members.
+ *
+ * As the amount of the member space in use grows, we become more aggressive
+ * in clamping this value. That not only causes autovacuum to ramp up, but
+ * also makes any manual vacuums the user issues more aggressive. This
+ * happens because vacuum_get_cutoffs() will clamp the freeze table and the
+ * minimum freeze age cutoffs based on the effective
+ * autovacuum_multixact_freeze_max_age this function returns. At the extreme,
+ * when the members usage reaches MULTIXACT_MEMBER_HIGH_THRESHOLD, we clamp
+ * freeze_max_age to zero, and every vacuum of any table will freeze every
+ * multixact.
*/
int
MultiXactMemberFreezeThreshold(void)
MultiXactId oldestMultiXactId;
MultiXactOffset oldestOffset;
- /* If we can't determine member space utilization, assume the worst. */
- if (!GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset))
- return 0;
+ /* Read the current offsets and members usage. */
+ GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset);
/* If member space utilization is low, no special action is required. */
- if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
+ if (members <= MULTIXACT_MEMBER_LOW_THRESHOLD)
return autovacuum_multixact_freeze_max_age;
/*
* Compute a target for relminmxid advancement. The number of multixacts
* we try to eliminate from the system is based on how far we are past
- * MULTIXACT_MEMBER_SAFE_THRESHOLD.
+ * MULTIXACT_MEMBER_LOW_THRESHOLD.
+ *
+ * The way this formula works is that when members is exactly at the low
+ * threshold, fraction = 0.0, and we set freeze_max_age equal to
+ * mxid_age(oldestMultiXactId). As members grows further, towards the
+ * high threshold, fraction grows linearly from 0.0 to 1.0, and the result
+ * shrinks from mxid_age(oldestMultiXactId) to 0. Beyond the high
+ * threshold, fraction > 1.0 and the result is clamped to 0.
*/
- fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
- (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
- victim_multixacts = multixacts * fraction;
+ fraction = (double) (members - MULTIXACT_MEMBER_LOW_THRESHOLD) /
+ (MULTIXACT_MEMBER_HIGH_THRESHOLD - MULTIXACT_MEMBER_LOW_THRESHOLD);
/* fraction could be > 1.0, but lowest possible freeze age is zero */
- if (victim_multixacts > multixacts)
+ if (fraction >= 1.0)
return 0;
+
+ victim_multixacts = multixacts * fraction;
result = multixacts - victim_multixacts;
/*
/*
* Delete members segments [oldest, newOldest)
- *
- * The members SLRU can, in contrast to the offsets one, be filled to almost
- * the full range at once. This means SimpleLruTruncate() can't trivially be
- * used - instead the to-be-deleted range is computed using the offsets
- * SLRU. C.f. TruncateMultiXact().
*/
static void
PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
{
- const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
- int64 startsegment = MXOffsetToMemberSegment(oldestOffset);
- int64 endsegment = MXOffsetToMemberSegment(newOldestOffset);
- int64 segment = startsegment;
-
- /*
- * Delete all the segments but the last one. The last segment can still
- * contain, possibly partially, valid data.
- */
- while (segment != endsegment)
- {
- elog(DEBUG2, "truncating multixact members segment %" PRIx64,
- segment);
- SlruDeleteSegment(MultiXactMemberCtl, segment);
-
- /* move to next segment, handling wraparound correctly */
- if (segment == maxsegment)
- segment = 0;
- else
- segment += 1;
- }
+ SimpleLruTruncate(MultiXactMemberCtl,
+ MXOffsetToMemberPage(newOldestOffset));
}
/*
elog(DEBUG1, "performing multixact truncation: "
"offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
- "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
+ "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")",
oldestMulti, newOldestMulti,
MultiXactIdToOffsetSegment(oldestMulti),
MultiXactIdToOffsetSegment(newOldestMulti),
LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
MultiXactState->oldestMultiXactId = newOldestMulti;
MultiXactState->oldestMultiXactDB = newOldestMultiDB;
+ MultiXactState->oldestOffset = newOldestOffset;
LWLockRelease(MultiXactGenLock);
/* First truncate members */
/*
* Decide whether a MultiXactMember page number is "older" for truncation
- * purposes. There is no "invalid offset number" so use the numbers verbatim.
+ * purposes. There is no "invalid offset number" and members never wrap
+ * around, so use the numbers verbatim.
*/
static bool
MultiXactMemberPagePrecedes(int64 page1, int64 page2)
{
- MultiXactOffset offset1;
- MultiXactOffset offset2;
-
- offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
- offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
-
- return (MultiXactOffsetPrecedes(offset1, offset2) &&
- MultiXactOffsetPrecedes(offset1,
- offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
+ return page1 < page2;
}
/*
}
-/*
- * Decide which of two offsets is earlier.
- */
-static bool
-MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
-{
- int32 diff = (int32) (offset1 - offset2);
-
- return (diff < 0);
-}
-
/*
* Write a TRUNCATE xlog record
*
elog(DEBUG1, "replaying multixact truncation: "
"offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
- "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
+ "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")",
xlrec.startTruncOff, xlrec.endTruncOff,
MultiXactIdToOffsetSegment(xlrec.startTruncOff),
MultiXactIdToOffsetSegment(xlrec.endTruncOff),
* Advance the horizon values, so they're current at the end of
* recovery.
*/
- SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
+ SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB);
PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
checkPoint.nextOid = FirstGenbkiObjectId;
checkPoint.nextMulti = FirstMultiXactId;
- checkPoint.nextMultiOffset = 0;
+ checkPoint.nextMultiOffset = 1;
checkPoint.oldestXid = FirstNormalTransactionId;
checkPoint.oldestXidDB = Template1DbOid;
checkPoint.oldestMulti = FirstMultiXactId;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
- SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+ SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
/* Set up the XLOG page header */
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
- SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+ SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
SetCommitTsLimit(checkPoint.oldestCommitTsXid,
checkPoint.newestCommitTsXid);
U64FromFullTransactionId(checkPoint.nextXid),
checkPoint.nextOid)));
ereport(DEBUG1,
- (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
+ (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
checkPoint.nextMulti, checkPoint.nextMultiOffset)));
ereport(DEBUG1,
(errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
/*
* Also compute the multixact age for which freezing is urgent. This is
- * normally autovacuum_multixact_freeze_max_age, but may be less if we are
- * short of multixact member space.
+ * normally autovacuum_multixact_freeze_max_age, but may be less if
+ * multixact members are bloated.
*/
effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
* signaling twice?
*/
SetTransactionIdLimit(frozenXID, oldestxid_datoid);
- SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+ SetMultiXactIdLimit(minMulti, minmulti_datoid);
LWLockRelease(WrapLimitsVacuumLock);
}
/*
* Compute the multixact age for which freezing is urgent. This is
- * normally autovacuum_multixact_freeze_max_age, but may be less if we are
- * short of multixact member space.
+ * normally autovacuum_multixact_freeze_max_age, but may be less if
+ * multixact members are bloated.
*/
effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
ControlFile->checkPointCopy.nextOid);
printf(_("Latest checkpoint's NextMultiXactId: %u\n"),
ControlFile->checkPointCopy.nextMulti);
- printf(_("Latest checkpoint's NextMultiOffset: %u\n"),
+ printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"),
ControlFile->checkPointCopy.nextMultiOffset);
printf(_("Latest checkpoint's oldestXID: %u\n"),
ControlFile->checkPointCopy.oldestXid);
static void WriteEmptyXLOG(void);
static void usage(void);
static uint32 strtouint32_strict(const char *restrict s, char **restrict endptr, int base);
+static uint64 strtouint64_strict(const char *restrict s, char **restrict endptr, int base);
int
case 'O':
errno = 0;
- next_mxoff_val = strtouint32_strict(optarg, &endptr, 0);
+ next_mxoff_val = strtouint64_strict(optarg, &endptr, 0);
if (endptr == optarg || *endptr != '\0' || errno != 0)
{
pg_log_error("invalid argument for option %s", "-O");
ControlFile.checkPointCopy.nextOid);
printf(_("Latest checkpoint's NextMultiXactId: %u\n"),
ControlFile.checkPointCopy.nextMulti);
- printf(_("Latest checkpoint's NextMultiOffset: %u\n"),
+ printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"),
ControlFile.checkPointCopy.nextMultiOffset);
printf(_("Latest checkpoint's oldestXID: %u\n"),
ControlFile.checkPointCopy.oldestXid);
if (next_mxoff_given)
{
- printf(_("NextMultiOffset: %u\n"),
+ printf(_("NextMultiOffset: %" PRIu64 "\n"),
ControlFile.checkPointCopy.nextMultiOffset);
}
return (uint32) val;
}
+
+/*
+ * strtouint64_strict -- like strtou64(), but doesn't accept negative values
+ */
+static uint64
+strtouint64_strict(const char *restrict s, char **restrict endptr, int base)
+{
+ uint64 val;
+ bool is_neg;
+
+ /* skip leading whitespace */
+ while (isspace((unsigned char) *s))
+ s++;
+
+ /*
+ * Is it negative? We still call strtou64() if it was, to set 'endptr'.
+ * (The current callers don't care though.)
+ */
+ is_neg = (*s == '-');
+
+ val = strtou64(s, endptr, base);
+
+ /* reject if it was negative */
+ if (errno == 0 && is_neg)
+ {
+ errno = ERANGE;
+ val = 0;
+ }
+
+ return val;
+}
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * $blcksz / 8;
# --multixact-ids argument is "new,old"
push @cmd,
'--multixact-ids' => sprintf("%d,%d",
file.o \
function.o \
info.o \
+ multixact_read_v18.o \
+ multixact_rewrite.o \
option.o \
parallel.o \
pg_upgrade.o \
relfilenumber.o \
server.o \
+ slru_io.o \
tablespace.o \
task.o \
util.o \
'file.c',
'function.c',
'info.c',
+ 'multixact_read_v18.c',
+ 'multixact_rewrite.c',
'option.c',
'parallel.c',
'pg_upgrade.c',
'relfilenumber.c',
'server.c',
+ 'slru_io.c',
'tablespace.c',
'task.c',
'util.c',
't/004_subscription.pl',
't/005_char_signedness.pl',
't/006_transfer_modes.pl',
+ 't/007_multixact_conversion.pl',
],
'test_kwargs': {'priority': 40}, # pg_upgrade tests are slow
},
--- /dev/null
+/*
+ * multixact_read_v18.c
+ *
+ * Functions to read multixact SLRUs from clusters of PostgreSQL version 18
+ * and older. In version 19, the multixid offsets were expanded from 32 to 64
+ * bits.
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_read_v18.c
+ */
+
+#include "postgres_fe.h"
+
+#include "multixact_read_v18.h"
+#include "pg_upgrade.h"
+
+/*
+ * NOTE: below are a bunch of definitions that are copy-pasted from
+ * multixact.c from version 18. It's important that this file doesn't
+ * #include the new definitions with same names from "multixact_internal.h"!
+ *
+ * To further avoid confusion in the functions exposed outside this source
+ * file, we use MultiXactOffset32 to represent the old-style 32-bit multixid
+ * offsets. The new 64-bit MultiXactOffset should not be used anywhere in
+ * this file.
+ */
+#ifdef MULTIXACT_INTERNAL_H
+#error multixact_internal.h should not be included in multixact_read_v18.c
+#endif
+#define MultiXactOffset should_not_be_used
+
+/* We need four bytes per offset and 8 bytes per base for each page. */
+#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset32))
+
+static inline int64
+MultiXactIdToOffsetPage(MultiXactId multi)
+{
+ return multi / MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+static inline int
+MultiXactIdToOffsetEntry(MultiXactId multi)
+{
+ return multi % MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+/*
+ * The situation for members is a bit more complex: we store one byte of
+ * additional flag bits for each TransactionId. To do this without getting
+ * into alignment issues, we store four bytes of flags, and then the
+ * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
+ * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT 8
+#define MXACT_MEMBER_FLAGS_PER_BYTE 1
+#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP 4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
+ (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+ (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE \
+ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/* page in which a member is to be found */
+static inline int64
+MXOffsetToMemberPage(MultiXactOffset32 offset)
+{
+ return offset / MULTIXACT_MEMBERS_PER_PAGE;
+}
+
+/* Location (byte offset within page) of flag word for a given member */
+static inline int
+MXOffsetToFlagsOffset(MultiXactOffset32 offset)
+{
+ MultiXactOffset32 group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+ int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
+ int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
+
+ return byteoff;
+}
+
+/* Location (byte offset within page) of TransactionId of given member */
+static inline int
+MXOffsetToMemberOffset(MultiXactOffset32 offset)
+{
+ int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+
+ return MXOffsetToFlagsOffset(offset) +
+ MULTIXACT_FLAGBYTES_PER_GROUP +
+ member_in_group * sizeof(TransactionId);
+}
+
+static inline int
+MXOffsetToFlagsBitShift(MultiXactOffset32 offset)
+{
+ int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+ int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
+
+ return bshift;
+}
+
+/*
+ * Construct reader of old multixacts.
+ *
+ * Returns the malloced memory used by the all other calls in this module.
+ */
+OldMultiXactReader *
+AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti,
+ MultiXactOffset32 nextOffset)
+{
+ OldMultiXactReader *state = state = pg_malloc(sizeof(*state));
+ char dir[MAXPGPATH] = {0};
+
+ state->nextMXact = nextMulti;
+ state->nextOffset = nextOffset;
+
+ pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata);
+ state->offset = AllocSlruRead(dir, false);
+
+ pg_sprintf(dir, "%s/pg_multixact/members", pgdata);
+ state->members = AllocSlruRead(dir, false);
+
+ return state;
+}
+
+/*
+ * This is a simplified version of the GetMultiXactIdMembers() server
+ * function:
+ *
+ * - Only return the updating member, if any. Upgrade only cares about the
+ * updaters. If there is no updating member, return somewhat arbitrarily
+ * the first locking-only member, because we don't have any way to represent
+ * "no members".
+ *
+ * - Because there's no concurrent activity, we don't need to worry about
+ * locking and some corner cases.
+ *
+ * - Don't bail out on invalid entries. If the server crashes, it can leave
+ * invalid or half-written entries on disk. Such multixids won't appear
+ * anywhere else on disk, so the server will never try to read them. During
+ * upgrade, however, we scan through all multixids in order, and will
+ * encounter such invalid but unreferenced multixids too.
+ *
+ * Returns true on success, false if the multixact was invalid.
+ */
+bool
+GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi,
+ MultiXactMember *member)
+{
+ MultiXactId nextMXact,
+ nextOffset,
+ tmpMXact;
+ int64 pageno,
+ prev_pageno;
+ int entryno,
+ length;
+ char *buf;
+ MultiXactOffset32 *offptr,
+ offset;
+ MultiXactOffset32 nextMXOffset;
+ TransactionId result_xid = InvalidTransactionId;
+ MultiXactStatus result_status = 0;
+
+ nextMXact = state->nextMXact;
+ nextOffset = state->nextOffset;
+
+ /*
+ * Comment copied from GetMultiXactIdMembers in PostgreSQL v18
+ * multixact.c:
+ *
+ * Find out the offset at which we need to start reading MultiXactMembers
+ * and the number of members in the multixact. We determine the latter as
+ * the difference between this multixact's starting offset and the next
+ * one's. However, there are some corner cases to worry about:
+ *
+ * 1. This multixact may be the latest one created, in which case there is
+ * no next one to look at. The next multixact's offset should be set
+ * already, as we set it in RecordNewMultiXact(), but we used to not do
+ * that in older minor versions. To cope with that case, if this
+ * multixact is the latest one created, use the nextOffset value we read
+ * above as the endpoint.
+ *
+ * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
+ * for to mean "unset", there is an ambiguity near the point of offset
+ * wraparound. If we see next multixact's offset is one, is that our
+ * multixact's actual endpoint, or did it end at zero with a subsequent
+ * increment? We handle this using the knowledge that if the zero'th
+ * member slot wasn't filled, it'll contain zero, and zero isn't a valid
+ * transaction ID so it can't be a multixact member. Therefore, if we
+ * read a zero from the members array, just ignore it.
+ */
+
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ buf = SlruReadSwitchPage(state->offset, pageno);
+ offptr = (MultiXactOffset32 *) buf;
+ offptr += entryno;
+ offset = *offptr;
+
+ if (offset == 0)
+ {
+ /* Invalid entry */
+ return false;
+ }
+
+ /*
+ * Use the same increment rule as GetNewMultiXactId(), that is, don't
+ * handle wraparound explicitly until needed.
+ */
+ tmpMXact = multi + 1;
+
+ if (nextMXact == tmpMXact)
+ {
+ /* Corner case 1: there is no next multixact */
+ nextMXOffset = nextOffset;
+ }
+ else
+ {
+ /* handle wraparound if needed */
+ if (tmpMXact < FirstMultiXactId)
+ tmpMXact = FirstMultiXactId;
+
+ prev_pageno = pageno;
+
+ pageno = MultiXactIdToOffsetPage(tmpMXact);
+ entryno = MultiXactIdToOffsetEntry(tmpMXact);
+
+ if (pageno != prev_pageno)
+ buf = SlruReadSwitchPage(state->offset, pageno);
+
+ offptr = (MultiXactOffset32 *) buf;
+ offptr += entryno;
+ nextMXOffset = *offptr;
+ }
+
+ if (nextMXOffset == 0)
+ {
+ /* Invalid entry */
+ return false;
+ }
+ length = nextMXOffset - offset;
+
+ /* read the members */
+ prev_pageno = -1;
+ for (int i = 0; i < length; i++, offset++)
+ {
+ TransactionId *xactptr;
+ uint32 *flagsptr;
+ int flagsoff;
+ int bshift;
+ int memberoff;
+ MultiXactStatus status;
+
+ pageno = MXOffsetToMemberPage(offset);
+ memberoff = MXOffsetToMemberOffset(offset);
+
+ if (pageno != prev_pageno)
+ {
+ buf = SlruReadSwitchPage(state->members, pageno);
+ prev_pageno = pageno;
+ }
+
+ xactptr = (TransactionId *) (buf + memberoff);
+ if (!TransactionIdIsValid(*xactptr))
+ {
+ /*
+ * Corner case 2: we are looking at unused slot zero
+ */
+ if (offset == 0)
+ continue;
+
+ /*
+ * Otherwise this is an invalid entry that should not be
+ * referenced from anywhere in the heap. We could return 'false'
+ * here, but we prefer to continue reading the members and
+ * converting them the best we can, to preserve evidence in case
+ * this is corruption that should not happen.
+ */
+ }
+
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ bshift = MXOffsetToFlagsBitShift(offset);
+ flagsptr = (uint32 *) (buf + flagsoff);
+
+ status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+
+ /*
+ * Remember the updating XID among the members, or first locking XID
+ * if no updating XID.
+ */
+ if (ISUPDATE_from_mxstatus(status))
+ {
+ /* sanity check */
+ if (ISUPDATE_from_mxstatus(result_status))
+ {
+ /*
+ * We don't expect to see more than one updating member, even
+ * if the server had crashed.
+ */
+ pg_fatal("multixact %u has more than one updating member",
+ multi);
+ }
+ result_xid = *xactptr;
+ result_status = status;
+ }
+ else if (!TransactionIdIsValid(result_xid))
+ {
+ result_xid = *xactptr;
+ result_status = status;
+ }
+ }
+
+ member->xid = result_xid;
+ member->status = result_status;
+ return true;
+}
+
+/*
+ * Frees the malloced reader.
+ */
+void
+FreeOldMultiXactReader(OldMultiXactReader *state)
+{
+ FreeSlruRead(state->offset);
+ FreeSlruRead(state->members);
+
+ pfree(state);
+}
--- /dev/null
+/*
+ * multixact_read_v18.h
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_read_v18.h
+ */
+#ifndef MULTIXACT_READ_V18_H
+#define MULTIXACT_READ_V18_H
+
+#include "access/multixact.h"
+#include "slru_io.h"
+
+/*
+ * MultiXactOffset changed from uint32 to uint64 between versions 18 and 19.
+ * MultiXactOffset32 is used to represent a 32-bit offset from the old
+ * cluster.
+ */
+typedef uint32 MultiXactOffset32;
+
+typedef struct OldMultiXactReader
+{
+ MultiXactId nextMXact;
+ MultiXactOffset32 nextOffset;
+
+ SlruSegState *offset;
+ SlruSegState *members;
+} OldMultiXactReader;
+
+extern OldMultiXactReader *AllocOldMultiXactRead(char *pgdata,
+ MultiXactId nextMulti,
+ MultiXactOffset32 nextOffset);
+extern bool GetOldMultiXactIdSingleMember(OldMultiXactReader *state,
+ MultiXactId multi,
+ MultiXactMember *member);
+extern void FreeOldMultiXactReader(OldMultiXactReader *reader);
+
+#endif /* MULTIXACT_READ_V18_H */
--- /dev/null
+/*
+ * multixact_rewrite.c
+ *
+ * Functions to convert multixact SLRUs from the pre-v19 format to the current
+ * format with 64-bit MultiXactOffsets.
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_rewrite.c
+ */
+
+#include "postgres_fe.h"
+
+#include "access/multixact_internal.h"
+#include "multixact_read_v18.h"
+#include "pg_upgrade.h"
+
+static void RecordMultiXactOffset(SlruSegState *offsets_writer, MultiXactId multi,
+ MultiXactOffset offset);
+static void RecordMultiXactMembers(SlruSegState *members_writer,
+ MultiXactOffset offset,
+ int nmembers, MultiXactMember *members);
+
+/*
+ * Convert pg_multixact/offset and /members from the old pre-v19 format with
+ * 32-bit offsets to the current format.
+ *
+ * Multixids in the range [from_multi, to_multi) are read from the old
+ * cluster, and written in the new format. An important edge case is that if
+ * from_multi == to_multi, this initializes the new pg_multixact files in the
+ * new format without trying to open any old files. (We rely on that when
+ * upgrading from PostgreSQL version 9.2 or below.)
+ *
+ * Returns the new nextOffset value; the caller should set it in the new
+ * control file. The new members always start from offset 1, regardless of
+ * the offset range used in the old cluster.
+ */
+MultiXactOffset
+rewrite_multixacts(MultiXactId from_multi, MultiXactId to_multi)
+{
+ MultiXactOffset next_offset;
+ SlruSegState *offsets_writer;
+ SlruSegState *members_writer;
+ char dir[MAXPGPATH] = {0};
+ bool prev_multixid_valid = false;
+
+ /*
+ * The range of valid multi XIDs is unchanged by the conversion (they are
+ * referenced from the heap tables), but the members SLRU is rewritten to
+ * start from offset 1.
+ */
+ next_offset = 1;
+
+ /* Prepare to write the new SLRU files */
+ pg_sprintf(dir, "%s/pg_multixact/offsets", new_cluster.pgdata);
+ offsets_writer = AllocSlruWrite(dir, false);
+ SlruWriteSwitchPage(offsets_writer, MultiXactIdToOffsetPage(from_multi));
+
+ pg_sprintf(dir, "%s/pg_multixact/members", new_cluster.pgdata);
+ members_writer = AllocSlruWrite(dir, true /* use long segment names */ );
+ SlruWriteSwitchPage(members_writer, MXOffsetToMemberPage(next_offset));
+
+ /*
+ * Convert old multixids, if needed, by reading them one-by-one from the
+ * old cluster.
+ */
+ if (to_multi != from_multi)
+ {
+ OldMultiXactReader *old_reader;
+
+ old_reader = AllocOldMultiXactRead(old_cluster.pgdata,
+ old_cluster.controldata.chkpnt_nxtmulti,
+ old_cluster.controldata.chkpnt_nxtmxoff);
+
+ for (MultiXactId multi = from_multi; multi != to_multi;)
+ {
+ MultiXactMember member;
+ bool multixid_valid;
+
+ /*
+ * Read this multixid's members.
+ *
+ * Locking-only XIDs that may be part of multi-xids don't matter
+ * after upgrade, as there can be no transactions running across
+ * upgrade. So as a small optimization, we only read one member
+ * from each multixid: the one updating one, or if there was no
+ * update, arbitrarily the first locking xid.
+ */
+ multixid_valid = GetOldMultiXactIdSingleMember(old_reader, multi, &member);
+
+ /*
+ * Write the new offset to pg_multixact/offsets.
+ *
+ * Even if this multixid is invalid, we still need to write its
+ * offset if the *previous* multixid was valid. That's because
+ * when reading a multixid, the number of members is calculated
+ * from the difference between the two offsets.
+ */
+ RecordMultiXactOffset(offsets_writer, multi,
+ (multixid_valid || prev_multixid_valid) ? next_offset : 0);
+
+ /* Write the members */
+ if (multixid_valid)
+ {
+ RecordMultiXactMembers(members_writer, next_offset, 1, &member);
+ next_offset += 1;
+ }
+
+ /* Advance to next multixid, handling wraparound */
+ multi++;
+ if (multi < FirstMultiXactId)
+ multi = FirstMultiXactId;
+ prev_multixid_valid = multixid_valid;
+ }
+
+ FreeOldMultiXactReader(old_reader);
+ }
+
+ /* Write the final 'next' offset to the last SLRU page */
+ RecordMultiXactOffset(offsets_writer, to_multi,
+ prev_multixid_valid ? next_offset : 0);
+
+ /* Flush the last SLRU pages */
+ FreeSlruWrite(offsets_writer);
+ FreeSlruWrite(members_writer);
+
+ return next_offset;
+}
+
+
+/*
+ * Write one offset to the offset SLRU
+ */
+static void
+RecordMultiXactOffset(SlruSegState *offsets_writer, MultiXactId multi,
+ MultiXactOffset offset)
+{
+ int64 pageno;
+ int entryno;
+ char *buf;
+ MultiXactOffset *offptr;
+
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ buf = SlruWriteSwitchPage(offsets_writer, pageno);
+ offptr = (MultiXactOffset *) buf;
+ offptr[entryno] = offset;
+}
+
+/*
+ * Write the members for one multixid in the members SLRU
+ *
+ * (Currently, this is only ever called with nmembers == 1)
+ */
+static void
+RecordMultiXactMembers(SlruSegState *members_writer,
+ MultiXactOffset offset,
+ int nmembers, MultiXactMember *members)
+{
+ for (int i = 0; i < nmembers; i++, offset++)
+ {
+ int64 pageno;
+ char *buf;
+ TransactionId *memberptr;
+ uint32 *flagsptr;
+ uint32 flagsval;
+ int bshift;
+ int flagsoff;
+ int memberoff;
+
+ Assert(members[i].status <= MultiXactStatusUpdate);
+
+ pageno = MXOffsetToMemberPage(offset);
+ memberoff = MXOffsetToMemberOffset(offset);
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ bshift = MXOffsetToFlagsBitShift(offset);
+
+ buf = SlruWriteSwitchPage(members_writer, pageno);
+
+ memberptr = (TransactionId *) (buf + memberoff);
+
+ *memberptr = members[i].xid;
+
+ flagsptr = (uint32 *) (buf + flagsoff);
+
+ flagsval = *flagsptr;
+ flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+ flagsval |= (members[i].status << bshift);
+ *flagsptr = flagsval;
+ }
+}
#include <time.h>
+#include "access/multixact.h"
#include "catalog/pg_class_d.h"
#include "common/file_perm.h"
#include "common/logging.h"
new_cluster.pgdata);
check_ok();
- /*
- * If the old server is before the MULTIXACT_FORMATCHANGE_CAT_VER change
- * (see pg_upgrade.h) and the new server is after, then we don't copy
- * pg_multixact files, but we need to reset pg_control so that the new
- * server doesn't attempt to read multis older than the cutoff value.
- */
- if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ /* Copy or convert pg_multixact files */
+ Assert(new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER);
+ Assert(new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER);
+ if (old_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER)
{
+ /* No change in multixact format, just copy the files */
+ MultiXactId new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti;
+ MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff;
+
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
* counters here and the oldest multi present on system.
*/
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
- "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"",
- new_cluster.bindir,
- old_cluster.controldata.chkpnt_nxtmxoff,
- old_cluster.controldata.chkpnt_nxtmulti,
+ "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"",
+ new_cluster.bindir, new_nxtmxoff, new_nxtmulti,
old_cluster.controldata.chkpnt_oldstMulti,
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else
{
+ /* Conversion is needed */
+ MultiXactId nxtmulti;
+ MultiXactId oldstMulti;
+ MultiXactOffset nxtmxoff;
+
/*
- * Remove offsets/0000 file created by initdb that no longer matches
- * the new multi-xid value. "members" starts at zero so no need to
- * remove it.
+ * Determine the range of multixacts to convert.
*/
- remove_new_subdir("pg_multixact/offsets", false);
+ nxtmulti = old_cluster.controldata.chkpnt_nxtmulti;
+ if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ {
+ /* Versions 9.3 - 18: convert all multixids */
+ oldstMulti = old_cluster.controldata.chkpnt_oldstMulti;
+ }
+ else
+ {
+ /*
+ * In PostgreSQL 9.2 and below, multitransactions were only used
+ * for row locking, and as such don't need to be preserved during
+ * upgrade. In that case, we utilize rewrite_multixacts() just to
+ * initialize new, empty files in the new format.
+ *
+ * It's important that the oldest multi is set to the latest value
+ * used by the old system, so that multixact.c returns the empty
+ * set for multis that might be present on disk.
+ */
+ oldstMulti = nxtmulti;
+ }
+ /* handle wraparound */
+ if (nxtmulti < FirstMultiXactId)
+ nxtmulti = FirstMultiXactId;
+ if (oldstMulti < FirstMultiXactId)
+ oldstMulti = FirstMultiXactId;
- prep_status("Setting oldest multixact ID in new cluster");
+ /*
+ * Remove the files created by initdb in the new cluster.
+ * rewrite_multixacts() will create new ones.
+ */
+ remove_new_subdir("pg_multixact/members", false);
+ remove_new_subdir("pg_multixact/offsets", false);
/*
- * We don't preserve files in this case, but it's important that the
- * oldest multi is set to the latest value used by the old system, so
- * that multixact.c returns the empty set for multis that might be
- * present on disk. We set next multi to the value following that; it
- * might end up wrapped around (i.e. 0) if the old cluster had
- * next=MaxMultiXactId, but multixact.c can cope with that just fine.
+ * Create new pg_multixact files, converting old ones if needed.
*/
+ prep_status("Converting pg_multixact files");
+ nxtmxoff = rewrite_multixacts(oldstMulti, nxtmulti);
+ check_ok();
+
+ prep_status("Setting next multixact ID and offset for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
- "\"%s/pg_resetwal\" -m %u,%u \"%s\"",
+ "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"",
new_cluster.bindir,
- old_cluster.controldata.chkpnt_nxtmulti + 1,
- old_cluster.controldata.chkpnt_nxtmulti,
+ nxtmxoff, nxtmulti, oldstMulti,
new_cluster.pgdata);
check_ok();
}
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * MultiXactOffset was changed from 32-bit to 64-bit in version 19, at this
+ * catalog version. pg_multixact files need to be converted when upgrading
+ * across this version.
+ */
+#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 202512091
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
uint32 chkpnt_nxtepoch;
uint32 chkpnt_nxtoid;
uint32 chkpnt_nxtmulti;
- uint32 chkpnt_nxtmxoff;
+ uint64 chkpnt_nxtmxoff;
uint32 chkpnt_oldstMulti;
uint32 chkpnt_oldstxid;
uint32 align;
void report_extension_updates(ClusterInfo *cluster);
+/* multixact_rewrite.c */
+MultiXactOffset rewrite_multixacts(MultiXactId from_multi, MultiXactId to_multi);
+
/* parallel.c */
void parallel_exec_prog(const char *log_file, const char *opt_log_file,
const char *fmt,...) pg_attribute_printf(3, 4);
--- /dev/null
+/*
+ * slru_io.c
+ *
+ * Routines for reading and writing SLRU files during upgrade.
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru_io.c
+ */
+
+#include "postgres_fe.h"
+
+#include <fcntl.h>
+
+#include "common/fe_memutils.h"
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "pg_upgrade.h"
+#include "port/pg_iovec.h"
+#include "slru_io.h"
+
+static SlruSegState *AllocSlruSegState(const char *dir);
+static char *SlruFileName(SlruSegState *state, int64 segno);
+static void SlruFlush(SlruSegState *state);
+
+/* common parts of AllocSlruRead and AllocSlruWrite */
+static SlruSegState *
+AllocSlruSegState(const char *dir)
+{
+ SlruSegState *state = pg_malloc(sizeof(*state));
+
+ state->dir = pstrdup(dir);
+ state->fn = NULL;
+ state->fd = -1;
+ state->segno = -1;
+ state->pageno = 0;
+
+ /* state->writing and state->long_segment_names must be set by caller! */
+
+ return state;
+}
+
+/* similar to the backend function with the same name */
+static char *
+SlruFileName(SlruSegState *state, int64 segno)
+{
+ if (state->long_segment_names)
+ {
+ Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
+ return psprintf("%s/%015" PRIX64, state->dir, segno);
+ }
+ else
+ {
+ Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
+ return psprintf("%s/%04X", state->dir, (unsigned int) segno);
+ }
+}
+
+/*
+ * Create SLRU reader for dir.
+ */
+SlruSegState *
+AllocSlruRead(const char *dir, bool long_segment_names)
+{
+ SlruSegState *state = AllocSlruSegState(dir);
+
+ state->writing = false;
+ state->long_segment_names = long_segment_names;
+
+ return state;
+}
+
+/*
+ * Read the given page into memory buffer.
+ *
+ * Reading can be done in random order.
+ *
+ * If the file containing 'pageno' does not exist, a fatal error is raised.
+ * If the file exists but is shorter than expected, the missing part is read
+ * as zeros and a warning is logged. That is reasonable behavior for current
+ * callers.
+ *
+ * This is the slow path of the inlineable SlruReadSwitchPage() function.
+ */
+char *
+SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno)
+{
+ int64 segno;
+ off_t offset;
+ ssize_t bytes_read;
+
+ Assert(!state->writing); /* read only mode */
+
+ if (state->segno != -1 && pageno == state->pageno)
+ return state->buf.data;
+
+ /* If the new page is on a different SLRU segment, open the new segment */
+ segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ if (segno != state->segno)
+ {
+ if (state->segno != -1)
+ {
+ close(state->fd);
+ state->fd = -1;
+
+ pg_free(state->fn);
+ state->fn = NULL;
+
+ state->segno = -1;
+ }
+
+ state->fn = SlruFileName(state, segno);
+ if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", state->fn);
+ state->segno = segno;
+ }
+
+ offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+ bytes_read = 0;
+ while (bytes_read < BLCKSZ)
+ {
+ ssize_t rc;
+
+ rc = pg_pread(state->fd,
+ &state->buf.data + bytes_read,
+ BLCKSZ - bytes_read,
+ offset + bytes_read);
+ if (rc < 0)
+ {
+ if (errno == EINTR)
+ continue;
+ pg_fatal("could not read file \"%s\": %m", state->fn);
+ }
+ if (rc == 0)
+ {
+ /* unexpected EOF */
+ pg_log(PG_WARNING, "unexpected EOF reading file \"%s\" at offset %zd, reading as zeros", state->fn,
+ offset + bytes_read);
+ memset(&state->buf.data + bytes_read, 0, BLCKSZ - bytes_read);
+ break;
+ }
+ bytes_read += rc;
+ }
+ state->pageno = pageno;
+
+ return state->buf.data;
+}
+
+/*
+ * Free the reader.
+ */
+void
+FreeSlruRead(SlruSegState *state)
+{
+ Assert(!state->writing); /* read only mode */
+
+ if (state->fd != -1)
+ close(state->fd);
+ pg_free(state);
+}
+
+/*
+ * Create SLRU writer for dir.
+ */
+SlruSegState *
+AllocSlruWrite(const char *dir, bool long_segment_names)
+{
+ SlruSegState *state = AllocSlruSegState(dir);
+
+ state->writing = true;
+ state->long_segment_names = long_segment_names;
+
+ return state;
+}
+
+/*
+ * Open the given page for writing.
+ *
+ * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that
+ * each segment is written in full before moving on to the next one. This
+ * limitation would be easy to lift if needed, but it fits the usage pattern
+ * of current callers.
+ *
+ * This is the slow path of the inlineable SlruWriteSwitchPage() function.
+ */
+char *
+SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno)
+{
+ int64 segno;
+ off_t offset;
+
+ Assert(state->writing);
+
+ if (state->segno != -1 && pageno == state->pageno)
+ return state->buf.data;
+
+ segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+ SlruFlush(state);
+ memset(state->buf.data, 0, BLCKSZ);
+
+ if (segno != state->segno)
+ {
+ if (state->segno != -1)
+ {
+ close(state->fd);
+ state->fd = -1;
+
+ pg_free(state->fn);
+ state->fn = NULL;
+
+ state->segno = -1;
+ }
+
+ /* Create the segment */
+ state->fn = SlruFileName(state, segno);
+ if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ {
+ pg_fatal("could not create file \"%s\": %m", state->fn);
+ }
+
+ state->segno = segno;
+
+ if (offset > 0)
+ {
+ if (pg_pwrite_zeros(state->fd, offset, 0) < 0)
+ pg_fatal("could not write file \"%s\": %m", state->fn);
+ }
+ }
+
+ state->pageno = pageno;
+
+ return state->buf.data;
+}
+
+static void
+SlruFlush(SlruSegState *state)
+{
+ struct iovec iovec = {
+ .iov_base = &state->buf,
+ .iov_len = BLCKSZ,
+ };
+ off_t offset;
+
+ if (state->segno == -1)
+ return;
+
+ offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+ if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0)
+ pg_fatal("could not write file \"%s\": %m", state->fn);
+}
+
+/*
+ * Free the writer.
+ */
+void
+FreeSlruWrite(SlruSegState *state)
+{
+ Assert(state->writing);
+
+ SlruFlush(state);
+
+ if (state->fd != -1)
+ close(state->fd);
+ pg_free(state);
+}
--- /dev/null
+/*
+ * slru_io.h
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru_io.h
+ */
+
+#ifndef SLRU_IO_H
+#define SLRU_IO_H
+
+/*
+ * State for reading or writing an SLRU, with a one page buffer.
+ */
+typedef struct SlruSegState
+{
+ bool writing;
+ bool long_segment_names;
+
+ char *dir;
+ char *fn;
+ int fd;
+ int64 segno;
+ uint64 pageno;
+
+ PGAlignedBlock buf;
+} SlruSegState;
+
+extern SlruSegState *AllocSlruRead(const char *dir, bool long_segment_names);
+extern char *SlruReadSwitchPageSlow(SlruSegState *state, uint64 pageno);
+extern void FreeSlruRead(SlruSegState *state);
+
+static inline char *
+SlruReadSwitchPage(SlruSegState *state, uint64 pageno)
+{
+ if (state->segno != -1 && pageno == state->pageno)
+ return state->buf.data;
+ return SlruReadSwitchPageSlow(state, pageno);
+}
+
+extern SlruSegState *AllocSlruWrite(const char *dir, bool long_segment_names);
+extern char *SlruWriteSwitchPageSlow(SlruSegState *state, uint64 pageno);
+extern void FreeSlruWrite(SlruSegState *state);
+
+static inline char *
+SlruWriteSwitchPage(SlruSegState *state, uint64 pageno)
+{
+ if (state->segno != -1 && pageno == state->pageno)
+ return state->buf.data;
+ return SlruWriteSwitchPageSlow(state, pageno);
+}
+
+#endif /* SLRU_IO_H */
--- /dev/null
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+# Version 19 expanded MultiXactOffset from 32 to 64 bits. Upgrading
+# across that requires rewriting the SLRU files to the new format.
+# This file contains tests for the conversion.
+#
+# To run, set 'oldinstall' ENV variable to point to a pre-v19
+# installation. If it's not set, or if it points to a v19 or above
+# installation, this still performs a very basic test, upgrading a
+# cluster with some multixacts. It's not very interesting, however,
+# because there's no conversion involved in that case.
+
+use strict;
+use warnings FATAL => 'all';
+
+use Math::BigInt;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Temp dir for a dumps.
+my $tempdir = PostgreSQL::Test::Utils::tempdir;
+
+# A workload that consumes multixids. The purpose of this is to
+# generate some multixids in the old cluster, so that we can test
+# upgrading them. The workload is a mix of KEY SHARE locking queries
+# and UPDATEs, and commits and aborts, to generate a mix of multixids
+# with different statuses. It consumes around 3000 multixids with
+# 30000 members. That's enough to span more than one multixids
+# 'offsets' page, and more than one 'members' segment.
+#
+# The workload leaves behind a table called 'mxofftest' containing a
+# small number of rows referencing some of the generated multixids.
+#
+# Because this function is used to generate test data on the old
+# installation, it needs to work with older PostgreSQL server
+# versions.
+#
+# The first argument is the cluster to connect to, the second argument
+# is a cluster using the new version. We need the 'psql' binary from
+# the new version, the new cluster is otherwise unused. (We need to
+# use the new 'psql' because some of the more advanced background psql
+# perl module features depend on a fairly recent psql version.)
+sub mxact_workload
+{
+ my $node = shift; # Cluster to connect to
+ my $binnode = shift; # Use the psql binary from this cluster
+
+ my $connstr = $node->connstr('postgres');
+
+ $node->start;
+ $node->safe_psql(
+ 'postgres', qq[
+ CREATE TABLE mxofftest (id INT PRIMARY KEY, n_updated INT)
+ WITH (AUTOVACUUM_ENABLED=FALSE);
+ INSERT INTO mxofftest SELECT G, 0 FROM GENERATE_SERIES(1, 50) G;
+ ]);
+
+ my $nclients = 20;
+ my $update_every = 13;
+ my $abort_every = 11;
+ my @connections = ();
+
+ # Silence the logging of the statements we run to avoid
+ # unnecessarily bloating the test logs. This runs before the
+ # upgrade we're testing, so the details should not be very
+ # interesting for debugging. But if needed, you can make it more
+ # verbose by setting this.
+ my $verbose = 0;
+
+ # Open multiple connections to the database. Start a transaction
+ # in each connection.
+ for (0 .. $nclients)
+ {
+ # Use the psql binary from the new installation. The
+ # BackgroundPsql functionality doesn't work with older psql
+ # versions.
+ my $conn = $binnode->background_psql('',
+ connstr => $node->connstr('postgres'));
+
+ $conn->query_safe("SET log_statement=none", verbose => $verbose)
+ unless $verbose;
+ $conn->query_safe("SET enable_seqscan=off", verbose => $verbose);
+ $conn->query_safe("BEGIN", verbose => $verbose);
+
+ push(@connections, $conn);
+ }
+
+ # Run queries using cycling through the connections in a
+ # round-robin fashion. We keep a transaction open in each
+ # connection at all times, and lock/update the rows. With 10
+ # connections, each SELECT FOR KEY SHARE query generates a new
+ # multixid, containing the 10 XIDs of all the transactions running
+ # at the time.
+ for (my $i = 0; $i < 3000; $i++)
+ {
+ my $conn = $connections[ $i % $nclients ];
+
+ my $sql = ($i % $abort_every == 0) ? "ABORT" : "COMMIT";
+ $conn->query_safe($sql, verbose => $verbose);
+
+ $conn->query_safe("BEGIN", verbose => $verbose);
+ if ($i % $update_every == 0)
+ {
+ $sql = qq[
+ UPDATE mxofftest SET n_updated = n_updated + 1 WHERE id = ${i} % 50;
+ ];
+ }
+ else
+ {
+ my $threshold = int($i / 3000 * 50);
+ $sql = qq[
+ select count(*) from (
+ SELECT * FROM mxofftest WHERE id >= $threshold FOR KEY SHARE
+ ) as x
+ ];
+ }
+ $conn->query_safe($sql, verbose => $verbose);
+ }
+
+ for my $conn (@connections)
+ {
+ $conn->quit();
+ }
+
+ $node->stop;
+ return;
+}
+
+# Return contents of the 'mxofftest' table, created by mxact_workload
+sub get_test_table_contents
+{
+ my ($node, $filename) = @_;
+
+ my $contents = $node->safe_psql('postgres',
+ "SELECT ctid, xmin, xmax, * FROM mxofftest");
+
+ my $path = $tempdir . '/' . $filename;
+ open(my $fh, '>', $path)
+ || die "could not open $path for writing $!";
+ print $fh $contents;
+ close($fh);
+
+ return $path;
+}
+
+# Return the members of all updating multixids in the given range
+sub get_updating_multixact_members
+{
+ my ($node, $from, $to, $filename) = @_;
+
+ my $path = $tempdir . '/' . $filename;
+ open(my $fh, '>', $path)
+ || die "could not open $path for writing $!";
+
+ if ($to >= $from)
+ {
+ my $res = $node->safe_psql(
+ 'postgres', qq[
+ SELECT multi, mode, xid
+ FROM generate_series($from, $to - 1) as multi,
+ pg_get_multixact_members(multi::text::xid)
+ WHERE mode not in ('keysh', 'sh');
+ ]);
+ print $fh $res;
+ }
+ else
+ {
+ # Multixids wrapped around. Split the query into two parts,
+ # before and after the wraparound.
+ my $res = $node->safe_psql(
+ 'postgres', qq[
+ SELECT multi, mode, xid
+ FROM generate_series($from, 4294967295) as multi,
+ pg_get_multixact_members(multi::text::xid)
+ WHERE mode not in ('keysh', 'sh');
+ ]);
+ print $fh $res;
+ $res = $node->safe_psql(
+ 'postgres', qq[
+ SELECT multi, mode, xid
+ FROM generate_series(1, $to - 1) as multi,
+ pg_get_multixact_members(multi::text::xid)
+ WHERE mode not in ('keysh', 'sh');
+ ]);
+ print $fh $res;
+ }
+
+ close($fh);
+ return $path;
+}
+
+# Read multixid related fields from the control file
+#
+# Note: This is used on both the old and the new installation, so the
+# command arguments and the output parsing used here must work with
+# all PostgreSQL versions supported by the test.
+sub read_multixid_fields
+{
+ my $node = shift;
+
+ my $pg_controldata_path = $node->installed_command('pg_controldata');
+ my ($stdout, $stderr) =
+ run_command([ $pg_controldata_path, $node->data_dir ]);
+ $stdout =~ /^Latest checkpoint's oldestMultiXid:\s*(.*)$/m
+ or die "could not read oldestMultiXid from pg_controldata";
+ my $oldest_multi_xid = $1;
+ $stdout =~ /^Latest checkpoint's NextMultiXactId:\s*(.*)$/m
+ or die "could not read NextMultiXactId from pg_controldata";
+ my $next_multi_xid = $1;
+ $stdout =~ /^Latest checkpoint's NextMultiOffset:\s*(.*)$/m
+ or die "could not read NextMultiOffset from pg_controldata";
+ my $next_multi_offset = $1;
+
+ return ($oldest_multi_xid, $next_multi_xid, $next_multi_offset);
+}
+
+# Reset a cluster's next multixid and mxoffset to given values.
+#
+# Note: This is used on the old insallation, so the command arguments
+# and the output parsing used here must work with all pre-v19
+# PostgreSQL versions supported by the test.
+sub reset_mxid_mxoffset_pre_v19
+{
+ my $node = shift;
+ my $mxid = shift;
+ my $mxoffset = shift;
+
+ my $pg_resetwal_path = $node->installed_command('pg_resetwal');
+ # Get block size
+ my ($out, $err) =
+ run_command([ $pg_resetwal_path, '--dry-run', $node->data_dir ]);
+ $out =~ /^Database block size: *(\d+)$/m or die;
+
+ # Verify that no multixids are currently in use. Resetting would
+ # destroy them. (A freshly initialized cluster has no multixids.)
+ $out =~ /^Latest checkpoint's NextMultiXactId: *(\d+)$/m or die;
+ my $next_mxid = $1;
+ $out =~ /^Latest checkpoint's oldestMultiXid: *(\d+)$/m or die;
+ my $oldest_mxid = $1;
+ die "cluster has some multixids in use" unless $next_mxid == $oldest_mxid;
+
+ # Extract a few other values from pg_resetwal --dry-run output
+ # that we need for the calculations below
+ $out =~ /^Database block size: *(\d+)$/m or die;
+ my $blcksz = $1;
+ # SLRU_PAGES_PER_SEGMENT is always 32 on pre-19 versions
+ my $slru_pages_per_segment = 32;
+
+ # Do the reset
+ my @cmd = (
+ $pg_resetwal_path,
+ '--pgdata' => $node->data_dir,
+ '--multixact-offset' => $mxoffset,
+ '--multixact-ids' => "$mxid,$mxid");
+ command_ok(\@cmd, 'reset multixids and offset');
+
+ # pg_resetwal just updates the control file. The cluster will
+ # refuse to start up, if the SLRU segments corresponding to the
+ # next multixid and offset does not exist. Create a segments that
+ # covers the given values, filled with zeros. But first remove
+ # any old segments.
+ unlink glob $node->data_dir . "/pg_multixact/offsets/*";
+ unlink glob $node->data_dir . "/pg_multixact/members/*";
+
+ # Initialize the 'offsets' SLRU file containing the new next multixid
+ # with zeros
+ #
+ # sizeof(MultiXactOffset) == 4 in PostgreSQL versions before 19
+ my $multixact_offsets_per_page = $blcksz / 4;
+ my $segno =
+ int($mxid / $multixact_offsets_per_page / $slru_pages_per_segment);
+ my $path =
+ sprintf('%s/pg_multixact/offsets/%04X', $node->data_dir, $segno);
+ open my $fh, ">", $path
+ or die "could not open \"$path\": $!";
+ binmode $fh;
+ my $bytes_per_seg = $slru_pages_per_segment * $blcksz;
+ syswrite($fh, "\0" x $bytes_per_seg) == $bytes_per_seg
+ or die "could not write to \"$path\": $!";
+ close $fh;
+
+ # Same for the 'members' SLRU
+ my $multixact_members_per_page = int($blcksz / 20) * 4;
+ $segno =
+ int($mxoffset / $multixact_members_per_page / $slru_pages_per_segment);
+ $path = sprintf "%s/pg_multixact/members/%04X", $node->data_dir, $segno;
+ open $fh, ">", $path
+ or die "could not open \"$path\": $!";
+ binmode $fh;
+ syswrite($fh, "\0" x $bytes_per_seg) == $bytes_per_seg
+ or die "could not write to \"$path\": $!";
+ close($fh);
+}
+
+# Main test workhorse routine. Dump data on old version, run
+# pg_upgrade, compare data after upgrade.
+sub upgrade_and_compare
+{
+ my $tag = shift;
+ my $oldnode = shift;
+ my $newnode = shift;
+
+ command_ok(
+ [
+ 'pg_upgrade', '--no-sync',
+ '--old-datadir' => $oldnode->data_dir,
+ '--new-datadir' => $newnode->data_dir,
+ '--old-bindir' => $oldnode->config_data('--bindir'),
+ '--new-bindir' => $newnode->config_data('--bindir'),
+ '--socketdir' => $newnode->host,
+ '--old-port' => $oldnode->port,
+ '--new-port' => $newnode->port,
+ ],
+ 'run of pg_upgrade for new instance');
+
+ # Dump contents of the test table, and the status of all updating
+ # multixids from the old cluster. (Locking-only multixids don't
+ # need to be preserved so we ignore those)
+ #
+ # Note: we do this *after* running pg_upgrade, to ensure that we
+ # don't set all the hint bits before upgrade by doing the SELECT
+ # on the table.
+ my ($multixids_start, $multixids_end, undef) =
+ read_multixid_fields($oldnode);
+ $oldnode->start;
+ my $old_table_contents =
+ get_test_table_contents($oldnode, "oldnode_${tag}_table_contents");
+ my $old_multixacts =
+ get_updating_multixact_members($oldnode, $multixids_start,
+ $multixids_end, "oldnode_${tag}_multixacts");
+ $oldnode->stop;
+
+ # Compare them with the upgraded cluster
+ $newnode->start;
+ my $new_table_contents =
+ get_test_table_contents($newnode, "newnode_${tag}_table_contents");
+ my $new_multixacts =
+ get_updating_multixact_members($newnode, $multixids_start,
+ $multixids_end, "newnode_${tag}_multixacts");
+ $newnode->stop;
+
+ compare_files($old_table_contents, $new_table_contents,
+ 'test table contents from original and upgraded clusters match');
+ compare_files($old_multixacts, $new_multixacts,
+ 'multixact members from original and upgraded clusters match');
+}
+
+my $old_version;
+
+# Basic scenario: Create a cluster using old installation, run
+# multixid-creating workload on it, then upgrade.
+#
+# This works even even if the old and new version is the same,
+# although it's not very interesting as the conversion routines only
+# run when upgrading from a pre-v19 cluster.
+{
+ my $tag = 'basic';
+ my $old =
+ PostgreSQL::Test::Cluster->new("${tag}_oldnode",
+ install_path => $ENV{oldinstall});
+ my $new = PostgreSQL::Test::Cluster->new("${tag}_newnode");
+
+ $old->init(extra => ['-k']);
+
+ $old_version = $old->pg_version;
+ note "old installation is version $old_version\n";
+
+ # Run the workload
+ my (undef, $start_mxid, $start_mxoff) = read_multixid_fields($old);
+ mxact_workload($old, $new);
+ my (undef, $finish_mxid, $finish_mxoff) = read_multixid_fields($old);
+
+ note "Testing upgrade, ${tag} scenario\n"
+ . " mxid from ${start_mxid} to ${finish_mxid}\n"
+ . " oldnode mxoff from ${start_mxoff} to ${finish_mxoff}\n";
+
+ $new->init;
+ upgrade_and_compare($tag, $old, $new);
+}
+
+# Wraparound scenario: This is the same as the basic scenario, but the
+# old cluster goes through multixid and offset wraparound.
+#
+# This requires the old installation to be version 18 or older,
+# because the hacks we use to reset the old cluster to a state just
+# before the wraparound rely on the pre-v19 file format. If the old
+# cluster is of v19 or above, multixact SLRU conversion is not needed
+# anyway.
+SKIP:
+{
+ skip
+ "skipping mxoffset conversion tests because upgrading from the old version does not require conversion"
+ if ($old_version >= '19devel');
+
+ my $tag = 'wraparound';
+ my $old =
+ PostgreSQL::Test::Cluster->new("${tag}_oldnode",
+ install_path => $ENV{oldinstall});
+ my $new = PostgreSQL::Test::Cluster->new("${tag}_newnode");
+
+ $old->init(extra => ['-k']);
+
+ # Reset the old cluster to just before multixid and 32-bit offset
+ # wraparound.
+ reset_mxid_mxoffset_pre_v19($old, 0xFFFFFA00, 0xFFFFEC00);
+
+ # Run the workload. This crosses multixid and offset wraparound.
+ my (undef, $start_mxid, $start_mxoff) = read_multixid_fields($old);
+ mxact_workload($old, $new);
+ my (undef, $finish_mxid, $finish_mxoff) = read_multixid_fields($old);
+
+ note "Testing upgrade, ${tag} scenario\n"
+ . " mxid from ${start_mxid} to ${finish_mxid}\n"
+ . " oldnode mxoff from ${start_mxoff} to ${finish_mxoff}\n";
+
+ # Verify that wraparounds happened.
+ cmp_ok($finish_mxid, '<', $start_mxid,
+ "multixid wrapped around in old cluster");
+ cmp_ok($finish_mxoff, '<', $start_mxoff,
+ "mxoff wrapped around in old cluster");
+
+ $new->init;
+ upgrade_and_compare($tag, $old, $new);
+}
+
+done_testing();
#define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId)
-#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF)
-
/*
* Possible multixact lock modes ("status"). The first four modes are for
* tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the
extern void MultiXactIdSetOldestMember(void);
extern int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
bool from_pgupgrade, bool isLockOnly);
-extern bool GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members,
+extern void GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members,
MultiXactId *oldestMultiXactId,
MultiXactOffset *oldestOffset);
extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
extern void StartupMultiXact(void);
extern void TrimMultiXact(void);
extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid,
- Oid oldest_datoid,
- bool is_startup);
+ Oid oldest_datoid);
extern void MultiXactGetCheckptMulti(bool is_shutdown,
MultiXactId *nextMulti,
MultiXactOffset *nextMultiOffset,
* src/include/access/multixact_internal.h
*/
#ifndef MULTIXACT_INTERNAL_H
+
+/*
+ * Note: This is not only to prevent including this file twice.
+ * MULTIXACT_INTERNAL_H is checked explicitly in multixact_read_v18.c.
+ */
#define MULTIXACT_INTERNAL_H
#include "access/multixact.h"
/*
* Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
* used everywhere else in Postgres.
- *
- * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
- * MultiXact page numbering also wraps around at
- * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
- * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
- * take no explicit notice of that fact in this module, except when comparing
- * segment and page numbers in TruncateMultiXact (see
- * MultiXactOffsetPagePrecedes).
*/
-/* We need four bytes per offset */
+/* We need 8 bytes per offset */
#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
static inline int64
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
-/*
- * Because the number of items per page is not a divisor of the last item
- * number (member 0xFFFFFFFF), the last segment does not use the maximum number
- * of pages, and moreover the last used page therein does not use the same
- * number of items as previous pages. (Another way to say it is that the
- * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
- * has some empty space after that item.)
- *
- * This constant is the number of members in the last page of the last segment.
- */
-#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
- ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
-
/* page in which a member is to be found */
static inline int64
MXOffsetToMemberPage(MultiXactOffset offset)
/* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */
typedef TransactionId MultiXactId;
-typedef uint32 MultiXactOffset;
+typedef uint64 MultiXactOffset;
typedef uint32 CommandId;
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202512061
+#define CATALOG_VERSION_NO 202512091
#endif
# initialize the 'offsets' SLRU file containing the new next multixid
# with zeros
-my $multixact_offsets_per_page = $blcksz / 4; # sizeof(MultiXactOffset) == 4
+my $multixact_offsets_per_page = $blcksz / 8; # sizeof(MultiXactOffset) == 8
my $segno =
int(0xFFFFFFF8 / $multixact_offsets_per_page / $slru_pages_per_segment);
my $slru_file = sprintf('%s/pg_multixact/offsets/%04X', $node_pgdata, $segno);
context and (output, error) in list context where error is 1 in case there
was output generated on stderr when executing the query.
+By default, the query and its results are printed to the test output. This
+can be disabled by passing the keyword parameter verbose => false.
+
=cut
sub query
{
- my ($self, $query) = @_;
+ my ($self, $query, %params) = @_;
my $ret;
my $output;
my $query_cnt = $self->{query_cnt}++;
+ $params{verbose} = 1 unless defined $params{verbose};
+
local $Test::Builder::Level = $Test::Builder::Level + 1;
- note "issuing query $query_cnt via background psql: $query";
+ note "issuing query $query_cnt via background psql: $query" unless !$params{verbose};
$self->{timeout}->start() if (defined($self->{query_timer_restart}));
explain {
stdout => $self->{stdout},
stderr => $self->{stderr},
- };
+ } unless !$params{verbose};
# Remove banner from stdout and stderr, our caller doesn't care. The
# first newline is optional, as there would not be one if consuming an
sub query_safe
{
- my ($self, $query) = @_;
+ my ($self, $query, %params) = @_;
- my $ret = $self->query($query);
+ my $ret = $self->query($query, %params);
if ($self->{stderr} ne "")
{
return (%inst_env);
}
-# Private routine to get an installation path qualified command.
-#
-# IPC::Run maintains a cache, %cmd_cache, mapping commands to paths. Tests
-# which use nodes spanning more than one postgres installation path need to
-# avoid confusing which installation's binaries get run. Setting $ENV{PATH} is
-# insufficient, as IPC::Run does not check to see if the path has changed since
-# caching a command.
+=pod
+
+=item $node->installed_command(cmd)
+
+Get an installation path qualified command.
+
+IPC::Run maintains a cache, %cmd_cache, mapping commands to paths. Tests
+which use nodes spanning more than one postgres installation path need to
+avoid confusing which installation's binaries get run. Setting $ENV{PATH} is
+insufficient, as IPC::Run does not check to see if the path has changed since
+caching a command.
+
+=cut
+
sub installed_command
{
my ($self, $cmd) = @_;
MultiXactOffset
MultiXactStateData
MultiXactStatus
+MultiXactWriter
MultirangeIOData
MultirangeParseState
MultirangeType
Oid
OidOptions
OkeysState
+OldMultiXactReader
OldToNewMapping
OldToNewMappingData
OnCommitAction
SlruErrorCause
SlruPageStatus
SlruScanCallback
+SlruSegState
SlruShared
SlruSharedData
SlruWriteAll