*/
TimeLineID ThisTimeLineID = 0;
-/* Are we doing recovery from XLOG? */
+/*
+ * Are we doing recovery from XLOG?
+ *
+ * This is only ever true in the startup process, when it's replaying WAL.
+ * It's used in functions that need to act differently when called from a
+ * redo function (e.g skip WAL logging). To check whether the system is in
+ * recovery regardless of what process you're running in, use
+ * IsRecoveryProcessingMode().
+ */
bool InRecovery = false;
/* Are we recovering using offline XLOG archives? */
* ControlFileLock: must be held to read/update control file or create
* new log file.
*
- * CheckpointLock: must be held to do a checkpoint or restartpoint, ensuring
- * we get just one of those at any time. In 8.4+ recovery, both startup and
- * bgwriter processes may take restartpoints, so this locking must be strict
- * to ensure there are no mistakes.
+ * CheckpointLock: must be held to do a checkpoint (ensures only one
+ * checkpointer at a time; currently, with all checkpoints done by the
+ * bgwriter, this is just pro forma).
*
- * In 8.4 we progress through a number of states at startup. Initially, the
- * postmaster is in PM_STARTUP state and spawns the Startup process. We then
- * progress until the database is in a consistent state, then if we are in
- * InArchiveRecovery we go into PM_RECOVERY state. The bgwriter then starts
- * up and takes over responsibility for performing restartpoints. We then
- * progress until the end of recovery when we enter PM_RUN state upon
- * termination of the Startup process. In summary:
- *
- * PM_STARTUP state: Startup process performs restartpoints
- * PM_RECOVERY state: bgwriter process performs restartpoints
- * PM_RUN state: bgwriter process performs checkpoints
- *
- * These transitions are fairly delicate, with many things that need to
- * happen at the same time in order to change state successfully throughout
- * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can
- * prove the databases are in a consistent state. Changing from PM_RECOVERY
- * to PM_RUN happens whenever recovery ends, which could be forced upon us
- * externally or it can occur becasue of damage or termination of the WAL
- * sequence.
*----------
*/
/*
* SharedRecoveryProcessingMode indicates if we're still in crash or
- * archive recovery. You should use IsRecoveryProcessingMode() instead
- * of peeking at this variable directly, because it uses a cached value
- * after we exit recovery.
- *
- * We also retain a local state variable InRecovery. InRecovery=true
- * means the code is being executed by Startup process and therefore
- * always during Recovery Processing Mode. This allows us to identify
- * code executed *during* Recovery Processing Mode but not necessarily
- * by Startup process itself.
+ * archive recovery. It's checked by IsRecoveryProcessingMode()
*/
bool SharedRecoveryProcessingMode;
static void exitRecovery(void);
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
-static XLogRecPtr GetRedoLocationForCheckpoint(void);
static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
XLogRecPtr *lsn, BkpBlock *bkpb);
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
+ /* Disabled during REDO */
if (IsRecoveryProcessingMode())
return;
XLogRecPtr recptr;
XLogCtlInsert *Insert = &XLogCtl->Insert;
XLogRecData rdata;
+ uint32 freespace;
uint32 _logId;
uint32 _logSeg;
TransactionId *inCommitXids;
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
- * That shouldn't be happening, but checkpoints are an important aspect
- * of our resilience, so we take no chances.
+ * (This is just pro forma, since in the present system structure there is
+ * only one process that is allowed to issue checkpoints at any given
+ * time.)
*/
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
* the buffer flush work. Those XLOG records are logically after the
* checkpoint, even though physically before it. Got that?
*/
- checkPoint.redo = GetRedoLocationForCheckpoint();
+ freespace = INSERT_FREESPACE(Insert);
+ if (freespace < SizeOfXLogRecord)
+ {
+ (void) AdvanceXLInsertBuffer(false);
+ /* OK to ignore update return flag, since we will do flush anyway */
+ freespace = INSERT_FREESPACE(Insert);
+ }
+ INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
+
+ /*
+ * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+ * must be done while holding the insert lock AND the info_lck.
+ *
+ * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+ * pointing past where it really needs to point. This is okay; the only
+ * consequence is that XLogInsert might back up whole buffers that it
+ * didn't really need to. We can't postpone advancing RedoRecPtr because
+ * XLogInserts that happen while we are dumping buffers must assume that
+ * their buffer changes are not included in the checkpoint.
+ */
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
+ SpinLockRelease(&xlogctl->info_lck);
+ }
/*
* Now we can release WAL insert lock, allowing other xacts to proceed
* that this is executed by bgwriter after the death of Startup process.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
else
ControlFile->state = DB_IN_PRODUCTION;
-
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
-
LWLockRelease(ControlFileLock);
/* Update shared-memory copy of checkpoint XID/epoch */
LWLockRelease(CheckpointLock);
}
-/*
- * GetRedoLocationForCheckpoint()
- *
- * When !IsRecoveryProcessingMode() this must be called while holding
- * WALInsertLock.
- */
-static XLogRecPtr
-GetRedoLocationForCheckpoint()
-{
- XLogCtlInsert *Insert = &XLogCtl->Insert;
- uint32 freespace;
- XLogRecPtr redo;
-
- freespace = INSERT_FREESPACE(Insert);
- if (freespace < SizeOfXLogRecord)
- {
- (void) AdvanceXLInsertBuffer(false);
- /* OK to ignore update return flag, since we will do flush anyway */
- freespace = INSERT_FREESPACE(Insert);
- }
- INSERT_RECPTR(redo, Insert, Insert->curridx);
-
- /*
- * Here we update the shared RedoRecPtr for future XLogInsert calls; this
- * must be done while holding the insert lock AND the info_lck.
- *
- * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
- * pointing past where it really needs to point. This is okay; the only
- * consequence is that XLogInsert might back up whole buffers that it
- * didn't really need to. We can't postpone advancing RedoRecPtr because
- * XLogInserts that happen while we are dumping buffers must assume that
- * their buffer changes are not included in the checkpoint.
- */
- {
- /* use volatile pointer to prevent code rearrangement */
- volatile XLogCtlData *xlogctl = XLogCtl;
-
- SpinLockAcquire(&xlogctl->info_lck);
- RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
- SpinLockRelease(&xlogctl->info_lck);
- }
-
- return redo;
-}
-
/*
* Flush all data in shared memory to disk, and fsync
*
}
/*
- * XLOG resource manager's routines.
+ * XLOG resource manager's routines
*
* Definitions of message info are in include/catalog/pg_control.h,
* though not all messages relate to control file processing.
BgWriterRecoveryMode = IsRecoveryProcessingMode();
if (BgWriterRecoveryMode)
- elog(DEBUG1, "bgwriter starting during recovery, pid = %u",
- BgWriterShmem->bgwriter_pid);
+ elog(DEBUG1, "bgwriter starting during recovery");
+ else
+ InitXLOGAccess();
- /* If someone requested a checkpoint before we started up, process that */
+ /*
+ * If someone requested a checkpoint before we started up, process that.
+ *
+ * This check exists primarily for crash recovery: after the startup
+ * process is finished with WAL replay, it will request a checkpoint, but
+ * the background writer might not have started yet. This check will
+ * actually not notice a checkpoint that's been requested without any
+ * flags, but it's good enough for the startup checkpoint.
+ */
SpinLockAcquire(&bgs->ckpt_lck);
if (bgs->ckpt_flags)
checkpoint_requested = true;
/* Normal exit from the bgwriter is here */
proc_exit(0); /* done */
}
+ if (BgWriterRecoveryMode && !IsRecoveryProcessingMode())
+ {
+ elog(DEBUG1, "bgwriter changing from recovery to normal mode");
+
+ InitXLOGAccess();
+ BgWriterRecoveryMode = false;
+ }
/*
* Force a checkpoint if too much time has elapsed since the last one.
flags |= CHECKPOINT_CAUSE_TIME;
}
- if (BgWriterRecoveryMode && !IsRecoveryProcessingMode())
- {
- elog(DEBUG2, "bgwriter changing from recovery to normal mode");
-
- InitXLOGAccess();
- BgWriterRecoveryMode = false;
- }
-
/*
* Do a checkpoint if requested, otherwise do one cycle of
* dirty-buffer writing.
/*
* We use a simple state machine to control startup, shutdown, and
- * crash recovery (which is rather like shutdown followed by startup).
+ * recovery.
*
- * Recovery is split into two phases: crash recovery and archive recovery.
- * The startup process begins with crash recovery, replaying WAL until
- * a self-consistent database state is reached. At that point, it signals
- * postmaster, and we switch to archive recovery phase. The background
- * writer is launched, and we can start accepting connections to perform
- * read-only queries, while the startup process continues applying WAL.
- * When the startup process exits, we switch to PM_RUN state. The startup
- * process can also skip the archive recovery altogether, as it will during
- * normal startup when there's no recovery to be done, for example.
+ * Recovery is split into two phases: crash recovery and consistent (archive)
+ * recovery. The startup process begins with crash recovery, replaying WAL
+ * until a self-consistent database state is reached. At that point, it
+ * signals postmaster, and we switch to consistent recovery phase. The
+ * background writer is launched, while the startup process continues
+ * applying WAL. We could start accepting connections to perform read-only
+ * queries at this point, if we had the infrastructure to do that. When the
+ * startup process exits, we switch to PM_RUN state. The startup process can
+ * also skip the consistent recovery altogether, as it will during normal
+ * startup when there's no recovery to be done, for example.
*
* Normal child backends can only be launched when we are in PM_RUN state.
* (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
{
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
- PM_RECOVERY, /* archive recovery mode */
+ PM_RECOVERY, /* consistent recovery mode */
PM_RUN, /* normal "database is alive" state */
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
/*
* Crank up the background writer, if we didn't do that already
- * when we entered archive recovery phase. It doesn't matter if
- * this fails, we'll just try again later.
+ * when we entered consistent recovery phase. It doesn't matter
+ * if this fails, we'll just try again later.
*/
if (BgWriterPID == 0)
BgWriterPID = StartBackgroundWriter();