Change shutdown sequence to terminate checkpointer last

author Andres Freund <andres@anarazel.de>

Sat, 25 Jan 2025 16:37:13 +0000 (11:37 -0500)

committer Andres Freund <andres@anarazel.de>

Sat, 25 Jan 2025 16:37:13 +0000 (11:37 -0500)
author Andres Freund <andres@anarazel.de>
Sat, 25 Jan 2025 16:37:13 +0000 (11:37 -0500)
committer Andres Freund <andres@anarazel.de>
Sat, 25 Jan 2025 16:37:13 +0000 (11:37 -0500)
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c

index dd2c8376c6efee037d5095908110e4a325b63aae..b94f9cdff21c4ff0eba12a1af10a4736eec7f29d 100644 (file)
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -10,10 +10,13 @@
   * fill WAL segments; the checkpointer itself doesn't watch for the
   * condition.)
   *
- * Normal termination is by SIGUSR2, which instructs the checkpointer to
- * execute a shutdown checkpoint and then exit(0).  (All backends must be
- * stopped before SIGUSR2 is issued!)  Emergency termination is by SIGQUIT;
- * like any backend, the checkpointer will simply abort and exit on SIGQUIT.
+ * The normal termination sequence is that checkpointer is instructed to
+ * execute the shutdown checkpoint by SIGINT.  After that checkpointer waits
+ * to be terminated via SIGUSR2, which instructs the checkpointer to exit(0).
+ * All backends must be stopped before SIGINT or SIGUSR2 is issued!
+ *
+ * Emergency termination is by SIGQUIT; like any backend, the checkpointer
+ * will simply abort and exit on SIGQUIT.
   *
   * If the checkpointer exits unexpectedly, the postmaster treats that the same
   * as a backend crash: shared memory may be corrupted, so remaining backends
@@ -51,6 +54,7 @@
  #include "storage/fd.h"
  #include "storage/ipc.h"
  #include "storage/lwlock.h"
+#include "storage/pmsignal.h"
  #include "storage/proc.h"
  #include "storage/procsignal.h"
  #include "storage/shmem.h"
@@ -141,6 +145,7 @@ double      CheckPointCompletionTarget = 0.9;
   * Private state
   */
  static bool ckpt_active = false;
+static volatile sig_atomic_t ShutdownXLOGPending = false;
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
@@ -159,6 +164,9 @@ static bool ImmediateCheckpointRequested(void);
  static bool CompactCheckpointerRequestQueue(void);
  static void UpdateSharedMemoryConfig(void);
  
+/* Signal handlers */
+static void ReqShutdownXLOG(SIGNAL_ARGS);
+
  
  /*
   * Main entry point for checkpointer process
@@ -188,7 +196,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
      * tell us it's okay to shut down (via SIGUSR2).
      */
     pqsignal(SIGHUP, SignalHandlerForConfigReload);
-   pqsignal(SIGINT, SIG_IGN);
+   pqsignal(SIGINT, ReqShutdownXLOG);
     pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
     /* SIGQUIT handler was already set up by InitPostmasterChild */
     pqsignal(SIGALRM, SIG_IGN);
@@ -211,8 +219,11 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
      * process during a normal shutdown, and since checkpointer is shut down
      * very late...
      *
-    * Walsenders are shut down after the checkpointer, but currently don't
-    * report stats. If that changes, we need a more complicated solution.
+    * While e.g. walsenders are active after the shutdown checkpoint has been
+    * written (and thus could produce more stats), checkpointer stays around
+    * after the shutdown checkpoint has been written. postmaster will only
+    * signal checkpointer to exit after all processes that could emit stats
+    * have been shut down.
      */
     before_shmem_exit(pgstat_before_server_shutdown, 0);
  
@@ -327,7 +338,8 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
     ProcGlobal->checkpointerProc = MyProcNumber;
  
     /*
-    * Loop forever
+    * Loop until we've been asked to write the shutdown checkpoint or
+    * terminate.
      */
     for (;;)
     {
@@ -346,7 +358,10 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
          * Process any requests or signals received recently.
          */
         AbsorbSyncRequests();
+
         HandleCheckpointerInterrupts();
+       if (ShutdownXLOGPending || ShutdownRequestPending)
+           break;
  
         /*
          * Detect a pending checkpoint request by checking whether the flags
@@ -517,8 +532,13 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
  
             ckpt_active = false;
  
-           /* We may have received an interrupt during the checkpoint. */
+           /*
+            * We may have received an interrupt during the checkpoint and the
+            * latch might have been reset (e.g. in CheckpointWriteDelay).
+            */
             HandleCheckpointerInterrupts();
+           if (ShutdownXLOGPending || ShutdownRequestPending)
+               break;
         }
  
         /* Check for archive_timeout and switch xlog files if necessary. */
@@ -557,6 +577,57 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
                          cur_timeout * 1000L /* convert to ms */ ,
                          WAIT_EVENT_CHECKPOINTER_MAIN);
     }
+
+   /*
+    * From here on, elog(ERROR) should end with exit(1), not send control
+    * back to the sigsetjmp block above.
+    */
+   ExitOnAnyError = true;
+
+   if (ShutdownXLOGPending)
+   {
+       /*
+        * Close down the database.
+        *
+        * Since ShutdownXLOG() creates restartpoint or checkpoint, and
+        * updates the statistics, increment the checkpoint request and flush
+        * out pending statistic.
+        */
+       PendingCheckpointerStats.num_requested++;
+       ShutdownXLOG(0, 0);
+       pgstat_report_checkpointer();
+       pgstat_report_wal(true);
+
+       /*
+        * Tell postmaster that we're done.
+        */
+       SendPostmasterSignal(PMSIGNAL_XLOG_IS_SHUTDOWN);
+       ShutdownXLOGPending = false;
+   }
+
+   /*
+    * Wait until we're asked to shut down. By separating the writing of the
+    * shutdown checkpoint from checkpointer exiting, checkpointer can perform
+    * some should-be-as-late-as-possible work like writing out stats.
+    */
+   for (;;)
+   {
+       /* Clear any already-pending wakeups */
+       ResetLatch(MyLatch);
+
+       HandleCheckpointerInterrupts();
+
+       if (ShutdownRequestPending)
+           break;
+
+       (void) WaitLatch(MyLatch,
+                        WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
+                        0,
+                        WAIT_EVENT_CHECKPOINTER_SHUTDOWN);
+   }
+
+   /* Normal exit from the checkpointer is here */
+   proc_exit(0);               /* done */
  }
  
  /*
@@ -586,29 +657,6 @@ HandleCheckpointerInterrupts(void)
          */
         UpdateSharedMemoryConfig();
     }
-   if (ShutdownRequestPending)
-   {
-       /*
-        * From here on, elog(ERROR) should end with exit(1), not send control
-        * back to the sigsetjmp block above
-        */
-       ExitOnAnyError = true;
-
-       /*
-        * Close down the database.
-        *
-        * Since ShutdownXLOG() creates restartpoint or checkpoint, and
-        * updates the statistics, increment the checkpoint request and flush
-        * out pending statistic.
-        */
-       PendingCheckpointerStats.num_requested++;
-       ShutdownXLOG(0, 0);
-       pgstat_report_checkpointer();
-       pgstat_report_wal(true);
-
-       /* Normal exit from the checkpointer is here */
-       proc_exit(0);           /* done */
-   }
  
     /* Perform logging of memory contexts of this process */
     if (LogMemoryContextPending)
@@ -729,6 +777,7 @@ CheckpointWriteDelay(int flags, double progress)
      * in which case we just try to catch up as quickly as possible.
      */
     if (!(flags & CHECKPOINT_IMMEDIATE) &&
+       !ShutdownXLOGPending &&
         !ShutdownRequestPending &&
         !ImmediateCheckpointRequested() &&
         IsCheckpointOnSchedule(progress))
@@ -857,6 +906,20 @@ IsCheckpointOnSchedule(double progress)
  }
  
  
+/* --------------------------------
+ *     signal handler routines
+ * --------------------------------
+ */
+
+/* SIGINT: set flag to trigger writing of shutdown checkpoint */
+static void
+ReqShutdownXLOG(SIGNAL_ARGS)
+{
+   ShutdownXLOGPending = true;
+   SetLatch(MyLatch);
+}
+
+
  /* --------------------------------
   *     communication with backends
   * --------------------------------
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c

index f410600f7a4ed385055fa5a3bd2d18e25a12d047..bb22b13adef8760f6a08b94a3bee82105b3e62f0 100644 (file)
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -341,6 +341,7 @@ typedef enum
                                  * ckpt */
     PM_WAIT_XLOG_ARCHIVAL,      /* waiting for archiver and walsenders to
                                  * finish */
+   PM_WAIT_CHECKPOINTER,       /* waiting for checkpointer to shut down */
     PM_WAIT_DEAD_END,           /* waiting for dead-end children to exit */
     PM_NO_CHILDREN,             /* all important children have exited */
  } PMState;
@@ -2363,35 +2364,20 @@ process_pm_child_exit(void)
         {
             ReleasePostmasterChildSlot(CheckpointerPMChild);
             CheckpointerPMChild = NULL;
-           if (EXIT_STATUS_0(exitstatus) && pmState == PM_WAIT_XLOG_SHUTDOWN)
+           if (EXIT_STATUS_0(exitstatus) && pmState == PM_WAIT_CHECKPOINTER)
             {
                 /*
                  * OK, we saw normal exit of the checkpointer after it's been
-                * told to shut down.  We expect that it wrote a shutdown
-                * checkpoint.  (If for some reason it didn't, recovery will
-                * occur on next postmaster start.)
+                * told to shut down.  We know checkpointer wrote a shutdown
+                * checkpoint, otherwise we'd still be in
+                * PM_WAIT_XLOG_SHUTDOWN state.
                  *
-                * At this point we should have no normal backend children
-                * left (else we'd not be in PM_WAIT_XLOG_SHUTDOWN state) but
-                * we might have dead-end children to wait for.
-                *
-                * If we have an archiver subprocess, tell it to do a last
-                * archive cycle and quit. Likewise, if we have walsender
-                * processes, tell them to send any remaining WAL and quit.
+                * At this point only dead-end children and logger should be
+                * left.
                  */
-               Assert(Shutdown > NoShutdown);
-
-               /* Waken archiver for the last time */
-               if (PgArchPMChild != NULL)
-                   signal_child(PgArchPMChild, SIGUSR2);
-
-               /*
-                * Waken walsenders for the last time. No regular backends
-                * should be around anymore.
-                */
-               SignalChildren(SIGUSR2, btmask(B_WAL_SENDER));
-
-               UpdatePMState(PM_WAIT_XLOG_ARCHIVAL);
+               UpdatePMState(PM_WAIT_DEAD_END);
+               ConfigurePostmasterWaitSet(false);
+               SignalChildren(SIGTERM, btmask_all_except(B_LOGGER));
             }
             else
             {
@@ -2737,6 +2723,7 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
  
         case PM_WAIT_XLOG_SHUTDOWN:
         case PM_WAIT_XLOG_ARCHIVAL:
+       case PM_WAIT_CHECKPOINTER:
  
             /*
              * NB: Similar code exists in PostmasterStateMachine()'s handling
@@ -3012,10 +2999,10 @@ PostmasterStateMachine(void)
                 /* Start the checkpointer if not running */
                 if (CheckpointerPMChild == NULL)
                     CheckpointerPMChild = StartChildProcess(B_CHECKPOINTER);
-               /* And tell it to shut down */
+               /* And tell it to write the shutdown checkpoint */
                 if (CheckpointerPMChild != NULL)
                 {
-                   signal_child(CheckpointerPMChild, SIGUSR2);
+                   signal_child(CheckpointerPMChild, SIGINT);
                     UpdatePMState(PM_WAIT_XLOG_SHUTDOWN);
                 }
                 else
@@ -3043,22 +3030,40 @@ PostmasterStateMachine(void)
         }
     }
  
+   /*
+    * The state transition from PM_WAIT_XLOG_SHUTDOWN to
+    * PM_WAIT_XLOG_ARCHIVAL is in process_pm_pmsignal(), in response to
+    * PMSIGNAL_XLOG_IS_SHUTDOWN.
+    */
+
     if (pmState == PM_WAIT_XLOG_ARCHIVAL)
     {
         /*
-        * PM_WAIT_XLOG_ARCHIVAL state ends when there's no other children
-        * than dead-end children left. There shouldn't be any regular
-        * backends left by now anyway; what we're really waiting for is
-        * walsenders and archiver.
+        * PM_WAIT_XLOG_ARCHIVAL state ends when there are no children other
+        * than checkpointer, dead-end children and logger left. There
+        * shouldn't be any regular backends left by now anyway; what we're
+        * really waiting for is for walsenders and archiver to exit.
          */
-       if (CountChildren(btmask_all_except(B_LOGGER, B_DEAD_END_BACKEND)) == 0)
+       if (CountChildren(btmask_all_except(B_CHECKPOINTER, B_LOGGER, B_DEAD_END_BACKEND)) == 0)
         {
-           UpdatePMState(PM_WAIT_DEAD_END);
-           ConfigurePostmasterWaitSet(false);
-           SignalChildren(SIGTERM, btmask_all_except(B_LOGGER));
+           UpdatePMState(PM_WAIT_CHECKPOINTER);
+
+           /*
+            * Now that the processes mentioned above are gone, tell
+            * checkpointer to shut down too. That allows checkpointer to
+            * perform some last bits of cleanup without other processes
+            * interfering.
+            */
+           if (CheckpointerPMChild != NULL)
+               signal_child(CheckpointerPMChild, SIGUSR2);
         }
     }
  
+   /*
+    * The state transition from PM_WAIT_CHECKPOINTER to PM_WAIT_DEAD_END is
+    * in process_pm_child_exit().
+    */
+
     if (pmState == PM_WAIT_DEAD_END)
     {
         /*
@@ -3195,6 +3200,7 @@ pmstate_name(PMState state)
             PM_TOSTR_CASE(PM_WAIT_XLOG_SHUTDOWN);
             PM_TOSTR_CASE(PM_WAIT_XLOG_ARCHIVAL);
             PM_TOSTR_CASE(PM_WAIT_DEAD_END);
+           PM_TOSTR_CASE(PM_WAIT_CHECKPOINTER);
             PM_TOSTR_CASE(PM_NO_CHILDREN);
     }
  #undef PM_TOSTR_CASE
@@ -3613,6 +3619,8 @@ ExitPostmaster(int status)
  static void
  process_pm_pmsignal(void)
  {
+   bool        request_state_update = false;
+
     pending_pm_pmsignal = false;
  
     ereport(DEBUG2,
@@ -3724,9 +3732,67 @@ process_pm_pmsignal(void)
         WalReceiverRequested = true;
     }
  
+   if (CheckPostmasterSignal(PMSIGNAL_XLOG_IS_SHUTDOWN))
+   {
+       /* Checkpointer completed the shutdown checkpoint */
+       if (pmState == PM_WAIT_XLOG_SHUTDOWN)
+       {
+           /*
+            * If we have an archiver subprocess, tell it to do a last archive
+            * cycle and quit. Likewise, if we have walsender processes, tell
+            * them to send any remaining WAL and quit.
+            */
+           Assert(Shutdown > NoShutdown);
+
+           /* Waken archiver for the last time */
+           if (PgArchPMChild != NULL)
+               signal_child(PgArchPMChild, SIGUSR2);
+
+           /*
+            * Waken walsenders for the last time. No regular backends should
+            * be around anymore.
+            */
+           SignalChildren(SIGUSR2, btmask(B_WAL_SENDER));
+
+           UpdatePMState(PM_WAIT_XLOG_ARCHIVAL);
+       }
+       else if (!FatalError && Shutdown != ImmediateShutdown)
+       {
+           /*
+            * Checkpointer only ought to perform the shutdown checkpoint
+            * during shutdown.  If somehow checkpointer did so in another
+            * situation, we have no choice but to crash-restart.
+            *
+            * It's possible however that we get PMSIGNAL_XLOG_IS_SHUTDOWN
+            * outside of PM_WAIT_XLOG_SHUTDOWN if an orderly shutdown was
+            * "interrupted" by a crash or an immediate shutdown.
+            */
+           ereport(LOG,
+                   (errmsg("WAL was shut down unexpectedly")));
+
+           /*
+            * Doesn't seem likely to help to take send_abort_for_crash into
+            * account here.
+            */
+           HandleFatalError(PMQUIT_FOR_CRASH, false);
+       }
+
+       /*
+        * Need to run PostmasterStateMachine() to check if we already can go
+        * to the next state.
+        */
+       request_state_update = true;
+   }
+
     /*
      * Try to advance postmaster's state machine, if a child requests it.
-    *
+    */
+   if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE))
+   {
+       request_state_update = true;
+   }
+
+   /*
      * Be careful about the order of this action relative to this function's
      * other actions.  Generally, this should be after other actions, in case
      * they have effects PostmasterStateMachine would need to know about.
@@ -3734,7 +3800,7 @@ process_pm_pmsignal(void)
      * cannot have any (immediate) effect on the state machine, but does
      * depend on what state we're in now.
      */
-   if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE))
+   if (request_state_update)
     {
         PostmasterStateMachine();
     }
@@ -4045,6 +4111,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
     switch (pmState)
     {
         case PM_NO_CHILDREN:
+       case PM_WAIT_CHECKPOINTER:
         case PM_WAIT_DEAD_END:
         case PM_WAIT_XLOG_ARCHIVAL:
         case PM_WAIT_XLOG_SHUTDOWN:
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt

index 0b53cba807d4be841b549a9a402b41a25475d8b2..e199f071628987ec0626847d8c18632293dd3e89 100644 (file)
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -56,6 +56,7 @@ AUTOVACUUM_MAIN   "Waiting in main loop of autovacuum launcher process."
  BGWRITER_HIBERNATE "Waiting in background writer process, hibernating."
  BGWRITER_MAIN  "Waiting in main loop of background writer process."
  CHECKPOINTER_MAIN  "Waiting in main loop of checkpointer process."
+CHECKPOINTER_SHUTDOWN  "Waiting for checkpointer process to be terminated."
  LOGICAL_APPLY_MAIN "Waiting in main loop of logical replication apply process."
  LOGICAL_LAUNCHER_MAIN  "Waiting in main loop of logical replication launcher process."
  LOGICAL_PARALLEL_APPLY_MAIN    "Waiting in main loop of logical replication parallel apply process."
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h

index 3fbe5bf11367036c34c907ce3f1622f49777b8ee..d84a383047e02d8b21665ef7fdfb2d0318522461 100644 (file)
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -40,9 +40,10 @@ typedef enum
     PMSIGNAL_BACKGROUND_WORKER_CHANGE,  /* background worker state change */
     PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
     PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
+   PMSIGNAL_XLOG_IS_SHUTDOWN,  /* ShutdownXLOG() completed */
  } PMSignalReason;
  
-#define NUM_PMSIGNALS (PMSIGNAL_ADVANCE_STATE_MACHINE+1)
+#define NUM_PMSIGNALS (PMSIGNAL_XLOG_IS_SHUTDOWN+1)
  
  /*
   * Reasons why the postmaster would send SIGQUIT to its children.
author	Andres Freund <andres@anarazel.de>
	Sat, 25 Jan 2025 16:37:13 +0000 (11:37 -0500)
committer	Andres Freund <andres@anarazel.de>
	Sat, 25 Jan 2025 16:37:13 +0000 (11:37 -0500)
src/backend/postmaster/checkpointer.c		patch \| blob \| blame \| history
src/backend/postmaster/postmaster.c		patch \| blob \| blame \| history
src/backend/utils/activity/wait_event_names.txt		patch \| blob \| blame \| history
src/include/storage/pmsignal.h		patch \| blob \| blame \| history