Implementing the follow_primary command-locking over the watchdog channel.

author Muhammad Usama <m.usama@highgo.ca>

Fri, 16 Jul 2021 07:17:27 +0000 (12:17 +0500)

committer Muhammad Usama <m.usama@gmail.com>

Fri, 16 Jul 2021 07:21:42 +0000 (12:21 +0500)
author Muhammad Usama <m.usama@highgo.ca>
Fri, 16 Jul 2021 07:17:27 +0000 (12:17 +0500)
committer Muhammad Usama <m.usama@gmail.com>
Fri, 16 Jul 2021 07:21:42 +0000 (12:21 +0500)
diff --git a/src/include/pool.h b/src/include/pool.h

index fe19d2d98afd8ae0d0377769e0e6ec93affb182a..d15a0a13e6530ba1918d00fcf51a6f0464d5a931 100644 (file)
--- a/src/include/pool.h
+++ b/src/include/pool.h
@@ -426,7 +426,7 @@ typedef enum
  #define REQ_DETAIL_WATCHDOG            0x00000002      /* failover req from watchdog */
  #define REQ_DETAIL_CONFIRMED   0x00000004      /* failover req that does not
                                                                                          * require majority vote */
-#define REQ_DETAIL_UPDATE              0x00000008      /* failover req is just and update
+#define REQ_DETAIL_UPDATE              0x00000008      /* failover req is just an update
                                                                                          * node status request */
  #define REQ_DETAIL_PROMOTE             0x00000010      /* failover req is actually promoting the specified standby node.
                                                                                          * current primary will be detached */
@@ -454,6 +454,16 @@ typedef struct
         /* greater than 0 if follow primary command or detach_false_primary in
          * execution */
         bool            follow_primary_count;
+       bool            follow_primary_lock_pending; /* watchdog process can't wait
+                                                                                         * for follow_primary lock acquisition
+                                                                                         * in case it is held at the time of
+                                                                                         * request.
+                                                                                         * This flag indicates that lock was requested
+                                                                                         * by watchdog coordinator and next contender should
+                                                                                         * wait for the coordinator to release the lock
+                                                                                         */
+       bool            follow_primary_lock_held_remotely; /* true when lock is held by
+                                                                                                       watchdog coordinator*/
         bool            follow_primary_ongoing; /* true if follow primary command is ongoing */
  }                      POOL_REQUEST_INFO;
  
@@ -633,8 +643,8 @@ extern POOL_NODE_STATUS * verify_backend_node_status(POOL_CONNECTION_POOL_SLOT *
  extern POOL_NODE_STATUS * pool_get_node_status(void);
  extern void pool_set_backend_status_changed_time(int backend_id);
  extern int     get_next_main_node(void);
-extern bool pool_acquire_follow_primary_lock(bool block);
-extern void pool_release_follow_primary_lock(void);
+extern bool pool_acquire_follow_primary_lock(bool block, bool remote_reques);
+extern void pool_release_follow_primary_lock(bool remote_reques);
  
  /* strlcpy.c */
  #ifndef HAVE_STRLCPY
diff --git a/src/include/watchdog/wd_internal_commands.h b/src/include/watchdog/wd_internal_commands.h

index 4f0b37a2facbc4a8601ccd7a06591ce35d5f2dc8..5093eecc5e109a6cd3c5d8c1857b0cf033c7a147 100644 (file)
--- a/src/include/watchdog/wd_internal_commands.h
+++ b/src/include/watchdog/wd_internal_commands.h
@@ -31,6 +31,19 @@
  #include "watchdog/wd_ipc_conn.h"
  #include "watchdog/wd_commands.h"
  
+/*
+ * These lock can only be acquired by
+ * coordinator watchdog node on standby
+ * watchdog node.
+ */
+typedef enum WD_LOCK_STANDBY_TYPE
+{
+       WD_INVALID_LOCK,
+       /* currently we have only one lock */
+       WD_FOLLOW_PRIMARY_LOCK
+}WD_LOCK_STANDBY_TYPE;
+
+
  extern WdCommandResult wd_start_recovery(void);
  extern WdCommandResult wd_end_recovery(void);
  extern WDFailoverCMDResults wd_send_failback_request(int node_id, unsigned char flags);
@@ -60,4 +73,8 @@ extern void set_watchdog_node_escalated(void);
  extern void reset_watchdog_node_escalated(void);
  extern bool get_watchdog_node_escalation_state(void);
  extern size_t wd_ipc_get_shared_mem_size(void);
+
+extern WdCommandResult wd_lock_standby(WD_LOCK_STANDBY_TYPE lock_type);
+extern WdCommandResult wd_unlock_standby(WD_LOCK_STANDBY_TYPE lock_type);
+
  #endif                                                 /* WD_INTERNAL_COMMANDS_H */
diff --git a/src/include/watchdog/wd_ipc_defines.h b/src/include/watchdog/wd_ipc_defines.h

index e66c4551d76701badbf6c699cf392e3193959259..16ff5a4158ec9722958e05b47744201ce867fbc0 100644 (file)
--- a/src/include/watchdog/wd_ipc_defines.h
+++ b/src/include/watchdog/wd_ipc_defines.h
@@ -75,6 +75,7 @@ typedef enum WDValueDataType
  #define WD_COMMAND_REELECT_LEADER              "REELECT_LEADER"
  #define WD_COMMAND_SHUTDOWN_CLUSTER    "SHUTDOWN_CLUSTER"
  #define WD_COMMAND_RELOAD_CONFIG_CLUSTER               "RELOAD_CONFIG_CLUSTER"
+#define WD_COMMAND_LOCK_ON_STANDBY             "APPLY_LOCK_ON_STANDBY"
  
  
  #define WD_FUNCTION_START_RECOVERY             "START_RECOVERY"
diff --git a/src/main/main.c b/src/main/main.c

index 2702bf28352ff5e373109717ddaff39d4fe8b0d2..575bee4a6fe875f66cc1be36081317ff0a6b1423 100644 (file)
--- a/src/main/main.c
+++ b/src/main/main.c
@@ -77,6 +77,7 @@ int                   myargc;
  char     **myargv;
  int                    assert_enabled = 0;
  char      *pool_key = NULL;
+
  int
  main(int argc, char **argv)
  {
diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c

index d3e593af1e7d911ce6a9a6ab66cc1b3cd5ad4add..c4453a76428369f576ae2c7bcb919a6507d8a9fe 100644 (file)
--- a/src/main/pgpool_main.c
+++ b/src/main/pgpool_main.c
@@ -1305,6 +1305,13 @@ sigusr1_interrupt_processor(void)
                                         (errmsg("we have joined the watchdog cluster as STANDBY node"),
                                          errdetail("syncing the backend states from the LEADER watchdog node")));
                         sync_backend_from_watchdog();
+                       /*
+                        * we also want to release the follow_primary lock if it was held
+                        * by the remote node.
+                        * because the change of watchdog coordinator would lead to forever stuck
+                        * in the the locked state
+                        */
+                       pool_release_follow_primary_lock(true);
                 }
         }
         if (user1SignalSlot->signalFlags[SIG_FAILOVER_INTERRUPT])
@@ -3250,9 +3257,9 @@ find_primary_node(void)
                 pfree(password);
  
         /* Verify backend status */
-       pool_acquire_follow_primary_lock(true);
+       pool_acquire_follow_primary_lock(true, false);
         status = verify_backend_node_status(slots);
-       pool_release_follow_primary_lock();
+       pool_release_follow_primary_lock(false);
  
         for (i = 0; i < NUM_BACKENDS; i++)
         {
@@ -3390,7 +3397,16 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
  #endif
  
                 SetProcessGlobalVariables(PT_FOLLOWCHILD);
-               pool_acquire_follow_primary_lock(true);
+               /*
+                * when the watchdog is enabled, we would come here
+                * only on the coordinator node.
+                * so before acquiring the local lock, Lock all the
+                * standby nodes so that they should stop false primary
+                * detection until we are finished with the follow primary
+                * command.
+                */
+               wd_lock_standby(WD_FOLLOW_PRIMARY_LOCK);
+               pool_acquire_follow_primary_lock(true, false);
                 Req_info->follow_primary_ongoing = true;
                 ereport(LOG,
                                 (errmsg("start triggering follow command.")));
@@ -3404,7 +3420,9 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
                                                                                  old_main_node, new_primary, old_primary);
                 }
                 Req_info->follow_primary_ongoing = false;
-               pool_release_follow_primary_lock();
+               pool_release_follow_primary_lock(false);
+               /* inform standby watchdog nodes to release the lock aswell*/
+               wd_unlock_standby(WD_FOLLOW_PRIMARY_LOCK);
                 exit(0);
         }
         else if (pid == -1)
@@ -4345,9 +4363,11 @@ pool_set_backend_status_changed_time(int backend_id)
   * they are conflicting each other.  If argument "block" is true, this
   * function will not return until it succeeds in acquiring the lock.  This
   * function returns true if succeeded in acquiring the lock.
+ *
+ * first arg:block is ignored when remote_request is set
   */
  bool
-pool_acquire_follow_primary_lock(bool block)
+pool_acquire_follow_primary_lock(bool block, bool remote_request)
  {
         pool_sigset_t oldmask;
         volatile int    follow_primary_count;
@@ -4365,6 +4385,29 @@ pool_acquire_follow_primary_lock(bool block)
                                         (errmsg("pool_acquire_follow_primary_lock: lock was not held by anyone")));
                         break;
                 }
+               else if (follow_primary_count > 0 && remote_request)
+               {
+                       if (Req_info->follow_primary_lock_held_remotely)
+                       {
+                               /* The lock was already held by remote node and we only
+                                * support one remote lock
+                                */
+                               ereport(LOG,
+                                               (errmsg("pool_acquire_follow_primary_lock: received remote locking request while lock is already held by the remote node")));
+
+                       }
+                       else
+                       {
+                               /* set the flag that watchdog has requested the lock */
+                               Req_info->follow_primary_lock_pending = true;
+                       }
+                       pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
+                       POOL_SETMASK(&oldmask);
+                       /* return and inform that the lock was held by someone */
+                       ereport(DEBUG1,
+                                       (errmsg("pool_acquire_follow_primary_lock: lock was held by someone %d", follow_primary_count)));
+                       return false;
+               }
  
                 else if (follow_primary_count > 0 && !block)
                 {
@@ -4384,6 +4427,8 @@ pool_acquire_follow_primary_lock(bool block)
         }
  
         /* acquire lock */
+       Req_info->follow_primary_lock_held_remotely = remote_request;
+
         Req_info->follow_primary_count = 1;
         pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
         POOL_SETMASK(&oldmask);
@@ -4398,13 +4443,71 @@ pool_acquire_follow_primary_lock(bool block)
   * Release lock on follow primary command execution.
   */
  void
-pool_release_follow_primary_lock(void)
+pool_release_follow_primary_lock(bool remote_request)
  {
         pool_sigset_t oldmask;
  
         POOL_SETMASK2(&BlockSig, &oldmask);
         pool_semaphore_lock(FOLLOW_PRIMARY_SEM);
-       Req_info->follow_primary_count = 0;
+       if (remote_request)
+       {
+               if (Req_info->follow_primary_lock_held_remotely)
+               {
+                       /* remote request can only release locks held by remote nodes */
+                       Req_info->follow_primary_count = 0;
+                       Req_info->follow_primary_lock_held_remotely = false;
+                       ereport(DEBUG1,
+                                       (errmsg("pool_release_follow_primary_lock relased the remote lock")));
+               }
+               else if (Req_info->follow_primary_count)
+               {
+                       /*
+                        * we have received the release lock request from remote
+                        * but the lock is not held by remote node.
+                        * Just ignore the request
+                        */
+                       ereport(DEBUG1,
+                                       (errmsg("pool_release_follow_primary_lock is not relasing the lock since it was not held by remote node")));
+               }
+               /*
+                * Silently ignore, if we received the release request from remote while no lock was held.
+                * Also clear the pending lock request, As we only support single remote lock
+                */
+               Req_info->follow_primary_lock_pending = false;
+
+       }
+       else /*local request */
+       {
+               /*
+                * if we have a pending lock request from watchdog
+                * do not remove the actual lock, Just clear the pending flag
+                */
+               if (Req_info->follow_primary_lock_pending)
+               {
+                       Req_info->follow_primary_lock_held_remotely = true;
+                       Req_info->follow_primary_count = 1;
+                       /* also clear the pending lock flag */
+                       Req_info->follow_primary_lock_pending = false;
+                       ereport(DEBUG1,
+                                       (errmsg("pool_release_follow_primary_lock is not relasing the lock and shifting it to coordinator watchdog node")));
+               }
+               else
+               {
+                       if (Req_info->follow_primary_lock_held_remotely)
+                       {
+                               /*
+                                * Ideally this should not happen.
+                                * yet if for some reason our local node is trying to release a lock
+                                * that is heald by remote node. Just produce a LOG message and release
+                                * the lock
+                                */
+                               ereport(LOG,
+                                               (errmsg("pool_release_follow_primary_lock is relasing the remote lock by local request")));
+                       }
+                       Req_info->follow_primary_count = 0;
+                       Req_info->follow_primary_lock_held_remotely = false;
+               }
+       }
         pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
         POOL_SETMASK(&oldmask);
  
diff --git a/src/streaming_replication/pool_worker_child.c b/src/streaming_replication/pool_worker_child.c

index fb15ceb0b9a1e8cd082ed285a58dd000aca775b2..61b94446a52d09fdba0b2b514e784bf9c0f22911 100644 (file)
--- a/src/streaming_replication/pool_worker_child.c
+++ b/src/streaming_replication/pool_worker_child.c
@@ -196,7 +196,7 @@ do_worker_child(void)
                          */
                         follow_primary_lock_acquired = false;
  
-                       if (pool_acquire_follow_primary_lock(false) == true)
+                       if (pool_acquire_follow_primary_lock(false, false) == true)
                         {
                                 follow_primary_lock_acquired = true;
  
@@ -227,17 +227,11 @@ do_worker_child(void)
                                                         /*
                                                          * If detach_false_primary is enabled, send
                                                          * degenerate request to detach invalid node.
-                                                        * This should only happen on leader watchdog node
-                                                        * and quorum exists if watchdog is enabled. Other
-                                                        * nodes will be informed by the leader node later
-                                                        * on.
                                                          */
-                                                       if ((pool_config->detach_false_primary && !pool_config->use_watchdog) ||
-                                                               (pool_config->detach_false_primary && pool_config->use_watchdog &&
-                                                                wd_internal_get_watchdog_quorum_state() >= 0 && wd_status == WD_COORDINATOR))
+                                                       if (pool_config->detach_false_primary)
                                                         {
                                                                 n = i;
-                                                               degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER | REQ_DETAIL_CONFIRMED);
+                                                               degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER);
                                                         }
                                                 }
                                         }
@@ -245,7 +239,7 @@ do_worker_child(void)
                                 PG_CATCH();
                                 {
                                         discard_persistent_connection();
-                                       pool_release_follow_primary_lock();
+                                       pool_release_follow_primary_lock(false);
                                         follow_primary_lock_acquired = false;
                                         sleep(pool_config->sr_check_period);
                                         PG_RE_THROW();
@@ -256,7 +250,7 @@ do_worker_child(void)
                                 discard_persistent_connection();
                                 if (follow_primary_lock_acquired)
                                 {
-                                       pool_release_follow_primary_lock();
+                                       pool_release_follow_primary_lock(false);
                                         follow_primary_lock_acquired = false;
                                 }
                         }
@@ -656,6 +650,6 @@ static void
  sr_check_will_die(int code, Datum arg)
  {
         if (follow_primary_lock_acquired)
-               pool_release_follow_primary_lock();
+               pool_release_follow_primary_lock(false);
  
  }
diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c

index e167bd4151075fd3ed7a36c643c61fe22bd9068c..6adea4286d6f37ca99d9e90b6e38d793de6eb25f 100644 (file)
--- a/src/watchdog/watchdog.c
+++ b/src/watchdog/watchdog.c
@@ -2102,6 +2102,16 @@ process_IPC_execute_cluster_command(WDCommandData * ipcCommand)
                 ereport(LOG,
                                 (errmsg("Watchdog has received reload config cluster command from IPC channel")));
         }
+       else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
+       {
+               ereport(LOG,
+                               (errmsg("Watchdog has received 'LOCK ON STANDBY' command from IPC channel")));
+               if (get_local_node_state() != WD_COORDINATOR)
+               {
+                       ereport(LOG,
+                                       (errmsg("'LOCK ON STANDBY' command can only be processed on coordinator node")));
+               }
+       }
         else
         {
                 ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
@@ -3011,7 +3021,6 @@ static IPC_CMD_PROCESS_RES process_IPC_failover_indication(WDCommandData * ipcCo
                                                          errdetail("failed to get failover state from json data in command packet")));
                                         res = FAILOVER_RES_INVALID_FUNCTION;
                                 }
-
                         }
                         else
                         {
@@ -4092,6 +4101,73 @@ wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                                 (errmsg("processing reload config command from remote node \"%s\"", wdNode->nodeName)));
                 pool_signal_parent(SIGHUP);
         }
+       else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
+       {
+               int i;
+               int lock_type = -1;
+               char *operation = NULL;
+               if (get_local_node_state() != WD_STANDBY && wdNode->state == WD_COORDINATOR)
+               {
+                       if (nArgs == 2)
+                       {
+                               for ( i =0; i < nArgs; i++)
+                               {
+                                       if (strcmp(wdExecCommandArg[i].arg_name, "StandbyLockType") == 0)
+                                       {
+                                               lock_type = atoi(wdExecCommandArg[i].arg_value);
+                                       }
+                                       else if (strcmp(wdExecCommandArg[i].arg_name, "LockingOperation") == 0)
+                                       {
+                                               operation = wdExecCommandArg[i].arg_value;
+                                       }
+                                       else
+                                               ereport(LOG,
+                                                               (errmsg("unsupported argument \"%s\" in 'LOCK ON STANDBY' from remote node \"%s\"", wdExecCommandArg[i].arg_name, wdNode->nodeName)));
+                               }
+                               if (lock_type < 0 || operation == NULL)
+                               {
+                                       ereport(LOG,
+                                                       (errmsg("missing argument in 'LOCK ON STANDBY' from remote node \"%s\"", wdNode->nodeName),
+                                                        errdetail("command ignored")));
+                               }
+                               else if (lock_type == WD_FOLLOW_PRIMARY_LOCK)
+                               {
+                                       ereport(LOG,
+                                                       (errmsg("processing follow primary looking[%s] request from remote node \"%s\"", operation,wdNode->nodeName)));
+
+                                       if (strcasecmp("acquire", operation) == 0)
+                                               pool_acquire_follow_primary_lock(false, true);
+                                       else if (strcasecmp("release", operation) == 0)
+                                               pool_release_follow_primary_lock(true);
+                                       else
+                                               ereport(LOG,
+                                                               (errmsg("invalid looking operaition[%s] in 'LOCK ON STANDBY' from remote node \"%s\"", operation, wdNode->nodeName),
+                                                                errdetail("command ignored")));
+                               }
+                               else
+                                       ereport(LOG,
+                                                       (errmsg("unsupported lock-type:%d in 'LOCK ON STANDBY' from remote node \"%s\"", lock_type, wdNode->nodeName)));
+
+                       }
+                       else
+                       {
+                               ereport(LOG,
+                                               (errmsg("invalid arguments in 'LOCK ON STANDBY' command from remote node \"%s\"",  wdNode->nodeName)));
+                       }
+               }
+               else if (get_local_node_state() != WD_STANDBY)
+               {
+                       ereport(LOG,
+                                       (errmsg("invalid node state to execute 'LOCK ON STANDBY' command")));
+
+               }
+               else
+               {
+                       ereport(LOG,
+                                       (errmsg("'LOCK ON STANDBY' command can only be accepted from the coordinator watchdog node"),
+                                        errdetail("ignoring...")));
+               }
+       }
         else
         {
                 ereport(WARNING,
diff --git a/src/watchdog/wd_internal_commands.c b/src/watchdog/wd_internal_commands.c

index d628ead42397120932ef5298ceb66bd73730d3bf..d2eb16fda3c5f877abacfa6ba326750cdcd9e73c 100644 (file)
--- a/src/watchdog/wd_internal_commands.c
+++ b/src/watchdog/wd_internal_commands.c
@@ -72,6 +72,8 @@ static WDFailoverCMDResults wd_get_failover_result_from_data(WDIPCCmdResult * re
  static WDFailoverCMDResults wd_issue_failover_command(char *func_name, int *node_id_set,
                                                                                                                         int count, unsigned char flags);
  
+static WdCommandResult wd_send_locking_command(WD_LOCK_STANDBY_TYPE lock_type,
+                                                                                                                       bool acquire);
  
  void
  wd_ipc_initialize_data(void)
@@ -552,3 +554,37 @@ wd_internal_get_watchdog_local_node_state(void)
  {
         return get_watchdog_local_node_state(pool_config->wd_authkey);
  }
+
+static WdCommandResult
+wd_send_locking_command(WD_LOCK_STANDBY_TYPE lock_type, bool acquire)
+{
+       WDExecCommandArg wdExecCommandArg[2];
+
+       strncpy(wdExecCommandArg[0].arg_name, "StandbyLockType", sizeof(wdExecCommandArg[0].arg_name) - 1);
+       snprintf(wdExecCommandArg[0].arg_value, sizeof(wdExecCommandArg[0].arg_name) - 1, "%d",lock_type);
+
+       strncpy(wdExecCommandArg[1].arg_name, "LockingOperation", sizeof(wdExecCommandArg[1].arg_name) - 1);
+       snprintf(wdExecCommandArg[1].arg_value, sizeof(wdExecCommandArg[1].arg_name) - 1,
+                        "%s",acquire?"acquire":"release");
+
+       ereport(DEBUG1,
+                       (errmsg("sending standby locking request to watchdog")));
+
+       return wd_execute_cluster_command(WD_COMMAND_LOCK_ON_STANDBY, 2, wdExecCommandArg);
+}
+
+WdCommandResult
+wd_lock_standby(WD_LOCK_STANDBY_TYPE lock_type)
+{
+       if (pool_config->use_watchdog)
+               return wd_send_locking_command(lock_type, true);
+       return COMMAND_OK;
+}
+
+WdCommandResult
+wd_unlock_standby(WD_LOCK_STANDBY_TYPE lock_type)
+{
+       if (pool_config->use_watchdog)
+               return wd_send_locking_command(lock_type, false);
+       return COMMAND_OK;
+}
author	Muhammad Usama <m.usama@highgo.ca>
	Fri, 16 Jul 2021 07:17:27 +0000 (12:17 +0500)
committer	Muhammad Usama <m.usama@gmail.com>
	Fri, 16 Jul 2021 07:21:42 +0000 (12:21 +0500)
src/include/pool.h		patch \| blob \| blame \| history
src/include/watchdog/wd_internal_commands.h		patch \| blob \| blame \| history
src/include/watchdog/wd_ipc_defines.h		patch \| blob \| blame \| history
src/main/main.c		patch \| blob \| blame \| history
src/main/pgpool_main.c		patch \| blob \| blame \| history
src/streaming_replication/pool_worker_child.c		patch \| blob \| blame \| history
src/watchdog/watchdog.c		patch \| blob \| blame \| history
src/watchdog/wd_internal_commands.c		patch \| blob \| blame \| history