Implementing the follow_primary command-locking over the watchdog channel.
authorMuhammad Usama <m.usama@highgo.ca>
Fri, 16 Jul 2021 07:17:27 +0000 (12:17 +0500)
committerMuhammad Usama <m.usama@gmail.com>
Fri, 16 Jul 2021 07:21:42 +0000 (12:21 +0500)
Supplementary fix for [pgpool-hackers: 3892] Problem with detach_false_primary..

commit:455f00dd5f5b7b94bd91aa0b6b40aab21dceabb9 fixed a race condition between
detach_false_primary and follow_primary commands. Part of the fix was to make
sure that the detach_false_primary should only be executed on the
leader watchdog node.

The mentioned commit ensures the execution of detach_false_primary on the
watchdog leader by getting the watchdog status from within the main process.
The design is good enough for most cases, but has the potential to fail if
the cluster goes into the election process just after the main process
has read the status.

To fix that, this commit implements the synchronization of follow_primary_command
execution using the distributed locks over the watchdog channel.

The idea is, just before executing the follow_primary during the failover process
we instruct all standby watchdog nodes to acquire a lock on their respective
nodes to block the false primary detection during the period when the
follow_primary is being executed on the leader watchdog node.

Moreover to keep the watchdog process blocked on waiting for the lock the commit
introduced the pending remote lock mechanism, so that remote locks can get
acquired in the background after the completion of the inflight replication checks.

Finally, REQ_DETAIL_CONFIRMED flag is removed from degenerate_backend_set()
request that gets issued to detach the false primary, That means all quorum
and consensus rules must be satisfied for the detach to happen.

src/include/pool.h
src/include/watchdog/wd_internal_commands.h
src/include/watchdog/wd_ipc_defines.h
src/main/main.c
src/main/pgpool_main.c
src/streaming_replication/pool_worker_child.c
src/watchdog/watchdog.c
src/watchdog/wd_internal_commands.c

index fe19d2d98afd8ae0d0377769e0e6ec93affb182a..d15a0a13e6530ba1918d00fcf51a6f0464d5a931 100644 (file)
@@ -426,7 +426,7 @@ typedef enum
 #define REQ_DETAIL_WATCHDOG            0x00000002      /* failover req from watchdog */
 #define REQ_DETAIL_CONFIRMED   0x00000004      /* failover req that does not
                                                                                         * require majority vote */
-#define REQ_DETAIL_UPDATE              0x00000008      /* failover req is just and update
+#define REQ_DETAIL_UPDATE              0x00000008      /* failover req is just an update
                                                                                         * node status request */
 #define REQ_DETAIL_PROMOTE             0x00000010      /* failover req is actually promoting the specified standby node.
                                                                                         * current primary will be detached */
@@ -454,6 +454,16 @@ typedef struct
        /* greater than 0 if follow primary command or detach_false_primary in
         * execution */
        bool            follow_primary_count;
+       bool            follow_primary_lock_pending; /* watchdog process can't wait
+                                                                                         * for follow_primary lock acquisition
+                                                                                         * in case it is held at the time of
+                                                                                         * request.
+                                                                                         * This flag indicates that lock was requested
+                                                                                         * by watchdog coordinator and next contender should
+                                                                                         * wait for the coordinator to release the lock
+                                                                                         */
+       bool            follow_primary_lock_held_remotely; /* true when lock is held by
+                                                                                                       watchdog coordinator*/
        bool            follow_primary_ongoing; /* true if follow primary command is ongoing */
 }                      POOL_REQUEST_INFO;
 
@@ -633,8 +643,8 @@ extern POOL_NODE_STATUS * verify_backend_node_status(POOL_CONNECTION_POOL_SLOT *
 extern POOL_NODE_STATUS * pool_get_node_status(void);
 extern void pool_set_backend_status_changed_time(int backend_id);
 extern int     get_next_main_node(void);
-extern bool pool_acquire_follow_primary_lock(bool block);
-extern void pool_release_follow_primary_lock(void);
+extern bool pool_acquire_follow_primary_lock(bool block, bool remote_reques);
+extern void pool_release_follow_primary_lock(bool remote_reques);
 
 /* strlcpy.c */
 #ifndef HAVE_STRLCPY
index 4f0b37a2facbc4a8601ccd7a06591ce35d5f2dc8..5093eecc5e109a6cd3c5d8c1857b0cf033c7a147 100644 (file)
 #include "watchdog/wd_ipc_conn.h"
 #include "watchdog/wd_commands.h"
 
+/*
+ * These lock can only be acquired by
+ * coordinator watchdog node on standby
+ * watchdog node.
+ */
+typedef enum WD_LOCK_STANDBY_TYPE
+{
+       WD_INVALID_LOCK,
+       /* currently we have only one lock */
+       WD_FOLLOW_PRIMARY_LOCK
+}WD_LOCK_STANDBY_TYPE;
+
+
 extern WdCommandResult wd_start_recovery(void);
 extern WdCommandResult wd_end_recovery(void);
 extern WDFailoverCMDResults wd_send_failback_request(int node_id, unsigned char flags);
@@ -60,4 +73,8 @@ extern void set_watchdog_node_escalated(void);
 extern void reset_watchdog_node_escalated(void);
 extern bool get_watchdog_node_escalation_state(void);
 extern size_t wd_ipc_get_shared_mem_size(void);
+
+extern WdCommandResult wd_lock_standby(WD_LOCK_STANDBY_TYPE lock_type);
+extern WdCommandResult wd_unlock_standby(WD_LOCK_STANDBY_TYPE lock_type);
+
 #endif                                                 /* WD_INTERNAL_COMMANDS_H */
index e66c4551d76701badbf6c699cf392e3193959259..16ff5a4158ec9722958e05b47744201ce867fbc0 100644 (file)
@@ -75,6 +75,7 @@ typedef enum WDValueDataType
 #define WD_COMMAND_REELECT_LEADER              "REELECT_LEADER"
 #define WD_COMMAND_SHUTDOWN_CLUSTER    "SHUTDOWN_CLUSTER"
 #define WD_COMMAND_RELOAD_CONFIG_CLUSTER               "RELOAD_CONFIG_CLUSTER"
+#define WD_COMMAND_LOCK_ON_STANDBY             "APPLY_LOCK_ON_STANDBY"
 
 
 #define WD_FUNCTION_START_RECOVERY             "START_RECOVERY"
index 2702bf28352ff5e373109717ddaff39d4fe8b0d2..575bee4a6fe875f66cc1be36081317ff0a6b1423 100644 (file)
@@ -77,6 +77,7 @@ int                   myargc;
 char     **myargv;
 int                    assert_enabled = 0;
 char      *pool_key = NULL;
+
 int
 main(int argc, char **argv)
 {
index d3e593af1e7d911ce6a9a6ab66cc1b3cd5ad4add..c4453a76428369f576ae2c7bcb919a6507d8a9fe 100644 (file)
@@ -1305,6 +1305,13 @@ sigusr1_interrupt_processor(void)
                                        (errmsg("we have joined the watchdog cluster as STANDBY node"),
                                         errdetail("syncing the backend states from the LEADER watchdog node")));
                        sync_backend_from_watchdog();
+                       /*
+                        * we also want to release the follow_primary lock if it was held
+                        * by the remote node.
+                        * because the change of watchdog coordinator would lead to forever stuck
+                        * in the the locked state
+                        */
+                       pool_release_follow_primary_lock(true);
                }
        }
        if (user1SignalSlot->signalFlags[SIG_FAILOVER_INTERRUPT])
@@ -3250,9 +3257,9 @@ find_primary_node(void)
                pfree(password);
 
        /* Verify backend status */
-       pool_acquire_follow_primary_lock(true);
+       pool_acquire_follow_primary_lock(true, false);
        status = verify_backend_node_status(slots);
-       pool_release_follow_primary_lock();
+       pool_release_follow_primary_lock(false);
 
        for (i = 0; i < NUM_BACKENDS; i++)
        {
@@ -3390,7 +3397,16 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
 #endif
 
                SetProcessGlobalVariables(PT_FOLLOWCHILD);
-               pool_acquire_follow_primary_lock(true);
+               /*
+                * when the watchdog is enabled, we would come here
+                * only on the coordinator node.
+                * so before acquiring the local lock, Lock all the
+                * standby nodes so that they should stop false primary
+                * detection until we are finished with the follow primary
+                * command.
+                */
+               wd_lock_standby(WD_FOLLOW_PRIMARY_LOCK);
+               pool_acquire_follow_primary_lock(true, false);
                Req_info->follow_primary_ongoing = true;
                ereport(LOG,
                                (errmsg("start triggering follow command.")));
@@ -3404,7 +3420,9 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
                                                                                 old_main_node, new_primary, old_primary);
                }
                Req_info->follow_primary_ongoing = false;
-               pool_release_follow_primary_lock();
+               pool_release_follow_primary_lock(false);
+               /* inform standby watchdog nodes to release the lock aswell*/
+               wd_unlock_standby(WD_FOLLOW_PRIMARY_LOCK);
                exit(0);
        }
        else if (pid == -1)
@@ -4345,9 +4363,11 @@ pool_set_backend_status_changed_time(int backend_id)
  * they are conflicting each other.  If argument "block" is true, this
  * function will not return until it succeeds in acquiring the lock.  This
  * function returns true if succeeded in acquiring the lock.
+ *
+ * first arg:block is ignored when remote_request is set
  */
 bool
-pool_acquire_follow_primary_lock(bool block)
+pool_acquire_follow_primary_lock(bool block, bool remote_request)
 {
        pool_sigset_t oldmask;
        volatile int    follow_primary_count;
@@ -4365,6 +4385,29 @@ pool_acquire_follow_primary_lock(bool block)
                                        (errmsg("pool_acquire_follow_primary_lock: lock was not held by anyone")));
                        break;
                }
+               else if (follow_primary_count > 0 && remote_request)
+               {
+                       if (Req_info->follow_primary_lock_held_remotely)
+                       {
+                               /* The lock was already held by remote node and we only
+                                * support one remote lock
+                                */
+                               ereport(LOG,
+                                               (errmsg("pool_acquire_follow_primary_lock: received remote locking request while lock is already held by the remote node")));
+
+                       }
+                       else
+                       {
+                               /* set the flag that watchdog has requested the lock */
+                               Req_info->follow_primary_lock_pending = true;
+                       }
+                       pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
+                       POOL_SETMASK(&oldmask);
+                       /* return and inform that the lock was held by someone */
+                       ereport(DEBUG1,
+                                       (errmsg("pool_acquire_follow_primary_lock: lock was held by someone %d", follow_primary_count)));
+                       return false;
+               }
 
                else if (follow_primary_count > 0 && !block)
                {
@@ -4384,6 +4427,8 @@ pool_acquire_follow_primary_lock(bool block)
        }
 
        /* acquire lock */
+       Req_info->follow_primary_lock_held_remotely = remote_request;
+
        Req_info->follow_primary_count = 1;
        pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
        POOL_SETMASK(&oldmask);
@@ -4398,13 +4443,71 @@ pool_acquire_follow_primary_lock(bool block)
  * Release lock on follow primary command execution.
  */
 void
-pool_release_follow_primary_lock(void)
+pool_release_follow_primary_lock(bool remote_request)
 {
        pool_sigset_t oldmask;
 
        POOL_SETMASK2(&BlockSig, &oldmask);
        pool_semaphore_lock(FOLLOW_PRIMARY_SEM);
-       Req_info->follow_primary_count = 0;
+       if (remote_request)
+       {
+               if (Req_info->follow_primary_lock_held_remotely)
+               {
+                       /* remote request can only release locks held by remote nodes */
+                       Req_info->follow_primary_count = 0;
+                       Req_info->follow_primary_lock_held_remotely = false;
+                       ereport(DEBUG1,
+                                       (errmsg("pool_release_follow_primary_lock relased the remote lock")));
+               }
+               else if (Req_info->follow_primary_count)
+               {
+                       /*
+                        * we have received the release lock request from remote
+                        * but the lock is not held by remote node.
+                        * Just ignore the request
+                        */
+                       ereport(DEBUG1,
+                                       (errmsg("pool_release_follow_primary_lock is not relasing the lock since it was not held by remote node")));
+               }
+               /*
+                * Silently ignore, if we received the release request from remote while no lock was held.
+                * Also clear the pending lock request, As we only support single remote lock
+                */
+               Req_info->follow_primary_lock_pending = false;
+
+       }
+       else /*local request */
+       {
+               /*
+                * if we have a pending lock request from watchdog
+                * do not remove the actual lock, Just clear the pending flag
+                */
+               if (Req_info->follow_primary_lock_pending)
+               {
+                       Req_info->follow_primary_lock_held_remotely = true;
+                       Req_info->follow_primary_count = 1;
+                       /* also clear the pending lock flag */
+                       Req_info->follow_primary_lock_pending = false;
+                       ereport(DEBUG1,
+                                       (errmsg("pool_release_follow_primary_lock is not relasing the lock and shifting it to coordinator watchdog node")));
+               }
+               else
+               {
+                       if (Req_info->follow_primary_lock_held_remotely)
+                       {
+                               /*
+                                * Ideally this should not happen.
+                                * yet if for some reason our local node is trying to release a lock
+                                * that is heald by remote node. Just produce a LOG message and release
+                                * the lock
+                                */
+                               ereport(LOG,
+                                               (errmsg("pool_release_follow_primary_lock is relasing the remote lock by local request")));
+                       }
+                       Req_info->follow_primary_count = 0;
+                       Req_info->follow_primary_lock_held_remotely = false;
+               }
+       }
        pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
        POOL_SETMASK(&oldmask);
 
index fb15ceb0b9a1e8cd082ed285a58dd000aca775b2..61b94446a52d09fdba0b2b514e784bf9c0f22911 100644 (file)
@@ -196,7 +196,7 @@ do_worker_child(void)
                         */
                        follow_primary_lock_acquired = false;
 
-                       if (pool_acquire_follow_primary_lock(false) == true)
+                       if (pool_acquire_follow_primary_lock(false, false) == true)
                        {
                                follow_primary_lock_acquired = true;
 
@@ -227,17 +227,11 @@ do_worker_child(void)
                                                        /*
                                                         * If detach_false_primary is enabled, send
                                                         * degenerate request to detach invalid node.
-                                                        * This should only happen on leader watchdog node
-                                                        * and quorum exists if watchdog is enabled. Other
-                                                        * nodes will be informed by the leader node later
-                                                        * on.
                                                         */
-                                                       if ((pool_config->detach_false_primary && !pool_config->use_watchdog) ||
-                                                               (pool_config->detach_false_primary && pool_config->use_watchdog &&
-                                                                wd_internal_get_watchdog_quorum_state() >= 0 && wd_status == WD_COORDINATOR))
+                                                       if (pool_config->detach_false_primary)
                                                        {
                                                                n = i;
-                                                               degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER | REQ_DETAIL_CONFIRMED);
+                                                               degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER);
                                                        }
                                                }
                                        }
@@ -245,7 +239,7 @@ do_worker_child(void)
                                PG_CATCH();
                                {
                                        discard_persistent_connection();
-                                       pool_release_follow_primary_lock();
+                                       pool_release_follow_primary_lock(false);
                                        follow_primary_lock_acquired = false;
                                        sleep(pool_config->sr_check_period);
                                        PG_RE_THROW();
@@ -256,7 +250,7 @@ do_worker_child(void)
                                discard_persistent_connection();
                                if (follow_primary_lock_acquired)
                                {
-                                       pool_release_follow_primary_lock();
+                                       pool_release_follow_primary_lock(false);
                                        follow_primary_lock_acquired = false;
                                }
                        }
@@ -656,6 +650,6 @@ static void
 sr_check_will_die(int code, Datum arg)
 {
        if (follow_primary_lock_acquired)
-               pool_release_follow_primary_lock();
+               pool_release_follow_primary_lock(false);
 
 }
index e167bd4151075fd3ed7a36c643c61fe22bd9068c..6adea4286d6f37ca99d9e90b6e38d793de6eb25f 100644 (file)
@@ -2102,6 +2102,16 @@ process_IPC_execute_cluster_command(WDCommandData * ipcCommand)
                ereport(LOG,
                                (errmsg("Watchdog has received reload config cluster command from IPC channel")));
        }
+       else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
+       {
+               ereport(LOG,
+                               (errmsg("Watchdog has received 'LOCK ON STANDBY' command from IPC channel")));
+               if (get_local_node_state() != WD_COORDINATOR)
+               {
+                       ereport(LOG,
+                                       (errmsg("'LOCK ON STANDBY' command can only be processed on coordinator node")));
+               }
+       }
        else
        {
                ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
@@ -3011,7 +3021,6 @@ static IPC_CMD_PROCESS_RES process_IPC_failover_indication(WDCommandData * ipcCo
                                                         errdetail("failed to get failover state from json data in command packet")));
                                        res = FAILOVER_RES_INVALID_FUNCTION;
                                }
-
                        }
                        else
                        {
@@ -4092,6 +4101,73 @@ wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                                (errmsg("processing reload config command from remote node \"%s\"", wdNode->nodeName)));
                pool_signal_parent(SIGHUP);
        }
+       else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
+       {
+               int i;
+               int lock_type = -1;
+               char *operation = NULL;
+               if (get_local_node_state() != WD_STANDBY && wdNode->state == WD_COORDINATOR)
+               {
+                       if (nArgs == 2)
+                       {
+                               for ( i =0; i < nArgs; i++)
+                               {
+                                       if (strcmp(wdExecCommandArg[i].arg_name, "StandbyLockType") == 0)
+                                       {
+                                               lock_type = atoi(wdExecCommandArg[i].arg_value);
+                                       }
+                                       else if (strcmp(wdExecCommandArg[i].arg_name, "LockingOperation") == 0)
+                                       {
+                                               operation = wdExecCommandArg[i].arg_value;
+                                       }
+                                       else
+                                               ereport(LOG,
+                                                               (errmsg("unsupported argument \"%s\" in 'LOCK ON STANDBY' from remote node \"%s\"", wdExecCommandArg[i].arg_name, wdNode->nodeName)));
+                               }
+                               if (lock_type < 0 || operation == NULL)
+                               {
+                                       ereport(LOG,
+                                                       (errmsg("missing argument in 'LOCK ON STANDBY' from remote node \"%s\"", wdNode->nodeName),
+                                                        errdetail("command ignored")));
+                               }
+                               else if (lock_type == WD_FOLLOW_PRIMARY_LOCK)
+                               {
+                                       ereport(LOG,
+                                                       (errmsg("processing follow primary looking[%s] request from remote node \"%s\"", operation,wdNode->nodeName)));
+
+                                       if (strcasecmp("acquire", operation) == 0)
+                                               pool_acquire_follow_primary_lock(false, true);
+                                       else if (strcasecmp("release", operation) == 0)
+                                               pool_release_follow_primary_lock(true);
+                                       else
+                                               ereport(LOG,
+                                                               (errmsg("invalid looking operaition[%s] in 'LOCK ON STANDBY' from remote node \"%s\"", operation, wdNode->nodeName),
+                                                                errdetail("command ignored")));
+                               }
+                               else
+                                       ereport(LOG,
+                                                       (errmsg("unsupported lock-type:%d in 'LOCK ON STANDBY' from remote node \"%s\"", lock_type, wdNode->nodeName)));
+
+                       }
+                       else
+                       {
+                               ereport(LOG,
+                                               (errmsg("invalid arguments in 'LOCK ON STANDBY' command from remote node \"%s\"",  wdNode->nodeName)));
+                       }
+               }
+               else if (get_local_node_state() != WD_STANDBY)
+               {
+                       ereport(LOG,
+                                       (errmsg("invalid node state to execute 'LOCK ON STANDBY' command")));
+
+               }
+               else
+               {
+                       ereport(LOG,
+                                       (errmsg("'LOCK ON STANDBY' command can only be accepted from the coordinator watchdog node"),
+                                        errdetail("ignoring...")));
+               }
+       }
        else
        {
                ereport(WARNING,
index d628ead42397120932ef5298ceb66bd73730d3bf..d2eb16fda3c5f877abacfa6ba326750cdcd9e73c 100644 (file)
@@ -72,6 +72,8 @@ static WDFailoverCMDResults wd_get_failover_result_from_data(WDIPCCmdResult * re
 static WDFailoverCMDResults wd_issue_failover_command(char *func_name, int *node_id_set,
                                                                                                                        int count, unsigned char flags);
 
+static WdCommandResult wd_send_locking_command(WD_LOCK_STANDBY_TYPE lock_type,
+                                                                                                                       bool acquire);
 
 void
 wd_ipc_initialize_data(void)
@@ -552,3 +554,37 @@ wd_internal_get_watchdog_local_node_state(void)
 {
        return get_watchdog_local_node_state(pool_config->wd_authkey);
 }
+
+static WdCommandResult
+wd_send_locking_command(WD_LOCK_STANDBY_TYPE lock_type, bool acquire)
+{
+       WDExecCommandArg wdExecCommandArg[2];
+
+       strncpy(wdExecCommandArg[0].arg_name, "StandbyLockType", sizeof(wdExecCommandArg[0].arg_name) - 1);
+       snprintf(wdExecCommandArg[0].arg_value, sizeof(wdExecCommandArg[0].arg_name) - 1, "%d",lock_type);
+
+       strncpy(wdExecCommandArg[1].arg_name, "LockingOperation", sizeof(wdExecCommandArg[1].arg_name) - 1);
+       snprintf(wdExecCommandArg[1].arg_value, sizeof(wdExecCommandArg[1].arg_name) - 1,
+                        "%s",acquire?"acquire":"release");
+
+       ereport(DEBUG1,
+                       (errmsg("sending standby locking request to watchdog")));
+
+       return wd_execute_cluster_command(WD_COMMAND_LOCK_ON_STANDBY, 2, wdExecCommandArg);
+}
+
+WdCommandResult
+wd_lock_standby(WD_LOCK_STANDBY_TYPE lock_type)
+{
+       if (pool_config->use_watchdog)
+               return wd_send_locking_command(lock_type, true);
+       return COMMAND_OK;
+}
+
+WdCommandResult
+wd_unlock_standby(WD_LOCK_STANDBY_TYPE lock_type)
+{
+       if (pool_config->use_watchdog)
+               return wd_send_locking_command(lock_type, false);
+       return COMMAND_OK;
+}