Supplementary fix for [pgpool-hackers: 3892] Problem with detach_false_primary..
commit:
455f00dd5f5b7b94bd91aa0b6b40aab21dceabb9 fixed a race condition between
detach_false_primary and follow_primary commands. Part of the fix was to make
sure that the detach_false_primary should only be executed on the
leader watchdog node.
The mentioned commit ensures the execution of detach_false_primary on the
watchdog leader by getting the watchdog status from within the main process.
The design is good enough for most cases, but has the potential to fail if
the cluster goes into the election process just after the main process
has read the status.
To fix that, this commit implements the synchronization of follow_primary_command
execution using the distributed locks over the watchdog channel.
The idea is, just before executing the follow_primary during the failover process
we instruct all standby watchdog nodes to acquire a lock on their respective
nodes to block the false primary detection during the period when the
follow_primary is being executed on the leader watchdog node.
Moreover to keep the watchdog process blocked on waiting for the lock the commit
introduced the pending remote lock mechanism, so that remote locks can get
acquired in the background after the completion of the inflight replication checks.
Finally, REQ_DETAIL_CONFIRMED flag is removed from degenerate_backend_set()
request that gets issued to detach the false primary, That means all quorum
and consensus rules must be satisfied for the detach to happen.
#define REQ_DETAIL_WATCHDOG 0x00000002 /* failover req from watchdog */
#define REQ_DETAIL_CONFIRMED 0x00000004 /* failover req that does not
* require majority vote */
-#define REQ_DETAIL_UPDATE 0x00000008 /* failover req is just and update
+#define REQ_DETAIL_UPDATE 0x00000008 /* failover req is just an update
* node status request */
#define REQ_DETAIL_PROMOTE 0x00000010 /* failover req is actually promoting the specified standby node.
* current primary will be detached */
/* greater than 0 if follow primary command or detach_false_primary in
* execution */
bool follow_primary_count;
+ bool follow_primary_lock_pending; /* watchdog process can't wait
+ * for follow_primary lock acquisition
+ * in case it is held at the time of
+ * request.
+ * This flag indicates that lock was requested
+ * by watchdog coordinator and next contender should
+ * wait for the coordinator to release the lock
+ */
+ bool follow_primary_lock_held_remotely; /* true when lock is held by
+ watchdog coordinator*/
bool follow_primary_ongoing; /* true if follow primary command is ongoing */
} POOL_REQUEST_INFO;
extern POOL_NODE_STATUS * pool_get_node_status(void);
extern void pool_set_backend_status_changed_time(int backend_id);
extern int get_next_main_node(void);
-extern bool pool_acquire_follow_primary_lock(bool block);
-extern void pool_release_follow_primary_lock(void);
+extern bool pool_acquire_follow_primary_lock(bool block, bool remote_reques);
+extern void pool_release_follow_primary_lock(bool remote_reques);
/* strlcpy.c */
#ifndef HAVE_STRLCPY
#include "watchdog/wd_ipc_conn.h"
#include "watchdog/wd_commands.h"
+/*
+ * These lock can only be acquired by
+ * coordinator watchdog node on standby
+ * watchdog node.
+ */
+typedef enum WD_LOCK_STANDBY_TYPE
+{
+ WD_INVALID_LOCK,
+ /* currently we have only one lock */
+ WD_FOLLOW_PRIMARY_LOCK
+}WD_LOCK_STANDBY_TYPE;
+
+
extern WdCommandResult wd_start_recovery(void);
extern WdCommandResult wd_end_recovery(void);
extern WDFailoverCMDResults wd_send_failback_request(int node_id, unsigned char flags);
extern void reset_watchdog_node_escalated(void);
extern bool get_watchdog_node_escalation_state(void);
extern size_t wd_ipc_get_shared_mem_size(void);
+
+extern WdCommandResult wd_lock_standby(WD_LOCK_STANDBY_TYPE lock_type);
+extern WdCommandResult wd_unlock_standby(WD_LOCK_STANDBY_TYPE lock_type);
+
#endif /* WD_INTERNAL_COMMANDS_H */
#define WD_COMMAND_REELECT_LEADER "REELECT_LEADER"
#define WD_COMMAND_SHUTDOWN_CLUSTER "SHUTDOWN_CLUSTER"
#define WD_COMMAND_RELOAD_CONFIG_CLUSTER "RELOAD_CONFIG_CLUSTER"
+#define WD_COMMAND_LOCK_ON_STANDBY "APPLY_LOCK_ON_STANDBY"
#define WD_FUNCTION_START_RECOVERY "START_RECOVERY"
char **myargv;
int assert_enabled = 0;
char *pool_key = NULL;
+
int
main(int argc, char **argv)
{
(errmsg("we have joined the watchdog cluster as STANDBY node"),
errdetail("syncing the backend states from the LEADER watchdog node")));
sync_backend_from_watchdog();
+ /*
+ * we also want to release the follow_primary lock if it was held
+ * by the remote node.
+ * because the change of watchdog coordinator would lead to forever stuck
+ * in the the locked state
+ */
+ pool_release_follow_primary_lock(true);
}
}
if (user1SignalSlot->signalFlags[SIG_FAILOVER_INTERRUPT])
pfree(password);
/* Verify backend status */
- pool_acquire_follow_primary_lock(true);
+ pool_acquire_follow_primary_lock(true, false);
status = verify_backend_node_status(slots);
- pool_release_follow_primary_lock();
+ pool_release_follow_primary_lock(false);
for (i = 0; i < NUM_BACKENDS; i++)
{
#endif
SetProcessGlobalVariables(PT_FOLLOWCHILD);
- pool_acquire_follow_primary_lock(true);
+ /*
+ * when the watchdog is enabled, we would come here
+ * only on the coordinator node.
+ * so before acquiring the local lock, Lock all the
+ * standby nodes so that they should stop false primary
+ * detection until we are finished with the follow primary
+ * command.
+ */
+ wd_lock_standby(WD_FOLLOW_PRIMARY_LOCK);
+ pool_acquire_follow_primary_lock(true, false);
Req_info->follow_primary_ongoing = true;
ereport(LOG,
(errmsg("start triggering follow command.")));
old_main_node, new_primary, old_primary);
}
Req_info->follow_primary_ongoing = false;
- pool_release_follow_primary_lock();
+ pool_release_follow_primary_lock(false);
+ /* inform standby watchdog nodes to release the lock aswell*/
+ wd_unlock_standby(WD_FOLLOW_PRIMARY_LOCK);
exit(0);
}
else if (pid == -1)
* they are conflicting each other. If argument "block" is true, this
* function will not return until it succeeds in acquiring the lock. This
* function returns true if succeeded in acquiring the lock.
+ *
+ * first arg:block is ignored when remote_request is set
*/
bool
-pool_acquire_follow_primary_lock(bool block)
+pool_acquire_follow_primary_lock(bool block, bool remote_request)
{
pool_sigset_t oldmask;
volatile int follow_primary_count;
(errmsg("pool_acquire_follow_primary_lock: lock was not held by anyone")));
break;
}
+ else if (follow_primary_count > 0 && remote_request)
+ {
+ if (Req_info->follow_primary_lock_held_remotely)
+ {
+ /* The lock was already held by remote node and we only
+ * support one remote lock
+ */
+ ereport(LOG,
+ (errmsg("pool_acquire_follow_primary_lock: received remote locking request while lock is already held by the remote node")));
+
+ }
+ else
+ {
+ /* set the flag that watchdog has requested the lock */
+ Req_info->follow_primary_lock_pending = true;
+ }
+ pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
+ POOL_SETMASK(&oldmask);
+ /* return and inform that the lock was held by someone */
+ ereport(DEBUG1,
+ (errmsg("pool_acquire_follow_primary_lock: lock was held by someone %d", follow_primary_count)));
+ return false;
+ }
else if (follow_primary_count > 0 && !block)
{
}
/* acquire lock */
+ Req_info->follow_primary_lock_held_remotely = remote_request;
+
Req_info->follow_primary_count = 1;
pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
POOL_SETMASK(&oldmask);
* Release lock on follow primary command execution.
*/
void
-pool_release_follow_primary_lock(void)
+pool_release_follow_primary_lock(bool remote_request)
{
pool_sigset_t oldmask;
POOL_SETMASK2(&BlockSig, &oldmask);
pool_semaphore_lock(FOLLOW_PRIMARY_SEM);
- Req_info->follow_primary_count = 0;
+ if (remote_request)
+ {
+ if (Req_info->follow_primary_lock_held_remotely)
+ {
+ /* remote request can only release locks held by remote nodes */
+ Req_info->follow_primary_count = 0;
+ Req_info->follow_primary_lock_held_remotely = false;
+ ereport(DEBUG1,
+ (errmsg("pool_release_follow_primary_lock relased the remote lock")));
+ }
+ else if (Req_info->follow_primary_count)
+ {
+ /*
+ * we have received the release lock request from remote
+ * but the lock is not held by remote node.
+ * Just ignore the request
+ */
+ ereport(DEBUG1,
+ (errmsg("pool_release_follow_primary_lock is not relasing the lock since it was not held by remote node")));
+ }
+ /*
+ * Silently ignore, if we received the release request from remote while no lock was held.
+ * Also clear the pending lock request, As we only support single remote lock
+ */
+ Req_info->follow_primary_lock_pending = false;
+
+ }
+ else /*local request */
+ {
+ /*
+ * if we have a pending lock request from watchdog
+ * do not remove the actual lock, Just clear the pending flag
+ */
+ if (Req_info->follow_primary_lock_pending)
+ {
+ Req_info->follow_primary_lock_held_remotely = true;
+ Req_info->follow_primary_count = 1;
+ /* also clear the pending lock flag */
+ Req_info->follow_primary_lock_pending = false;
+ ereport(DEBUG1,
+ (errmsg("pool_release_follow_primary_lock is not relasing the lock and shifting it to coordinator watchdog node")));
+ }
+ else
+ {
+ if (Req_info->follow_primary_lock_held_remotely)
+ {
+ /*
+ * Ideally this should not happen.
+ * yet if for some reason our local node is trying to release a lock
+ * that is heald by remote node. Just produce a LOG message and release
+ * the lock
+ */
+ ereport(LOG,
+ (errmsg("pool_release_follow_primary_lock is relasing the remote lock by local request")));
+ }
+ Req_info->follow_primary_count = 0;
+ Req_info->follow_primary_lock_held_remotely = false;
+ }
+ }
pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
POOL_SETMASK(&oldmask);
*/
follow_primary_lock_acquired = false;
- if (pool_acquire_follow_primary_lock(false) == true)
+ if (pool_acquire_follow_primary_lock(false, false) == true)
{
follow_primary_lock_acquired = true;
/*
* If detach_false_primary is enabled, send
* degenerate request to detach invalid node.
- * This should only happen on leader watchdog node
- * and quorum exists if watchdog is enabled. Other
- * nodes will be informed by the leader node later
- * on.
*/
- if ((pool_config->detach_false_primary && !pool_config->use_watchdog) ||
- (pool_config->detach_false_primary && pool_config->use_watchdog &&
- wd_internal_get_watchdog_quorum_state() >= 0 && wd_status == WD_COORDINATOR))
+ if (pool_config->detach_false_primary)
{
n = i;
- degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER | REQ_DETAIL_CONFIRMED);
+ degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER);
}
}
}
PG_CATCH();
{
discard_persistent_connection();
- pool_release_follow_primary_lock();
+ pool_release_follow_primary_lock(false);
follow_primary_lock_acquired = false;
sleep(pool_config->sr_check_period);
PG_RE_THROW();
discard_persistent_connection();
if (follow_primary_lock_acquired)
{
- pool_release_follow_primary_lock();
+ pool_release_follow_primary_lock(false);
follow_primary_lock_acquired = false;
}
}
sr_check_will_die(int code, Datum arg)
{
if (follow_primary_lock_acquired)
- pool_release_follow_primary_lock();
+ pool_release_follow_primary_lock(false);
}
ereport(LOG,
(errmsg("Watchdog has received reload config cluster command from IPC channel")));
}
+ else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
+ {
+ ereport(LOG,
+ (errmsg("Watchdog has received 'LOCK ON STANDBY' command from IPC channel")));
+ if (get_local_node_state() != WD_COORDINATOR)
+ {
+ ereport(LOG,
+ (errmsg("'LOCK ON STANDBY' command can only be processed on coordinator node")));
+ }
+ }
else
{
ipcCommand->errorMessage = MemoryContextStrdup(ipcCommand->memoryContext,
errdetail("failed to get failover state from json data in command packet")));
res = FAILOVER_RES_INVALID_FUNCTION;
}
-
}
else
{
(errmsg("processing reload config command from remote node \"%s\"", wdNode->nodeName)));
pool_signal_parent(SIGHUP);
}
+ else if (strcasecmp(WD_COMMAND_LOCK_ON_STANDBY, clusterCommand) == 0)
+ {
+ int i;
+ int lock_type = -1;
+ char *operation = NULL;
+ if (get_local_node_state() != WD_STANDBY && wdNode->state == WD_COORDINATOR)
+ {
+ if (nArgs == 2)
+ {
+ for ( i =0; i < nArgs; i++)
+ {
+ if (strcmp(wdExecCommandArg[i].arg_name, "StandbyLockType") == 0)
+ {
+ lock_type = atoi(wdExecCommandArg[i].arg_value);
+ }
+ else if (strcmp(wdExecCommandArg[i].arg_name, "LockingOperation") == 0)
+ {
+ operation = wdExecCommandArg[i].arg_value;
+ }
+ else
+ ereport(LOG,
+ (errmsg("unsupported argument \"%s\" in 'LOCK ON STANDBY' from remote node \"%s\"", wdExecCommandArg[i].arg_name, wdNode->nodeName)));
+ }
+ if (lock_type < 0 || operation == NULL)
+ {
+ ereport(LOG,
+ (errmsg("missing argument in 'LOCK ON STANDBY' from remote node \"%s\"", wdNode->nodeName),
+ errdetail("command ignored")));
+ }
+ else if (lock_type == WD_FOLLOW_PRIMARY_LOCK)
+ {
+ ereport(LOG,
+ (errmsg("processing follow primary looking[%s] request from remote node \"%s\"", operation,wdNode->nodeName)));
+
+ if (strcasecmp("acquire", operation) == 0)
+ pool_acquire_follow_primary_lock(false, true);
+ else if (strcasecmp("release", operation) == 0)
+ pool_release_follow_primary_lock(true);
+ else
+ ereport(LOG,
+ (errmsg("invalid looking operaition[%s] in 'LOCK ON STANDBY' from remote node \"%s\"", operation, wdNode->nodeName),
+ errdetail("command ignored")));
+ }
+ else
+ ereport(LOG,
+ (errmsg("unsupported lock-type:%d in 'LOCK ON STANDBY' from remote node \"%s\"", lock_type, wdNode->nodeName)));
+
+ }
+ else
+ {
+ ereport(LOG,
+ (errmsg("invalid arguments in 'LOCK ON STANDBY' command from remote node \"%s\"", wdNode->nodeName)));
+ }
+ }
+ else if (get_local_node_state() != WD_STANDBY)
+ {
+ ereport(LOG,
+ (errmsg("invalid node state to execute 'LOCK ON STANDBY' command")));
+
+ }
+ else
+ {
+ ereport(LOG,
+ (errmsg("'LOCK ON STANDBY' command can only be accepted from the coordinator watchdog node"),
+ errdetail("ignoring...")));
+ }
+ }
else
{
ereport(WARNING,
static WDFailoverCMDResults wd_issue_failover_command(char *func_name, int *node_id_set,
int count, unsigned char flags);
+static WdCommandResult wd_send_locking_command(WD_LOCK_STANDBY_TYPE lock_type,
+ bool acquire);
void
wd_ipc_initialize_data(void)
{
return get_watchdog_local_node_state(pool_config->wd_authkey);
}
+
+static WdCommandResult
+wd_send_locking_command(WD_LOCK_STANDBY_TYPE lock_type, bool acquire)
+{
+ WDExecCommandArg wdExecCommandArg[2];
+
+ strncpy(wdExecCommandArg[0].arg_name, "StandbyLockType", sizeof(wdExecCommandArg[0].arg_name) - 1);
+ snprintf(wdExecCommandArg[0].arg_value, sizeof(wdExecCommandArg[0].arg_name) - 1, "%d",lock_type);
+
+ strncpy(wdExecCommandArg[1].arg_name, "LockingOperation", sizeof(wdExecCommandArg[1].arg_name) - 1);
+ snprintf(wdExecCommandArg[1].arg_value, sizeof(wdExecCommandArg[1].arg_name) - 1,
+ "%s",acquire?"acquire":"release");
+
+ ereport(DEBUG1,
+ (errmsg("sending standby locking request to watchdog")));
+
+ return wd_execute_cluster_command(WD_COMMAND_LOCK_ON_STANDBY, 2, wdExecCommandArg);
+}
+
+WdCommandResult
+wd_lock_standby(WD_LOCK_STANDBY_TYPE lock_type)
+{
+ if (pool_config->use_watchdog)
+ return wd_send_locking_command(lock_type, true);
+ return COMMAND_OK;
+}
+
+WdCommandResult
+wd_unlock_standby(WD_LOCK_STANDBY_TYPE lock_type)
+{
+ if (pool_config->use_watchdog)
+ return wd_send_locking_command(lock_type, false);
+ return COMMAND_OK;
+}