Fixing a problem with the watchdog failover_command locking mechanism
authorMuhammad Usama <m.usama@gmail.com>
Mon, 19 Sep 2016 19:38:49 +0000 (00:38 +0500)
committerMuhammad Usama <m.usama@gmail.com>
Mon, 19 Sep 2016 19:38:49 +0000 (00:38 +0500)
From pgpool-II 3.5 watchdog was using the separate individual locks for each
node-failover command(failover, failback and follow-master) and the lock was
acquired just before executing the respective failover script and was released
as soon as the script execution finishes. This technique although was very
efficient but also had a problem. If the failover command takes a very little
time and gets finished before the lock request from other pgpool-II node
arrives, the other node is also granted a lock, since the lock was already
released by the first node at that time. Consequently, both nodes ends up
executing the failover script.
So to fix this we are reverting back to the tested failover interlocking design
used prior to pgpool-II 3.5 where all the commands gets locked at the failover
start by the node that becomes a lock-holder and each command lock is released
after its execution finishes. And only the lock-holder node is allowed to
acquire/release the individual command lock. That way the lock-holder node
keeps the lock-holder status throughout the span of the failover execution and
the system becomes less time sensitive.

The issue was identified by Yugo<nagata@sraoss.co.jp>

src/include/pool.h
src/include/watchdog/wd_ipc_commands.h
src/include/watchdog/wd_ipc_defines.h
src/main/pgpool_main.c
src/watchdog/watchdog.c
src/watchdog/wd_commands.c

index 59533d27e4f51cc8885202848547ea3b33b4999f..65b3c92f6f8ef5b1704fd76056b74db999c5d2d9 100644 (file)
@@ -373,7 +373,9 @@ extern int my_master_node_id;
 #define ACCEPT_FD_SEM                  5
 #define MAX_REQUEST_QUEUE_SIZE 10
 
-#define MAX_SEC_WAIT_FOR_CLUSTER_TRANSATION 6 /*number of sec to wait for watchdog command if cluster is stabalizing */
+#define MAX_SEC_WAIT_FOR_CLUSTER_TRANSATION 6 /* time in seconds to keep retrying for a
+                                                                                          * watchdog command if the cluster is not
+                                                                                          * in stable state */
 
 #define SERIALIZE_ACCEPT (pool_config->serialize_accept == true && \
                                                  pool_config->child_life_time == 0)
index 1c4175c1aa0ba4de344a92ab2c44a1b576608139..d5dd148304c48969e8de93d906a22e6d2a4ccf5c 100644 (file)
@@ -63,7 +63,6 @@ extern WdCommandResult wd_end_recovery(void);
 extern WdCommandResult wd_send_failback_request(int node_id);
 extern WdCommandResult wd_degenerate_backend_set(int *node_id_set, int count);
 extern WdCommandResult wd_promote_backend(int node_id);
-extern WDFailoverCMDResults wd_send_failover_sync_command(WDFailoverCMDTypes cmdType, char* syncReqType);
 
 extern WDPGBackendStatus* get_pg_backend_status_from_master_wd_node(void);
 
@@ -72,13 +71,12 @@ extern char* wd_get_watchdog_nodes(int nodeID);
 extern WDIPCCmdResult* issue_command_to_watchdog(char type, int timeout_sec, char* data, int data_len, bool blocking);
 
 
-/* wd_interlock.c */
-
-extern WDFailoverCMDResults wd_release_failover_command_lock(WDFailoverCMDTypes cmdType);
-extern WDFailoverCMDResults wd_failover_command_check_lock(WDFailoverCMDTypes cmdType);
-extern WDFailoverCMDResults wd_failover_command_end(WDFailoverCMDTypes cmdType);
-extern WDFailoverCMDResults wd_failover_command_start(WDFailoverCMDTypes cmdType);
-extern void wd_wati_until_lock_or_timeout(WDFailoverCMDTypes cmdType);
+/* functions for failover commands interlocking */
+extern WDFailoverCMDResults wd_end_failover_interlocking(void);
+extern WDFailoverCMDResults wd_start_failover_interlocking(void);
+extern WDFailoverCMDResults wd_failover_lock_release(enum WDFailoverLocks lock);
+extern WDFailoverCMDResults wd_failover_lock_status(enum WDFailoverLocks lock);
+extern void wd_wait_until_command_complete_or_timeout(enum WDFailoverLocks lock);
 
 
 
index cd8adf5e751c602401c1a4b3044a13ffba14e3b6..9301f2a98a0379ec1b25f2447dccf5c106c1b3df 100644 (file)
 #ifndef WD_IPC_DEFINES_H
 #define WD_IPC_DEFINES_H
 
-typedef enum WDFailoverCMDTypes
+typedef enum WDFailoverLocks
 {
-       NODE_FAILED_CMD = 0,
-       NODE_FAILBACK_CMD,
-       NODE_PROMOTE_CMD,
-       MAX_FAILOVER_CMDS
-}WDFailoverCMDTypes;
+       FAILOVER_LOCK = 0,
+       FAILBACK_LOCK,
+       FOLLOW_MASTER_LOCK,
+       MAX_FAILOVER_LOCKS
+}WDFailoverLock;
 
 typedef enum WDFailoverCMDResults
 {
        FAILOVER_RES_ERROR = 0,                         /* processing of command is failed */
-       FAILOVER_RES_TRANSITION,                        /* cluster is transitioning and is 
+       FAILOVER_RES_TRANSITION,                        /* cluster is transitioning and is
                                                                                 * currently not accepting any commands.
                                                                                 * retry is the best option when this result
                                                                                 * is returned by watchdog
                                                                                 */
        FAILOVER_RES_I_AM_LOCK_HOLDER,          /* node successfully becomes a lock holder */
-       FAILOVER_RES_LOCK_UNLOCKED,                     /* the node is not a lock holder but associated
-                                                                                * lock is unlocked */
-       FAILOVER_RES_BLOCKED                            /* the node is neither a lock holder and
-                                                                                * associated lock is also locked
-                                                                                */
+       FAILOVER_RES_I_AM_NOT_LOCK_HOLDER,      /* some other node is a lock holder */
+       FAILOVER_RES_UNLOCKED,                          /* the lock is not acquired */
+       FAILOVER_RES_LOCKED,                            /* lock is acquired */
+       FAILOVER_RES_SUCCESS,
+       FAILOVER_RES_NO_LOCKHOLDER
 }WDFailoverCMDResults;
 
 
@@ -73,6 +73,14 @@ typedef enum WDFailoverCMDResults
 
 #define WD_DATE_REQ_PG_BACKEND_DATA            "BackendStatus"
 
+
+#define WD_REQ_FAILOVER_START                  "FAILOVER_START"
+#define WD_REQ_FAILOVER_END                            "FAILOVER_FINISH"
+#define WD_REQ_FAILOVER_RELEASE_LOCK   "RELEASE_LOCK"
+#define WD_REQ_FAILOVER_LOCK_STATUS            "CHECK_LOCKED"
+
+
+
 #define WD_IPC_AUTH_KEY                        "IPCAuthKey"    /* JSON data key for authentication.
                                                                                                 * watchdog IPC server use the value for this key
                                                                                                 * to authenticate the external IPC clients
index ecbf18f547502b500448faae84f6a563a49bf885..04156f7022bedea9a31429b5fa33cd8a6e36eb77 100644 (file)
@@ -1549,6 +1549,7 @@ static void failover(void)
                kill(pcp_pid, SIGUSR2);
                return;
        }
+
        Req_info->switching = true;
        switching = 1;
        for(;;)
@@ -1558,7 +1559,7 @@ static void failover(void)
                int node_id_set[MAX_NUM_BACKENDS];
                int node_count;
                unsigned char request_details;
-               WDFailoverCMDResults failoverLockRes;
+               WDFailoverCMDResults wdInterlockingRes;
 
                pool_semaphore_lock(REQUEST_INFO_SEM);
 
@@ -1590,6 +1591,9 @@ static void failover(void)
                        continue;
                }
 
+               /* start watchdog interlocking */
+               wdInterlockingRes = wd_start_failover_interlocking();
+
                /*
                 * if not in replication mode/master slave mode, we treat this a restart request.
                 * otherwise we need to check if we have already failovered.
@@ -1607,7 +1611,6 @@ static void failover(void)
                                BACKEND_INFO(node_id).backend_status == CON_DOWN) && VALID_BACKEND(node_id)) ||
                                (reqkind == NODE_DOWN_REQUEST && !VALID_BACKEND(node_id)))
                        {
-
                                if (node_id < 0 || node_id >= MAX_NUM_BACKENDS)
                                        ereport(LOG,
                                                (errmsg("invalid failback request, node id: %d is invalid. node id must be between [0 and %d]",node_id,MAX_NUM_BACKENDS)));
@@ -1615,6 +1618,9 @@ static void failover(void)
                                        ereport(LOG,
                                                        (errmsg("invalid failback request, status: [%d] of node id : %d is invalid for failback",BACKEND_INFO(node_id).backend_status,node_id)));
 
+                               if (wdInterlockingRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
+                                       wd_end_failover_interlocking();
+
                                continue;
                        }
 
@@ -1627,24 +1633,20 @@ static void failover(void)
                        (void)write_status_file();
 
                        /* Aquire failback start command lock */
-                       failoverLockRes = wd_failover_command_start(NODE_FAILBACK_CMD);
-
-                       if (failoverLockRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
+                       if (wdInterlockingRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
                        {
                                trigger_failover_command(node_id, pool_config->failback_command,
-                                                                               MASTER_NODE_ID, get_next_master_node(), PRIMARY_NODE_ID);
-
-                               wd_release_failover_command_lock(NODE_FAILBACK_CMD);
-
+                                                                                       MASTER_NODE_ID, get_next_master_node(), PRIMARY_NODE_ID);
+                               wd_failover_lock_release(FAILBACK_LOCK);
                        }
                        else
                        {
                                /*
                                 * Okay we are not allowed to execute the failover command
-                                * so we need to wait untle the one who is executing the command
+                                * so we need to wait till the one who is executing the command
                                 * finish with it.
                                 */
-                               wd_wati_until_lock_or_timeout(NODE_FAILBACK_CMD);
+                               wd_wait_until_command_complete_or_timeout(FAILBACK_LOCK);
                        }
                }
                else if (reqkind == PROMOTE_NODE_REQUEST)
@@ -1660,10 +1662,10 @@ static void failover(void)
                        {
                                ereport(LOG,
                                                (errmsg("failover: no backends are promoted")));
+                               if (wdInterlockingRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
+                                       wd_end_failover_interlocking();
                                continue;
                        }
-
-                       failoverLockRes = wd_failover_command_start(NODE_PROMOTE_CMD);
                }
                else    /* NODE_DOWN_REQUEST */
                {
@@ -1694,6 +1696,9 @@ static void failover(void)
                                ereport(LOG,
                                                (errmsg("failover: no backends are degenerated")));
 
+                               if (wdInterlockingRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
+                                       wd_end_failover_interlocking();
+
                                continue;
                        }
                }
@@ -1798,10 +1803,7 @@ static void failover(void)
                        need_to_restart_children = true;
                        partial_restart = false;
                }
-
-               failoverLockRes = wd_failover_command_start(NODE_FAILED_CMD);
-
-               if (failoverLockRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
+               if (wdInterlockingRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
                {
                        /* Exec failover_command if needed */
                        for (i = 0; i < pool_config->backend_desc->num_backends; i++)
@@ -1810,10 +1812,12 @@ static void failover(void)
                                        trigger_failover_command(i, pool_config->failover_command,
                                                                                                MASTER_NODE_ID, new_master, PRIMARY_NODE_ID);
                        }
+                       wd_failover_lock_release(FAILOVER_LOCK);
                }
                else
-                       wd_wati_until_lock_or_timeout(NODE_FAILED_CMD);
-
+               {
+                       wd_wait_until_command_complete_or_timeout(FAILOVER_LOCK);
+               }
 
        /* no need to wait since it will be done in reap_handler */
 #ifdef NOT_USED
@@ -1900,26 +1904,20 @@ static void failover(void)
                /*
                 * follow master command also uses the same locks used by trigring command
                 */
-               if (failoverLockRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
+               if (wdInterlockingRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
                {
                        if ((follow_cnt > 0) && (*pool_config->follow_master_command != '\0'))
                        {
                                follow_pid = fork_follow_child(Req_info->master_node_id, new_primary,
                                                                                        Req_info->primary_node_id);
                        }
-                       if (reqkind == PROMOTE_NODE_REQUEST)
-                               wd_release_failover_command_lock(NODE_PROMOTE_CMD);
-                       else if (reqkind == NODE_DOWN_REQUEST)
-                               wd_release_failover_command_lock(NODE_FAILED_CMD);
+                       wd_failover_lock_release(FOLLOW_MASTER_LOCK);
                }
                else
                {
-                       if (reqkind == PROMOTE_NODE_REQUEST)
-                               wd_wati_until_lock_or_timeout(NODE_PROMOTE_CMD);
-                       else if (reqkind == NODE_DOWN_REQUEST)
-                               wd_wati_until_lock_or_timeout(NODE_FAILED_CMD);
-               }
+                       wd_wait_until_command_complete_or_timeout(FOLLOW_MASTER_LOCK);
 
+               }
 
                /* Save primary node id */
                Req_info->primary_node_id = new_primary;
@@ -2006,9 +2004,11 @@ static void failover(void)
                 */
                kill(worker_pid, SIGUSR1);
 
+               if (wdInterlockingRes == FAILOVER_RES_I_AM_LOCK_HOLDER)
+                       wd_end_failover_interlocking();
+
                if (reqkind == NODE_UP_REQUEST)
                {
-                       wd_failover_command_end(NODE_FAILBACK_CMD);
                        ereport(LOG,
                                        (errmsg("failback done. reconnect host %s(%d)",
                                         BACKEND_INFO(node_id).backend_hostname,
@@ -2017,7 +2017,6 @@ static void failover(void)
                }
                else if (reqkind == PROMOTE_NODE_REQUEST)
                {
-                       wd_failover_command_end(NODE_PROMOTE_CMD);
                        ereport(LOG,
                                        (errmsg("promotion done. promoted host %s(%d)",
                                         BACKEND_INFO(node_id).backend_hostname,
@@ -2025,9 +2024,6 @@ static void failover(void)
                }
                else
                {
-                       /* Release the locks and interlocking */
-                       wd_failover_command_end(NODE_FAILED_CMD);
-
                        /* Temporary black magic. Without this regression 055 does not finish */
                        fprintf(stderr, "failover done. shutdown host %s(%d)",
                                         BACKEND_INFO(node_id).backend_hostname,
index 12d8b2fef967ebc04497b6cfb6199a6bf965495a..04eb07304704a0046e6458e46f9631ccd763417c 100644 (file)
@@ -132,11 +132,11 @@ packet_types all_packet_types[] = {
        {WD_NO_MESSAGE,""}
 };
 
-char* wd_failover_cmd_type_name[] =
+char* wd_failover_lock_name[] =
 {
        "FAILOVER",
        "FAILBACK",
-       "PROMOTE"
+       "FOLLOW MASTER"
 };
 
 char *wd_event_name[] =
@@ -232,10 +232,12 @@ typedef struct WDCommandTimerData
        WDFunctionCommandData*  wd_func_command;
 }WDCommandTimerData;
 
+
 typedef struct InterlockingNode
 {
-       WatchdogNode*   lockHolderNode;
-       bool                    locked;
+       WatchdogNode*           lockHolderNode;
+       bool                            locks[MAX_FAILOVER_LOCKS];
+       struct timeval          lock_time;
 }InterlockingNode;
 
 
@@ -275,7 +277,7 @@ typedef struct wd_cluster
        WatchdogNode*           remoteNodes;
        WatchdogNode*           masterNode;
        WatchdogNode*           lockHolderNode;
-       InterlockingNode        interlockingNodes[MAX_FAILOVER_CMDS];
+       InterlockingNode        interlockingNode;
        int                                     remoteNodeCount;
        int                                     aliveNodeCount;
        int                                     quorum_status;
@@ -289,7 +291,6 @@ typedef struct wd_cluster
        bool                    escalated;
        bool                    clusterInitialized;
        bool                    ipc_auth_needed;
-
        List                    *unidentified_socks;
        List                    *notify_clients;
        List                    *ipc_command_socks;
@@ -413,8 +414,10 @@ static bool process_wd_command_function(WatchdogNode* wdNode, WDPacketData* pkt,
 static bool process_pgpool_replicate_command(WatchdogNode* wdNode, WDPacketData* pkt);
 
 static void process_failover_command_sync_requests(WatchdogNode* wdNode, WDPacketData* pkt, WDIPCCommandData* ipcCommand);
-static WDFailoverCMDResults node_is_asking_for_failover_cmd_end(WatchdogNode* wdNode, WDPacketData* pkt, int failoverCmdType, bool resign);
-static WDFailoverCMDResults node_is_asking_for_failover_cmd_start(WatchdogNode* wdNode, WDPacketData* pkt, int failoverCmdType, bool check);
+static WDFailoverCMDResults node_is_asking_for_failover_end(WatchdogNode* wdNode, WDPacketData* pkt);
+static WDFailoverCMDResults node_is_asking_for_failover_start(WatchdogNode* wdNode, WDPacketData* pkt);
+static WDFailoverCMDResults node_is_asking_for_failover_lock_status(WatchdogNode* wdNode, WDPacketData* pkt, WDFailoverLock failoverLock);
+static WDFailoverCMDResults node_is_asking_for_failover_lock_release(WatchdogNode* wdNode, WDPacketData* pkt, WDFailoverLock failoverLock);
 static void wd_system_will_go_down(int code, Datum arg);
 static bool verify_pool_configurations(POOL_CONFIG* config);
 
@@ -1958,19 +1961,24 @@ static int node_has_requested_for_interlocking(WatchdogNode* wdNode, WDPacketDat
        return false;
 }
 
+/*
+ * process_failover_command_sync_requests()
+ * the function is the main processor of all interlocking related requests.
+ * it parses the request json and executes the requested intelocking command
+ */
 static void process_failover_command_sync_requests(WatchdogNode* wdNode, WDPacketData* pkt, WDIPCCommandData* ipcCommand)
 {
        
        WDFailoverCMDResults res = FAILOVER_RES_TRANSITION;
        JsonNode* jNode = NULL;
-       int failoverCmdType = -1;
-       
+       int failoverLockID = -1;
+
        /* only coordinator(master) node can process this request */
        if (get_local_node_state() == WD_COORDINATOR)
        {
                char* json_data = NULL;
                int data_len = 0;
-               json_value *root;
+               json_value *root = NULL;
                char* syncRequestType = NULL;
                
                /* We need to identify failover command type and sync function */
@@ -1986,76 +1994,68 @@ static void process_failover_command_sync_requests(WatchdogNode* wdNode, WDPacke
                }
 
                if (data_len > 0 && json_data)
-                       root = json_parse(json_data,data_len);
-               else
-                       root = NULL;
-
-               /* The root node must be object */
-               if (root == NULL || root->type != json_object)
                {
-                       ereport(LOG,
-                                       (errmsg("unable to parse json data from replicate command")));
-                       res = FAILOVER_RES_ERROR;
+                       root = json_parse(json_data,data_len);
+                       if (root && root->type == json_object)
+                       {
+                               syncRequestType = json_get_string_value_for_key(root, "SyncRequestType");
+                               json_get_int_value_for_key(root, "FailoverLockID", &failoverLockID);
+                       }
+                       else
+                       {
+                               ereport(LOG,
+                                               (errmsg("unable to parse json data of interlocking command")));
+                       }
                }
-               if (root)
-                       syncRequestType = json_get_string_value_for_key(root, "SyncRequestType");
-               
-               if (syncRequestType == NULL)
+               if (syncRequestType)
                {
-                       ereport(LOG,
-                                       (errmsg("invalid json data"),
-                                        errdetail("unable to find Watchdog Function Name")));
-                       res = FAILOVER_RES_ERROR;
+
+                       if (strcasecmp(WD_REQ_FAILOVER_START, syncRequestType) == 0)
+                               res = node_is_asking_for_failover_start(wdNode, pkt);
+
+                       else if (strcasecmp(WD_REQ_FAILOVER_END, syncRequestType) == 0)
+                               res = node_is_asking_for_failover_end(wdNode, pkt);
+
+                       else if (strcasecmp(WD_REQ_FAILOVER_RELEASE_LOCK, syncRequestType) == 0)
+                               res = node_is_asking_for_failover_lock_release(wdNode, pkt, failoverLockID);
+
+                       else if (strcasecmp(WD_REQ_FAILOVER_LOCK_STATUS, syncRequestType) == 0)
+                               res = node_is_asking_for_failover_lock_status(wdNode, pkt, failoverLockID);
+
+                       else
+                               res = FAILOVER_RES_ERROR;
                }
                else
-                       syncRequestType = pstrdup(syncRequestType);
-               
-               if (root && json_get_int_value_for_key(root, "FailoverCMDType", &failoverCmdType))
                {
+                       ereport(LOG,
+                                       (errmsg("invalid json data"),
+                                        errdetail("unable to find interlocking command type")));
                        res = FAILOVER_RES_ERROR;
                }
-               
+
                if (root)
+               {
                        json_value_free(root);
-               
-               /* verify the failoverCmdType */
-               if (failoverCmdType < 0 || failoverCmdType >= MAX_FAILOVER_CMDS)
-                       res = FAILOVER_RES_ERROR;
-               
-               if (syncRequestType == NULL)
-                       res = FAILOVER_RES_ERROR;
-               
-               if (res != FAILOVER_RES_ERROR)
-               {
-                       if (strcasecmp("START_COMMAND", syncRequestType) == 0)
-                               res = node_is_asking_for_failover_cmd_start(wdNode, pkt, failoverCmdType, false);
-                       else if (strcasecmp("END_COMMAND", syncRequestType) == 0)
-                               res = node_is_asking_for_failover_cmd_end(wdNode, pkt, failoverCmdType, true);
-                       else if (strcasecmp("UNLOCK_COMMAND", syncRequestType) == 0)
-                               res = node_is_asking_for_failover_cmd_end(wdNode, pkt, failoverCmdType, false);
-                       else if (strcasecmp("CHECK_LOCKED", syncRequestType) == 0)
-                               res = node_is_asking_for_failover_cmd_start(wdNode, pkt, failoverCmdType, true);
-                       else
-                               res = FAILOVER_RES_ERROR;
                }
        }
        else
        {
+               /* I am not the coordinator node. So just return an error */
                res = FAILOVER_RES_ERROR;
        }
-       
+
        if (res != FAILOVER_RES_ERROR)
        {
                /* create the json result */
                jNode = jw_create_with_object(true);
                /* add the node count */
-               jw_put_int(jNode, "FailoverCMDType", failoverCmdType);
+               jw_put_int(jNode, "FailoverLockID", failoverLockID);
                jw_put_int(jNode, "InterlockingResult", res);
                /* create the packet */
                jw_end_element(jNode);
                jw_finish_document(jNode);
        }
-       
+
        if (wdNode != g_cluster.localNode)
        {
                if (jNode == NULL)
@@ -2069,7 +2069,7 @@ static void process_failover_command_sync_requests(WatchdogNode* wdNode, WDPacke
        }
        else
        {
-               /* Reply to IPC Socket */
+               /* reply on IPC Socket */
                bool ret;
                if (jNode != NULL)
                {
@@ -2083,75 +2083,133 @@ static void process_failover_command_sync_requests(WatchdogNode* wdNode, WDPacke
                }
 
                if (ret == false)
+               {
                        ereport(LOG,
                                        (errmsg("failed to write results for failover sync request to IPC socket")));
+               }
        }
 }
 
+/*
+ * node_is_asking_for_failover_start()
+ * the function process the lock holding requests. If the lock holding node
+ * is the same as the requesting node or no lock holder exists when the request
+ * arrives, the node is registered as a a lock holder. When the lock holding request
+ * is successful all respective command locks states are changed to locked
+ * Only coordinator/master node can execute the interlocking requests.
+ */
 static WDFailoverCMDResults
-node_is_asking_for_failover_cmd_start(WatchdogNode* wdNode, WDPacketData* pkt, int failoverCmdType, bool check)
+node_is_asking_for_failover_start(WatchdogNode* wdNode, WDPacketData* pkt)
 {
        WDFailoverCMDResults res = FAILOVER_RES_TRANSITION;
-       /* only coordinator(master) node can process this request */
 
        ereport(LOG,
-               (errmsg("%s pgpool-II node \"%s\" is %s [%s] lock to start the failover command",
-                               (g_cluster.localNode == wdNode)? "local":"remote",
-                                       wdNode->nodeName,
-                                       check?"checking the availability of":"requesting to acquire",
-                                       wd_failover_cmd_type_name[failoverCmdType])));
+                       (errmsg("%s pgpool-II node \"%s\" is requesting to become a lock holder",
+                                       (g_cluster.localNode == wdNode)? "local":"remote",
+                                       wdNode->nodeName)));
 
+       /* only coordinator(master) node can process this request */
        if (get_local_node_state() == WD_COORDINATOR)
        {
-               InterlockingNode* lockingNode = NULL;
-               if (failoverCmdType < 0 || failoverCmdType >= MAX_FAILOVER_CMDS)
-                       res = FAILOVER_RES_ERROR;
-               else
-                       lockingNode = &g_cluster.interlockingNodes[failoverCmdType];
-               
-               if (res != FAILOVER_RES_ERROR)
+               /* check if we have no node in interlocking or requesting node is itself
+                * a lock holder node
+                */
+               if (g_cluster.interlockingNode.lockHolderNode == NULL ||
+                       g_cluster.interlockingNode.lockHolderNode == wdNode)
                {
-                       /* check if we already have no lockholder node */
-                       if (lockingNode->lockHolderNode == NULL || lockingNode->lockHolderNode == wdNode)
+                       int i = 0;
+                       /* lock all command locks */
+                       for (i = 0; i < MAX_FAILOVER_LOCKS; i++)
                        {
-                               if (check == false)
-                               {
-                                       lockingNode->lockHolderNode = wdNode;
-                                       lockingNode->locked = true;
-                               }
-                               res = FAILOVER_RES_I_AM_LOCK_HOLDER;
-                               ereport(LOG,
-                                       (errmsg("%s pgpool-II node \"%s\" %s [%s] lock to start the failover command",
+                               g_cluster.interlockingNode.locks[i] = true;
+                       }
+                       g_cluster.interlockingNode.lockHolderNode = wdNode;
+                       gettimeofday(&g_cluster.interlockingNode.lock_time, NULL);
+                       res = FAILOVER_RES_I_AM_LOCK_HOLDER;
+                       ereport(LOG,
+                                       (errmsg("%s pgpool-II node \"%s\" is the lock holder",
+                                                       (g_cluster.localNode == wdNode)? "local":"remote",
+                                                       wdNode->nodeName)));
+               }
+               else
+               {
+                       /* some other node is holding the lock */
+                       res = FAILOVER_RES_I_AM_NOT_LOCK_HOLDER;
+                       ereport(LOG,
+                                       (errmsg("lock holder request denied to %s pgpool-II node \"%s\"",
                                                        (g_cluster.localNode == wdNode)? "local":"remote",
-                                                       wdNode->nodeName,
-                                                       check?"can acquire":"has",
-                                                       wd_failover_cmd_type_name[failoverCmdType])));
+                                                       wdNode->nodeName),
+                                        errdetail("%s pgpool-II node \"%s\" is already holding the locks",
+                                                          (g_cluster.localNode == g_cluster.interlockingNode.lockHolderNode)? "local":"remote",
+                                                          g_cluster.interlockingNode.lockHolderNode->nodeName)));
+               }
+       }
+       else
+       {
+               ereport(LOG,
+                               (errmsg("failed to process interlocking request from %s pgpool-II node \"%s\"",
+                                               (g_cluster.localNode == wdNode)? "local":"remote",
+                                               wdNode->nodeName),
+                                errdetail("I am standby node and request can only be processed by master watchdog node")));
+               res = FAILOVER_RES_ERROR;
+       }
+       return res;
+}
 
-                       }
-                       else /* some other node is holding the lock */
-                       {
-                               ereport(LOG,
-                                               (errmsg("[%s] lock %s %s pgpool-II node \"%s\" to start the failover command",
-                                                               wd_failover_cmd_type_name[failoverCmdType],
-                                                               check?"is not available for":"request denied to",
-                                                               (g_cluster.localNode == wdNode)? "local":"remote",
-                                                               wdNode->nodeName),
-                                                errdetail("%s pgpool-II node \"%s\" is holding the lock",
-                                                                  (g_cluster.localNode == lockingNode->lockHolderNode)? "local":"remote",
-                                                                  lockingNode->lockHolderNode->nodeName)));
+/*
+ * node_is_asking_for_failover_end()
+ * the function process the request to release from the lock holder.
+ * The node can resign from the lock holder if the lock holding node
+ * is the same as the requesting node. When the resign from lock holding request
+ * is successful all respective command locks becomes unlocked.
+ * Only coordinator/master node can execute the interlocking requests.
+ */
+static WDFailoverCMDResults
+node_is_asking_for_failover_end(WatchdogNode* wdNode, WDPacketData* pkt)
+{
+       WDFailoverCMDResults res = FAILOVER_RES_TRANSITION;
 
-                               if (lockingNode->locked)
-                                       res = FAILOVER_RES_BLOCKED;
-                               else
-                                       res = FAILOVER_RES_LOCK_UNLOCKED;
+       ereport(LOG,
+                       (errmsg("%s pgpool-II node \"%s\" is requesting to resign from a lock holder",
+                                       (g_cluster.localNode == wdNode)? "local":"remote",
+                                       wdNode->nodeName)));
+
+       if (get_local_node_state() == WD_COORDINATOR)
+       {
+               /* check if the resigning node is the same that is holding the lock
+                */
+               if (g_cluster.interlockingNode.lockHolderNode == NULL ||
+                       g_cluster.interlockingNode.lockHolderNode == wdNode)
+               {
+                       int i;
+                       /* unlock all the locks */
+                       for (i = 0; i < MAX_FAILOVER_LOCKS; i++)
+                       {
+                               g_cluster.interlockingNode.locks[i] = false;
                        }
+                       g_cluster.interlockingNode.lockHolderNode = NULL;
+                       res = FAILOVER_RES_SUCCESS;
+                       ereport(LOG,
+                                       (errmsg("%s pgpool-II node \"%s\" has resigned from the lock holder",
+                                                       (g_cluster.localNode == wdNode)? "local":"remote",
+                                                       wdNode->nodeName)));
+               }
+               else /* some other node is holding the lock */
+               {
+                       res = FAILOVER_RES_I_AM_NOT_LOCK_HOLDER;
+                       ereport(LOG,
+                                       (errmsg("request of resigning from lock holder is denied to %s pgpool-II node \"%s\"",
+                                                       (g_cluster.localNode == wdNode)? "local":"remote",
+                                                       wdNode->nodeName),
+                                        errdetail("%s pgpool-II node \"%s\" is the lock holder node",
+                                                          (g_cluster.localNode == g_cluster.interlockingNode.lockHolderNode)? "local":"remote",
+                                                          g_cluster.interlockingNode.lockHolderNode->nodeName)));
                }
        }
        else
        {
                ereport(LOG,
-                               (errmsg("failed to process [%s] lock request from %s pgpool-II node \"%s\" to start the failover command",
-                                               wd_failover_cmd_type_name[failoverCmdType],
+                               (errmsg("failed to process release interlocking request from %s pgpool-II node \"%s\"",
                                                (g_cluster.localNode == wdNode)? "local":"remote",
                                                wdNode->nodeName),
                                 errdetail("I am standby node and request can only be processed by master watchdog node")));
@@ -2160,68 +2218,129 @@ node_is_asking_for_failover_cmd_start(WatchdogNode* wdNode, WDPacketData* pkt, i
        return res;
 }
 
+/*
+ * node_is_asking_for_failover_lock_release()
+ * the function process the request from the lock holder node to
+ * release a specific failocer command lock.
+ * Only coordinator/master node can execute the interlocking requests.
+ */
 static WDFailoverCMDResults
-node_is_asking_for_failover_cmd_end(WatchdogNode* wdNode, WDPacketData* pkt, int failoverCmdType, bool resign)
+node_is_asking_for_failover_lock_release(WatchdogNode* wdNode, WDPacketData* pkt, WDFailoverLock failoverLock)
 {
        WDFailoverCMDResults res = FAILOVER_RES_TRANSITION;
-       /* only coordinator(master) node can process this request */
 
        ereport(LOG,
-                       (errmsg("%s pgpool-II node \"%s\" is %s to release [%s] lock to end the failover command",
+                       (errmsg("%s pgpool-II node \"%s\" is requesting to release [%s] lock",
                                        (g_cluster.localNode == wdNode)? "local":"remote",
                                        wdNode->nodeName,
-                                       resign?"requesting":"testing",
-                                       wd_failover_cmd_type_name[failoverCmdType])));
+                                       wd_failover_lock_name[failoverLock])));
 
        if (get_local_node_state() == WD_COORDINATOR)
        {
-               InterlockingNode* lockingNode = NULL;
-               
-               if (failoverCmdType < 0 || failoverCmdType >= MAX_FAILOVER_CMDS)
-                       res = FAILOVER_RES_ERROR;
-               else
-                       lockingNode = &g_cluster.interlockingNodes[failoverCmdType];
-               
-               if (res != FAILOVER_RES_ERROR)
+               /* check if the node requesting to release a lock is the lock holder */
+               if (g_cluster.interlockingNode.lockHolderNode == wdNode)
                {
-                       /* check if we already have no lockholder node */
-                       if (lockingNode->lockHolderNode == NULL || lockingNode->lockHolderNode == wdNode)
+                       /* make sure the request is of a valid lock */
+                       if (failoverLock < MAX_FAILOVER_LOCKS)
                        {
-                               if (resign)
-                               {
-                                       lockingNode->lockHolderNode = NULL;
-                                       lockingNode->locked  = false;
-                               }
-                               res = FAILOVER_RES_LOCK_UNLOCKED;
+                               g_cluster.interlockingNode.locks[failoverLock] = false;
+                               res = FAILOVER_RES_SUCCESS;
 
                                ereport(LOG,
-                                               (errmsg("%s pgpool-II node \"%s\" %s the [%s] lock to end the failover command",
+                                               (errmsg("%s pgpool-II node \"%s\" has released the [%s] lock",
                                                                (g_cluster.localNode == wdNode)? "local":"remote",
                                                                wdNode->nodeName,
-                                                               resign?"has released":"can release",
-                                                               wd_failover_cmd_type_name[failoverCmdType])));
-
+                                                               wd_failover_lock_name[failoverLock])));
+                       }
+                       else
+                       {
+                               res = FAILOVER_RES_ERROR;
                        }
-                       else /* some other node is holding the lock */
+               }
+               else
+               {
+                       /* I am not the lock holder so not allowed to release the lock */
+                       ereport(LOG,
+                                       (errmsg("[%s] lock release request denied to %s pgpool-II node \"%s\"",
+                                                       wd_failover_lock_name[failoverLock],
+                                                       (g_cluster.localNode == wdNode)? "local":"remote",
+                                                       wdNode->nodeName),
+                                        errdetail("requesting node is not the lock holder")));
+                       res = FAILOVER_RES_I_AM_NOT_LOCK_HOLDER;
+               }
+       }
+       else
+       {
+               ereport(LOG,
+                               (errmsg("failed to process release lock request from %s pgpool-II node \"%s\"",
+                                               (g_cluster.localNode == wdNode)? "local":"remote",
+                                               wdNode->nodeName),
+                                errdetail("I am standby node and request can only be processed by master watchdog node")));
+               res = FAILOVER_RES_ERROR;
+       }
+       return res;
+}
+
+/*
+ * node_is_asking_for_failover_lock_status()
+ * This is an interlocking family function and returns the status of a specific failover lock.
+ * Only coordinator/master node can execute the interlocking requests.
+ */
+static WDFailoverCMDResults
+node_is_asking_for_failover_lock_status(WatchdogNode* wdNode, WDPacketData* pkt, WDFailoverLock failoverLock)
+{
+       WDFailoverCMDResults res = FAILOVER_RES_TRANSITION;
+
+       ereport(LOG,
+                       (errmsg("%s pgpool-II node \"%s\" is checking the status of [%s] lock",
+                                       (g_cluster.localNode == wdNode)? "local":"remote",
+                                       wdNode->nodeName,
+                                       wd_failover_lock_name[failoverLock])));
+
+       if (get_local_node_state() == WD_COORDINATOR)
+       {
+               /* check if the node requesting to start the command is the lock holder */
+               if (g_cluster.interlockingNode.lockHolderNode)
+               {
+                       /* make sure the request is of a valid lock */
+                       if (failoverLock < MAX_FAILOVER_LOCKS)
                        {
+                               if (g_cluster.interlockingNode.locks[failoverLock])
+                                       res = FAILOVER_RES_LOCKED;
+                               else
+                                       res = FAILOVER_RES_UNLOCKED;
+
                                ereport(LOG,
-                                               (errmsg("[%s] lock %s %s pgpool-II node \"%s\" to end the failover command",
-                                                               wd_failover_cmd_type_name[failoverCmdType],
-                                                               resign?"release request denied to":"cannot be released by",
-                                                               (g_cluster.localNode == wdNode)? "local":"remote",
-                                                               wdNode->nodeName),
-                                                errdetail("%s pgpool-II node \"%s\" is holding the lock",
-                                                                  (g_cluster.localNode == lockingNode->lockHolderNode)? "local":"remote",
-                                                                  lockingNode->lockHolderNode->nodeName)));
-                               res = FAILOVER_RES_BLOCKED;
+                                               (errmsg("%s lock is currently %s",
+                                                               wd_failover_lock_name[failoverLock],
+                                                               (res == FAILOVER_RES_LOCKED)?"LOCKED":"FREE"),
+                                                errdetail("request was from %s pgpool-II node \"%s\" and lock holder is %s pgpool-II node \"%s\"",
+                                                                  (g_cluster.localNode == wdNode)? "local":"remote",
+                                                                  wdNode->nodeName,
+                                                                  (g_cluster.localNode == g_cluster.interlockingNode.lockHolderNode)? "local":"remote",
+                                                                  g_cluster.interlockingNode.lockHolderNode->nodeName)));
+                       }
+                       else
+                       {
+                               res = FAILOVER_RES_ERROR;
                        }
                }
+               else
+               {
+                       /* no lock holder exists */
+                       ereport(LOG,
+                                       (errmsg("[%s] lock status check request denied to %s pgpool-II node \"%s\"",
+                                                       wd_failover_lock_name[failoverLock],
+                                                       (g_cluster.localNode == wdNode)? "local":"remote",
+                                                       wdNode->nodeName),
+                                        errdetail("no lock holder exists")));
+                       res = FAILOVER_RES_NO_LOCKHOLDER;
+               }
        }
        else
        {
                ereport(LOG,
-                               (errmsg("failed to process [%s] lock request from %s pgpool-II node \"%s\" to end the failover command",
-                                               wd_failover_cmd_type_name[failoverCmdType],
+                               (errmsg("failed to process lock status check request from %s pgpool-II node \"%s\"",
                                                (g_cluster.localNode == wdNode)? "local":"remote",
                                                wdNode->nodeName),
                                 errdetail("I am standby node and request can only be processed by master watchdog node")));
index c1ec5a61e71f54a90fd785a18cd8a1ef52d89fd4..0133267a350a10f4ace11c2d512d532e31164b6f 100644 (file)
 
 static void sleep_in_waiting(void);
 static void FreeCmdResult(WDIPCCmdResult* res);
-static WDFailoverCMDResults wd_issue_failover_lock_command(WDFailoverCMDTypes cmdType, char* syncReqType);
 
-
-static char* get_wd_failover_cmd_type_json(WDFailoverCMDTypes cmdType, char* reqType);
-WDFailoverCMDResults wd_send_failover_sync_command(WDFailoverCMDTypes cmdType, char* syncReqType);
+static WDFailoverCMDResults wd_issue_failover_lock_command(char* syncReqType, enum WDFailoverLocks lockID);
+static char* get_wd_failover_cmd_type_json(char* reqType, enum WDFailoverLocks lockID);
+static WDFailoverCMDResults wd_send_failover_sync_command(char* syncReqType, enum WDFailoverLocks lockID);
 
 static int wd_set_node_mask (unsigned char req_mask, int *node_id_set, int count);
 static int wd_chk_node_mask (unsigned char req_mask, int *node_id_set, int count);
@@ -181,6 +180,7 @@ issue_command_to_watchdog(char type, int timeout_sec, char* data, int data_len,
                if (timeout_sec > 0)
                {
                        tv.tv_sec = timeout_sec;
+                       tv.tv_usec = 0;
                        timeout_st = &tv;
                }
                FD_ZERO(&fds);
@@ -477,7 +477,7 @@ wd_send_failback_request(int node_id)
        return COMMAND_FAILED;
 }
 
-static char* get_wd_failover_cmd_type_json(WDFailoverCMDTypes cmdType, char* reqType)
+static char* get_wd_failover_cmd_type_json(char* reqType, enum WDFailoverLocks lockID)
 {
        char* json_str;
        JsonNode* jNode = jw_create_with_object(true);
@@ -487,24 +487,22 @@ static char* get_wd_failover_cmd_type_json(WDFailoverCMDTypes cmdType, char* req
        if (pool_config->wd_authkey != NULL && strlen(pool_config->wd_authkey) > 0)
                jw_put_string(jNode, WD_IPC_AUTH_KEY, pool_config->wd_authkey); /*  put the auth key*/
 
-       jw_put_int(jNode, "FailoverCMDType", cmdType);
        jw_put_string(jNode, "SyncRequestType", reqType);
+       jw_put_int(jNode, "FailoverLockID", lockID);
        jw_finish_document(jNode);
        json_str = pstrdup(jw_get_json_string(jNode));
        jw_destroy(jNode);
        return json_str;
 }
 
-
-WDFailoverCMDResults
-wd_send_failover_sync_command(WDFailoverCMDTypes cmdType, char* syncReqType)
+static WDFailoverCMDResults
+wd_send_failover_sync_command(char* syncReqType, enum WDFailoverLocks lockID)
 {
-       int failoverResCmdType;
-       int interlockingResult;
+       int interlockingResult = FAILOVER_RES_ERROR;
        json_value *root;
        
-       char* json_data = get_wd_failover_cmd_type_json(cmdType, syncReqType);
-       
+       char* json_data = get_wd_failover_cmd_type_json(syncReqType, lockID);
+
        WDIPCCmdResult *result = issue_command_to_watchdog(WD_FAILOVER_CMD_SYNC_REQUEST
                                                                                                           ,pool_config->recovery_timeout,
                                                                                                           json_data, strlen(json_data), true);
@@ -544,28 +542,20 @@ wd_send_failover_sync_command(WDFailoverCMDTypes cmdType, char* syncReqType)
                FreeCmdResult(result);
                return FAILOVER_RES_ERROR;
        }
-       
-       if (json_get_int_value_for_key(root, "FailoverCMDType", &failoverResCmdType))
-       {
-               json_value_free(root);
-               FreeCmdResult(result);
-               return FAILOVER_RES_ERROR;
-       }
+
        if (root && json_get_int_value_for_key(root, "InterlockingResult", &interlockingResult))
        {
                json_value_free(root);
                FreeCmdResult(result);
                return FAILOVER_RES_ERROR;
        }
+
        json_value_free(root);
        FreeCmdResult(result);
        
-       if (failoverResCmdType != cmdType)
+       if (interlockingResult < 0 || interlockingResult > FAILOVER_RES_NO_LOCKHOLDER)
                return FAILOVER_RES_ERROR;
-       
-       if (interlockingResult < 0 || interlockingResult > FAILOVER_RES_BLOCKED)
-               return FAILOVER_RES_ERROR;
-       
+
        return interlockingResult;
 }
 
@@ -770,43 +760,44 @@ open_wd_command_sock(bool throw_error)
        return sock;
 }
 
-WDFailoverCMDResults wd_failover_command_start(WDFailoverCMDTypes cmdType)
+WDFailoverCMDResults wd_start_failover_interlocking(void)
 {
        if (pool_config->use_watchdog)
-               return wd_issue_failover_lock_command(cmdType,"START_COMMAND");
+               return wd_issue_failover_lock_command(WD_REQ_FAILOVER_START, 0);
        return FAILOVER_RES_I_AM_LOCK_HOLDER;
 }
 
-WDFailoverCMDResults wd_failover_command_end(WDFailoverCMDTypes cmdType)
+WDFailoverCMDResults wd_end_failover_interlocking(void)
 {
        if (pool_config->use_watchdog)
-               return wd_issue_failover_lock_command(cmdType,"END_COMMAND");
-       return FAILOVER_RES_I_AM_LOCK_HOLDER;
+               return wd_issue_failover_lock_command(WD_REQ_FAILOVER_END, 0);
+       return FAILOVER_RES_SUCCESS;
 }
 
-WDFailoverCMDResults wd_failover_command_check_lock(WDFailoverCMDTypes cmdType)
+WDFailoverCMDResults wd_failover_lock_release(enum WDFailoverLocks lock)
 {
        if (pool_config->use_watchdog)
-               return wd_issue_failover_lock_command(cmdType,"CHECK_LOCKED");
-       return FAILOVER_RES_I_AM_LOCK_HOLDER;
+               return wd_issue_failover_lock_command(WD_REQ_FAILOVER_RELEASE_LOCK, lock);
+       return FAILOVER_RES_SUCCESS;
 }
 
-WDFailoverCMDResults wd_release_failover_command_lock(WDFailoverCMDTypes cmdType)
+WDFailoverCMDResults wd_failover_lock_status(enum WDFailoverLocks lock)
 {
        if (pool_config->use_watchdog)
-               return wd_issue_failover_lock_command(cmdType,"UNLOCK_COMMAND");
-       return FAILOVER_RES_I_AM_LOCK_HOLDER;
+               return wd_issue_failover_lock_command(WD_REQ_FAILOVER_LOCK_STATUS, lock);
+       return FAILOVER_RES_UNLOCKED;
 }
 
-void wd_wati_until_lock_or_timeout(WDFailoverCMDTypes cmdType)
+void wd_wait_until_command_complete_or_timeout(enum WDFailoverLocks lock)
 {
        WDFailoverCMDResults res = FAILOVER_RES_TRANSITION;
        int     count = WD_INTERLOCK_WAIT_COUNT;
-       while (1)
+
+       while (pool_config->use_watchdog)
        {
-               res = wd_failover_command_check_lock(cmdType);
-               if (res == FAILOVER_RES_I_AM_LOCK_HOLDER ||
-                       res == FAILOVER_RES_LOCK_UNLOCKED)
+               res = wd_failover_lock_status(lock);
+               if (res == FAILOVER_RES_UNLOCKED ||
+                       res == FAILOVER_RES_NO_LOCKHOLDER)
                {
                        /* we have the permision */
                        return;
@@ -823,17 +814,17 @@ void wd_wati_until_lock_or_timeout(WDFailoverCMDTypes cmdType)
 
 /*
  * This is just a wrapper over wd_send_failover_sync_command()
- * but tries to wait for WD_INTERLOCK_TIMEOUT_SEC amount of time
+ * but try to wait for WD_INTERLOCK_TIMEOUT_SEC amount of time
  * if watchdog is in transition state
  */
 
-static WDFailoverCMDResults wd_issue_failover_lock_command(WDFailoverCMDTypes cmdType, char* syncReqType)
+static WDFailoverCMDResults wd_issue_failover_lock_command(char* syncReqType, enum WDFailoverLocks lockID)
 {
        WDFailoverCMDResults res;
        int x;
        for (x=0; x < MAX_SEC_WAIT_FOR_CLUSTER_TRANSATION; x++)
        {
-               res = wd_send_failover_sync_command(cmdType, syncReqType);
+               res = wd_send_failover_sync_command(syncReqType, lockID);
                if (res != FAILOVER_RES_TRANSITION)
                        break;
                sleep(1);