Fix for [pgpool-general: 7896] Possible race condition..
authorMuhammad Usama <m.usama@gmail.com>
Wed, 27 Apr 2022 10:30:58 +0000 (15:30 +0500)
committerMuhammad Usama <m.usama@gmail.com>
Sun, 1 May 2022 21:52:30 +0000 (02:52 +0500)
Watchdog does not allow the remote nodes reported lost by life-check to rejoin
the cluster until the life-check process confirms the existence of life in the
previously lost nodes. This is good enough except for the case when the
(lost by life-check) node tries to rejoin the cluster after it was restarted
(Pgpool-II service restarted).
What happens is the cluster keeps rejecting the restarted node because
the cluster's life-check doesn't agree while the restarted node's life-check
waits to be added to cluster before it can start sending the heart-beats.

The fix is to allow the previously lost remote node become the part of the
cluster after restart, no matter the lost-reason.

Issue report:
https://www.pgpool.net/pipermail/pgpool-general/2021-November/007954.html

src/watchdog/watchdog.c

index 91e8225df99c60ea2ae1dd736731ed001a20299f..889854e5d418c70d6888631485b3b1e916888aff 100644 (file)
@@ -641,7 +641,7 @@ wd_check_config(void)
        {
                if (pool_config->num_hb_dest_if <= 0)
                        ereport(ERROR,
-                                       (errmsg("invalid lifecheck configuration. no heartbeat interfaces defined")));
+                                       (errmsg("invalid life-check configuration. no heartbeat interfaces defined")));
        }
 }
 
@@ -1664,18 +1664,9 @@ read_sockets(fd_set *rmask, int pending_fds_count)
                                                                                                   wd_node_lost_reasons[wdNode->node_lost_reason],
                                                                                                   abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)))));
 
-                                                               if (abs((int)(previous_startup_time.tv_sec - wdNode->startup_time.tv_sec)) <= 2 &&
-                                                                       wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
-                                                               {
-                                                                       ereport(LOG,
-                                                                               (errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
-                                                                                        errdetail("only lifecheck process can mark this node alive again")));
-                                                                       /* restore the node's lost state */
-                                                                       wdNode->state = oldNodeState;
-                                                               }
-                                                               else
-                                                                       watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
-
+                                                               watchdog_state_machine(WD_EVENT_PACKET_RCV, wdNode, pkt, NULL);
+                                                               /* Since the node was lost. Fire node found event as well */
+                                                               watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL);
                                                        }
 
                                                }
@@ -4357,8 +4348,8 @@ standard_packet_processor(WatchdogNode * wdNode, WDPacketData * pkt)
                                        if (wdNode->node_lost_reason == NODE_LOST_BY_LIFECHECK)
                                        {
                                                ereport(LOG,
-                                                       (errmsg("node:\"%s\" was reported lost by the lifecheck process",wdNode->nodeName),
-                                                                errdetail("only life-check process can mark this node alive again")));
+                                                       (errmsg("node:\"%s\" was reported lost by the life-check process",wdNode->nodeName),
+                                                                errdetail("node will be added to cluster once life-check mark it as reachable again")));
                                                /* restore the node's lost state */
                                                wdNode->state = oldNodeState;
                                        }