From 3a1216d779e5e5fc73618db53056e65c90702a45 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Sun, 7 Nov 2021 16:00:43 +0500 Subject: [PATCH] [New-Feature] Enable dynamic cluster membership of watchdog nodes Enables dynamic watchdog cluster definition by introducing the concept of Member and Nonmember watchdog nodes. If the node's membership gets revoked from the watchdog cluster, then the cluster re-calibrate itself dynamically to adjust all subsequent majority rule computations. Three new settings wd_no_show_node_removal_timeout, wd_lost_node_removal_timeout and wd_remove_shutdown_nodes are added to configure the membership criteria of the watchdog nodes. --- src/config/pool_config_variables.c | 29 ++++++ src/include/pcp/pcp.h | 4 + src/include/pool_config.h | 12 +++ src/include/watchdog/watchdog.h | 12 +++ src/include/watchdog/wd_commands.h | 2 + src/libs/pcp/pcp.c | 25 +++++ src/sample/pgpool.conf.sample-stream | 23 +++++ src/tools/pcp/pcp_frontend_client.c | 42 ++++---- src/watchdog/watchdog.c | 137 +++++++++++++++++++++++++-- src/watchdog/wd_commands.c | 14 +++ 10 files changed, 272 insertions(+), 28 deletions(-) diff --git a/src/config/pool_config_variables.c b/src/config/pool_config_variables.c index bc0298817..116f89417 100644 --- a/src/config/pool_config_variables.c +++ b/src/config/pool_config_variables.c @@ -413,6 +413,15 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"wd_remove_shutdown_nodes", CFGCXT_RELOAD, WATCHDOG_CONFIG, + "Revoke the cluster membership of properly shutdown watchdog nodes.", + CONFIG_VAR_TYPE_BOOL, false, 0 + }, + &g_pool_config.wd_remove_shutdown_nodes, + false, + NULL, NULL, NULL + }, { {"log_connections", CFGCXT_RELOAD, LOGGING_CONFIG, "Logs each successful connection.", @@ -1983,6 +1992,26 @@ static struct config_int ConfigureNamesInt[] = 0, INT_MAX, NULL, NULL, NULL }, + { + {"wd_lost_node_removal_timeout", CFGCXT_RELOAD, WATCHDOG_CONFIG, + "Timeout in seconds to revoke the cluster membership of LOST watchdog nodes.", + CONFIG_VAR_TYPE_INT, false, GUC_UNIT_S + }, + &g_pool_config.wd_lost_node_removal_timeout, + 0, + 0, INT_MAX, + NULL, NULL, NULL + }, + { + {"wd_initial_node_showup_time", CFGCXT_RELOAD, WATCHDOG_CONFIG, + "Timeout in seconds to revoke the cluster membership of NO-SHOW watchdog nodes.", + CONFIG_VAR_TYPE_INT, false, GUC_UNIT_S + }, + &g_pool_config.wd_initial_node_showup_time, + 0, + 0, INT_MAX, + NULL, NULL, NULL + }, { {"wd_life_point", CFGCXT_INIT, WATCHDOG_CONFIG, diff --git a/src/include/pcp/pcp.h b/src/include/pcp/pcp.h index d4fd8832c..2aab91a10 100644 --- a/src/include/pcp/pcp.h +++ b/src/include/pcp/pcp.h @@ -38,9 +38,11 @@ typedef struct PCPWDNodeInfo { int state; + int membership_status; char nodeName[WD_MAX_HOST_NAMELEN]; char hostName[WD_MAX_HOST_NAMELEN]; /* host name */ char stateName[WD_MAX_HOST_NAMELEN]; /* state name */ + char membership_status_string[WD_MAX_HOST_NAMELEN]; /* membership status of this node */ int wd_port; /* watchdog port */ int wd_priority; /* node priority in leader election */ int pgpool_port; /* pgpool port */ @@ -51,6 +53,8 @@ typedef struct PCPWDNodeInfo typedef struct PCPWDClusterInfo { int remoteNodeCount; + int memberRemoteNodeCount; + int nodesRequiredForQuorum; int quorumStatus; int aliveNodeCount; bool escalated; diff --git a/src/include/pool_config.h b/src/include/pool_config.h index 8ec4da35b..8ebf283e9 100644 --- a/src/include/pool_config.h +++ b/src/include/pool_config.h @@ -538,6 +538,18 @@ typedef struct * votes in a cluster with an even * number of nodes. */ + bool wd_remove_shutdown_nodes; + /* revoke membership of properly shutdown watchdog + * nodes. + */ + int wd_lost_node_removal_timeout; + /* timeout in seconds to revoke membership of + * LOST watchdog nodes + */ + int wd_initial_node_showup_time; + /* time in seconds to revoke membership of + * NO-SHOW watchdog node + */ WdLifeCheckMethod wd_lifecheck_method; /* method of lifecheck. * 'heartbeat' or 'query' */ diff --git a/src/include/watchdog/watchdog.h b/src/include/watchdog/watchdog.h index 65ba3189f..b501492d5 100644 --- a/src/include/watchdog/watchdog.h +++ b/src/include/watchdog/watchdog.h @@ -130,6 +130,13 @@ typedef enum { NODE_LOST_SHUTDOWN } WD_NODE_LOST_REASONS; +typedef enum { + WD_NODE_MEMBERSHIP_ACTIVE, + WD_NODE_REVOKED_SHUTDOWN, + WD_NODE_REVOKED_NO_SHOW, + WD_NODE_REVOKED_LOST +}WD_NODE_MEMBERSHIP_STATUS; + typedef struct SocketConnection { int sock; /* socket descriptor */ @@ -148,6 +155,11 @@ typedef struct WatchdogNode * from the node */ struct timeval last_sent_time; /* timestamp when last packet was sent on * the node */ + struct timeval lost_time; /* timestamp when the remote node was lost on coordinator + */ + WD_NODE_MEMBERSHIP_STATUS membership_status; /* status of node membership + *in watchdog cluster + Only valid for remote nodes */ bool has_lost_us; /* * True when this remote node thinks * we are lost diff --git a/src/include/watchdog/wd_commands.h b/src/include/watchdog/wd_commands.h index 8e43c5bde..34c5b9ac6 100644 --- a/src/include/watchdog/wd_commands.h +++ b/src/include/watchdog/wd_commands.h @@ -32,6 +32,8 @@ typedef struct WDNodeInfo { int state; + int membership_status; + char membership_status_string[WD_MAX_HOST_NAMELEN]; char nodeName[WD_MAX_HOST_NAMELEN]; char hostName[WD_MAX_HOST_NAMELEN]; /* host name */ char stateName[WD_MAX_HOST_NAMELEN]; /* watchdog state name */ diff --git a/src/libs/pcp/pcp.c b/src/libs/pcp/pcp.c index 1c5ef57e4..75693b44d 100644 --- a/src/libs/pcp/pcp.c +++ b/src/libs/pcp/pcp.c @@ -1518,6 +1518,15 @@ process_watchdog_info_response(PCPConnInfo * pcpConn, char *buf, int len) json_value_free(root); goto INVALID_RESPONSE; } + if (json_get_int_value_for_key(root, "MemberRemoteNodeCount", &wd_cluster_info->memberRemoteNodeCount)) + { + wd_cluster_info->memberRemoteNodeCount = -1; + } + if (json_get_int_value_for_key(root, "NodesRequireForQuorum", &wd_cluster_info->nodesRequiredForQuorum)) + { + wd_cluster_info->nodesRequiredForQuorum = -1; + } + if (json_get_int_value_for_key(root, "QuorumStatus", &wd_cluster_info->quorumStatus)) { json_value_free(root); @@ -1594,6 +1603,22 @@ process_watchdog_info_response(PCPConnInfo * pcpConn, char *buf, int len) } strncpy(wdNodeInfo->delegate_ip, ptr, sizeof(wdNodeInfo->delegate_ip) - 1); + if (json_get_int_value_for_key(nodeInfoValue, "Membership", &wdNodeInfo->membership_status)) + { + /* would be from the older version. No need to panic */ + wdNodeInfo->membership_status = 0; + } + + ptr = json_get_string_value_for_key(nodeInfoValue, "MembershipString"); + if (ptr == NULL) + { + strncpy(wdNodeInfo->membership_status_string, "NOT-Available", + sizeof(wdNodeInfo->membership_status_string) - 1); + } + else + strncpy(wdNodeInfo->membership_status_string, ptr, + sizeof(wdNodeInfo->membership_status_string) - 1); + if (json_get_int_value_for_key(nodeInfoValue, "WdPort", &wdNodeInfo->wd_port)) { json_value_free(root); diff --git a/src/sample/pgpool.conf.sample-stream b/src/sample/pgpool.conf.sample-stream index 8222fd355..f8e422557 100644 --- a/src/sample/pgpool.conf.sample-stream +++ b/src/sample/pgpool.conf.sample-stream @@ -751,6 +751,29 @@ hostname0 = '' # half of the total votes. # (change requires restart) +# - Watchdog cluster membership settings for quorum computation - + +#wd_remove_shutdown_nodes = off + # when enabled cluster membership of properly shutdown + # watchdog nodes gets revoked, After that the node does + # not count towards the quorum and consensus computations + +#wd_lost_node_removal_timeout = 0s + # Timeout after which the cluster membership of LOST watchdog + # nodes gets revoked. After that the node node does not + # count towards the quorum and consensus computations + # setting timeout to 0 will never revoke the membership + # of LOST nodes + +#wd_initial_node_showup_time = 0s + # Time to wait for Watchdog node to connect to the cluster. + # After that time the cluster membership of NO-SHOW node gets + # revoked and it does not count towards the quorum and + # consensus computations + # setting timeout to 0 will not revoke the membership + # of NO-SHOW nodes + + # - Lifecheck Setting - # -- common -- diff --git a/src/tools/pcp/pcp_frontend_client.c b/src/tools/pcp/pcp_frontend_client.c index 235fcf67f..45a154868 100644 --- a/src/tools/pcp/pcp_frontend_client.c +++ b/src/tools/pcp/pcp_frontend_client.c @@ -775,33 +775,37 @@ output_watchdog_info_result(PCPResultInfo * pcpResInfo, bool verbose) quorumStatus = "UNKNOWN"; printf("Watchdog Cluster Information \n"); - printf("Total Nodes : %d\n", cluster->remoteNodeCount + 1); - printf("Remote Nodes : %d\n", cluster->remoteNodeCount); - printf("Quorum state : %s\n", quorumStatus); - printf("Alive Remote Nodes : %d\n", cluster->aliveNodeCount); - printf("VIP up on local node : %s\n", cluster->escalated ? "YES" : "NO"); - printf("Leader Node Name : %s\n", cluster->leaderNodeName); - printf("Leader Host Name : %s\n\n", cluster->leaderHostName); + printf("Total Nodes : %d\n", cluster->remoteNodeCount + 1); + printf("Remote Nodes : %d\n", cluster->remoteNodeCount); + printf("Member Remote Nodes : %d\n", cluster->memberRemoteNodeCount); + printf("Alive Remote Nodes : %d\n", cluster->aliveNodeCount); + printf("Nodes required for quorum: %d\n", cluster->nodesRequiredForQuorum); + printf("Quorum state : %s\n", quorumStatus); + printf("VIP up on local node : %s\n", cluster->escalated ? "YES" : "NO"); + printf("Leader Node Name : %s\n", cluster->leaderNodeName); + printf("Leader Host Name : %s\n\n", cluster->leaderHostName); printf("Watchdog Node Information \n"); for (i = 0; i < cluster->nodeCount; i++) { PCPWDNodeInfo *watchdog_info = &cluster->nodeList[i]; - printf("Node Name : %s\n", watchdog_info->nodeName); - printf("Host Name : %s\n", watchdog_info->hostName); - printf("Delegate IP : %s\n", watchdog_info->delegate_ip); - printf("Pgpool port : %d\n", watchdog_info->pgpool_port); - printf("Watchdog port : %d\n", watchdog_info->wd_port); - printf("Node priority : %d\n", watchdog_info->wd_priority); - printf("Status : %d\n", watchdog_info->state); - printf("Status Name : %s\n\n", watchdog_info->stateName); + printf("Node Name : %s\n", watchdog_info->nodeName); + printf("Host Name : %s\n", watchdog_info->hostName); + printf("Delegate IP : %s\n", watchdog_info->delegate_ip); + printf("Pgpool port : %d\n", watchdog_info->pgpool_port); + printf("Watchdog port : %d\n", watchdog_info->wd_port); + printf("Node priority : %d\n", watchdog_info->wd_priority); + printf("Status : %d\n", watchdog_info->state); + printf("Status Name : %s\n", watchdog_info->stateName); + printf("Membership Status : %s\n\n", watchdog_info->membership_status_string); } } else { - printf("%d %s %s %s\n\n", + printf("%d %d %s %s %s\n\n", cluster->remoteNodeCount + 1, + cluster->memberRemoteNodeCount + 1, cluster->escalated ? "YES" : "NO", cluster->leaderNodeName, cluster->leaderHostName); @@ -810,13 +814,15 @@ output_watchdog_info_result(PCPResultInfo * pcpResInfo, bool verbose) { PCPWDNodeInfo *watchdog_info = &cluster->nodeList[i]; - printf("%s %s %d %d %d %s\n", + printf("%s %s %d %d %d %s %d %s\n", watchdog_info->nodeName, watchdog_info->hostName, watchdog_info->pgpool_port, watchdog_info->wd_port, watchdog_info->state, - watchdog_info->stateName); + watchdog_info->stateName, + watchdog_info->membership_status, + watchdog_info->membership_status_string); } } } diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c index d00fa1d3b..4603d18f3 100644 --- a/src/watchdog/watchdog.c +++ b/src/watchdog/watchdog.c @@ -250,6 +250,12 @@ char *wd_node_lost_reasons[] = { "SHUTDOWN" }; +char *wd_cluster_membership_status[] = { + "MEMBER", + "REVOKED-SHUTDOWN", + "REVOKED-NO-SHOW", + "REVOKED-LOST" +}; /* * Command packet definition. */ @@ -362,6 +368,7 @@ typedef struct wd_cluster WatchdogNode *remoteNodes; WDClusterLeaderInfo clusterLeaderInfo; int remoteNodeCount; + int memberRemoteNodeCount; /* no of nodes that count towards quorum and consensus */ int quorum_status; unsigned int nextCommandID; pid_t escalation_pid; @@ -594,6 +601,10 @@ static void set_cluster_leader_node(WatchdogNode * wdNode); static void clear_standby_nodes_list(void); static int standby_node_left_cluster(WatchdogNode * wdNode); static int standby_node_join_cluster(WatchdogNode * wdNode); +static void reset_lost_timers(void); +static int update_cluster_memberships(void); +static int revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBERSHIP_STATUS revoke_status); +static int restore_cluster_membership_of_node(WatchdogNode* wdNode); static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear); static void wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt); @@ -761,10 +772,10 @@ wd_cluster_initialize(void) /* initialize remote nodes */ g_cluster.remoteNodeCount = pool_config->wd_nodes.num_wd - 1; + g_cluster.memberRemoteNodeCount = g_cluster.remoteNodeCount; if (g_cluster.remoteNodeCount == 0) ereport(ERROR, (errmsg("invalid watchdog configuration. other pgpools setting is not defined"))); - ereport(LOG, (errmsg("watchdog cluster is configured with %d remote nodes", g_cluster.remoteNodeCount))); g_cluster.remoteNodes = palloc0((sizeof(WatchdogNode) * g_cluster.remoteNodeCount)); @@ -1638,6 +1649,7 @@ read_sockets(fd_set *rmask, int pending_fds_count) } if (found) { + restore_cluster_membership_of_node(wdNode); /* reply with node info message */ ereport(LOG, (errmsg("new node joined the cluster hostname:\"%s\" port:%d pgpool_port:%d", wdNode->hostname, @@ -3633,6 +3645,8 @@ add_nodeinfo_to_json(JsonNode * jNode, WatchdogNode * node) jw_put_int(jNode, "ID", nodeIfNull_int(pgpool_node_id, -1)); jw_put_int(jNode, "State", nodeIfNull_int(state, -1)); + jw_put_int(jNode, "Membership", nodeIfNull_int(membership_status, -1)); + jw_put_string(jNode, "MembershipString", node ? wd_cluster_membership_status[node->membership_status] : NotSet); jw_put_string(jNode, "NodeName", nodeIfNull_str(nodeName, NotSet)); jw_put_string(jNode, "HostName", nodeIfNull_str(hostname, NotSet)); jw_put_string(jNode, "StateName", node ? wd_state_names[node->state] : NotSet); @@ -3652,6 +3666,8 @@ static JsonNode * get_node_list_json(int id) JsonNode *jNode = jw_create_with_object(true); jw_put_int(jNode, "RemoteNodeCount", g_cluster.remoteNodeCount); + jw_put_int(jNode, "MemberRemoteNodeCount", g_cluster.memberRemoteNodeCount); + jw_put_int(jNode, "NodesRequireForQuorum", get_minimum_votes_to_resolve_consensus()); jw_put_int(jNode, "QuorumStatus", WD_LEADER_NODE ? WD_LEADER_NODE->quorum_status : -2); jw_put_int(jNode, "AliveNodeCount", WD_LEADER_NODE ? WD_LEADER_NODE->standby_nodes_count : 0); jw_put_int(jNode, "Escalated", g_cluster.localNode->escalated); @@ -4755,6 +4771,34 @@ service_unreachable_nodes(void) { WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]); + if (wdNode->state == WD_LOST && wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE + && pool_config->wd_lost_node_removal_timeout) + { + int lost_seconds = WD_TIME_DIFF_SEC(currTime, wdNode->lost_time); + if (lost_seconds >= pool_config->wd_lost_node_removal_timeout) + { + ereport(LOG, + (errmsg("remote node \"%s\" is lost for %d seconds", wdNode->nodeName,lost_seconds), + errdetail("revoking the node's membership"))); + revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_LOST); + } + continue; + } + + if (wdNode->state == WD_DEAD && wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE + && pool_config->wd_initial_node_showup_time) + { + int no_show_seconds = WD_TIME_DIFF_SEC(currTime, g_cluster.localNode->startup_time); + if (no_show_seconds >= pool_config->wd_initial_node_showup_time) + { + ereport(LOG, + (errmsg("remote node \"%s\" didn't showed-up in %d seconds", wdNode->nodeName,no_show_seconds), + errdetail("revoking the node's membership"))); + revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_NO_SHOW); + } + continue; + } + if (is_node_active(wdNode) == false) continue; @@ -5402,6 +5446,8 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk { ereport(LOG, (errmsg("remote node \"%s\" is shutting down", wdNode->nodeName))); + if (pool_config->wd_remove_shutdown_nodes) + revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_SHUTDOWN); } else { @@ -5440,6 +5486,8 @@ watchdog_state_machine(WD_EVENTS event, WatchdogNode * wdNode, WDPacketData * pk wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON; wdNode->state = WD_LOADING; send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_FOUND); + /* if this node was kicked out of quorum calculation. add it back */ + restore_cluster_membership_of_node(wdNode); } else if (event == WD_EVENT_PACKET_RCV) { @@ -6004,8 +6052,9 @@ watchdog_state_machine_coordinator(WD_EVENTS event, WatchdogNode * wdNode, WDPac if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED || clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT) { + update_cluster_memberships(); update_quorum_status(); - + reset_lost_timers(); ereport(DEBUG1, (errmsg("declare coordinator command finished with status:[%s]", clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ? @@ -7060,7 +7109,7 @@ update_quorum_status(void) } else if (g_cluster.clusterLeaderInfo.standby_nodes_count == get_minimum_remote_nodes_required_for_quorum()) { - if (g_cluster.remoteNodeCount % 2 != 0) + if (g_cluster.memberRemoteNodeCount % 2 != 0) { if (pool_config->enable_consensus_with_half_votes) g_cluster.quorum_status = 0; /* on the edge */ @@ -7091,14 +7140,14 @@ get_minimum_remote_nodes_required_for_quorum(void) * Even number of remote nodes, That means total number of nodes are odd, * so minimum quorum is just remote/2. */ - if (g_cluster.remoteNodeCount % 2 == 0) - return (g_cluster.remoteNodeCount / 2); + if (g_cluster.memberRemoteNodeCount % 2 == 0) + return (g_cluster.memberRemoteNodeCount / 2); /* * Total nodes including self are even, So we return 50% nodes as quorum * requirements */ - return ((g_cluster.remoteNodeCount - 1) / 2); + return ((g_cluster.memberRemoteNodeCount - 1) / 2); } /* @@ -7143,7 +7192,7 @@ get_minimum_votes_to_resolve_consensus(void) * So for even number of nodes when enable_consensus_with_half_votes is * not allowed than we would add one more vote than exact 50% */ - if (g_cluster.remoteNodeCount % 2 != 0) + if (g_cluster.memberRemoteNodeCount % 2 != 0) { if (pool_config->enable_consensus_with_half_votes == false) required_node_count += 1; @@ -7922,18 +7971,85 @@ set_cluster_leader_node(WatchdogNode * wdNode) } } -static WatchdogNode * getLeaderWatchdogNode(void) +static WatchdogNode* +getLeaderWatchdogNode(void) { return g_cluster.clusterLeaderInfo.leaderNode; } +static int +update_cluster_memberships(void) +{ + int i; + g_cluster.memberRemoteNodeCount = g_cluster.remoteNodeCount; + for (i = 0; i < g_cluster.remoteNodeCount; i++) + { + WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]); + if (wdNode->membership_status != WD_NODE_MEMBERSHIP_ACTIVE) + g_cluster.memberRemoteNodeCount--; + } + return g_cluster.memberRemoteNodeCount; +} + +static int +revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBERSHIP_STATUS revoke_status) +{ + if (wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE) + { + wdNode->membership_status = revoke_status; + + ereport(LOG, + (errmsg("revoking the membership of [%s] node:\"%s\" [node_id:%d]", + wd_state_names[wdNode->state], wdNode->nodeName,wdNode->pgpool_node_id), + errdetail("membership revoke reason: \"%s\"", + wd_cluster_membership_status[wdNode->membership_status]))); + + g_cluster.memberRemoteNodeCount--; + } + return g_cluster.memberRemoteNodeCount; +} + +static int +restore_cluster_membership_of_node(WatchdogNode* wdNode) +{ + if (wdNode->membership_status != WD_NODE_MEMBERSHIP_ACTIVE) + { + ereport(LOG, + (errmsg("Restoring cluster membership of node:\"%s\"",wdNode->nodeName), + errdetail("membership of node was revoked because it was \"%s\"", + wd_cluster_membership_status[wdNode->membership_status]))); + + wdNode->membership_status = WD_NODE_MEMBERSHIP_ACTIVE; + /* reset the lost time on the node */ + wdNode->lost_time.tv_sec = 0; + wdNode->lost_time.tv_usec = 0; + g_cluster.memberRemoteNodeCount++; + } + return g_cluster.memberRemoteNodeCount; +} + +static void +reset_lost_timers(void) +{ + int i; + for (i = 0; i < g_cluster.remoteNodeCount; i++) + { + WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]); + wdNode->lost_time.tv_sec = 0; + wdNode->lost_time.tv_usec = 0; + } +} + static int standby_node_join_cluster(WatchdogNode * wdNode) { if (get_local_node_state() == WD_COORDINATOR) { int i; - + /* Just rest the lost time stamp*/ + /* set the timestamp on node to track for how long this node is lost */ + wdNode->lost_time.tv_sec = 0; + wdNode->lost_time.tv_usec = 0; /* First check if the node is already in the List */ for (i = 0; i < g_cluster.clusterLeaderInfo.standby_nodes_count; i++) { @@ -7983,7 +8099,8 @@ standby_node_left_cluster(WatchdogNode * wdNode) */ ereport(LOG, (errmsg("removing watchdog node \"%s\" from the standby list", wdNode->nodeName))); - + /* set the timestamp on node to track for how long this node is lost */ + gettimeofday(&wdNode->lost_time, NULL); g_cluster.clusterLeaderInfo.standbyNodes[i] = NULL; g_cluster.clusterLeaderInfo.standby_nodes_count--; removed = true; diff --git a/src/watchdog/wd_commands.c b/src/watchdog/wd_commands.c index e1aae9e07..145955865 100644 --- a/src/watchdog/wd_commands.c +++ b/src/watchdog/wd_commands.c @@ -346,6 +346,20 @@ parse_watchdog_node_info_from_wd_node_json(json_value * source) (errmsg("invalid json data"), errdetail("unable to find Watchdog Node ID"))); } + if (json_get_int_value_for_key(source, "Membership", &wdNodeInfo->membership_status)) + { + /* would be from the older version. No need to panic */ + wdNodeInfo->membership_status = WD_NODE_MEMBERSHIP_ACTIVE; + } + + ptr = json_get_string_value_for_key(source, "MembershipString"); + if (ptr == NULL) + { + strncpy(wdNodeInfo->membership_status_string, "NOT-Available", sizeof(wdNodeInfo->membership_status_string) - 1); + } + else + strncpy(wdNodeInfo->membership_status_string, ptr, sizeof(wdNodeInfo->membership_status_string) - 1); + ptr = json_get_string_value_for_key(source, "NodeName"); if (ptr == NULL) -- 2.39.5