false,
NULL, NULL, NULL
},
+ {
+ {"wd_remove_shutdown_nodes", CFGCXT_RELOAD, WATCHDOG_CONFIG,
+ "Revoke the cluster membership of properly shutdown watchdog nodes.",
+ CONFIG_VAR_TYPE_BOOL, false, 0
+ },
+ &g_pool_config.wd_remove_shutdown_nodes,
+ false,
+ NULL, NULL, NULL
+ },
{
{"log_connections", CFGCXT_RELOAD, LOGGING_CONFIG,
"Logs each successful connection.",
0, INT_MAX,
NULL, NULL, NULL
},
+ {
+ {"wd_lost_node_removal_timeout", CFGCXT_RELOAD, WATCHDOG_CONFIG,
+ "Timeout in seconds to revoke the cluster membership of LOST watchdog nodes.",
+ CONFIG_VAR_TYPE_INT, false, GUC_UNIT_S
+ },
+ &g_pool_config.wd_lost_node_removal_timeout,
+ 0,
+ 0, INT_MAX,
+ NULL, NULL, NULL
+ },
+ {
+ {"wd_initial_node_showup_time", CFGCXT_RELOAD, WATCHDOG_CONFIG,
+ "Timeout in seconds to revoke the cluster membership of NO-SHOW watchdog nodes.",
+ CONFIG_VAR_TYPE_INT, false, GUC_UNIT_S
+ },
+ &g_pool_config.wd_initial_node_showup_time,
+ 0,
+ 0, INT_MAX,
+ NULL, NULL, NULL
+ },
{
{"wd_life_point", CFGCXT_INIT, WATCHDOG_CONFIG,
typedef struct PCPWDNodeInfo
{
int state;
+ int membership_status;
char nodeName[WD_MAX_HOST_NAMELEN];
char hostName[WD_MAX_HOST_NAMELEN]; /* host name */
char stateName[WD_MAX_HOST_NAMELEN]; /* state name */
+ char membership_status_string[WD_MAX_HOST_NAMELEN]; /* membership status of this node */
int wd_port; /* watchdog port */
int wd_priority; /* node priority in leader election */
int pgpool_port; /* pgpool port */
typedef struct PCPWDClusterInfo
{
int remoteNodeCount;
+ int memberRemoteNodeCount;
+ int nodesRequiredForQuorum;
int quorumStatus;
int aliveNodeCount;
bool escalated;
* votes in a cluster with an even
* number of nodes.
*/
+ bool wd_remove_shutdown_nodes;
+ /* revoke membership of properly shutdown watchdog
+ * nodes.
+ */
+ int wd_lost_node_removal_timeout;
+ /* timeout in seconds to revoke membership of
+ * LOST watchdog nodes
+ */
+ int wd_initial_node_showup_time;
+ /* time in seconds to revoke membership of
+ * NO-SHOW watchdog node
+ */
WdLifeCheckMethod wd_lifecheck_method; /* method of lifecheck.
* 'heartbeat' or 'query' */
NODE_LOST_SHUTDOWN
} WD_NODE_LOST_REASONS;
+typedef enum {
+ WD_NODE_MEMBERSHIP_ACTIVE,
+ WD_NODE_REVOKED_SHUTDOWN,
+ WD_NODE_REVOKED_NO_SHOW,
+ WD_NODE_REVOKED_LOST
+}WD_NODE_MEMBERSHIP_STATUS;
+
typedef struct SocketConnection
{
int sock; /* socket descriptor */
* from the node */
struct timeval last_sent_time; /* timestamp when last packet was sent on
* the node */
+ struct timeval lost_time; /* timestamp when the remote node was lost on coordinator
+ */
+ WD_NODE_MEMBERSHIP_STATUS membership_status; /* status of node membership
+ *in watchdog cluster
+ Only valid for remote nodes */
bool has_lost_us; /*
* True when this remote node thinks
* we are lost
typedef struct WDNodeInfo
{
int state;
+ int membership_status;
+ char membership_status_string[WD_MAX_HOST_NAMELEN];
char nodeName[WD_MAX_HOST_NAMELEN];
char hostName[WD_MAX_HOST_NAMELEN]; /* host name */
char stateName[WD_MAX_HOST_NAMELEN]; /* watchdog state name */
json_value_free(root);
goto INVALID_RESPONSE;
}
+ if (json_get_int_value_for_key(root, "MemberRemoteNodeCount", &wd_cluster_info->memberRemoteNodeCount))
+ {
+ wd_cluster_info->memberRemoteNodeCount = -1;
+ }
+ if (json_get_int_value_for_key(root, "NodesRequireForQuorum", &wd_cluster_info->nodesRequiredForQuorum))
+ {
+ wd_cluster_info->nodesRequiredForQuorum = -1;
+ }
+
if (json_get_int_value_for_key(root, "QuorumStatus", &wd_cluster_info->quorumStatus))
{
json_value_free(root);
}
strncpy(wdNodeInfo->delegate_ip, ptr, sizeof(wdNodeInfo->delegate_ip) - 1);
+ if (json_get_int_value_for_key(nodeInfoValue, "Membership", &wdNodeInfo->membership_status))
+ {
+ /* would be from the older version. No need to panic */
+ wdNodeInfo->membership_status = 0;
+ }
+
+ ptr = json_get_string_value_for_key(nodeInfoValue, "MembershipString");
+ if (ptr == NULL)
+ {
+ strncpy(wdNodeInfo->membership_status_string, "NOT-Available",
+ sizeof(wdNodeInfo->membership_status_string) - 1);
+ }
+ else
+ strncpy(wdNodeInfo->membership_status_string, ptr,
+ sizeof(wdNodeInfo->membership_status_string) - 1);
+
if (json_get_int_value_for_key(nodeInfoValue, "WdPort", &wdNodeInfo->wd_port))
{
json_value_free(root);
# half of the total votes.
# (change requires restart)
+# - Watchdog cluster membership settings for quorum computation -
+
+#wd_remove_shutdown_nodes = off
+ # when enabled cluster membership of properly shutdown
+ # watchdog nodes gets revoked, After that the node does
+ # not count towards the quorum and consensus computations
+
+#wd_lost_node_removal_timeout = 0s
+ # Timeout after which the cluster membership of LOST watchdog
+ # nodes gets revoked. After that the node node does not
+ # count towards the quorum and consensus computations
+ # setting timeout to 0 will never revoke the membership
+ # of LOST nodes
+
+#wd_initial_node_showup_time = 0s
+ # Time to wait for Watchdog node to connect to the cluster.
+ # After that time the cluster membership of NO-SHOW node gets
+ # revoked and it does not count towards the quorum and
+ # consensus computations
+ # setting timeout to 0 will not revoke the membership
+ # of NO-SHOW nodes
+
+
# - Lifecheck Setting -
# -- common --
quorumStatus = "UNKNOWN";
printf("Watchdog Cluster Information \n");
- printf("Total Nodes : %d\n", cluster->remoteNodeCount + 1);
- printf("Remote Nodes : %d\n", cluster->remoteNodeCount);
- printf("Quorum state : %s\n", quorumStatus);
- printf("Alive Remote Nodes : %d\n", cluster->aliveNodeCount);
- printf("VIP up on local node : %s\n", cluster->escalated ? "YES" : "NO");
- printf("Leader Node Name : %s\n", cluster->leaderNodeName);
- printf("Leader Host Name : %s\n\n", cluster->leaderHostName);
+ printf("Total Nodes : %d\n", cluster->remoteNodeCount + 1);
+ printf("Remote Nodes : %d\n", cluster->remoteNodeCount);
+ printf("Member Remote Nodes : %d\n", cluster->memberRemoteNodeCount);
+ printf("Alive Remote Nodes : %d\n", cluster->aliveNodeCount);
+ printf("Nodes required for quorum: %d\n", cluster->nodesRequiredForQuorum);
+ printf("Quorum state : %s\n", quorumStatus);
+ printf("VIP up on local node : %s\n", cluster->escalated ? "YES" : "NO");
+ printf("Leader Node Name : %s\n", cluster->leaderNodeName);
+ printf("Leader Host Name : %s\n\n", cluster->leaderHostName);
printf("Watchdog Node Information \n");
for (i = 0; i < cluster->nodeCount; i++)
{
PCPWDNodeInfo *watchdog_info = &cluster->nodeList[i];
- printf("Node Name : %s\n", watchdog_info->nodeName);
- printf("Host Name : %s\n", watchdog_info->hostName);
- printf("Delegate IP : %s\n", watchdog_info->delegate_ip);
- printf("Pgpool port : %d\n", watchdog_info->pgpool_port);
- printf("Watchdog port : %d\n", watchdog_info->wd_port);
- printf("Node priority : %d\n", watchdog_info->wd_priority);
- printf("Status : %d\n", watchdog_info->state);
- printf("Status Name : %s\n\n", watchdog_info->stateName);
+ printf("Node Name : %s\n", watchdog_info->nodeName);
+ printf("Host Name : %s\n", watchdog_info->hostName);
+ printf("Delegate IP : %s\n", watchdog_info->delegate_ip);
+ printf("Pgpool port : %d\n", watchdog_info->pgpool_port);
+ printf("Watchdog port : %d\n", watchdog_info->wd_port);
+ printf("Node priority : %d\n", watchdog_info->wd_priority);
+ printf("Status : %d\n", watchdog_info->state);
+ printf("Status Name : %s\n", watchdog_info->stateName);
+ printf("Membership Status : %s\n\n", watchdog_info->membership_status_string);
}
}
else
{
- printf("%d %s %s %s\n\n",
+ printf("%d %d %s %s %s\n\n",
cluster->remoteNodeCount + 1,
+ cluster->memberRemoteNodeCount + 1,
cluster->escalated ? "YES" : "NO",
cluster->leaderNodeName,
cluster->leaderHostName);
{
PCPWDNodeInfo *watchdog_info = &cluster->nodeList[i];
- printf("%s %s %d %d %d %s\n",
+ printf("%s %s %d %d %d %s %d %s\n",
watchdog_info->nodeName,
watchdog_info->hostName,
watchdog_info->pgpool_port,
watchdog_info->wd_port,
watchdog_info->state,
- watchdog_info->stateName);
+ watchdog_info->stateName,
+ watchdog_info->membership_status,
+ watchdog_info->membership_status_string);
}
}
}
"SHUTDOWN"
};
+char *wd_cluster_membership_status[] = {
+ "MEMBER",
+ "REVOKED-SHUTDOWN",
+ "REVOKED-NO-SHOW",
+ "REVOKED-LOST"
+};
/*
* Command packet definition.
*/
WatchdogNode *remoteNodes;
WDClusterLeaderInfo clusterLeaderInfo;
int remoteNodeCount;
+ int memberRemoteNodeCount; /* no of nodes that count towards quorum and consensus */
int quorum_status;
unsigned int nextCommandID;
pid_t escalation_pid;
static void clear_standby_nodes_list(void);
static int standby_node_left_cluster(WatchdogNode * wdNode);
static int standby_node_join_cluster(WatchdogNode * wdNode);
+static void reset_lost_timers(void);
+static int update_cluster_memberships(void);
+static int revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBERSHIP_STATUS revoke_status);
+static int restore_cluster_membership_of_node(WatchdogNode* wdNode);
static void update_missed_beacon_count(WDCommandData* ipcCommand, bool clear);
static void wd_execute_cluster_command_processor(WatchdogNode * wdNode, WDPacketData * pkt);
/* initialize remote nodes */
g_cluster.remoteNodeCount = pool_config->wd_nodes.num_wd - 1;
+ g_cluster.memberRemoteNodeCount = g_cluster.remoteNodeCount;
if (g_cluster.remoteNodeCount == 0)
ereport(ERROR,
(errmsg("invalid watchdog configuration. other pgpools setting is not defined")));
-
ereport(LOG,
(errmsg("watchdog cluster is configured with %d remote nodes", g_cluster.remoteNodeCount)));
g_cluster.remoteNodes = palloc0((sizeof(WatchdogNode) * g_cluster.remoteNodeCount));
}
if (found)
{
+ restore_cluster_membership_of_node(wdNode);
/* reply with node info message */
ereport(LOG,
(errmsg("new node joined the cluster hostname:\"%s\" port:%d pgpool_port:%d", wdNode->hostname,
jw_put_int(jNode, "ID", nodeIfNull_int(pgpool_node_id, -1));
jw_put_int(jNode, "State", nodeIfNull_int(state, -1));
+ jw_put_int(jNode, "Membership", nodeIfNull_int(membership_status, -1));
+ jw_put_string(jNode, "MembershipString", node ? wd_cluster_membership_status[node->membership_status] : NotSet);
jw_put_string(jNode, "NodeName", nodeIfNull_str(nodeName, NotSet));
jw_put_string(jNode, "HostName", nodeIfNull_str(hostname, NotSet));
jw_put_string(jNode, "StateName", node ? wd_state_names[node->state] : NotSet);
JsonNode *jNode = jw_create_with_object(true);
jw_put_int(jNode, "RemoteNodeCount", g_cluster.remoteNodeCount);
+ jw_put_int(jNode, "MemberRemoteNodeCount", g_cluster.memberRemoteNodeCount);
+ jw_put_int(jNode, "NodesRequireForQuorum", get_minimum_votes_to_resolve_consensus());
jw_put_int(jNode, "QuorumStatus", WD_LEADER_NODE ? WD_LEADER_NODE->quorum_status : -2);
jw_put_int(jNode, "AliveNodeCount", WD_LEADER_NODE ? WD_LEADER_NODE->standby_nodes_count : 0);
jw_put_int(jNode, "Escalated", g_cluster.localNode->escalated);
{
WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
+ if (wdNode->state == WD_LOST && wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE
+ && pool_config->wd_lost_node_removal_timeout)
+ {
+ int lost_seconds = WD_TIME_DIFF_SEC(currTime, wdNode->lost_time);
+ if (lost_seconds >= pool_config->wd_lost_node_removal_timeout)
+ {
+ ereport(LOG,
+ (errmsg("remote node \"%s\" is lost for %d seconds", wdNode->nodeName,lost_seconds),
+ errdetail("revoking the node's membership")));
+ revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_LOST);
+ }
+ continue;
+ }
+
+ if (wdNode->state == WD_DEAD && wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE
+ && pool_config->wd_initial_node_showup_time)
+ {
+ int no_show_seconds = WD_TIME_DIFF_SEC(currTime, g_cluster.localNode->startup_time);
+ if (no_show_seconds >= pool_config->wd_initial_node_showup_time)
+ {
+ ereport(LOG,
+ (errmsg("remote node \"%s\" didn't showed-up in %d seconds", wdNode->nodeName,no_show_seconds),
+ errdetail("revoking the node's membership")));
+ revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_NO_SHOW);
+ }
+ continue;
+ }
+
if (is_node_active(wdNode) == false)
continue;
{
ereport(LOG,
(errmsg("remote node \"%s\" is shutting down", wdNode->nodeName)));
+ if (pool_config->wd_remove_shutdown_nodes)
+ revoke_cluster_membership_of_node(wdNode,WD_NODE_REVOKED_SHUTDOWN);
}
else
{
wdNode->node_lost_reason = NODE_LOST_UNKNOWN_REASON;
wdNode->state = WD_LOADING;
send_cluster_service_message(wdNode, pkt, CLUSTER_NODE_APPEARING_FOUND);
+ /* if this node was kicked out of quorum calculation. add it back */
+ restore_cluster_membership_of_node(wdNode);
}
else if (event == WD_EVENT_PACKET_RCV)
{
if (clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ||
clusterCommand->commandStatus == COMMAND_FINISHED_TIMEOUT)
{
+ update_cluster_memberships();
update_quorum_status();
-
+ reset_lost_timers();
ereport(DEBUG1,
(errmsg("declare coordinator command finished with status:[%s]",
clusterCommand->commandStatus == COMMAND_FINISHED_ALL_REPLIED ?
}
else if (g_cluster.clusterLeaderInfo.standby_nodes_count == get_minimum_remote_nodes_required_for_quorum())
{
- if (g_cluster.remoteNodeCount % 2 != 0)
+ if (g_cluster.memberRemoteNodeCount % 2 != 0)
{
if (pool_config->enable_consensus_with_half_votes)
g_cluster.quorum_status = 0; /* on the edge */
* Even number of remote nodes, That means total number of nodes are odd,
* so minimum quorum is just remote/2.
*/
- if (g_cluster.remoteNodeCount % 2 == 0)
- return (g_cluster.remoteNodeCount / 2);
+ if (g_cluster.memberRemoteNodeCount % 2 == 0)
+ return (g_cluster.memberRemoteNodeCount / 2);
/*
* Total nodes including self are even, So we return 50% nodes as quorum
* requirements
*/
- return ((g_cluster.remoteNodeCount - 1) / 2);
+ return ((g_cluster.memberRemoteNodeCount - 1) / 2);
}
/*
* So for even number of nodes when enable_consensus_with_half_votes is
* not allowed than we would add one more vote than exact 50%
*/
- if (g_cluster.remoteNodeCount % 2 != 0)
+ if (g_cluster.memberRemoteNodeCount % 2 != 0)
{
if (pool_config->enable_consensus_with_half_votes == false)
required_node_count += 1;
}
}
-static WatchdogNode * getLeaderWatchdogNode(void)
+static WatchdogNode*
+getLeaderWatchdogNode(void)
{
return g_cluster.clusterLeaderInfo.leaderNode;
}
+static int
+update_cluster_memberships(void)
+{
+ int i;
+ g_cluster.memberRemoteNodeCount = g_cluster.remoteNodeCount;
+ for (i = 0; i < g_cluster.remoteNodeCount; i++)
+ {
+ WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
+ if (wdNode->membership_status != WD_NODE_MEMBERSHIP_ACTIVE)
+ g_cluster.memberRemoteNodeCount--;
+ }
+ return g_cluster.memberRemoteNodeCount;
+}
+
+static int
+revoke_cluster_membership_of_node(WatchdogNode* wdNode, WD_NODE_MEMBERSHIP_STATUS revoke_status)
+{
+ if (wdNode->membership_status == WD_NODE_MEMBERSHIP_ACTIVE)
+ {
+ wdNode->membership_status = revoke_status;
+
+ ereport(LOG,
+ (errmsg("revoking the membership of [%s] node:\"%s\" [node_id:%d]",
+ wd_state_names[wdNode->state], wdNode->nodeName,wdNode->pgpool_node_id),
+ errdetail("membership revoke reason: \"%s\"",
+ wd_cluster_membership_status[wdNode->membership_status])));
+
+ g_cluster.memberRemoteNodeCount--;
+ }
+ return g_cluster.memberRemoteNodeCount;
+}
+
+static int
+restore_cluster_membership_of_node(WatchdogNode* wdNode)
+{
+ if (wdNode->membership_status != WD_NODE_MEMBERSHIP_ACTIVE)
+ {
+ ereport(LOG,
+ (errmsg("Restoring cluster membership of node:\"%s\"",wdNode->nodeName),
+ errdetail("membership of node was revoked because it was \"%s\"",
+ wd_cluster_membership_status[wdNode->membership_status])));
+
+ wdNode->membership_status = WD_NODE_MEMBERSHIP_ACTIVE;
+ /* reset the lost time on the node */
+ wdNode->lost_time.tv_sec = 0;
+ wdNode->lost_time.tv_usec = 0;
+ g_cluster.memberRemoteNodeCount++;
+ }
+ return g_cluster.memberRemoteNodeCount;
+}
+
+static void
+reset_lost_timers(void)
+{
+ int i;
+ for (i = 0; i < g_cluster.remoteNodeCount; i++)
+ {
+ WatchdogNode *wdNode = &(g_cluster.remoteNodes[i]);
+ wdNode->lost_time.tv_sec = 0;
+ wdNode->lost_time.tv_usec = 0;
+ }
+}
+
static int
standby_node_join_cluster(WatchdogNode * wdNode)
{
if (get_local_node_state() == WD_COORDINATOR)
{
int i;
-
+ /* Just rest the lost time stamp*/
+ /* set the timestamp on node to track for how long this node is lost */
+ wdNode->lost_time.tv_sec = 0;
+ wdNode->lost_time.tv_usec = 0;
/* First check if the node is already in the List */
for (i = 0; i < g_cluster.clusterLeaderInfo.standby_nodes_count; i++)
{
*/
ereport(LOG,
(errmsg("removing watchdog node \"%s\" from the standby list", wdNode->nodeName)));
-
+ /* set the timestamp on node to track for how long this node is lost */
+ gettimeofday(&wdNode->lost_time, NULL);
g_cluster.clusterLeaderInfo.standbyNodes[i] = NULL;
g_cluster.clusterLeaderInfo.standby_nodes_count--;
removed = true;
(errmsg("invalid json data"),
errdetail("unable to find Watchdog Node ID")));
}
+ if (json_get_int_value_for_key(source, "Membership", &wdNodeInfo->membership_status))
+ {
+ /* would be from the older version. No need to panic */
+ wdNodeInfo->membership_status = WD_NODE_MEMBERSHIP_ACTIVE;
+ }
+
+ ptr = json_get_string_value_for_key(source, "MembershipString");
+ if (ptr == NULL)
+ {
+ strncpy(wdNodeInfo->membership_status_string, "NOT-Available", sizeof(wdNodeInfo->membership_status_string) - 1);
+ }
+ else
+ strncpy(wdNodeInfo->membership_status_string, ptr, sizeof(wdNodeInfo->membership_status_string) - 1);
+
ptr = json_get_string_value_for_key(source, "NodeName");
if (ptr == NULL)