From fe16d6f96a58f835020b5cb28cb982f84dc3c10f Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Tue, 13 Aug 2024 21:37:45 -0400 Subject: [PATCH 1/2] node: Document usage of ReconfigureMembership and ReconfigureMembershipExt Signed-off-by: Cole Miller --- node.go | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/node.go b/node.go index 9bc5961c..cb542b90 100644 --- a/node.go +++ b/node.go @@ -220,8 +220,29 @@ func GenerateID(address string) uint64 { // ReconfigureMembership can be used to recover a cluster whose majority of // nodes have died, and therefore has become unavailable. // -// It forces appending a new configuration to the raft log stored in the given -// directory, effectively replacing the current configuration. +// Deprecated: Use ReconfigureMembershipExt instead, which correctly accounts +// for node roles. +// +// ReconfigureMembership forces appending a new configuration to the raft log +// stored in the given directory, effectively replacing the current +// configuration. This is an unsafe operation, and you should follow these +// steps to avoid data loss: +// +// 1. Make sure no dqlite node in the cluster is running. +// +// 2. Identify all dqlite nodes that have survived and that you want to be part +// of the recovered cluster. Call this the "new member list". +// +// 3. From the nodes in the new member list, find the one with the most +// up-to-date raft term and log. Call this the "template node". +// +// 4. Invoke ReconfigureMembership exactly one time, on the template node, +// passing in the step 3, passing in the new member list. +// +// 5. Copy the data directory of the template node to all other nodes in the +// new member list, replacing their previous data directories. +// +// 6. Restart all nodes in the new member list. func ReconfigureMembership(dir string, cluster []NodeInfo) error { server, err := bindings.NewNode(context.Background(), 1, "1", dir) if err != nil { @@ -235,10 +256,24 @@ func ReconfigureMembership(dir string, cluster []NodeInfo) error { // nodes have died, and therefore has become unavailable. // // It forces appending a new configuration to the raft log stored in the given -// directory, effectively replacing the current configuration. -// In comparision with ReconfigureMembership, this function takes the node role -// into account and makes use of a dqlite API that supports extending the -// NodeInfo struct. +// directory, effectively replacing the current configuration. This is an +// unsafe operation, and you should follow these steps to avoid data loss: +// +// 1. Make sure no dqlite node in the cluster is running. +// +// 2. Identify all dqlite nodes that have survived and that you want to be part +// of the recovered cluster. Call this the "new member list". +// +// 3. From the nodes in the new member list, find the one with the most +// up-to-date raft term and log. Call this the "template node". +// +// 4. Invoke ReconfigureMembershipExt exactly one time, on the template node, +// passing in the step 3, passing in the new member list. +// +// 5. Copy the data directory of the template node to all other nodes in the +// new member list, replacing their previous data directories. +// +// 6. Restart all nodes in the new member list. func ReconfigureMembershipExt(dir string, cluster []NodeInfo) error { server, err := bindings.NewNode(context.Background(), 1, "1", dir) if err != nil { From 9c5fd75ec1f708151559642a15383778c1f4cea3 Mon Sep 17 00:00:00 2001 From: Cole Miller Date: Thu, 22 Aug 2024 22:21:14 -0400 Subject: [PATCH 2/2] Address review comments Signed-off-by: Cole Miller --- node.go | 46 +++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/node.go b/node.go index cb542b90..0a2dbf58 100644 --- a/node.go +++ b/node.go @@ -176,7 +176,7 @@ func (s *Node) Start() error { // Recover a node by forcing a new cluster configuration. // -// DEPRECATED: Use ReconfigureMembership instead, which does not require +// Deprecated: use ReconfigureMembershipExt instead, which does not require // instantiating a new Node object. func (s *Node) Recover(cluster []NodeInfo) error { return s.server.Recover(cluster) @@ -217,32 +217,11 @@ func GenerateID(address string) uint64 { return bindings.GenerateID(address) } -// ReconfigureMembership can be used to recover a cluster whose majority of -// nodes have died, and therefore has become unavailable. +// ReconfigureMembership forces a new cluster configuration. // -// Deprecated: Use ReconfigureMembershipExt instead, which correctly accounts -// for node roles. -// -// ReconfigureMembership forces appending a new configuration to the raft log -// stored in the given directory, effectively replacing the current -// configuration. This is an unsafe operation, and you should follow these -// steps to avoid data loss: -// -// 1. Make sure no dqlite node in the cluster is running. -// -// 2. Identify all dqlite nodes that have survived and that you want to be part -// of the recovered cluster. Call this the "new member list". -// -// 3. From the nodes in the new member list, find the one with the most -// up-to-date raft term and log. Call this the "template node". -// -// 4. Invoke ReconfigureMembership exactly one time, on the template node, -// passing in the step 3, passing in the new member list. -// -// 5. Copy the data directory of the template node to all other nodes in the -// new member list, replacing their previous data directories. -// -// 6. Restart all nodes in the new member list. +// Deprecated: this function ignores the provided node roles and makes every +// node in the new configuration a voter. Use ReconfigureMembershipExt, which +// respects the provided roles. func ReconfigureMembership(dir string, cluster []NodeInfo) error { server, err := bindings.NewNode(context.Background(), 1, "1", dir) if err != nil { @@ -252,12 +231,12 @@ func ReconfigureMembership(dir string, cluster []NodeInfo) error { return server.Recover(cluster) } -// ReconfigureMembershipExt can be used to recover a cluster whose majority of -// nodes have died, and therefore has become unavailable. +// ReconfigureMembershipExt forces a new cluster configuration. // -// It forces appending a new configuration to the raft log stored in the given -// directory, effectively replacing the current configuration. This is an -// unsafe operation, and you should follow these steps to avoid data loss: +// This function is useful to revive a cluster that can't achieve quorum in its +// old configuration because some nodes can't be brought online. Forcing a new +// configuration is unsafe, and you should follow these steps to avoid data +// loss and inconsistency: // // 1. Make sure no dqlite node in the cluster is running. // @@ -267,8 +246,9 @@ func ReconfigureMembership(dir string, cluster []NodeInfo) error { // 3. From the nodes in the new member list, find the one with the most // up-to-date raft term and log. Call this the "template node". // -// 4. Invoke ReconfigureMembershipExt exactly one time, on the template node, -// passing in the step 3, passing in the new member list. +// 4. Invoke ReconfigureMembershipExt exactly one time, on the template node. +// The arguments are the data directory of the template node and the new +// member list. // // 5. Copy the data directory of the template node to all other nodes in the // new member list, replacing their previous data directories.