From 70559dc1d8d3f88dc08fbe38a387b3507bba3410 Mon Sep 17 00:00:00 2001 From: Nirbhay Choubey Date: Tue, 25 Aug 2015 16:31:37 -0400 Subject: [PATCH] issue#283 : Cluster automatic recovery after graceful shutdown of all nodes Introduced a new configuration option 'pc.recover_minimum_weight', (default = 0) which is designed to work in tandem with pc.recovery. When set, this new option controls PC state recovery file's lifecycle. Also added logic to abort bootstrapping if PC state recovery file is found on start. --- gcomm/src/conf.cpp | 3 +++ gcomm/src/defaults.cpp | 1 + gcomm/src/defaults.hpp | 1 + gcomm/src/gcomm/conf.hpp | 2 ++ gcomm/src/gcomm/view.hpp | 1 + gcomm/src/pc.cpp | 53 +++++++++++++++++++++++++++++++++++----- gcomm/src/pc.hpp | 8 +++--- gcomm/src/pc_proto.cpp | 4 +-- gcomm/src/pc_proto.hpp | 7 ++++++ gcomm/src/view.cpp | 9 +++++++ 10 files changed, 78 insertions(+), 11 deletions(-) diff --git a/gcomm/src/conf.cpp b/gcomm/src/conf.cpp index 37b313da1..dbad82a26 100644 --- a/gcomm/src/conf.cpp +++ b/gcomm/src/conf.cpp @@ -109,6 +109,8 @@ std::string const gcomm::Conf::PcWaitPrimTimeout = PcPrefix + "wait_prim_timeout"; std::string const gcomm::Conf::PcWeight = PcPrefix + "weight"; std::string const gcomm::Conf::PcRecovery = PcPrefix + "recovery"; +std::string const gcomm::Conf::PcRecoverMinimumWeight = + PcPrefix + "recover_minimum_weight"; void gcomm::Conf::register_params(gu::Config& cnf) @@ -172,6 +174,7 @@ gcomm::Conf::register_params(gu::Config& cnf) GCOMM_CONF_ADD_DEFAULT(PcWaitPrimTimeout); GCOMM_CONF_ADD_DEFAULT(PcWeight); GCOMM_CONF_ADD_DEFAULT(PcRecovery); + GCOMM_CONF_ADD_DEFAULT(PcRecoverMinimumWeight); #undef GCOMM_CONF_ADD #undef GCOMM_CONF_ADD_DEFAULT diff --git a/gcomm/src/defaults.cpp b/gcomm/src/defaults.cpp index e2da55183..a892e9bc3 100644 --- a/gcomm/src/defaults.cpp +++ b/gcomm/src/defaults.cpp @@ -51,4 +51,5 @@ namespace gcomm std::string const Defaults::PcWaitPrimTimeout = "P30S"; std::string const Defaults::PcWeight = "1"; std::string const Defaults::PcRecovery = "1"; + std::string const Defaults::PcRecoverMinimumWeight = "0"; } diff --git a/gcomm/src/defaults.hpp b/gcomm/src/defaults.hpp index ef5c8ef38..f7ecce3df 100644 --- a/gcomm/src/defaults.hpp +++ b/gcomm/src/defaults.hpp @@ -49,6 +49,7 @@ namespace gcomm static std::string const PcWaitPrimTimeout ; static std::string const PcWeight ; static std::string const PcRecovery ; + static std::string const PcRecoverMinimumWeight ; }; } diff --git a/gcomm/src/gcomm/conf.hpp b/gcomm/src/gcomm/conf.hpp index 98cebdfc7..dae48ad96 100644 --- a/gcomm/src/gcomm/conf.hpp +++ b/gcomm/src/gcomm/conf.hpp @@ -410,6 +410,8 @@ namespace gcomm */ static std::string const PcRecovery; + static std::string const PcRecoverMinimumWeight; + static void register_params(gu::Config&); }; diff --git a/gcomm/src/gcomm/view.hpp b/gcomm/src/gcomm/view.hpp index d9e074702..da53c860f 100644 --- a/gcomm/src/gcomm/view.hpp +++ b/gcomm/src/gcomm/view.hpp @@ -236,6 +236,7 @@ namespace gcomm std::istream& read_stream(std::istream& is); void write_file() const; bool read_file(); + static bool file_exists(const char* fname = NULL); static void remove_file(gu::Config& conf); bool operator== (const ViewState& vst) const { diff --git a/gcomm/src/pc.cpp b/gcomm/src/pc.cpp index d75b6cbc9..d26a48627 100644 --- a/gcomm/src/pc.cpp +++ b/gcomm/src/pc.cpp @@ -24,12 +24,33 @@ void gcomm::PC::handle_up(const void* cid, const Datagram& rb, um.has_view() && um.view().id().type() == V_PRIM) { - ViewState vst(const_cast(uuid()), - const_cast(um.view()), - conf_); - log_info << "save pc into disk"; - vst.write_file(); + size_t total_weight_new= weighted_sum(um.view().members(), + pc_->instances()); + /* + When pc.recover_minimum_weight is set to non-zero, save the PC + state on disk iff the new total weight is greater than or equal + to the specified value. + */ + if (pc_recover_minimum_weight_ == 0 || /* backward compatibility */ + total_weight_new >= pc_recover_minimum_weight_) + { + ViewState vst(const_cast(uuid()), + const_cast(um.view()), + conf_); + log_info << "Saving PC state on disk."; + vst.write_file(); + } + else + { + log_info << "New PC weight is less than the configured " + "pc.recover_minimum_weight, state file not " + "being saved."; + } + + // Update total_weight + total_weight_= total_weight_new; } + send_up(rb, um); } @@ -74,6 +95,11 @@ std::string gcomm::PC::listen_addr() const void gcomm::PC::connect(bool start_prim) { + if (start_prim && pc_recovery_ && ViewState::file_exists()) + { + gu_throw_error(EPROTO) << "PC state file exists, aborting bootstrap."; + } + try { // for backward compatibility with old approach: gcomm://0.0.0.0 @@ -217,7 +243,18 @@ void gcomm::PC::close(bool force) pstack_.pop_proto(pc_); pstack_.pop_proto(evs_); pstack_.pop_proto(gmcast_); - ViewState::remove_file(conf_); + + if (pc_recover_minimum_weight_ == 0 || /* backward compatibility */ + total_weight_ > pc_recover_minimum_weight_) + { + ViewState::remove_file(conf_); + } + else if (ViewState::file_exists()) + { + log_info << "New PC weight has fallen below the configured " + "pc.recover_minimum_weight; state file not being " + "deleted."; + } closed_ = true; } @@ -240,6 +277,10 @@ gcomm::PC::PC(Protonet& net, const gu::URI& uri) : Defaults::PcAnnounceTimeout)), pc_recovery_ (param(conf_, uri, Conf::PcRecovery, Defaults::PcRecovery)), + pc_recover_minimum_weight_ (param(conf_, uri, + Conf::PcRecoverMinimumWeight, + Defaults::PcRecoverMinimumWeight)), + total_weight_(0), rst_uuid_(), rst_view_() diff --git a/gcomm/src/pc.hpp b/gcomm/src/pc.hpp index e8c837810..545393c59 100644 --- a/gcomm/src/pc.hpp +++ b/gcomm/src/pc.hpp @@ -47,11 +47,13 @@ namespace gcomm evs::Proto* evs_; // EVS protocol layer pc::Proto* pc_; // PC protocol layer bool closed_; // flag for destructor - // Period to wait graceful leave - gu::datetime::Period linger_; + gu::datetime::Period linger_; // Period to wait graceful leave gu::datetime::Period announce_timeout_; - bool pc_recovery_; + bool pc_recovery_; // Automatic PC recovery + // Minimum PC weight for recovery to kick-in + size_t pc_recover_minimum_weight_; + size_t total_weight_; // Current total PC weight UUID rst_uuid_; View rst_view_; diff --git a/gcomm/src/pc_proto.cpp b/gcomm/src/pc_proto.cpp index 14c6a9245..2eb144e14 100644 --- a/gcomm/src/pc_proto.cpp +++ b/gcomm/src/pc_proto.cpp @@ -423,8 +423,8 @@ void gcomm::pc::Proto::handle_first_trans(const View& view) // Compute weighted sum of members in node list. If member cannot be found // from node_map its weight is assumed to be zero. -static size_t weighted_sum(const gcomm::NodeList& node_list, - const gcomm::pc::NodeMap& node_map) +size_t weighted_sum(const gcomm::NodeList& node_list, + const gcomm::pc::NodeMap& node_map) { size_t sum(0); for (gcomm::NodeList::const_iterator i(node_list.begin()); diff --git a/gcomm/src/pc_proto.hpp b/gcomm/src/pc_proto.hpp index 70d39e810..5d1022b93 100644 --- a/gcomm/src/pc_proto.hpp +++ b/gcomm/src/pc_proto.hpp @@ -190,6 +190,11 @@ class gcomm::pc::Proto : public Protolay rst_view -> id().seq())); } const View* restored_view() const { return rst_view_; } + const NodeMap& instances() const + { + return instances_; + } + private: friend std::ostream& operator<<(std::ostream& os, const Proto& p); Proto (const Proto&); @@ -229,5 +234,7 @@ class gcomm::pc::Proto : public Protolay View* rst_view_; // restored PC view }; +size_t weighted_sum(const gcomm::NodeList& node_list, + const gcomm::pc::NodeMap& node_map); #endif // PC_PROTO_HPP diff --git a/gcomm/src/view.cpp b/gcomm/src/view.cpp index 4ab203720..235ebc649 100644 --- a/gcomm/src/view.cpp +++ b/gcomm/src/view.cpp @@ -322,6 +322,15 @@ bool gcomm::ViewState::read_file() } } +bool gcomm::ViewState::file_exists(const char* fname) +{ + if (fname == NULL) fname = COMMON_VIEW_STAT_FILE; + if (access(fname, F_OK) == 0) { + return true; + } + return false; +} + // remove_file is static function, it should remove the view // state file even if there is no ViewState object around. // View state file name is derived in the same way as for