From 88ddf9f981448a29aec6fa1431530932a5cab4c9 Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Tue, 19 Mar 2019 22:38:05 +0100 Subject: [PATCH] rabbit: raise failure-timeout (noref) The failure timeout of 30s is far too low. Essentially it means that a failed node is considered ready after 30s. Given that any start or stop operation takes considerably more than 30s. We should only expire failures after around 30 minutes to prevent flapping services. --- chef/cookbooks/rabbitmq/attributes/default.rb | 11 ++++++----- chef/cookbooks/rabbitmq/recipes/ha_cluster.rb | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/chef/cookbooks/rabbitmq/attributes/default.rb b/chef/cookbooks/rabbitmq/attributes/default.rb index dfc7c13518..b1865a26c6 100644 --- a/chef/cookbooks/rabbitmq/attributes/default.rb +++ b/chef/cookbooks/rabbitmq/attributes/default.rb @@ -42,13 +42,14 @@ default[:rabbitmq][:ha][:op][:start][:timeout] = "300s" default[:rabbitmq][:ha][:op][:promote][:timeout] = "180s" default[:rabbitmq][:ha][:op][:monitor][:interval] = "10s" -default[:rabbitmq][:ha][:clustered_op][:start][:timeout] = "360s" -default[:rabbitmq][:ha][:clustered_op][:stop][:timeout] = "120s" -default[:rabbitmq][:ha][:clustered_op][:promote][:timeout] = "120s" -default[:rabbitmq][:ha][:clustered_op][:demote][:timeout] = "120s" +default[:rabbitmq][:ha][:clustered_op][:start][:timeout] = "540s" +default[:rabbitmq][:ha][:clustered_op][:stop][:timeout] = "180s" +default[:rabbitmq][:ha][:clustered_op][:promote][:timeout] = "180s" +default[:rabbitmq][:ha][:clustered_op][:demote][:timeout] = "180s" default[:rabbitmq][:ha][:clustered_op][:notify][:timeout] = "180s" default[:rabbitmq][:ha][:clustered_op][:monitor] = [ - { interval: "30s" }, { interval: "27s", role: "Master" } + { interval: "60s", timeout: "90s" }, + { interval: "57s", timeout: "90s", role: "Master" } ] default[:rabbitmq][:hipe_compile] = false diff --git a/chef/cookbooks/rabbitmq/recipes/ha_cluster.rb b/chef/cookbooks/rabbitmq/recipes/ha_cluster.rb index 9de397e5cf..1690d67959 100644 --- a/chef/cookbooks/rabbitmq/recipes/ha_cluster.rb +++ b/chef/cookbooks/rabbitmq/recipes/ha_cluster.rb @@ -122,7 +122,7 @@ op node[:rabbitmq][:ha][:clustered_op] meta ({ "migration-threshold" => "10", - "failure-timeout" => "30s", + "failure-timeout" => "1800s", "resource-stickiness" => "100" }) action :update