Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

storcon: deny external node configuration if an operation is ongoing #8727

Merged
merged 9 commits into from
Aug 15, 2024
41 changes: 41 additions & 0 deletions test_runner/regress/test_storage_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2091,3 +2091,44 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
)
== 0
)

def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
# single unsharded tenant, two locations
neon_env_builder.num_pageservers = 2

env = neon_env_builder.init_start()

# make sure there is no secondary, so we will have something to drain
env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
koivunej marked this conversation as resolved.
Show resolved Hide resolved
env.storage_controller.reconcile_until_idle()

first, _ = env.pageservers

# attached_at = int(env.storage_controller.locate(env.initial_tenant)[0]["node_id"])
# secondary_at = next(filter(lambda ps: ps.id != attached_at, env.pageservers))
koivunej marked this conversation as resolved.
Show resolved Hide resolved

def first_is(state):
details = env.storage_controller.node_status(first.id)
log.info(f"{details}")
assert details["scheduling"] == state

def first_is_draining():
return first_is("Draining")

env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
env.storage_controller.node_drain(first.id)

wait_until(10, 0.5, first_is_draining)

# restarting or the re-attach request from ps does not cancel drain
# but it makes it so that it cannot be cancelled from the api, bug?
first.restart()
koivunej marked this conversation as resolved.
Show resolved Hide resolved

# we are unable to reconfigure node while the operation is still ongoing
with pytest.raises(StorageControllerApiException, match="Precondition failed: Ongoing background operation forbids configuring: drain.*"):
env.storage_controller.node_configure(first.id, {"scheduling": "Pause"})
with pytest.raises(StorageControllerApiException, match="Precondition failed: Ongoing background operation forbids configuring: drain.*"):
env.storage_controller.node_configure(first.id, {"availability": "Offline"})

# fails, because the check for schedulingpolicy happens before the operation check
# env.storage_controller.cancel_node_drain(first.id)