Skip to content

Commit

Permalink
Fail alloc if alloc runner prestart hooks fail
Browse files Browse the repository at this point in the history
When an alloc runner prestart hook fails, the task runners aren't invoked
and they remain in a pending state.

This leads to terrible results, some of which are:
* Lockup in GC process as reported in hashicorp#5861
* Lockup in shutdown process as TR.Shutdown() waits for WaitCh to be closed
* Alloc not being restarted/rescheduled to another node (as it's still in
  pending state)
* Unexpected restart of alloc on a client restart, potentially days/weeks after
  alloc expected start time!

Here, we treat all tasks to have failed if alloc runner prestart hook fails.
This fixes the lockups, and permits the alloc to be rescheduled on another node.

While it's desirable to retry alloc runner in such failures, I opted to treat it
out of scope.  I'm afraid of some subtles about alloc and task runners and their
idempotency that's better handled in a follow up PR.

This might be one of the root causes for
hashicorp#5840 .
  • Loading branch information
Mahmood Ali authored and chrisboulton committed Jul 15, 2019
1 parent 218b944 commit 705f1ea
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 0 deletions.
8 changes: 8 additions & 0 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,18 @@ func (ar *allocRunner) Run() {
default:
}

// When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run
// to perform any cleanup that's necessary, potentially not done prior to earlier termination

// Run the prestart hooks if non-terminal
if ar.shouldRun() {
if err := ar.prerun(); err != nil {
ar.logger.Error("prerun failed", "error", err)

for _, tr := range ar.tasks {
tr.MarkFailedDead(fmt.Sprintf("failed to setup runner: %v", err))
}

goto POST
}
}
Expand Down
76 changes: 76 additions & 0 deletions client/allocrunner/alloc_runner_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,79 @@ func TestAllocRunner_Restore_CompletedBatch(t *testing.T) {
events := ar2.AllocState().TaskStates[task.Name].Events
require.Equal(t, initialRunEvents, events)
}

// TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc
// prestart hooks failed, then the alloc and subsequent tasks transition
// to failed state
func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) {
t.Parallel()

alloc := mock.Alloc()
alloc.Job.Type = structs.JobTypeBatch
task := alloc.Job.TaskGroups[0].Tasks[0]
task.Driver = "mock_driver"
task.Config = map[string]interface{}{
"run_for": "2ms",
}
alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{
Attempts: 0,
}

conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
defer cleanup()

// Maintain state for subsequent run
conf.StateDB = state.NewMemDB(conf.Logger)

// Start and wait for task to be running
ar, err := NewAllocRunner(conf)
require.NoError(t, err)

ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{})

go ar.Run()
defer destroy(ar)

select {
case <-ar.WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "alloc.waitCh wasn't closed")
}

testutil.WaitForResult(func() (bool, error) {
s := ar.AllocState()
if s.ClientStatus != structs.AllocClientStatusFailed {
return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
}
return true, nil
}, func(err error) {
require.NoError(t, err)
})

// once job finishes, it shouldn't run again
require.False(t, ar.shouldRun())
initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
require.Len(t, initialRunEvents, 2)

ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
require.NoError(t, err)
require.NotNil(t, ls)
require.NotNil(t, ts)
require.Equal(t, structs.TaskStateDead, ts.State)
require.True(t, ts.Failed)

// TR waitCh must be closed too!
select {
case <-ar.tasks[task.Name].WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "tr.waitCh wasn't closed")
}
}

type allocFailingPrestartHook struct{}

func (*allocFailingPrestartHook) Name() string { return "failing_prestart" }

func (*allocFailingPrestartHook) Prerun() error {
return fmt.Errorf("failing prestart hooks")
}
22 changes: 22 additions & 0 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,28 @@ func (tr *TaskRunner) initLabels() {
}
}

// Mark a task as failed and not to run. Aimed to be invoked when alloc runner
// prestart hooks failed.
// Should never be called with Run().
func (tr *TaskRunner) MarkFailedDead(reason string) {
defer close(tr.waitCh)

tr.stateLock.Lock()
if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
//TODO Nomad will be unable to restore this task; try to kill
// it now and fail? In general we prefer to leave running
// tasks running even if the agent encounters an error.
tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart",
"error", err)
}
tr.stateLock.Unlock()

event := structs.NewTaskEvent(structs.TaskSetupFailure).
SetDisplayMessage(reason).
SetFailsTask()
tr.UpdateState(structs.TaskStateDead, event)
}

// Run the TaskRunner. Starts the user's task or reattaches to a restored task.
// Run closes WaitCh when it exits. Should be started in a goroutine.
func (tr *TaskRunner) Run() {
Expand Down

0 comments on commit 705f1ea

Please # to comment.