diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 544f9ba6ff..170dabe244 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -248,10 +248,18 @@ func (ar *allocRunner) Run() { default: } + // When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run + // to perform any cleanup that's necessary, potentially not done prior to earlier termination + // Run the prestart hooks if non-terminal if ar.shouldRun() { if err := ar.prerun(); err != nil { ar.logger.Error("prerun failed", "error", err) + + for _, tr := range ar.tasks { + tr.MarkFailedDead(fmt.Sprintf("failed to setup runner: %v", err)) + } + goto POST } } diff --git a/client/allocrunner/alloc_runner_unix_test.go b/client/allocrunner/alloc_runner_unix_test.go index 8c31832e3c..a438266a49 100644 --- a/client/allocrunner/alloc_runner_unix_test.go +++ b/client/allocrunner/alloc_runner_unix_test.go @@ -215,3 +215,79 @@ func TestAllocRunner_Restore_CompletedBatch(t *testing.T) { events := ar2.AllocState().TaskStates[task.Name].Events require.Equal(t, initialRunEvents, events) } + +// TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc +// prestart hooks failed, then the alloc and subsequent tasks transition +// to failed state +func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) { + t.Parallel() + + alloc := mock.Alloc() + alloc.Job.Type = structs.JobTypeBatch + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config = map[string]interface{}{ + "run_for": "2ms", + } + alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{ + Attempts: 0, + } + + conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) + defer cleanup() + + // Maintain state for subsequent run + conf.StateDB = state.NewMemDB(conf.Logger) + + // Start and wait for task to be running + ar, err := NewAllocRunner(conf) + require.NoError(t, err) + + ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{}) + + go ar.Run() + defer destroy(ar) + + select { + case <-ar.WaitCh(): + case <-time.After(10 * time.Second): + require.Fail(t, "alloc.waitCh wasn't closed") + } + + testutil.WaitForResult(func() (bool, error) { + s := ar.AllocState() + if s.ClientStatus != structs.AllocClientStatusFailed { + return false, fmt.Errorf("expected complete, got %s", s.ClientStatus) + } + return true, nil + }, func(err error) { + require.NoError(t, err) + }) + + // once job finishes, it shouldn't run again + require.False(t, ar.shouldRun()) + initialRunEvents := ar.AllocState().TaskStates[task.Name].Events + require.Len(t, initialRunEvents, 2) + + ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) + require.NoError(t, err) + require.NotNil(t, ls) + require.NotNil(t, ts) + require.Equal(t, structs.TaskStateDead, ts.State) + require.True(t, ts.Failed) + + // TR waitCh must be closed too! + select { + case <-ar.tasks[task.Name].WaitCh(): + case <-time.After(10 * time.Second): + require.Fail(t, "tr.waitCh wasn't closed") + } +} + +type allocFailingPrestartHook struct{} + +func (*allocFailingPrestartHook) Name() string { return "failing_prestart" } + +func (*allocFailingPrestartHook) Prerun() error { + return fmt.Errorf("failing prestart hooks") +} diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 9ea8c0416b..0159b91bcc 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -384,6 +384,28 @@ func (tr *TaskRunner) initLabels() { } } +// Mark a task as failed and not to run. Aimed to be invoked when alloc runner +// prestart hooks failed. +// Should never be called with Run(). +func (tr *TaskRunner) MarkFailedDead(reason string) { + defer close(tr.waitCh) + + tr.stateLock.Lock() + if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { + //TODO Nomad will be unable to restore this task; try to kill + // it now and fail? In general we prefer to leave running + // tasks running even if the agent encounters an error. + tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart", + "error", err) + } + tr.stateLock.Unlock() + + event := structs.NewTaskEvent(structs.TaskSetupFailure). + SetDisplayMessage(reason). + SetFailsTask() + tr.UpdateState(structs.TaskStateDead, event) +} + // Run the TaskRunner. Starts the user's task or reattaches to a restored task. // Run closes WaitCh when it exits. Should be started in a goroutine. func (tr *TaskRunner) Run() {