Fail alloc if alloc runner prestart hooks fail

When an alloc runner prestart hook fails, the task runners aren't invoked and they remain in a pending state. This leads to terrible results, some of which are: * Lockup in GC process as reported in hashicorp#5861 * Lockup in shutdown process as TR.Shutdown() waits for WaitCh to be closed * Alloc not being restarted/rescheduled to another node (as it's still in pending state) * Unexpected restart of alloc on a client restart, potentially days/weeks after alloc expected start time! Here, we treat all tasks to have failed if alloc runner prestart hook fails. This fixes the lockups, and permits the alloc to be rescheduled on another node. While it's desirable to retry alloc runner in such failures, I opted to treat it out of scope. I'm afraid of some subtles about alloc and task runners and their idempotency that's better handled in a follow up PR. This might be one of the root causes for hashicorp#5840 .
bigcommerce · Jul 15, 2019 · 705f1ea · 705f1ea
1 parent 218b944
commit 705f1ea
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 0 deletions.
diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go
@@ -248,10 +248,18 @@ func (ar *allocRunner) Run() {
 	default:
 	}
 
+	// When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run
+	// to perform any cleanup that's necessary, potentially not done prior to earlier termination
+
 	// Run the prestart hooks if non-terminal
 	if ar.shouldRun() {
 		if err := ar.prerun(); err != nil {
 			ar.logger.Error("prerun failed", "error", err)
+
+			for _, tr := range ar.tasks {
+				tr.MarkFailedDead(fmt.Sprintf("failed to setup runner: %v", err))
+			}
+
 			goto POST
 		}
 	}

diff --git a/client/allocrunner/alloc_runner_unix_test.go b/client/allocrunner/alloc_runner_unix_test.go
@@ -215,3 +215,79 @@ func TestAllocRunner_Restore_CompletedBatch(t *testing.T) {
 	events := ar2.AllocState().TaskStates[task.Name].Events
 	require.Equal(t, initialRunEvents, events)
 }
+
+// TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc
+// prestart hooks failed, then the alloc and subsequent tasks transition
+// to failed state
+func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) {
+	t.Parallel()
+
+	alloc := mock.Alloc()
+	alloc.Job.Type = structs.JobTypeBatch
+	task := alloc.Job.TaskGroups[0].Tasks[0]
+	task.Driver = "mock_driver"
+	task.Config = map[string]interface{}{
+		"run_for": "2ms",
+	}
+	alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{
+		Attempts: 0,
+	}
+
+	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
+	defer cleanup()
+
+	// Maintain state for subsequent run
+	conf.StateDB = state.NewMemDB(conf.Logger)
+
+	// Start and wait for task to be running
+	ar, err := NewAllocRunner(conf)
+	require.NoError(t, err)
+
+	ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{})
+
+	go ar.Run()
+	defer destroy(ar)
+
+	select {
+	case <-ar.WaitCh():
+	case <-time.After(10 * time.Second):
+		require.Fail(t, "alloc.waitCh wasn't closed")
+	}
+
+	testutil.WaitForResult(func() (bool, error) {
+		s := ar.AllocState()
+		if s.ClientStatus != structs.AllocClientStatusFailed {
+			return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
+		}
+		return true, nil
+	}, func(err error) {
+		require.NoError(t, err)
+	})
+
+	// once job finishes, it shouldn't run again
+	require.False(t, ar.shouldRun())
+	initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
+	require.Len(t, initialRunEvents, 2)
+
+	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
+	require.NoError(t, err)
+	require.NotNil(t, ls)
+	require.NotNil(t, ts)
+	require.Equal(t, structs.TaskStateDead, ts.State)
+	require.True(t, ts.Failed)
+
+	// TR waitCh must be closed too!
+	select {
+	case <-ar.tasks[task.Name].WaitCh():
+	case <-time.After(10 * time.Second):
+		require.Fail(t, "tr.waitCh wasn't closed")
+	}
+}
+
+type allocFailingPrestartHook struct{}
+
+func (*allocFailingPrestartHook) Name() string { return "failing_prestart" }
+
+func (*allocFailingPrestartHook) Prerun() error {
+	return fmt.Errorf("failing prestart hooks")
+}
diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go
@@ -384,6 +384,28 @@ func (tr *TaskRunner) initLabels() {
 	}
 }
 
+// Mark a task as failed and not to run.  Aimed to be invoked when alloc runner
+// prestart hooks failed.
+// Should never be called with Run().
+func (tr *TaskRunner) MarkFailedDead(reason string) {
+	defer close(tr.waitCh)
+
+	tr.stateLock.Lock()
+	if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
+		//TODO Nomad will be unable to restore this task; try to kill
+		//     it now and fail? In general we prefer to leave running
+		//     tasks running even if the agent encounters an error.
+		tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart",
+			"error", err)
+	}
+	tr.stateLock.Unlock()
+
+	event := structs.NewTaskEvent(structs.TaskSetupFailure).
+		SetDisplayMessage(reason).
+		SetFailsTask()
+	tr.UpdateState(structs.TaskStateDead, event)
+}
+
 // Run the TaskRunner. Starts the user's task or reattaches to a restored task.
 // Run closes WaitCh when it exits. Should be started in a goroutine.
 func (tr *TaskRunner) Run() {