Skip to content

Commit

Permalink
fix: RP error handling (#514)
Browse files Browse the repository at this point in the history
  • Loading branch information
walkah authored Feb 18, 2025
1 parent aa39a51 commit cd39bfc
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
7 changes: 7 additions & 0 deletions pkg/executor/bacalhau/bacalhau.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@ func (executor *BacalhauExecutor) RunJob(
if jobInfo.Job.State.StateType == models.JobStateTypeCompleted {
break
}

// Jobs have 3 terminal states: completed, stopped and failed. We catch the latter two here.
// Most likely, the job failed, but could have been stopped by the operator.
// Any other state, the job is still running or yet to be run.
if jobInfo.Job.State.StateType.IsTerminal() {
return nil, fmt.Errorf("job failed to execute %s : %s", jobID, jobInfo.Job.State.Message)
}
}

time.Sleep(time.Duration(executor.Options.JobStatusPollInterval) * time.Second)
Expand Down
7 changes: 5 additions & 2 deletions pkg/resourceprovider/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,11 @@ func (controller *ResourceProviderController) runJobs(ctx context.Context) error
// we've already updated controller.runningJobs so we know this will only
// run once
func (controller *ResourceProviderController) runJob(ctx context.Context, deal data.DealContainer) {
defer func() {
// Everytime we finish a job we need to ensure we have resource offers
controller.ensureResourceOffers()
}()

controller.log.Info("run job", deal)
controller.log.Info("deal ID", deal.Deal.ID)

Expand Down Expand Up @@ -551,7 +556,5 @@ func (controller *ResourceProviderController) runJob(ctx context.Context, deal d
}
span.AddEvent("solver.transaction_hash.added")

controller.ensureResourceOffers()

span.AddEvent("done")
}

0 comments on commit cd39bfc

Please # to comment.