Skip to content

Commit fa96104

Browse files
authored
Improve batch job and restart behavior (#30)
The motivation for this was starting to use nomad batch jobs. Using these depends on being able to get an exit code from the container, which the driver didn't support so far. Changes to pot to make this work are in bsdpot/pot#200 . When a batch job returns an error code != 0, the restart behavior in Nomad's restart stanza is applied. Restarts happen in the context of the same allocation - this is also true for jobs of type `service`. As the pot nomad driver would base the potName on this data, the container name would be recycled. This resulted in all kinds of problems when restarting tasks rapidly. To correct this, I changed the naming of pots from jobname + taskname + "_" + allocId to taskname + "_" + invocationId + "_" + allocId Having invocationId in there makes sure that each container name is actually complete (for the sake of not changing pot in this respect, `invocationId + "_" + allocId` are passed in as allocId when calling `pot prepare`. The resulting pot names look quite okay (the way I structured jobs, they actually look better/are easier on the eyes, but that's subjective). Retrieving the exit code makes use of a new pot feature in the review mentioned above. This is a two level process: 1. If in potWait (no Nomad restart happened): Check if `pot start` returned a distinct error code and if it did, use `pot last-run-stats` to retrieve the process' exit code. 2. If in recoverWait (Nomad restart happened): Always use `pot last-run-stats` to retrieve the process' exit code. In both cases, the pot container is destroyed immediately once finished to avoid piling up stale pots that would need to be garbage collected with `pot prune` (which can get quite expensive). In the future, a parameter could be added to allow to configure this behavior. I hope I didn't miss any potential code paths (batch jobs rely on getting reliable results from the driver). Example batch job definition: job "cmd" { datacenters = ["dc1"] type = "batch" group "cmd-group" { task "command" { driver = "pot" restart { # agressive interval = "30m" attempts = 200 delay = "0s" mode = "fail" } config { image = "https://pottery.example.org" pot = "command_13_0" tag = "0.1" command = "/bin/sh" args = ["-c", "'date; false'"] } } } }
1 parent 36f92e4 commit fa96104

File tree

4 files changed

+116
-20
lines changed

4 files changed

+116
-20
lines changed

driver/driver.go

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import (
66
"fmt"
77
"os"
88
"os/exec"
9+
"strings"
10+
"syscall"
911
"time"
1012

1113
"github.com/hashicorp/go-hclog"
@@ -295,7 +297,8 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error {
295297
return fmt.Errorf("unable to recover a container that is not running")
296298
} else {
297299
se.containerPid = alive
298-
completeName := handle.Config.JobName + handle.Config.Name + "_" + handle.Config.AllocID
300+
parts := strings.Split(handle.Config.ID, "/")
301+
completeName := parts[1] + "_" + parts[2] + "_" + parts[0]
299302
Sout, err := se.Stdout()
300303
if err != nil {
301304
d.logger.Error("Error setting stdout with", "err", err)
@@ -433,7 +436,8 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drive
433436
}
434437
} else {
435438
se.containerPid = alive
436-
completeName := cfg.JobName + cfg.Name + "_" + cfg.AllocID
439+
parts := strings.Split(cfg.ID, "/")
440+
completeName := parts[1] + "_" + parts[2] + "_" + parts[0]
437441

438442
se.cmd = &exec.Cmd{
439443
Args: []string{"/usr/local/bin/pot", "start", completeName},
@@ -505,16 +509,45 @@ OuterLoop:
505509

506510
handle, _ := d.tasks.Get(id)
507511
handle.procState = drivers.TaskStateExited
512+
513+
last_run, err := se.getContainerLastRunStats(handle.taskConfig)
514+
if err != nil {
515+
d.logger.Error("Error getting container last-run-stats with err: ", err)
516+
handle.exitResult.ExitCode = defaultFailedCode
517+
} else {
518+
handle.exitResult.ExitCode = last_run.ExitCode
519+
}
520+
521+
err = se.destroyContainer(handle.taskConfig)
522+
if err != nil {
523+
d.logger.Error("Error destroying container with err: ", err)
524+
}
508525
}
509526

510527
func (d *Driver) potWait(taskID string, se syexec) {
511528
handle, _ := d.tasks.Get(taskID)
512529
err := se.cmd.Wait()
530+
handle.procState = drivers.TaskStateExited
513531
if err != nil {
514532
d.logger.Error("Error exiting se.cmd.Wait in potWait", "Err", err)
533+
handle.exitResult.ExitCode = defaultFailedCode
534+
if exitError, ok := err.(*exec.ExitError); ok {
535+
ws := exitError.Sys().(syscall.WaitStatus)
536+
if ws.ExitStatus() == 125 { // enclosed process exited with error
537+
last_run_stats, err := se.getContainerLastRunStats(handle.taskConfig)
538+
if err != nil {
539+
d.logger.Error("Error getting container last-run-stats with err: ", err)
540+
} else {
541+
handle.exitResult.ExitCode = last_run_stats.ExitCode
542+
}
543+
}
544+
}
515545
}
516-
handle.procState = drivers.TaskStateExited
517546

547+
err = se.destroyContainer(handle.taskConfig)
548+
if err != nil {
549+
d.logger.Error("Error destroying container with err: ", err)
550+
}
518551
}
519552

520553
// WaitTask waits for task completion

driver/handle.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ func (h *taskHandle) run() {
6161

6262
if h.syexec.ExitError != nil {
6363
h.exitResult.Err = h.syexec.ExitError
64+
h.exitResult.ExitCode = h.syexec.exitCode
6465
h.procState = drivers.TaskStateUnknown
6566
h.completedAt = time.Now()
6667
return

driver/pot.go

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ type syexec struct {
3131
argvStart []string
3232
argvStop []string
3333
argvStats []string
34+
argvLastRunStats []string
3435
argvDestroy []string
3536
cmd *exec.Cmd
3637
cachedir string
@@ -67,6 +68,10 @@ type potStats struct {
6768
} `json:"ResourceUsage"`
6869
}
6970

71+
type lastRunStats struct {
72+
ExitCode int `json:"ExitCode"`
73+
}
74+
7075
var potStatistics map[string]potStats
7176

7277
func init() {
@@ -364,8 +369,8 @@ func (s *syexec) containerStats(commandCfg *drivers.TaskConfig) (stats potStats,
364369

365370
func (s *syexec) checkContainerAlive(commandCfg *drivers.TaskConfig) int {
366371
s.logger.Trace("Checking if pot is alive", "Checking")
367-
completeName := commandCfg.JobName + commandCfg.Name
368-
potName := completeName + "_" + commandCfg.AllocID
372+
parts := strings.Split(commandCfg.ID, "/")
373+
potName := parts[1] + "_" + parts[2] + "_" + parts[0]
369374
s.logger.Trace("Allocation name beeing check for liveness", "alive", potName)
370375

371376
psCommand := "/bin/sh /usr/local/bin/pot start " + potName
@@ -395,8 +400,8 @@ func (s *syexec) checkContainerAlive(commandCfg *drivers.TaskConfig) int {
395400

396401
func (s *syexec) checkContainerExists(commandCfg *drivers.TaskConfig) int {
397402
s.logger.Debug("Checking if pot is alive")
398-
completeName := commandCfg.JobName + commandCfg.Name
399-
potName := completeName + "_" + commandCfg.AllocID
403+
parts := strings.Split(commandCfg.ID, "/")
404+
potName := parts[1] + "_" + parts[2] + "_" + parts[0]
400405
s.logger.Trace("Allocation name beeing check for liveness", "alive", potName)
401406

402407
pidCommand := "/usr/local/bin/pot ls -q | grep " + potName
@@ -420,3 +425,53 @@ func (s *syexec) checkContainerExists(commandCfg *drivers.TaskConfig) int {
420425

421426
return 0
422427
}
428+
429+
func (s *syexec) getContainerLastRunStats(commandCfg *drivers.TaskConfig) (stats lastRunStats, err error) {
430+
s.logger.Debug("launching LastRunStatsContainer command", strings.Join(s.argvLastRunStats, " "))
431+
432+
cmd := exec.Command(potBIN, s.argvLastRunStats...)
433+
434+
// set the task dir as the working directory for the command
435+
cmd.Dir = commandCfg.TaskDir().Dir
436+
cmd.Path = potBIN
437+
cmd.Args = append([]string{cmd.Path}, s.argvLastRunStats...)
438+
439+
var outb, errb bytes.Buffer
440+
cmd.Stdout = &outb
441+
cmd.Stderr = &errb
442+
443+
// Start the process
444+
if err := cmd.Run(); err != nil {
445+
// try to get the exit code
446+
if exitError, ok := err.(*exec.ExitError); ok {
447+
ws := exitError.Sys().(syscall.WaitStatus)
448+
s.exitCode = ws.ExitStatus()
449+
} else {
450+
s.logger.Error("Could not get exit code for container last-run-stats ", "pot", s.argvLastRunStats)
451+
s.exitCode = defaultFailedCode
452+
}
453+
} else {
454+
// success, exitCode should be 0 if go is ok
455+
ws := cmd.ProcessState.Sys().(syscall.WaitStatus)
456+
s.exitCode = ws.ExitStatus()
457+
}
458+
459+
s.cmd = cmd
460+
461+
s.state = &psState{Pid: s.cmd.Process.Pid, ExitCode: s.exitCode, Time: time.Now()}
462+
463+
var lastRunStats lastRunStats
464+
465+
if s.exitCode != 0 {
466+
err = errors.New("Pot exit code different than 0")
467+
return lastRunStats, err
468+
}
469+
470+
err = json.Unmarshal([]byte(outb.String()), &lastRunStats)
471+
if err != nil {
472+
s.logger.Error("Error unmarshaling json with err: ", err)
473+
return lastRunStats, err
474+
}
475+
476+
return lastRunStats, nil
477+
}

driver/prepare.go

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,6 @@ func prepareContainer(cfg *drivers.TaskConfig, taskCfg TaskConfig) (syexec, erro
2424

2525
argv = append(argv, "prepare", "-U", taskCfg.Image, "-p", taskCfg.Pot, "-t", taskCfg.Tag)
2626

27-
if cfg.AllocID != "" {
28-
argv = append(argv, "-a", cfg.AllocID)
29-
}
30-
3127
if len(taskCfg.Args) > 0 {
3228
if taskCfg.Command == "" {
3329
err := errors.New("command can not be empty if arguments are provided")
@@ -59,12 +55,17 @@ func prepareContainer(cfg *drivers.TaskConfig, taskCfg TaskConfig) (syexec, erro
5955
}
6056
}
6157

62-
completeName := cfg.JobName + cfg.Name
63-
argv = append(argv, "-n", completeName, "-v")
58+
parts := strings.Split(cfg.ID, "/")
59+
baseName := parts[1]
60+
jobIDAllocID := parts[2] + "_" + parts[0]
61+
if jobIDAllocID != "" {
62+
argv = append(argv, "-a", jobIDAllocID)
63+
}
64+
argv = append(argv, "-n", baseName, "-v")
6465

6566
se.argvCreate = argv
6667

67-
potName := completeName + "_" + cfg.AllocID
68+
potName := baseName + "_" + jobIDAllocID
6869

6970
//Mount local
7071
commandLocal := "mount-in -p " + potName + " -d " + cfg.TaskDir().LocalDir + " -m /local"
@@ -140,14 +141,18 @@ func prepareContainer(cfg *drivers.TaskConfig, taskCfg TaskConfig) (syexec, erro
140141
argvStop = append(argvStop, "stop", potName)
141142
se.argvStop = argvStop
142143

143-
argvDestroy := make([]string, 0, 50)
144-
argvDestroy = append(argvDestroy, "destroy", "-p", potName)
145-
se.argvDestroy = argvDestroy
146-
147144
argvStats := make([]string, 0, 50)
148145
argvStats = append(argvStats, "get-rss", "-p", potName, "-J")
149146
se.argvStats = argvStats
150147

148+
argvLastRunStats := make([]string, 0, 50)
149+
argvLastRunStats = append(argvLastRunStats, "last-run-stats", "-p", potName)
150+
se.argvLastRunStats = argvLastRunStats
151+
152+
argvDestroy := make([]string, 0, 50)
153+
argvDestroy = append(argvDestroy, "destroy", "-p", potName)
154+
se.argvDestroy = argvDestroy
155+
151156
return se, nil
152157
}
153158

@@ -209,7 +214,8 @@ func prepareStop(cfg *drivers.TaskConfig, taskCfg TaskConfig) syexec {
209214

210215
argv = append(argv, "stop")
211216

212-
completeName := cfg.JobName + cfg.Name + "_" + cfg.AllocID
217+
parts := strings.Split(cfg.ID, "/")
218+
completeName := parts[1] + "_" + parts[2] + "_" + parts[0]
213219

214220
argv = append(argv, completeName)
215221

@@ -229,7 +235,8 @@ func prepareDestroy(cfg *drivers.TaskConfig, taskCfg TaskConfig) syexec {
229235

230236
argv = append(argv, "destroy")
231237

232-
completeName := cfg.JobName + cfg.Name + "_" + cfg.AllocID
238+
parts := strings.Split(cfg.ID, "/")
239+
completeName := parts[1] + "_" + parts[2] + "_" + parts[0]
233240

234241
argv = append(argv, "-p", completeName)
235242

0 commit comments

Comments
 (0)