Skip to content

Commit

Permalink
Merge pull request #525 from stanford-oval/wip/workaround-kube2iam
Browse files Browse the repository at this point in the history
Workaround for jobs failing with missing S3 credentials
  • Loading branch information
gcampax authored Oct 22, 2019
2 parents 0ecfc2e + 9fc1911 commit 2de81af
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 30 deletions.
13 changes: 4 additions & 9 deletions tests/training/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,10 @@ const assert = require('assert');
//const Tp = require('thingpedia');

const db = require('../../util/db');
const sleep = require('../../util/sleep');
const trainingJobModel = require('../../model/training_job');
const TrainingServer = require('../../util/training_server');

async function delay(ms) {
return new Promise((resolve, reject) => {
setTimeout(resolve, ms);
});
}

async function waitUntilAllJobsDone() {
for (;;) {
const row = await db.withClient((dbClient) => {
Expand All @@ -37,7 +32,7 @@ async function waitUntilAllJobsDone() {
if (row.cnt === 0)
break;

await delay(10000);
await sleep(10000);
}

const failed = await db.withClient((dbClient) => {
Expand Down Expand Up @@ -74,7 +69,7 @@ async function testBasic() {
// issue a basic train command

await server.queue('en', null, 'train');
await delay(1000);
await sleep(1000);

const queue = await db.withClient((dbClient) => server.getJobQueue(dbClient));
//console.log(queue);
Expand Down Expand Up @@ -185,7 +180,7 @@ async function testForDevice() {
// issue a train command for a device that is not approved

await server.queue('en', ['org.thingpedia.builtin.test.adminonly'], 'train');
await delay(1000);
await sleep(1000);

const queue = await db.withClient((dbClient) => server.getJobQueue(dbClient));
//console.log(queue);
Expand Down
23 changes: 9 additions & 14 deletions tests/unit/test_lock.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@
const assert = require('assert');

const Lock = require('../../util/lock');

function delay(timeout) {
return new Promise((resolve, reject) => {
setTimeout(resolve, timeout);
});
}
const sleep = require('../../util/sleep');

async function withTimeout(promise, timeout = 30000) {
await Promise.race([
Expand Down Expand Up @@ -50,20 +45,20 @@ async function testInterleave() {
async function thread1() {
const release1 = await lock.acquire();
output.push(1);
await delay(1000);
await sleep(1000);
output.push(2);
release1();
await delay(1000);
await sleep(1000);
const release2 = await lock.acquire();
output.push(5);
release2();
}

async function thread2() {
await delay(500);
await sleep(500);
const release1 = await lock.acquire();
output.push(3);
await delay(500);
await sleep(500);
output.push(4);
release1();
}
Expand All @@ -82,22 +77,22 @@ async function testQueue() {
async function thread1() {
const release1 = await lock.acquire();
output.push(1);
await delay(5000);
await sleep(5000);
output.push(2);
release1();
}

async function thread2() {
await delay(500);
await sleep(500);
const release1 = await lock.acquire();
output.push(3);
await delay(500);
await sleep(500);
output.push(4);
release1();
}

async function thread3() {
await delay(1500);
await sleep(1500);
const release1 = await lock.acquire();
output.push(5);
output.push(6);
Expand Down
9 changes: 2 additions & 7 deletions tests/website/test_admin.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const { assertHttpError, assertRedirect, assertLoginRequired, assertBlocked, ses
const { login, startSession } = require('../#');

const db = require('../../util/db');
const sleep = require('../../util/sleep');
const EngineManagerClient = require('../../almond/enginemanagerclient');

const Config = require('../../config');
Expand Down Expand Up @@ -44,12 +45,6 @@ async function testAdminUsers(root, bob, nobody) {
assert(rootUserPage2.indexOf('root@localhost') >= 0);
}

function delay(ms) {
return new Promise((resolve, reject) => {
setTimeout(resolve, ms);
});
}

async function testAdminKillRestart(root, bob, nobody) {
const emc = EngineManagerClient.get();
assert (await emc.isRunning(1)); // root
Expand All @@ -71,7 +66,7 @@ async function testAdminKillRestart(root, bob, nobody) {
assert (!await emc.isRunning(5)); // emma -or- alexa_user

// the shared processes will be restarted in 5s
await delay(10000);
await sleep(10000);

await assertLoginRequired(sessionRequest('/admin/users/start/1', 'POST', '', nobody));
await assertRedirect(sessionRequest('/admin/users/start/1', 'POST', '', root, { followRedirects: false }), '/admin/users/search?q=1');
Expand Down
14 changes: 14 additions & 0 deletions training/tasks/train.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,23 @@ const fs = require('fs');
const Genie = require('genie-toolkit');

const AbstractFS = require('../../util/abstract_fs');
const sleep = require('../../util/sleep');
const Config = require('../../config');

module.exports = async function main(task, argv) {
// on kubernetes, we might encounter a race when this pod is scheduled on a
// newly started node, where the pod is started before the kube2iam daemonset
// is ready
// in turn, this means we don't have the right credentials to access s3, and die
//
// we work around that problem with an artificial 1 minute delay when training
// we only do it for training, because other jobs are likely scheduled on existing
// general-purpose nodes (where kube2iam is already active)
// we also only do it for the kubernetes backend, because the local backend doesn't
// have that problem, and we don't want our CI to become longer
if (Config.TRAINING_TASK_BACKEND === 'kubernetes')
await sleep(60000);

const jobdir = await AbstractFS.download(task.jobDir + '/');
const datadir = path.resolve(jobdir, 'dataset');
const workdir = path.resolve(jobdir, 'workdir');
Expand Down
16 changes: 16 additions & 0 deletions util/sleep.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// -*- mode: js; indent-tabs-mode: nil; js-basic-offset: 4 -*-
//
// This file is part of ThingEngine
//
// Copyright 2019 The Board of Trustees of the Leland Stanford Junior University
//
// Author: Giovanni Campagna <gcampagn@cs.stanford.edu>
//
// See COPYING for details
"use strict";

module.exports = async function sleep(ms) {
return new Promise((resolve, reject) => {
setTimeout(resolve, ms);
});
};

0 comments on commit 2de81af

Please # to comment.