Skip to content

Commit 9c2353e

Browse files
committed
feat: dependency
1 parent 1440e75 commit 9c2353e

11 files changed

+929
-191
lines changed

protos/PublicDefs.proto

+25
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,23 @@ enum InteractiveTaskType {
109109
Crun = 1;
110110
}
111111

112+
enum DependencyType {
113+
AFTER = 0;
114+
AFTER_ANY = 1;
115+
AFTER_OK = 2;
116+
AFTER_NOT_OK = 3;
117+
}
118+
119+
message DependencyCondition {
120+
uint32 task_id = 1;
121+
DependencyType type = 2;
122+
}
123+
124+
message Dependencies{
125+
repeated DependencyCondition dependencies = 1;
126+
bool depend_all = 2;
127+
}
128+
112129
message TaskToCtld {
113130
/* -------- Fields that are set at the submission time. ------- */
114131
google.protobuf.Duration time_limit = 1;
@@ -128,6 +145,8 @@ message TaskToCtld {
128145

129146
bool requeue_if_failed = 12;
130147
bool get_user_env = 13;
148+
149+
Dependencies dependencies = 14;
131150

132151
oneof payload {
133152
BatchTaskAdditionalMeta batch_meta = 21;
@@ -168,6 +187,11 @@ message RuntimeAttrOfTask {
168187

169188
bool held = 18;
170189
ResourceV2 resources = 19;
190+
bool dependency_ok = 20;
191+
// If this task depends all dependencies, store satisfied dependencies.
192+
// If this task depends any dependency, store unsatisfied dependencies.
193+
// TaskId must be stored in order to restore.
194+
repeated uint32 dependency_ids = 21;
171195
}
172196

173197
message TaskToD {
@@ -249,6 +273,7 @@ message TaskInfo {
249273
string extra_attr = 20;
250274

251275
// Dynamic task information
276+
uint32 dependency_state = 29;
252277
bool held = 30;
253278
TaskStatus status = 31;
254279

src/CraneCtld/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ add_executable(cranectld
1212
CranedMetaContainer.cpp
1313
AccountManager.h
1414
AccountManager.cpp
15+
DependencyManager.h
16+
DependencyManager.cpp
1517
EmbeddedDbClient.cpp
1618
EmbeddedDbClient.h
1719
CraneCtld.cpp

src/CraneCtld/CraneCtld.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "CtldGrpcServer.h"
3232
#include "CtldPublicDefs.h"
3333
#include "DbClient.h"
34+
#include "DependencyManager.h"
3435
#include "EmbeddedDbClient.h"
3536
#include "TaskScheduler.h"
3637
#include "crane/Logger.h"
@@ -626,6 +627,7 @@ void DestroyCtldGlobalVariables() {
626627

627628
g_task_scheduler.reset();
628629
g_craned_keeper.reset();
630+
g_dependency_manager.reset();
629631

630632
g_plugin_client.reset();
631633

@@ -755,6 +757,8 @@ void InitializeCtldGlobalVariables() {
755757
}
756758
}
757759

760+
g_dependency_manager = std::make_unique<DependencyManager>();
761+
758762
g_task_scheduler = std::make_unique<TaskScheduler>();
759763
ok = g_task_scheduler->Init();
760764
if (!ok) {

src/CraneCtld/CtldGrpcServer.cpp

+24-13
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,14 @@ grpc::Status CraneCtldServiceImpl::SubmitBatchTask(
3636

3737
auto result = m_ctld_server_->SubmitTaskToScheduler(std::move(task));
3838
if (result.has_value()) {
39-
task_id_t id = result.value().get();
40-
if (id != 0) {
39+
auto submit_result = result.value().get();
40+
if (submit_result.has_value()) {
4141
response->set_ok(true);
42+
task_id_t id = submit_result.value();
4243
response->set_task_id(id);
4344
} else {
4445
response->set_ok(false);
45-
response->set_reason(
46-
"System error occurred or "
47-
"the number of pending tasks exceeded maximum value.");
46+
response->set_reason(std::string(CraneErrStr(submit_result.error())));
4847
}
4948
} else {
5049
response->set_ok(false);
@@ -58,7 +57,9 @@ grpc::Status CraneCtldServiceImpl::SubmitBatchTasks(
5857
grpc::ServerContext *context,
5958
const crane::grpc::SubmitBatchTasksRequest *request,
6059
crane::grpc::SubmitBatchTasksReply *response) {
61-
std::vector<result::result<std::future<task_id_t>, std::string>> results;
60+
std::vector<result::result<std::future<result::result<task_id_t, CraneErr>>,
61+
std::string>>
62+
results;
6263

6364
uint32_t task_count = request->count();
6465
const auto &task_to_ctld = request->task();
@@ -73,9 +74,14 @@ grpc::Status CraneCtldServiceImpl::SubmitBatchTasks(
7374
}
7475

7576
for (auto &res : results) {
76-
if (res.has_value())
77-
response->mutable_task_id_list()->Add(res.value().get());
78-
else
77+
if (res.has_value()) {
78+
auto submit_res = res.value().get();
79+
if (submit_res.has_value())
80+
response->mutable_task_id_list()->Add(submit_res.value());
81+
else
82+
response->mutable_reason_list()->Add(
83+
std::string(CraneErrStr(submit_res.error())));
84+
} else
7985
response->mutable_reason_list()->Add(res.error());
8086
}
8187

@@ -1086,8 +1092,13 @@ grpc::Status CraneCtldServiceImpl::CforedStream(
10861092
m_ctld_server_->SubmitTaskToScheduler(std::move(task));
10871093
result::result<task_id_t, std::string> result;
10881094
if (submit_result.has_value()) {
1089-
result = result::result<task_id_t, std::string>{
1090-
submit_result.value().get()};
1095+
auto submit_final_result = submit_result.value().get();
1096+
if (submit_final_result.has_value()) {
1097+
result = result::result<task_id_t, std::string>{
1098+
submit_final_result.value()};
1099+
} else {
1100+
result = result::fail(CraneErrStr(submit_final_result.error()));
1101+
}
10911102
} else {
10921103
result = result::fail(submit_result.error());
10931104
}
@@ -1210,7 +1221,7 @@ CtldServer::CtldServer(const Config::CraneCtldListenConf &listen_conf) {
12101221
signal(SIGINT, &CtldServer::signal_handler_func);
12111222
}
12121223

1213-
result::result<std::future<task_id_t>, std::string>
1224+
result::result<std::future<result::result<task_id_t, CraneErr>>, std::string>
12141225
CtldServer::SubmitTaskToScheduler(std::unique_ptr<TaskInCtld> task) {
12151226
CraneErr err;
12161227

@@ -1260,7 +1271,7 @@ CtldServer::SubmitTaskToScheduler(std::unique_ptr<TaskInCtld> task) {
12601271

12611272
if (err == CraneErr::kOk) {
12621273
task->SetSubmitTime(absl::Now());
1263-
std::future<task_id_t> future =
1274+
std::future<result::result<task_id_t, CraneErr>> future =
12641275
g_task_scheduler->SubmitTaskAsync(std::move(task));
12651276
return {std::move(future)};
12661277
}

src/CraneCtld/CtldGrpcServer.h

+17-10
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,8 @@ class CforedStreamWriter {
4242
crane::grpc::StreamCforedRequest> *stream)
4343
: m_stream_(stream), m_valid_(true) {}
4444

45-
bool WriteTaskIdReply(
46-
pid_t calloc_pid,
47-
result::result<task_id_t, std::string> res) {
45+
bool WriteTaskIdReply(pid_t calloc_pid,
46+
result::result<task_id_t, std::string> res) {
4847
LockGuard guard(&m_stream_mtx_);
4948
if (!m_valid_) return false;
5049

@@ -64,8 +63,11 @@ class CforedStreamWriter {
6463
return m_stream_->Write(reply);
6564
}
6665

67-
bool WriteTaskResAllocReply(task_id_t task_id,
68-
result::result<std::pair<std::string,std::list<std::string>>, std::string> res) {
66+
bool WriteTaskResAllocReply(
67+
task_id_t task_id,
68+
result::result<std::pair<std::string, std::list<std::string>>,
69+
std::string>
70+
res) {
6971
LockGuard guard(&m_stream_mtx_);
7072
if (!m_valid_) return false;
7173

@@ -76,8 +78,12 @@ class CforedStreamWriter {
7678

7779
if (res.has_value()) {
7880
task_res_alloc_reply->set_ok(true);
79-
task_res_alloc_reply->set_allocated_craned_regex(std::move(res.value().first));
80-
std::ranges::for_each(res.value().second,[&task_res_alloc_reply](const auto& craned_id){task_res_alloc_reply->add_craned_ids(craned_id);});
81+
task_res_alloc_reply->set_allocated_craned_regex(
82+
std::move(res.value().first));
83+
std::ranges::for_each(res.value().second,
84+
[&task_res_alloc_reply](const auto &craned_id) {
85+
task_res_alloc_reply->add_craned_ids(craned_id);
86+
});
8187
} else {
8288
task_res_alloc_reply->set_ok(false);
8389
task_res_alloc_reply->set_failure_reason(std::move(res.error()));
@@ -89,7 +95,8 @@ class CforedStreamWriter {
8995
bool WriteTaskCompletionAckReply(task_id_t task_id) {
9096
LockGuard guard(&m_stream_mtx_);
9197
if (!m_valid_) return false;
92-
CRANE_TRACE("Sending TaskCompletionAckReply to cfored of task id {}",task_id);
98+
CRANE_TRACE("Sending TaskCompletionAckReply to cfored of task id {}",
99+
task_id);
93100
StreamCtldReply reply;
94101
reply.set_type(StreamCtldReply::TASK_COMPLETION_ACK_REPLY);
95102

@@ -271,8 +278,8 @@ class CtldServer {
271278

272279
inline void Wait() { m_server_->Wait(); }
273280

274-
result::result<std::future<task_id_t>, std::string> SubmitTaskToScheduler(
275-
std::unique_ptr<TaskInCtld> task);
281+
result::result<std::future<result::result<task_id_t, CraneErr>>, std::string>
282+
SubmitTaskToScheduler(std::unique_ptr<TaskInCtld> task);
276283

277284
private:
278285
template <typename K, typename V,

src/CraneCtld/CtldPublicDefs.h

+25
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,8 @@ struct TaskInCtld {
281281
bool requeue_if_failed{false};
282282
bool get_user_env{false};
283283

284+
crane::grpc::Dependencies dependencies;
285+
284286
std::string cmd_line;
285287
std::unordered_map<std::string, std::string> env;
286288
std::string cwd;
@@ -308,6 +310,8 @@ struct TaskInCtld {
308310
crane::grpc::TaskStatus status;
309311
uint32_t exit_code;
310312
bool held{false};
313+
bool dependency_ok{false};
314+
std::vector<task_id_t> dependency_ids;
311315

312316
// If this task is PENDING, start_time is either not set (default constructed)
313317
// or an estimated start time.
@@ -458,6 +462,21 @@ struct TaskInCtld {
458462
resources = std::move(val);
459463
}
460464
ResourceV2 const& Resources() const { return resources; }
465+
466+
void SetDependencyOK() {
467+
dependency_ok = true;
468+
runtime_attr.set_dependency_ok(true);
469+
}
470+
bool HasDependency() const {
471+
return dependencies.dependencies_size() != 0 && !dependency_ok;
472+
}
473+
void DependencyAdd(const std::vector<task_id_t>& val) {
474+
dependency_ids.insert(dependency_ids.end(), val.begin(), val.end());
475+
for (auto const& id : val) runtime_attr.add_dependency_ids(id);
476+
}
477+
bool NoWaitingDependency() const {
478+
return dependency_ids.size() == dependencies.dependencies_size();
479+
}
461480

462481
void SetFieldsByTaskToCtld(crane::grpc::TaskToCtld const& val) {
463482
task_to_ctld = val;
@@ -506,6 +525,8 @@ struct TaskInCtld {
506525
get_user_env = val.get_user_env();
507526

508527
extra_attr = val.extra_attr();
528+
529+
dependencies = val.dependencies();
509530
}
510531

511532
void SetFieldsByRuntimeAttr(crane::grpc::RuntimeAttrOfTask const& val) {
@@ -522,6 +543,10 @@ struct TaskInCtld {
522543
status = runtime_attr.status();
523544
held = runtime_attr.held();
524545

546+
dependency_ok = runtime_attr.dependency_ok();
547+
dependency_ids.assign(runtime_attr.dependency_ids().begin(),
548+
runtime_attr.dependency_ids().end());
549+
525550
if (status != crane::grpc::TaskStatus::Pending) {
526551
craned_ids.assign(runtime_attr.craned_ids().begin(),
527552
runtime_attr.craned_ids().end());

0 commit comments

Comments
 (0)