Skip to content

Commit

Permalink
Resolves hang when the device op queue gets full and no more commands…
Browse files Browse the repository at this point in the history
… can be submitted
  • Loading branch information
sophimao committed Sep 19, 2023
1 parent aede25e commit 568659a
Showing 1 changed file with 30 additions and 17 deletions.
47 changes: 30 additions & 17 deletions src/acl_command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -677,11 +677,13 @@ void acl_try_FastKernelRelaunch_ooo_queue_event_dependents(cl_event parent) {

// Fast Kernel Relaunch: submitting is safe even though has dependency
// Prior to submitting remove dependency
dependent->depend_on.erase(parent);
dependent_it = parent->depend_on_me.erase(dependent_it);
dependent_it--; // decrement it other wise we will skip an element
dependent->command_queue->num_commands_submitted++;
acl_submit_command(dependent);
int local_updates = acl_submit_command(dependent);
if (local_updates) {
dependent->depend_on.erase(parent);
dependent_it = parent->depend_on_me.erase(dependent_it);
dependent_it--; // decrement it otherwise we will skip an element
dependent->command_queue->num_commands_submitted++;
}
}
}

Expand All @@ -691,19 +693,26 @@ int acl_update_ooo_queue(cl_command_queue command_queue) {
// Directly submit the event if it has no dependencies
// unless it is a user_event queue which never submits events
while (!command_queue->new_commands.empty()) {
int success = 1;
cl_event event = command_queue->new_commands.front();
if (command_queue->submits_commands &&
event->execution_status == CL_QUEUED) {
if (event->depend_on.empty()) {
command_queue->num_commands_submitted++;
acl_submit_command(event);
success = acl_submit_command(event);
} else {
// This is allowed to fail, so no need to mark success as false
// dependent events that fail to be FKRd will still be picked up when
// their parent event finishes
acl_try_FastKernelRelaunch_ooo_queue_event_dependents(
*(event->depend_on.begin()));
}
}
// safe to pop as there is a master copy in command_queue->commands
command_queue->new_commands.pop_front();

if (success) {
// safe to pop as there is a master copy in command_queue->commands
command_queue->new_commands.pop_front();
}
}

// Remove dependencies on completed events, and launch any events
Expand Down Expand Up @@ -731,9 +740,10 @@ int acl_update_ooo_queue(cl_command_queue command_queue) {
if ((dependent->command_queue->properties &
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) &&
dependent->cmd.type != CL_COMMAND_USER) {
dependent->command_queue
->num_commands_submitted++; // dependent might be on another queue
num_updates += acl_submit_command(dependent);
int local_updates = acl_submit_command(dependent);
dependent->command_queue->num_commands_submitted +=
local_updates; // dependent might be on another queue
num_updates += local_updates;
}
}
}
Expand Down Expand Up @@ -879,8 +889,9 @@ int acl_update_inorder_queue(cl_command_queue command_queue) {
}

if (command_queue->num_commands_submitted == 0) {
command_queue->num_commands_submitted++;
num_updates += acl_submit_command(event);
int local_updates = acl_submit_command(event);
command_queue->num_commands_submitted += local_updates;
num_updates += local_updates;
continue; // there might be another kernel behind us that can be
// submitted aswell
} else {
Expand All @@ -900,8 +911,9 @@ int acl_update_inorder_queue(cl_command_queue command_queue) {
if (submitted_event->last_device_op->status <= CL_SUBMITTED) {
// Assumption: last device_op of the submitted kernel event is a
// kernel_op
command_queue->num_commands_submitted++;
num_updates += acl_submit_command(event);
int local_updates = acl_submit_command(event);
command_queue->num_commands_submitted += local_updates;
num_updates += local_updates;
continue; // there might be another kernel behind us that can be
// submitted aswell
}
Expand All @@ -915,8 +927,9 @@ int acl_update_inorder_queue(cl_command_queue command_queue) {
event->depend_on.empty()) {
// it is safe to submit: nothing else submitted AND all dependencies
// are resolved
command_queue->num_commands_submitted++;
num_updates += acl_submit_command(event);
int local_updates = acl_submit_command(event);
command_queue->num_commands_submitted += local_updates;
num_updates += local_updates;
}
break; // no more events can be submitted
}
Expand Down

0 comments on commit 568659a

Please # to comment.