From 9a1c965aff49a0207e2e801c6f6d8eb697ca6a73 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Fri, 18 Apr 2025 17:58:46 +0000 Subject: [PATCH 1/2] [SYCL][UR] fix use after free on queue in eventRelease There was a potential race: if getEventEndTimestamp returns false and just after that the queue finishes and is destroyed, we would call deferEventRelease on invalid queue. This patch reworks the logic for deferring event release. Instead of registering the events to be released in the queue, we just clear adjustedEventStartTimestamp and adjustedEventEndTimestamp to mark the event as not timestamped. Since events are never actually destroyed, only returned to the pool, any pending timestamp write will always succeed. Since we are only dealing with in-order queues, the timestamp can only increase. --- .../scripts/templates/queue_api.hpp.mako | 2 - .../source/adapters/level_zero/v2/event.cpp | 55 ++++++++----------- .../source/adapters/level_zero/v2/event.hpp | 7 ++- .../adapters/level_zero/v2/queue_api.hpp | 2 - .../v2/queue_immediate_in_order.cpp | 11 ---- .../v2/queue_immediate_in_order.hpp | 3 - .../level_zero/v2/event_pool_test.cpp | 29 ++-------- .../test/adapters/level_zero/ze_helpers.hpp | 10 ++++ .../enqueue/urEnqueueTimestampRecording.cpp | 55 +++++++++++++++++++ .../event/urEventGetProfilingInfo.cpp | 24 ++++++++ 10 files changed, 119 insertions(+), 79 deletions(-) diff --git a/unified-runtime/scripts/templates/queue_api.hpp.mako b/unified-runtime/scripts/templates/queue_api.hpp.mako index a587c8f2e7287..ade657b9af446 100644 --- a/unified-runtime/scripts/templates/queue_api.hpp.mako +++ b/unified-runtime/scripts/templates/queue_api.hpp.mako @@ -29,8 +29,6 @@ from templates import helper as th struct ur_queue_t_ { virtual ~ur_queue_t_(); - virtual void deferEventFree(ur_event_handle_t hEvent) = 0; - %for obj in th.get_queue_related_functions(specs, n, tags): %if not 'Release' in obj['name'] and not 'Retain' in obj['name']: virtual ${x}_result_t ${th.transform_queue_related_function_name(n, tags, obj, format=["type"])} = 0; diff --git a/unified-runtime/source/adapters/level_zero/v2/event.cpp b/unified-runtime/source/adapters/level_zero/v2/event.cpp index a26fc1a2a45dd..70075966a3ad1 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.cpp @@ -60,6 +60,21 @@ uint64_t event_profiling_data_t::getEventEndTimestamp() { return adjustedEventEndTimestamp; } +void event_profiling_data_t::reset() { + // This ensures that the event is consider as not timestamped. + // We can't touch the recordEventEndTimestamp + // as it may still be overwritten by the driver. + // In case event is resued and recordStartTimestamp + // is called again, adjustedEventEndTimestamp will always be updated correctly + // to the new value as we wait for the event to be signaled. + // If the event is reused on another queue, this means that the original + // queue must have been destroyed (and the even pool released back to the + // context) and the timstamp is already wrriten, so there's no race-condition + // possible. + adjustedEventStartTimestamp = 0; + adjustedEventEndTimestamp = 0; +} + void event_profiling_data_t::recordStartTimestamp(ur_device_handle_t hDevice) { zeTimerResolution = hDevice->ZeDeviceProperties->timerResolution; timestampMaxValue = hDevice->getTimestampMask(); @@ -98,7 +113,7 @@ void ur_event_handle_t_::resetQueueAndCommand(ur_queue_t_ *hQueue, ur_command_t commandType) { this->hQueue = hQueue; this->commandType = commandType; - profilingData = event_profiling_data_t(getZeEvent()); + profilingData.reset(); } void ur_event_handle_t_::recordStartTimestamp() { @@ -141,33 +156,17 @@ ur_result_t ur_event_handle_t_::retain() { return UR_RESULT_SUCCESS; } -ur_result_t ur_event_handle_t_::releaseDeferred() { - assert(zeEventQueryStatus(getZeEvent()) == ZE_RESULT_SUCCESS); - assert(RefCount.load() == 0); - - return this->forceRelease(); -} - ur_result_t ur_event_handle_t_::release() { if (!RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; - // Need to take a lock before checking if the event is timestamped. - std::unique_lock lock(Mutex); - - if (isTimestamped() && !getEventEndTimestamp()) { - // L0 will write end timestamp to this event some time in the future, - // so we can't release it yet. - assert(hQueue); - hQueue->deferEventFree(this); - return UR_RESULT_SUCCESS; + if (event_pool) { + event_pool->free(this); + } else { + std::get(hZeEvent).release(); + delete this; } - - // Need to unlock now, as forceRelease might deallocate memory backing - // the Mutex. - lock.unlock(); - - return this->forceRelease(); + return UR_RESULT_SUCCESS; } bool ur_event_handle_t_::isTimestamped() const { @@ -209,16 +208,6 @@ ur_event_handle_t_::ur_event_handle_t_( , nullptr) {} -ur_result_t ur_event_handle_t_::forceRelease() { - if (event_pool) { - event_pool->free(this); - } else { - std::get(hZeEvent).release(); - delete this; - } - return UR_RESULT_SUCCESS; -} - namespace ur::level_zero { ur_result_t urEventRetain(ur_event_handle_t hEvent) try { return hEvent->retain(); diff --git a/unified-runtime/source/adapters/level_zero/v2/event.hpp b/unified-runtime/source/adapters/level_zero/v2/event.hpp index 8365832224cc7..6810a3f5217dd 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.hpp @@ -35,6 +35,10 @@ struct event_profiling_data_t { bool recordingStarted() const; bool recordingEnded() const; + // clear the profiling data, allowing the event to be reused + // for a new command + void reset(); + private: ze_event_handle_t hZeEvent; @@ -64,9 +68,6 @@ struct ur_event_handle_t_ : _ur_object { // Set the queue and command that this event is associated with void resetQueueAndCommand(ur_queue_t_ *hQueue, ur_command_t commandType); - // releases event immediately - ur_result_t forceRelease(); - void reset(); ze_event_handle_t getZeEvent() const; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp index 1a551cfaedb21..004b51f822c7f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp @@ -21,8 +21,6 @@ struct ur_queue_t_ { virtual ~ur_queue_t_(); - virtual void deferEventFree(ur_event_handle_t hEvent) = 0; - virtual ur_result_t queueGetInfo(ur_queue_info_t, size_t, void *, size_t *) = 0; virtual ur_result_t queueGetNativeHandle(ur_queue_native_desc_t *, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index a782bd43c94c0..62f815eecc4cb 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -138,11 +138,6 @@ ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, return UR_RESULT_SUCCESS; } -void ur_queue_immediate_in_order_t::deferEventFree(ur_event_handle_t hEvent) { - auto commandListLocked = commandListManager.lock(); - deferredEvents.push_back(hEvent); -} - ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { *phNativeQueue = reinterpret_cast( @@ -160,12 +155,6 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { ZE2UR_CALL(zeCommandListHostSynchronize, (commandListLocked->getZeCommandList(), UINT64_MAX)); - // Free deferred events - for (auto &hEvent : deferredEvents) { - UR_CALL(hEvent->releaseDeferred()); - } - deferredEvents.clear(); - // Free deferred kernels for (auto &hKernel : submittedKernels) { UR_CALL(hKernel->release()); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 7ddd96fd9ff91..fb7ed9a9b43e9 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -34,7 +34,6 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ { ur_queue_flags_t flags; lockable commandListManager; - std::vector deferredEvents; std::vector submittedKernels; wait_list_view @@ -46,8 +45,6 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ { ur_event_handle_t *hUserEvent, ur_command_t commandType); - void deferEventFree(ur_event_handle_t hEvent) override; - ur_result_t enqueueGenericFillUnlocked( ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, diff --git a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp index 9dc627d687658..41b503f0c988c 100644 --- a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp @@ -272,39 +272,18 @@ TEST_P(EventPoolTestWithQueue, WithTimestamp) { &hDevice, nullptr)); ur_event_handle_t first; - ze_event_handle_t zeFirst; { ASSERT_SUCCESS( urEnqueueTimestampRecordingExp(queue, false, 1, &hEvent, &first)); - zeFirst = first->getZeEvent(); - urEventRelease(first); // should not actually release the event until // recording is completed } ur_event_handle_t second; - ze_event_handle_t zeSecond; - { - ASSERT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &second)); - zeSecond = second->getZeEvent(); - ASSERT_SUCCESS(urEventRelease(second)); - } - ASSERT_NE(first, second); - ASSERT_NE(zeFirst, zeSecond); + ASSERT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &second)); + // even if the event is reused, it should not be timestamped anymore + ASSERT_FALSE(second->isTimestamped()); + ASSERT_SUCCESS(urEventRelease(second)); ASSERT_EQ(zeEventHostSignal(zeEvent.get()), ZE_RESULT_SUCCESS); - ASSERT_SUCCESS(urQueueFinish(queue)); - - // Now, the first event should be avilable for reuse - ur_event_handle_t third; - ze_event_handle_t zeThird; - { - ASSERT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &third)); - zeThird = third->getZeEvent(); - ASSERT_SUCCESS(urEventRelease(third)); - - ASSERT_FALSE(third->isTimestamped()); - } - ASSERT_EQ(first, third); - ASSERT_EQ(zeFirst, zeThird); } diff --git a/unified-runtime/test/adapters/level_zero/ze_helpers.hpp b/unified-runtime/test/adapters/level_zero/ze_helpers.hpp index 82bab84e054d8..98517c8bb901d 100644 --- a/unified-runtime/test/adapters/level_zero/ze_helpers.hpp +++ b/unified-runtime/test/adapters/level_zero/ze_helpers.hpp @@ -10,8 +10,18 @@ #include #include +static bool ze_initialized = false; + std::unique_ptr<_ze_event_handle_t, std::function> createZeEvent(ur_context_handle_t hContext, ur_device_handle_t hDevice) { + if (!ze_initialized) { + ze_result_t result = zeInit(ZE_INIT_FLAG_GPU_ONLY); + if (result != ZE_RESULT_SUCCESS) { + return nullptr; + } + ze_initialized = true; + } + ze_event_pool_desc_t desc; desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC; desc.pNext = nullptr; diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp index 1b5a016a204f2..5f65044f66ad9 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp @@ -66,6 +66,61 @@ TEST_P(urEnqueueTimestampRecordingExpTest, SuccessBlocking) { ASSERT_SUCCESS(urEventRelease(event)); } +TEST_P(urEnqueueTimestampRecordingExpTest, + ReleaseEventWhileTimestampWritePending) { + void *ptr; + ASSERT_SUCCESS( + urUSMSharedAlloc(context, device, nullptr, nullptr, 1024 * 1024, &ptr)); + + // Enqueue an operation to keep the device busy + uint8_t pattern = 0xFF; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(uint8_t), &pattern, + 1024 * 1024, 0, nullptr, nullptr)); + + ur_event_handle_t event1 = nullptr; + ASSERT_SUCCESS( + urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, &event1)); + ASSERT_SUCCESS(urEventRelease(event1)); + + ur_event_handle_t event2 = nullptr; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(uint8_t), &pattern, + 1024 * 1024, 0, nullptr, &event2)); + + // Make sure the new event does not contain profiling info (in case it's reused + // by the adapter) + ASSERT_EQ(urEventGetProfilingInfo(event2, UR_PROFILING_INFO_COMMAND_QUEUED, + sizeof(uint64_t), nullptr, nullptr), + UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE); + ASSERT_SUCCESS(urEventRelease(event2)); + ASSERT_SUCCESS(urUSMFree(context, ptr)); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, ReleaseEventAfterQueueRelease) { + void *ptr; + ASSERT_SUCCESS( + urUSMSharedAlloc(context, device, nullptr, nullptr, 1024 * 1024, &ptr)); + + // Enqueue an operation to keep the device busy + uint8_t pattern = 0xFF; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(uint8_t), &pattern, + 1024 * 1024, 0, nullptr, nullptr)); + + ur_event_handle_t event1 = nullptr; + ASSERT_SUCCESS( + urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, &event1)); + + ASSERT_SUCCESS(urQueueRelease(queue)); + queue = nullptr; + + uint64_t queuedTime = 0; + ASSERT_SUCCESS( + urEventGetProfilingInfo(event1, UR_PROFILING_INFO_COMMAND_QUEUED, + sizeof(uint64_t), &queuedTime, nullptr)); + + ASSERT_SUCCESS(urEventRelease(event1)); + ASSERT_SUCCESS(urUSMFree(context, ptr)); +} + TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullHandleQueue) { ur_event_handle_t event = nullptr; ASSERT_EQ_RESULT( diff --git a/unified-runtime/test/conformance/event/urEventGetProfilingInfo.cpp b/unified-runtime/test/conformance/event/urEventGetProfilingInfo.cpp index e63b6078de6cb..b8b37df7d13c8 100644 --- a/unified-runtime/test/conformance/event/urEventGetProfilingInfo.cpp +++ b/unified-runtime/test/conformance/event/urEventGetProfilingInfo.cpp @@ -128,6 +128,30 @@ TEST_P(urEventGetProfilingInfoTest, Success) { UR_PROFILING_INFO_COMMAND_COMPLETE); } +TEST_P(urEventGetProfilingInfoTest, ReleaseEventAfterQueueRelease) { + void *ptr; + ASSERT_SUCCESS( + urUSMSharedAlloc(context, device, nullptr, nullptr, 1024 * 1024, &ptr)); + + // Enqueue an operation to keep the device busy + uint8_t pattern = 0xFF; + ur_event_handle_t event1; + ASSERT_SUCCESS(urEnqueueUSMFill(queue, ptr, sizeof(uint8_t), &pattern, + 1024 * 1024, 0, nullptr, &event1)); + + ASSERT_SUCCESS(urQueueRelease(queue)); + queue = nullptr; + + uint64_t queuedTime = 0; + auto ret = urEventGetProfilingInfo(event1, UR_PROFILING_INFO_COMMAND_QUEUED, + sizeof(uint64_t), &queuedTime, nullptr); + ASSERT_TRUE(ret == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION || + ret == UR_RESULT_SUCCESS); + + ASSERT_SUCCESS(urEventRelease(event1)); + ASSERT_SUCCESS(urUSMFree(context, ptr)); +} + TEST_P(urEventGetProfilingInfoTest, InvalidNullHandle) { UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); From e739c9c013f6f4727634fb6ae8df1bcff1790f06 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Fri, 18 Apr 2025 18:10:35 +0000 Subject: [PATCH 2/2] [SYCL][UR] do not call any function on queue in urEventGetProfilingInfo The queue can already be released. --- .../source/adapters/level_zero/v2/event.cpp | 29 ++++++++++--------- .../source/adapters/level_zero/v2/event.hpp | 4 +++ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/event.cpp b/unified-runtime/source/adapters/level_zero/v2/event.cpp index 70075966a3ad1..ad6c959c20a8d 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.cpp @@ -113,16 +113,22 @@ void ur_event_handle_t_::resetQueueAndCommand(ur_queue_t_ *hQueue, ur_command_t commandType) { this->hQueue = hQueue; this->commandType = commandType; + + if (hQueue) { + UR_CALL_THROWS(hQueue->queueGetInfo(UR_QUEUE_INFO_DEVICE, sizeof(hDevice), + reinterpret_cast(&hDevice), + nullptr)); + } else { + hDevice = nullptr; + } + profilingData.reset(); } void ur_event_handle_t_::recordStartTimestamp() { - assert(hQueue); // queue must be set before calling this - - ur_device_handle_t hDevice; - UR_CALL_THROWS(hQueue->queueGetInfo(UR_QUEUE_INFO_DEVICE, sizeof(hDevice), - reinterpret_cast(&hDevice), - nullptr)); + // queue and device must be set before calling this + assert(hQueue); + assert(hDevice); profilingData.recordStartTimestamp(hDevice); } @@ -188,6 +194,8 @@ ur_context_handle_t ur_event_handle_t_::getContext() const { return hContext; } ur_command_t ur_event_handle_t_::getCommandType() const { return commandType; } +ur_device_handle_t ur_event_handle_t_::getDevice() const { return hDevice; } + ur_event_handle_t_::ur_event_handle_t_( ur_context_handle_t hContext, v2::raii::cache_borrowed_event eventAllocation, v2::event_pool *pool) @@ -312,19 +320,14 @@ ur_result_t urEventGetProfilingInfo( } } - auto hQueue = hEvent->getQueue(); - if (!hQueue) { + auto hDevice = hEvent->getDevice(); + if (!hDevice) { // no command has been enqueued with this event yet return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } ze_kernel_timestamp_result_t tsResult; - ur_device_handle_t hDevice; - UR_CALL_THROWS(hQueue->queueGetInfo(UR_QUEUE_INFO_DEVICE, sizeof(hDevice), - reinterpret_cast(&hDevice), - nullptr)); - auto zeTimerResolution = hDevice->ZeDeviceProperties->timerResolution; auto timestampMaxValue = hDevice->getTimestampMask(); diff --git a/unified-runtime/source/adapters/level_zero/v2/event.hpp b/unified-runtime/source/adapters/level_zero/v2/event.hpp index 6810a3f5217dd..c09d998c9c609 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.hpp @@ -96,6 +96,9 @@ struct ur_event_handle_t_ : _ur_object { // Get the type of the command that this event is associated with ur_command_t getCommandType() const; + // Get the device associated with this event + ur_device_handle_t getDevice() const; + // Record the start timestamp of the event, to be obtained by // urEventGetProfilingInfo. resetQueueAndCommand should be // called before this. @@ -123,6 +126,7 @@ struct ur_event_handle_t_ : _ur_object { // commands ur_queue_t_ *hQueue = nullptr; ur_command_t commandType = UR_COMMAND_FORCE_UINT32; + ur_device_handle_t hDevice = nullptr; v2::event_flags_t flags; event_profiling_data_t profilingData;