-- v13: rtworkq: Release cancelled work items.
From: Yuxuan Shui yshui@codeweavers.com
queue_release_pending_item releases the work_item reference but later accesses `item->queue`, which is a potential use-after-free. --- dlls/rtworkq/queue.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/dlls/rtworkq/queue.c b/dlls/rtworkq/queue.c index eebb096ad31..00b77bf6953 100644 --- a/dlls/rtworkq/queue.c +++ b/dlls/rtworkq/queue.c @@ -734,9 +734,10 @@ static HRESULT invoke_async_callback(IRtwqAsyncResult *result) * removed from pending items when it got canceled. */ static BOOL queue_release_pending_item(struct work_item *item) { + struct queue *queue = item->queue; BOOL ret = FALSE;
- EnterCriticalSection(&item->queue->cs); + EnterCriticalSection(&queue->cs); if (item->key) { list_remove(&item->entry); @@ -744,7 +745,7 @@ static BOOL queue_release_pending_item(struct work_item *item) item->key = 0; IUnknown_Release(&item->IUnknown_iface); } - LeaveCriticalSection(&item->queue->cs); + LeaveCriticalSection(&queue->cs); return ret; }
From: Yuxuan Shui yshui@codeweavers.com
--- dlls/mfplat/tests/mfplat.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/dlls/mfplat/tests/mfplat.c b/dlls/mfplat/tests/mfplat.c index 6897036e9dd..363c240d13d 100644 --- a/dlls/mfplat/tests/mfplat.c +++ b/dlls/mfplat/tests/mfplat.c @@ -3598,6 +3598,7 @@ static void test_scheduled_items(void) IMFAsyncResult *result; MFWORKITEM_KEY key, key2; HRESULT hr; + ULONG refcount;
callback = create_test_callback(NULL);
@@ -3610,6 +3611,9 @@ static void test_scheduled_items(void) hr = MFCancelWorkItem(key); ok(hr == S_OK, "Failed to cancel item, hr %#lx.\n", hr);
+ refcount = IMFAsyncCallback_Release(&callback->IMFAsyncCallback_iface); + todo_wine ok(refcount == 0, "Unexpected refcount %lu.\n", refcount); + hr = MFCancelWorkItem(key); ok(hr == MF_E_NOT_FOUND || broken(hr == S_OK) /* < win10 */, "Unexpected hr %#lx.\n", hr);
@@ -3619,6 +3623,8 @@ static void test_scheduled_items(void) return; }
+ callback = create_test_callback(NULL); + hr = MFCreateAsyncResult(NULL, &callback->IMFAsyncCallback_iface, NULL, &result); ok(hr == S_OK, "Failed to create result, hr %#lx.\n", hr);
From: Yuxuan Shui yshui@codeweavers.com
Usually the threadpool holds a reference to the `work_item`, which is released when the `work_item`'s callback is invoked. On the other hand, `queue_cancel_item` closes the threadpool object without releasing the `work_item`. So if the callbacks don't get a chance to run - which is not guaranteed - the `work_item` will be leaked.
The fix is not as simple as adding a `IUnknown_Release` to `queue_cancel_item`, because the `work_item`'s callback can still be called after CloseThreadpoolTimer/Wait has returned. In fact its callback might currently be running. In which case the callback will access freed memory if `queue_cancel_item` frees the `work_item`.
We have to stop any further callbacks to be queued, wait for any currently running callbacks to finish, then finally we can release the `work_item` if it hasn't already been freed during the wait. --- dlls/mfplat/tests/mfplat.c | 2 +- dlls/rtworkq/queue.c | 53 +++++++++++++++++++++++++++++++------- 2 files changed, 45 insertions(+), 10 deletions(-)
diff --git a/dlls/mfplat/tests/mfplat.c b/dlls/mfplat/tests/mfplat.c index 363c240d13d..1e751b425fa 100644 --- a/dlls/mfplat/tests/mfplat.c +++ b/dlls/mfplat/tests/mfplat.c @@ -3612,7 +3612,7 @@ static void test_scheduled_items(void) ok(hr == S_OK, "Failed to cancel item, hr %#lx.\n", hr);
refcount = IMFAsyncCallback_Release(&callback->IMFAsyncCallback_iface); - todo_wine ok(refcount == 0, "Unexpected refcount %lu.\n", refcount); + ok(refcount == 0, "Unexpected refcount %lu.\n", refcount);
hr = MFCancelWorkItem(key); ok(hr == MF_E_NOT_FOUND || broken(hr == S_OK) /* < win10 */, "Unexpected hr %#lx.\n", hr); diff --git a/dlls/rtworkq/queue.c b/dlls/rtworkq/queue.c index 00b77bf6953..1fb367da03b 100644 --- a/dlls/rtworkq/queue.c +++ b/dlls/rtworkq/queue.c @@ -768,8 +768,9 @@ static void CALLBACK waiting_item_cancelable_callback(TP_CALLBACK_INSTANCE *inst
TRACE("result object %p.\n", item->result);
- if (queue_release_pending_item(item)) - invoke_async_callback(item->result); + queue_release_pending_item(item); + + invoke_async_callback(item->result);
IUnknown_Release(&item->IUnknown_iface); } @@ -791,8 +792,9 @@ static void CALLBACK scheduled_item_cancelable_callback(TP_CALLBACK_INSTANCE *in
TRACE("result object %p.\n", item->result);
- if (queue_release_pending_item(item)) - invoke_async_callback(item->result); + queue_release_pending_item(item); + + invoke_async_callback(item->result);
IUnknown_Release(&item->IUnknown_iface); } @@ -884,6 +886,8 @@ static HRESULT queue_submit_timer(struct queue *queue, IRtwqAsyncResult *result, static HRESULT queue_cancel_item(struct queue *queue, RTWQWORKITEM_KEY key) { HRESULT hr = RTWQ_E_NOT_FOUND; + TP_WAIT *wait_object; + TP_TIMER *timer_object; struct work_item *item;
EnterCriticalSection(&queue->cs); @@ -891,22 +895,53 @@ static HRESULT queue_cancel_item(struct queue *queue, RTWQWORKITEM_KEY key) { if (item->key == key) { + /* We can't immediately release the item here, because the callback could already be + * running somewhere else. And if we release it here, the callback will access freed memory. + * So instead we have to make sure the callback is really stopped, or has really finished + * running before we do that. And we can't do that in this critical section, which would be a + * deadlock. So we first keep an extra reference to it, then leave the critical section to + * wait for the thread-pool objects, finally we re-enter critical section to release it. */ key >>= 32; + IUnknown_AddRef(&item->IUnknown_iface); if ((key & WAIT_ITEM_KEY_MASK) == WAIT_ITEM_KEY_MASK) { - IRtwqAsyncResult_SetStatus(item->result, RTWQ_E_OPERATION_CANCELLED); - invoke_async_callback(item->result); - CloseThreadpoolWait(item->u.wait_object); + wait_object = item->u.wait_object; item->u.wait_object = NULL; + LeaveCriticalSection(&queue->cs); + + SetThreadpoolWait(wait_object, NULL, NULL); + WaitForThreadpoolWaitCallbacks(wait_object, TRUE); + CloseThreadpoolWait(wait_object); + + EnterCriticalSection(&queue->cs); } else if ((key & SCHEDULED_ITEM_KEY_MASK) == SCHEDULED_ITEM_KEY_MASK) { - CloseThreadpoolTimer(item->u.timer_object); + timer_object = item->u.timer_object; item->u.timer_object = NULL; + LeaveCriticalSection(&queue->cs); + + SetThreadpoolTimer(timer_object, NULL, 0, 0); + WaitForThreadpoolTimerCallbacks(timer_object, TRUE); + CloseThreadpoolTimer(timer_object); + + EnterCriticalSection(&queue->cs); } else WARN("Unknown item key mask %#I64x.\n", key); - queue_release_pending_item(item); + + if (queue_release_pending_item(item)) + { + /* This means the callback wasn't run during our wait, so we can invoke the + * callback with a canceled status, and release the work item. */ + if ((key & WAIT_ITEM_KEY_MASK) == WAIT_ITEM_KEY_MASK) + { + IRtwqAsyncResult_SetStatus(item->result, RTWQ_E_OPERATION_CANCELLED); + invoke_async_callback(item->result); + } + IUnknown_Release(&item->IUnknown_iface); + } + IUnknown_Release(&item->IUnknown_iface); hr = S_OK; break; }
Zhiyi Zhang (@zhiyi) commented about dlls/mfplat/tests/mfplat.c:
hr = MFCancelWorkItem(key);
queue_cancel_item() is called from RtwqCancelWorkItem() and RtwqRemovePeriodicCallback(). So please add tests for MFRemovePeriodicCallback() as well and confirm it has the same behavior.
Also, I would like to see some tests to show that MFCancelWorkItem() and MFRemovePeriodicCallback() are blocking and they do wait for their callbacks to finish. For example, use MFCancelWorkItem() to cancel a work item that's already started and takes ~1s to finish really costs ~1s.
Zhiyi Zhang (@zhiyi) commented about dlls/rtworkq/queue.c:
key >>= 32;
IUnknown_AddRef(&item->IUnknown_iface); if ((key & WAIT_ITEM_KEY_MASK) == WAIT_ITEM_KEY_MASK) {
IRtwqAsyncResult_SetStatus(item->result, RTWQ_E_OPERATION_CANCELLED);
invoke_async_callback(item->result);
CloseThreadpoolWait(item->u.wait_object);
wait_object = item->u.wait_object; item->u.wait_object = NULL;
LeaveCriticalSection(&queue->cs);
SetThreadpoolWait(wait_object, NULL, NULL);
WaitForThreadpoolWaitCallbacks(wait_object, TRUE);
CloseThreadpoolWait(wait_object);
EnterCriticalSection(&queue->cs);
I don't think you need to re-enter queue->cs? You should restructure it a bit. When the item is found, we break out of the loop and won't need to access queue->pending_items anymore. queue_release_pending_item() has its own Enter/LeaveCriticalSection().
Zhiyi Zhang (@zhiyi) commented about dlls/rtworkq/queue.c:
TRACE("result object %p.\n", item->result);
- if (queue_release_pending_item(item))
invoke_async_callback(item->result);
While I understand you no longer need to check the return value of queue_release_pending_item() because it always returns TRUE here, I think you can keep these codes to reduce the diff and potentially more future proof.
On Thu Mar 7 08:34:43 2024 +0000, Zhiyi Zhang wrote:
queue_cancel_item() is called from RtwqCancelWorkItem() and RtwqRemovePeriodicCallback(). So please add tests for MFRemovePeriodicCallback() as well and confirm it has the same behavior. Also, I would like to see some tests to show that MFCancelWorkItem() and MFRemovePeriodicCallback() are blocking and they do wait for their callbacks to finish. For example, use MFCancelWorkItem() to cancel a work item that's already started and takes ~1s to finish really costs ~1s.
I feel we are blocking waiting for the callbacks to finish because of how wine workqueue is implemented. Windows could be doing this in a way that doesn't require this wait, and still don't leak memory.
I think adding those tests are fine, but they might ended up being todos
On Sat Mar 9 02:55:40 2024 +0000, Yuxuan Shui wrote:
I feel we are blocking waiting for the callbacks to finish because of how wine workqueue is implemented. Windows could be doing this in a way that doesn't require this wait, and still don't leak memory. I think adding those tests are fine, but they might ended up being todos
Let's see what the tests will show and decide what to do next.
On Sat Mar 9 03:19:34 2024 +0000, Zhiyi Zhang wrote:
Let's see what the tests will show and decide what to do next.
This is difficult to trigger on Wine's side, because this particular wait only happens with a race condition: if the callback starts after we have found the item in `queue_cancel_item`, but before we close the tp object. It's probably the same on Windows' side, it would difficult to derive a unambiguous conclusion.
If we don't see `MFCancelWorkItem` block, maybe Windows does not block, maybe we just didn't hit the race condition.
I think it's not worth it to match Windows in a race condition.
On Wed Mar 13 14:39:24 2024 +0000, Yuxuan Shui wrote:
This is difficult to trigger on Wine's side, because this particular wait only happens with a race condition: if the callback starts after we have found the item in `queue_cancel_item`, but before we close the tp object. It's probably the same on Windows' side, it would difficult to derive a unambiguous conclusion. If we don't see `MFCancelWorkItem` block, maybe Windows does not block, maybe we just didn't hit the race condition. I think it's not worth it to match Windows in a race condition.
I meant to test that MFCancelWorkItem() is a blocking call. Not to test the race condition. You can create a work item that fires an event after it gets started and then Sleeps(1s). And you wait for the event to be fired in the main thread, then call MFCancelWorkItem(), and time it and check if it takes ~1s for MFCancelWorkItem() to finish.
On Wed Mar 13 14:49:51 2024 +0000, Zhiyi Zhang wrote:
I meant to test that MFCancelWorkItem() is a blocking call. Not to test the race condition. You can create a work item that fires an event after it gets started and then Sleeps(1s). And you wait for the event to be fired in the main thread, then call MFCancelWorkItem(), and time it and check if it takes ~1s for MFCancelWorkItem() to finish.
@zhiyi In that case neither windows or wine blocks.