-- v3: vkd3d: Delete a misleading comment. vkd3d: Mention the correct mutex in a comment.
From: Giovanni Mascellani gmascellani@codeweavers.com
The goal is to simplify the CS queue handling: with this change operations are always started by d3d12_command_queue_flush_ops(), in order to make further refactoring easier.
Notice that while with this change executing an operation on an empty CS queue is a bit less efficient, it doesn't require more locking. On the other hand, this change paves the road for executing CS operations without holding the queue lock. --- libs/vkd3d/command.c | 142 +++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 86 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 158d0d9f..23353a6f 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -27,6 +27,7 @@ static void d3d12_fence_signal_timeline_semaphore(struct d3d12_fence *fence, uin static HRESULT d3d12_command_queue_signal(struct d3d12_command_queue *command_queue, struct d3d12_fence *fence, uint64_t value); static bool d3d12_command_queue_flush_ops(struct d3d12_command_queue *queue, bool *flushed_any); +static bool d3d12_command_queue_flush_ops_locked(struct d3d12_command_queue *queue, bool *flushed_any);
HRESULT vkd3d_queue_create(struct d3d12_device *device, uint32_t family_index, const VkQueueFamilyProperties *properties, struct vkd3d_queue **queue) @@ -6150,12 +6151,40 @@ static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queu vkd3d_free(buffers); }
+static HRESULT d3d12_command_queue_enqueue_op(struct d3d12_command_queue *queue, const struct vkd3d_cs_op_data *op) +{ + struct vkd3d_cs_op_data *new_op; + bool flushed_any = false; + + vkd3d_mutex_lock(&queue->op_mutex); + + if (!(new_op = d3d12_command_queue_require_space_locked(queue))) + { + vkd3d_mutex_unlock(&queue->op_mutex); + return E_OUTOFMEMORY; + } + + *new_op = *op; + + if (queue->ops_count == 1) + { + if (!d3d12_command_queue_flush_ops_locked(queue, &flushed_any)) + return d3d12_device_add_blocked_command_queues(queue->device, &queue, 1); + } + else + { + vkd3d_mutex_unlock(&queue->op_mutex); + } + + return S_OK; +} + static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12CommandQueue *iface, UINT command_list_count, ID3D12CommandList * const *command_lists) { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); struct d3d12_command_list *cmd_list; - struct vkd3d_cs_op_data *op; + struct vkd3d_cs_op_data op; VkCommandBuffer *buffers; unsigned int i;
@@ -6186,26 +6215,12 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm buffers[i] = cmd_list->vk_command_buffer; }
- vkd3d_mutex_lock(&command_queue->op_mutex); + op.opcode = VKD3D_CS_OP_EXECUTE; + op.u.execute.buffers = buffers; + op.u.execute.buffer_count = command_list_count;
- if (!command_queue->ops_count) - { - d3d12_command_queue_execute(command_queue, buffers, command_list_count); - vkd3d_mutex_unlock(&command_queue->op_mutex); - return; - } - - if (!(op = d3d12_command_queue_require_space_locked(command_queue))) - { - ERR("Failed to add op.\n"); - return; - } - op->opcode = VKD3D_CS_OP_EXECUTE; - op->u.execute.buffers = buffers; - op->u.execute.buffer_count = command_list_count; - - vkd3d_mutex_unlock(&command_queue->op_mutex); - return; + if (FAILED(d3d12_command_queue_enqueue_op(command_queue, &op))) + ERR("Cannot enqueue in command queue %p.\n", command_queue); }
static void STDMETHODCALLTYPE d3d12_command_queue_SetMarker(ID3D12CommandQueue *iface, @@ -6262,33 +6277,17 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); struct d3d12_fence *fence = unsafe_impl_from_ID3D12Fence(fence_iface); - struct vkd3d_cs_op_data *op; - HRESULT hr = S_OK; + struct vkd3d_cs_op_data op;
TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value);
- vkd3d_mutex_lock(&command_queue->op_mutex); - - if (!command_queue->ops_count) - { - hr = d3d12_command_queue_signal(command_queue, fence, value); - goto done; - } - - if (!(op = d3d12_command_queue_require_space_locked(command_queue))) - { - hr = E_OUTOFMEMORY; - goto done; - } - op->opcode = VKD3D_CS_OP_SIGNAL; - op->u.signal.fence = fence; - op->u.signal.value = value; + op.opcode = VKD3D_CS_OP_SIGNAL; + op.u.signal.fence = fence; + op.u.signal.value = value;
d3d12_fence_incref(fence);
-done: - vkd3d_mutex_unlock(&command_queue->op_mutex); - return hr; + return d3d12_command_queue_enqueue_op(command_queue, &op); }
static HRESULT d3d12_command_queue_signal(struct d3d12_command_queue *command_queue, @@ -6604,51 +6603,17 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Wait(ID3D12CommandQueue *if { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); struct d3d12_fence *fence = unsafe_impl_from_ID3D12Fence(fence_iface); - struct vkd3d_cs_op_data *op; - HRESULT hr = S_OK; + struct vkd3d_cs_op_data op;
TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value);
- vkd3d_mutex_lock(&command_queue->op_mutex); - vkd3d_mutex_lock(&fence->mutex); - - if (!command_queue->ops_count && value <= fence->max_pending_value) - { - hr = d3d12_command_queue_wait_locked(command_queue, fence, value); - goto done; - } - - /* This is the critical part required to support out-of-order signal. - * Normally we would be able to submit waits and signals out of order, but - * we don't have virtualized queues in Vulkan, so we need to handle the case - * where multiple queues alias over the same physical queue, so effectively, - * we need to manage out-of-order submits ourselves. */ - - if (!(op = d3d12_command_queue_require_space_locked(command_queue))) - { - vkd3d_mutex_unlock(&fence->mutex); - hr = E_OUTOFMEMORY; - goto done; - } - op->opcode = VKD3D_CS_OP_WAIT; - op->u.wait.fence = fence; - op->u.wait.value = value; + op.opcode = VKD3D_CS_OP_WAIT; + op.u.wait.fence = fence; + op.u.wait.value = value;
d3d12_fence_incref(fence);
- /* Add the queue to the blocked list after writing the op to ensure the queue isn't - * removed again in another thread because it has no ops. */ - if (command_queue->ops_count == 1) - hr = d3d12_device_add_blocked_command_queues(command_queue->device, &command_queue, 1); - - /* The fence must remain locked until the op is created and the queue is added to the blocked list, - * because if an unblocking d3d12_fence_Signal() call occurs on another thread before the above - * work is done, flushing will be delayed until the next signal, if one occurs at all. */ - vkd3d_mutex_unlock(&fence->mutex); - -done: - vkd3d_mutex_unlock(&command_queue->op_mutex); - return hr; + return d3d12_command_queue_enqueue_op(command_queue, &op); }
static HRESULT STDMETHODCALLTYPE d3d12_command_queue_GetTimestampFrequency(ID3D12CommandQueue *iface, @@ -6772,14 +6737,8 @@ static const struct ID3D12CommandQueueVtbl d3d12_command_queue_vtbl = d3d12_command_queue_GetDesc, };
-/* flushed_any is initialised by the caller. */ static bool d3d12_command_queue_flush_ops(struct d3d12_command_queue *queue, bool *flushed_any) { - struct vkd3d_cs_op_data *op; - struct d3d12_fence *fence; - bool flushed_all = false; - unsigned int i; - if (!queue->ops_count) return true;
@@ -6791,6 +6750,17 @@ static bool d3d12_command_queue_flush_ops(struct d3d12_command_queue *queue, boo
vkd3d_mutex_lock(&queue->op_mutex);
+ return d3d12_command_queue_flush_ops_locked(queue, flushed_any); +} + +/* flushed_any is initialised by the caller. */ +static bool d3d12_command_queue_flush_ops_locked(struct d3d12_command_queue *queue, bool *flushed_any) +{ + struct vkd3d_cs_op_data *op; + struct d3d12_fence *fence; + bool flushed_all = false; + unsigned int i; + /* Currently only required for d3d12_command_queue_signal(), but set it here anyway. */ queue->is_flushing = true;
From: Giovanni Mascellani gmascellani@codeweavers.com
d3d12_command_queue_flush_ops() can renter itself while processing signal events. Since we don't use recursive mutexes, we currently have to check some of the queue variables without holding the mutex, which is not safe.
This is solved by allowing the queue to release its mutex while it is processing entries: when flushing, the queue is briefly locked, the is_flushing flag is set, the queue content is copied away and the queue is unlocked again. After having processed the entries, the queue is locked again to check is something else was added in the meantime. This is repeated until the queue is empty (or a wait operation is blocking it).
This should also remove some latency when a thread pushes to the queue while another one is processing it, but I didn't try to measure any impact. While it is expected that with this patch the queue mutex will be locked and unlocked more frequently, it should also remain locked for less time, hopefully creating little contention. --- libs/vkd3d/command.c | 136 ++++++++++++++++++++++++++----------- libs/vkd3d/vkd3d_private.h | 9 +++ 2 files changed, 106 insertions(+), 39 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 23353a6f..f83774cf 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -6017,6 +6017,7 @@ static ULONG STDMETHODCALLTYPE d3d12_command_queue_Release(ID3D12CommandQueue *i
vkd3d_mutex_destroy(&command_queue->op_mutex); vkd3d_free(command_queue->ops); + vkd3d_free(command_queue->aux_ops);
vkd3d_private_store_destroy(&command_queue->private_store);
@@ -6166,7 +6167,7 @@ static HRESULT d3d12_command_queue_enqueue_op(struct d3d12_command_queue *queue,
*new_op = *op;
- if (queue->ops_count == 1) + if (queue->ops_count == 1 && !queue->is_flushing) { if (!d3d12_command_queue_flush_ops_locked(queue, &flushed_any)) return d3d12_device_add_blocked_command_queues(queue->device, &queue, 1); @@ -6737,18 +6738,62 @@ static const struct ID3D12CommandQueueVtbl d3d12_command_queue_vtbl = d3d12_command_queue_GetDesc, };
+static void d3d12_command_queue_swap_queues(struct d3d12_command_queue *queue) +{ + size_t size; + struct vkd3d_cs_op_data *ops; + + size = queue->ops_count; + queue->ops_count = queue->aux_ops_count; + queue->aux_ops_count = size; + + size = queue->ops_size; + queue->ops_size = queue->aux_ops_size; + queue->aux_ops_size = size; + + ops = queue->ops; + queue->ops = queue->aux_ops; + queue->aux_ops = ops; +} + +static void d3d12_command_queue_fixup_after_flush(struct d3d12_command_queue *queue, unsigned int done_count) +{ + queue->aux_ops_count -= done_count; + memmove(queue->aux_ops, &queue->aux_ops[done_count], queue->aux_ops_count * sizeof(*queue->ops)); + + vkd3d_mutex_lock(&queue->op_mutex); + + d3d12_command_queue_swap_queues(queue); + + if (!vkd3d_array_reserve((void **)&queue->ops, &queue->ops_size, + queue->ops_count + queue->aux_ops_count, sizeof(*queue->ops))) + { + ERR("Cannot reallocate, dropping %zu ops.\n", queue->aux_ops_count); + } + else + { + memcpy(&queue->ops[queue->ops_count], queue->aux_ops, queue->aux_ops_count * sizeof(*queue->ops)); + queue->ops_count += queue->aux_ops_count; + } + + queue->aux_ops_count = 0; + queue->is_flushing = false; + + vkd3d_mutex_unlock(&queue->op_mutex); +} + static bool d3d12_command_queue_flush_ops(struct d3d12_command_queue *queue, bool *flushed_any) { - if (!queue->ops_count) - return true; + vkd3d_mutex_lock(&queue->op_mutex);
/* This function may be re-entered during a call below to d3d12_command_queue_signal(). * We return true because the first caller is responsible for re-adding this queue to * the flush list if it ends up returning false. */ if (queue->is_flushing) + { + vkd3d_mutex_unlock(&queue->op_mutex); return true; - - vkd3d_mutex_lock(&queue->op_mutex); + }
return d3d12_command_queue_flush_ops_locked(queue, flushed_any); } @@ -6758,55 +6803,62 @@ static bool d3d12_command_queue_flush_ops_locked(struct d3d12_command_queue *que { struct vkd3d_cs_op_data *op; struct d3d12_fence *fence; - bool flushed_all = false; unsigned int i;
- /* Currently only required for d3d12_command_queue_signal(), but set it here anyway. */ queue->is_flushing = true;
- for (i = 0; i < queue->ops_count; ++i) + assert(queue->aux_ops_count == 0); + + while (queue->ops_count != 0) { - op = &queue->ops[i]; - switch (op->opcode) + d3d12_command_queue_swap_queues(queue); + + vkd3d_mutex_unlock(&queue->op_mutex); + + for (i = 0; i < queue->aux_ops_count; ++i) { - case VKD3D_CS_OP_WAIT: - fence = op->u.wait.fence; - vkd3d_mutex_lock(&fence->mutex); - if (op->u.wait.value > fence->max_pending_value) - { - vkd3d_mutex_unlock(&fence->mutex); - queue->ops_count -= i; - memmove(queue->ops, op, queue->ops_count * sizeof(*op)); - goto done; - } - d3d12_command_queue_wait_locked(queue, fence, op->u.wait.value); - d3d12_fence_decref(fence); - break; + op = &queue->aux_ops[i]; + switch (op->opcode) + { + case VKD3D_CS_OP_WAIT: + fence = op->u.wait.fence; + vkd3d_mutex_lock(&fence->mutex); + if (op->u.wait.value > fence->max_pending_value) + { + vkd3d_mutex_unlock(&fence->mutex); + d3d12_command_queue_fixup_after_flush(queue, i); + return false; + } + d3d12_command_queue_wait_locked(queue, fence, op->u.wait.value); + d3d12_fence_decref(fence); + break;
- case VKD3D_CS_OP_SIGNAL: - d3d12_command_queue_signal(queue, op->u.signal.fence, op->u.signal.value); - d3d12_fence_decref(op->u.signal.fence); - break; + case VKD3D_CS_OP_SIGNAL: + d3d12_command_queue_signal(queue, op->u.signal.fence, op->u.signal.value); + d3d12_fence_decref(op->u.signal.fence); + break;
- case VKD3D_CS_OP_EXECUTE: - d3d12_command_queue_execute(queue, op->u.execute.buffers, op->u.execute.buffer_count); - break; + case VKD3D_CS_OP_EXECUTE: + d3d12_command_queue_execute(queue, op->u.execute.buffers, op->u.execute.buffer_count); + break;
- default: - FIXME("Unhandled op type %u.\n", op->opcode); - break; + default: + vkd3d_unreachable(); + } + + *flushed_any |= true; } - *flushed_any |= true; - }
- queue->ops_count = 0; - flushed_all = true; + queue->aux_ops_count = 0; + + vkd3d_mutex_lock(&queue->op_mutex); + }
-done: queue->is_flushing = false;
vkd3d_mutex_unlock(&queue->op_mutex); - return flushed_all; + + return true; }
static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, @@ -6832,6 +6884,10 @@ static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, queue->ops_size = 0; queue->is_flushing = false;
+ queue->aux_ops = NULL; + queue->aux_ops_count = 0; + queue->aux_ops_size = 0; + if (desc->Priority == D3D12_COMMAND_QUEUE_PRIORITY_GLOBAL_REALTIME) { FIXME("Global realtime priority is not implemented.\n"); @@ -6897,6 +6953,8 @@ VkQueue vkd3d_acquire_vk_queue(ID3D12CommandQueue *queue)
if (d3d12_queue->ops_count) WARN("Acquired command queue %p with %zu remaining ops.\n", d3d12_queue, d3d12_queue->ops_count); + else if (d3d12_queue->is_flushing) + WARN("Acquired command queue %p which is flushing.\n", d3d12_queue);
return vk_queue; } diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 56677648..f2b519fa 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -1387,11 +1387,20 @@ struct d3d12_command_queue struct d3d12_device *device;
struct vkd3d_mutex op_mutex; + + /* These fields are protected by op_mutex. */ struct vkd3d_cs_op_data *ops; size_t ops_count; size_t ops_size; bool is_flushing;
+ /* These fields are not protected by op_mutex, but can only be + * used by the thread that set is_flushing; when is_flushing is + * not set, aux_ops_count must be zero. */ + struct vkd3d_cs_op_data *aux_ops; + size_t aux_ops_count; + size_t aux_ops_size; + struct vkd3d_private_store private_store; };
From: Giovanni Mascellani gmascellani@codeweavers.com
--- libs/vkd3d/command.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index f83774cf..7d9a9eeb 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -671,8 +671,9 @@ static HRESULT d3d12_device_flush_blocked_queues_once(struct d3d12_device *devic
vkd3d_mutex_lock(&device->blocked_queues_mutex);
- /* Flush any ops unblocked by a new pending value. These cannot be flushed - * with the device locked, so move the queue pointers to a local array. */ + /* Flush any ops unblocked by a new pending value. These cannot be + * flushed while holding blocked_queue_mutex, so move the queue + * pointers to a local array. */ blocked_queue_count = device->blocked_queue_count; memcpy(blocked_queues, device->blocked_queues, blocked_queue_count * sizeof(blocked_queues[0])); device->blocked_queue_count = 0;
From: Giovanni Mascellani gmascellani@codeweavers.com
Queues can be readded to the blocked queues array, e.g. if they are simultanously blocked and unblocked again by another thread before this one calls d3d12_device_add_blocked_command_queues(). However, it's not a problem to have the same queue twice in the blocked array and flush it more than once (unless the blocked queues array eventually fills up). --- libs/vkd3d/command.c | 2 -- 1 file changed, 2 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 7d9a9eeb..ec8755d3 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -689,8 +689,6 @@ static HRESULT d3d12_device_flush_blocked_queues_once(struct d3d12_device *devic ++i; }
- /* None of these queues could have been re-added during the above loop because - * blocked queues always have a nonzero op count. */ return d3d12_device_add_blocked_command_queues(device, blocked_queues, blocked_queue_count); }
I was referring more to the whole series, not just queues being added more than once. Sorry if that wasn't clear. The main issue is it really needs a threaded solution. For example, the ops mutex will naturally become available for locking when the worker thread sleeps on a condition variable, so workarounds become unnecessary; e.g. `is_flushing` disappears. The current implementation is little more than a hack to avoid threading, and the issue with sequencing vkQueuePresentKHR() is the only reason it was done that way.
Are these changes required to solve the main issue with Present sequencing? If not then I think the Present problem should be fixed on its own, then a threaded ops queue introduced after that.
I was referring more to the whole series, not just queues being added more than once. Sorry if that wasn't clear. The main issue is it really needs a threaded solution. For example, the ops mutex will naturally become available for locking when the worker thread sleeps on a condition variable, so workarounds become unnecessary; e.g. `is_flushing` disappears. The current implementation is little more than a hack to avoid threading, and the issue with sequencing vkQueuePresentKHR() is the only reason it was done that way.
I am not sure I agree that the current implementation is a hack. It's a solution in which the queue is flushed by whatever thread happens to find it in a flushable state, rather than by a another dedicated thread. To me it feels a completely reasonable choice. At any rate, I am getting convinced too that one or more dedicate threads could be better, though for other reasons. And even in that case I think it's better to avoid holding the lock while processing the queue, so that any thread willing to push something to the queue shouldn't uselessly wait for the queue to be flushed before continuing. So I'd say that this MR is actually a change in the direction of having a dedicated thread, even though that cannot happen until DXGI doesn't need anymore to access the Vulkan queue (I'm working on that too).
Are these changes required to solve the main issue with Present sequencing? If not then I think the Present problem should be fixed on its own, then a threaded ops queue introduced after that.
No, it's not immediately conducive to solving the problem with DXGI. But it still fixes a bug (accessing mutable fields without holding the lock) and should be considered anyway (and, as I said, it will make switching to a threaded CS worker easier as soon as we can).
So I'd say that this MR is actually a change in the direction of having a dedicated thread, even though that cannot happen until DXGI doesn't need anymore to access the Vulkan queue (I'm working on that too).
It's perhaps worth pointing out that even if Wine's DXGI were to stop using vkd3d_acquire_vk_queue(), that wouldn't make the API go away.
It's perhaps worth pointing out that even if Wine's DXGI were to stop using vkd3d_acquire_vk_queue(), that wouldn't make the API go away.
In that case we won't be able to use a threaded implementation.
@giomasce I think the patch for preventing duplicate entries in the blocked queue list should be included also. I'll be able to review all of it.
It's perhaps worth pointing out that even if Wine's DXGI were to stop using vkd3d_acquire_vk_queue(), that wouldn't make the API go away.
Is there any hope that we can deprecate it and eventually make it go away? I think it's basically a broken API, since it lets clients access internal vkd3d data at a level which is too low to make it safely usable. It was probably good enough before the CS queue was introduced, but I don't think there is any reasonable way to have it together with the CS queue, which is essentially where the presentation order bug comes from. So I think we should deprecate it anyway, as soon as we give our clients some way to do the stuff they wanted to do with it.
However, that's a rather orthogonal question: this MR still fixes a bug, and if we eventually want to switch to a worker thread (either by breaking compatibility or making the worked thread optional, so that `vkd3d_acquire_vk_queue()` can still be used by clients that don't want to use the worker thread) that's made easier by this MR as well.
Is there any hope that we can deprecate it and eventually make it go away?
We could certainly deprecate it. I don't think we can make it go away completely, other than in the sense of "Returns VK_NULL_HANDLE when VKD3D_API_VERSION_1_8 or higher is specified.", or something along those lines.
We could certainly deprecate it. I don't think we can make it go away completely, other than in the sense of "Returns VK_NULL_HANDLE when VKD3D_API_VERSION_1_8 or higher is specified.", or something along those lines.
That still means that a client compiled with vkd3d 1.7 and running with vkd3d 1.8 would receive a valid handle, right? So even in vkd3d 1.8 we still have to support a code path without the CS thread. It requires some more code, but it should be doable.