From: Giovanni Mascellani gmascellani@codeweavers.com
--- libs/vkd3d/command.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 62fc5149..45620e86 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -642,6 +642,9 @@ static HRESULT d3d12_device_add_blocked_command_queues(struct d3d12_device *devi HRESULT hr = S_OK; unsigned int i;
+ if (count == 0) + return S_OK; + vkd3d_mutex_lock(&device->mutex);
if ((i = ARRAY_SIZE(device->blocked_queues) - device->blocked_queue_count) < count) @@ -697,8 +700,6 @@ static HRESULT d3d12_device_flush_blocked_queues(struct d3d12_device *device) /* Executing an op on one queue may unblock another, so repeat until nothing is flushed. */ do { - if (!device->blocked_queue_count) - return S_OK; if (FAILED(hr = d3d12_device_flush_blocked_queues_once(device, &flushed_any))) return hr; }
From: Giovanni Mascellani gmascellani@codeweavers.com
--- libs/vkd3d/command.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 45620e86..9cf35831 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -6801,13 +6801,14 @@ static bool d3d12_command_queue_flush_ops(struct d3d12_command_queue *queue, boo { case VKD3D_CS_OP_WAIT: fence = op->u.wait.fence; + vkd3d_mutex_lock(&fence->mutex); if (op->u.wait.value > fence->max_pending_value) { + vkd3d_mutex_unlock(&fence->mutex); queue->ops_count -= i; memmove(queue->ops, op, queue->ops_count * sizeof(*op)); goto done; } - vkd3d_mutex_lock(&fence->mutex); d3d12_command_queue_wait_locked(queue, fence, op->u.wait.value); d3d12_fence_decref(fence); break;
From: Giovanni Mascellani gmascellani@codeweavers.com
--- libs/vkd3d/command.c | 8 ++++---- libs/vkd3d/device.c | 3 +++ libs/vkd3d/vkd3d_private.h | 1 + 3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 9cf35831..158d0d9f 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -645,7 +645,7 @@ static HRESULT d3d12_device_add_blocked_command_queues(struct d3d12_device *devi if (count == 0) return S_OK;
- vkd3d_mutex_lock(&device->mutex); + vkd3d_mutex_lock(&device->blocked_queues_mutex);
if ((i = ARRAY_SIZE(device->blocked_queues) - device->blocked_queue_count) < count) { @@ -657,7 +657,7 @@ static HRESULT d3d12_device_add_blocked_command_queues(struct d3d12_device *devi for (i = 0; i < count; ++i) device->blocked_queues[device->blocked_queue_count++] = command_queues[i];
- vkd3d_mutex_unlock(&device->mutex); + vkd3d_mutex_unlock(&device->blocked_queues_mutex); return hr; }
@@ -668,7 +668,7 @@ static HRESULT d3d12_device_flush_blocked_queues_once(struct d3d12_device *devic
*flushed_any = false;
- vkd3d_mutex_lock(&device->mutex); + vkd3d_mutex_lock(&device->blocked_queues_mutex);
/* Flush any ops unblocked by a new pending value. These cannot be flushed * with the device locked, so move the queue pointers to a local array. */ @@ -676,7 +676,7 @@ static HRESULT d3d12_device_flush_blocked_queues_once(struct d3d12_device *devic memcpy(blocked_queues, device->blocked_queues, blocked_queue_count * sizeof(blocked_queues[0])); device->blocked_queue_count = 0;
- vkd3d_mutex_unlock(&device->mutex); + vkd3d_mutex_unlock(&device->blocked_queues_mutex);
i = 0; while (i < blocked_queue_count) diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 21dd17ea..d81adc55 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2462,6 +2462,8 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device *iface) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
+ vkd3d_mutex_destroy(&device->blocked_queues_mutex); + vkd3d_private_store_destroy(&device->private_store);
vkd3d_cleanup_format_info(device); @@ -4119,6 +4121,7 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, vkd3d_time_domains_init(device);
device->blocked_queue_count = 0; + vkd3d_mutex_init(&device->blocked_queues_mutex);
for (i = 0; i < ARRAY_SIZE(device->desc_mutex); ++i) vkd3d_mutex_init(&device->desc_mutex[i]); diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 9c7bed39..56677648 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -1512,6 +1512,7 @@ struct d3d12_device unsigned int queue_family_count; VkTimeDomainEXT vk_host_time_domain;
+ struct vkd3d_mutex blocked_queues_mutex; struct d3d12_command_queue *blocked_queues[VKD3D_MAX_DEVICE_BLOCKED_QUEUES]; unsigned int blocked_queue_count;
My understanding of the problem with blocked_queue_count is this sequence:
1. thread0: Is in d3d12_command_queue_Wait() after `if (!command_queue->ops_count && value <= fence->max_pending_value)` but has not yet called d3d12_device_add_blocked_command_queues(). 2. thread1: Is handling a signal which unblocks the wait, but had not yet updated max_pending_value when thread0 checked it. It executes `if (!device->blocked_queue_count)` before thread0 calls d3d12_device_add_blocked_command_queues(). 3. thread0: calls d3d12_device_add_blocked_command_queues() but the corresponding call to d3d12_device_flush_blocked_queues() has passed in thread1.
I don't think we need to lock the fence mutex before checking max_pending_value. All cases which update max_pending_value lead to a new call to d3d12_command_queue_flush_ops(), except for updates from vkd3d_wait_for_gpu_timeline_semaphore(), which are not relevant because higher values being waited on there have already been included in max_pending_value.
The device mutex is for protecting the cache so it should probably be renamed at some point. You're right about creating a separate one for blocked queues.
On Fri Feb 10 12:10:25 2023 +0000, Conor McCarthy wrote:
My understanding of the problem with blocked_queue_count is this sequence:
- thread0: Is in d3d12_command_queue_Wait() after `if
(!command_queue->ops_count && value <= fence->max_pending_value)` but has not yet called d3d12_device_add_blocked_command_queues(). 2. thread1: Is handling a signal which unblocks the wait, but had not yet updated max_pending_value when thread0 checked it. It executes `if (!device->blocked_queue_count)` before thread0 calls d3d12_device_add_blocked_command_queues(). 3. thread0: calls d3d12_device_add_blocked_command_queues() but the corresponding call to d3d12_device_flush_blocked_queues() has passed in thread1.
Notice that on relaxed memory architectures, like ARM, something even worse can happen: even after that `d3d12_device_add_blocked_command_queues()` has released its mutex, it's not guaranteed that another thread observes the new value for `blocked_queue_count` unless this latter thread has first acquired the mutex. The reading thread might, for example, read from an earlier cached value of `blocked_queue_count` and not bother checking that the cache is still up-to-date. So I think we should really protect the read with the mutex.
On Fri Feb 10 12:17:03 2023 +0000, Conor McCarthy wrote:
The device mutex is for protecting the cache so it should probably be renamed at some point. You're right about creating a separate one for blocked queues.
Actually, while I was creating `blocked_queues_mutex` I also wondered if the current `mutex` can be further split. In `state.c` it seems that it is used to protect both `cache->render_passes` and `graphics->compiled_pipelines`, and it's not clear to me whether these two uses are independent or not. If they are, the mutex can be further split, and in principle I guess that it's a good idea, when possible, to keep locking as granular as possible. But since I don't know much that code, I didn't venture doing anything.
On Fri Feb 10 12:34:40 2023 +0000, Conor McCarthy wrote:
I don't think we need to lock the fence mutex before checking max_pending_value. All cases which update max_pending_value lead to a new call to d3d12_command_queue_flush_ops(), except for updates from vkd3d_wait_for_gpu_timeline_semaphore(), which are not relevant because higher values being waited on there have already been included in max_pending_value.
As before, I don't think we should assume strong memory models, as I guess we want our code to be portable. And on relaxed memory models you're not guaranteed that memory reads are current unless you've performed some acquire operation, like locking a mutex.
Specifically in this case, I guess this is a scenario which could go wrong: thread A waits on a fence, with a value larger than the current `max_pending_value`; it therefore mark that queue as blocked. Then thread A continues doing other stuff, until, for some unrelated reason, it begins flushing queues. In `d3d12_device_flush_blocked_queues_once()` it "steals" all the blocked queues, locks the device mutex and is about to start processing them when it finishes its time quantum and has to yield to another thread. On another core, thread B finally signals the fence with a value that would unblock the queue, and it updates `max_pending_value`. When it comes to `d3d12_device_flush_blocked_queues_once()` there is nothing to do, because thread A has temporarily stolen them, so thread B resumes doing other stuff. This should not be a problem, because as soon as thread A gets rescheduled it should see the new `max_pending_value` and act by it. But, as I said, unless there is sufficient sync hronization, thread A is not guaranteed to see the `max_pending_value` that was set by thread B, and it could conclude that there is nothing to do for that fence, incorrectly missing the queue unblock.
On Fri Feb 10 12:17:03 2023 +0000, Giovanni Mascellani wrote:
Actually, while I was creating `blocked_queues_mutex` I also wondered if the current `mutex` can be further split. In `state.c` it seems that it is used to protect both `cache->render_passes` and `graphics->compiled_pipelines`, and it's not clear to me whether these two uses are independent or not. If they are, the mutex can be further split, and in principle I guess that it's a good idea, when possible, to keep locking as granular as possible. But since I don't know much that code, I didn't venture doing anything.
That makes sense too.
This merge request was approved by Conor McCarthy.
This merge request was approved by Henri Verbeet.