-- v3: vkd3d: Write Vulkan descriptors in a worker thread. vkd3d: Update the descriptor `next` index before getting a reference for writing.
From: Conor McCarthy cmccarthy@codeweavers.com
Fixes a race condition where the descriptor is modified between getting its object and resetting the `next` index. The new object would never be written. While it is invalid for the app to write descriptors used by a command list which has been submitted to a queue, unused descriptors may be written. This change also supports writing descriptors in a worker thread. --- libs/vkd3d/resource.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index f2fe1250..5a015d6e 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2389,20 +2389,18 @@ void d3d12_desc_flush_vk_heap_updates_locked(struct d3d12_descriptor_heap *descr for (; i != UINT_MAX; i = next) { src = &descriptors[i]; - next = (int)src->next >> 1; + next = vkd3d_atomic_exchange(&src->next, 0); + next = (int)next >> 1;
+ /* A race exists here between updating src->next and getting the current object. The best + * we can do is get the object last, which may result in a harmless rewrite later. */ u.object = d3d12_desc_get_object_ref(src, device);
if (!u.object) - { - vkd3d_atomic_exchange(&src->next, 0); continue; - }
writes.held_refs[writes.held_ref_count++] = u.object; d3d12_desc_write_vk_heap(descriptor_heap, i, &writes, u.object, device); - - vkd3d_atomic_exchange(&src->next, 0); }
/* Avoid thunk calls wherever possible. */
From: Conor McCarthy cmccarthy@codeweavers.com
Raises framerate in Horizon Zero Dawn by about 5-10%. --- libs/vkd3d/command.c | 17 +++++++- libs/vkd3d/resource.c | 83 ++++++++++++++++++++++++++++++++++++++ libs/vkd3d/vkd3d_private.h | 11 +++++ 3 files changed, 109 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 24fbbce9..3cccd361 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -2623,10 +2623,21 @@ static bool d3d12_command_list_update_current_framebuffer(struct d3d12_command_l return true; }
+static void command_list_signal_descriptor_heaps(struct d3d12_command_list *list) +{ + unsigned int i; + + for (i = 0; i < list->descriptor_heap_count; ++i) + if (list->descriptor_heaps[i]->dirty_list_head != UINT_MAX) + vkd3d_cond_signal(&list->descriptor_heaps[i]->worker.cond); +} + static bool d3d12_command_list_update_compute_pipeline(struct d3d12_command_list *list) { const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs;
+ command_list_signal_descriptor_heaps(list); + if (list->current_pipeline != VK_NULL_HANDLE) return true;
@@ -2648,6 +2659,8 @@ static bool d3d12_command_list_update_graphics_pipeline(struct d3d12_command_lis VkRenderPass vk_render_pass; VkPipeline vk_pipeline;
+ command_list_signal_descriptor_heaps(list); + if (list->current_pipeline != VK_NULL_HANDLE) return true;
@@ -3196,9 +3209,9 @@ static void command_list_flush_vk_heap_updates(struct d3d12_command_list *list)
for (i = 0; i < list->descriptor_heap_count; ++i) { - vkd3d_mutex_lock(&list->descriptor_heaps[i]->vk_sets_mutex); + vkd3d_mutex_lock(&list->descriptor_heaps[i]->worker.mutex); d3d12_desc_flush_vk_heap_updates_locked(list->descriptor_heaps[i], device); - vkd3d_mutex_unlock(&list->descriptor_heaps[i]->vk_sets_mutex); + vkd3d_mutex_unlock(&list->descriptor_heaps[i]->worker.mutex); } }
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 5a015d6e..8bfbbf28 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2409,6 +2409,80 @@ void d3d12_desc_flush_vk_heap_updates_locked(struct d3d12_descriptor_heap *descr descriptor_writes_free_object_refs(&writes, device); }
+static void *descriptor_write_worker_main(void *arg) +{ + struct descriptor_write_worker *worker = arg; + struct d3d12_descriptor_heap *heap; + struct d3d12_device *device; + + vkd3d_set_thread_name("descriptor_write"); + + heap = worker->heap; + device = heap->device; + + vkd3d_mutex_lock(&worker->mutex); + + for (;;) + { + while (heap->dirty_list_head == UINT_MAX && !worker->should_exit) + vkd3d_cond_wait(&worker->cond, &worker->mutex); + + if (worker->should_exit) + break; + + d3d12_desc_flush_vk_heap_updates_locked(heap, device); + } + + vkd3d_mutex_unlock(&worker->mutex); + + return NULL; +} + +static HRESULT descriptor_write_worker_start(struct descriptor_write_worker *worker, + struct d3d12_descriptor_heap *descriptor_heap) +{ + struct d3d12_device *device = descriptor_heap->device; + HRESULT hr; + + TRACE("worker %p.\n", worker); + + worker->should_exit = false; + worker->heap = descriptor_heap; + vkd3d_mutex_init(&worker->mutex); + vkd3d_cond_init(&worker->cond); + + if (FAILED(hr = vkd3d_create_thread(device->vkd3d_instance, + descriptor_write_worker_main, worker, &worker->thread))) + { + vkd3d_cond_destroy(&worker->cond); + } + + return hr; +} + +static HRESULT descriptor_write_worker_stop(struct descriptor_write_worker *worker) +{ + struct d3d12_device *device = worker->heap->device; + HRESULT hr; + + TRACE("worker %p.\n", worker); + + vkd3d_mutex_lock(&worker->mutex); + + worker->should_exit = true; + vkd3d_cond_signal(&worker->cond); + + vkd3d_mutex_unlock(&worker->mutex); + + if (FAILED(hr = vkd3d_join_thread(device->vkd3d_instance, &worker->thread))) + return hr; + + vkd3d_mutex_destroy(&worker->mutex); + vkd3d_cond_destroy(&worker->cond); + + return S_OK; +} + static void d3d12_desc_mark_as_modified(struct d3d12_desc *dst, struct d3d12_descriptor_heap *descriptor_heap) { unsigned int i, head; @@ -3734,6 +3808,9 @@ static ULONG STDMETHODCALLTYPE d3d12_descriptor_heap_Release(ID3D12DescriptorHea { struct d3d12_desc *descriptors = (struct d3d12_desc *)heap->descriptors;
+ if (heap->use_vk_heaps) + descriptor_write_worker_stop(&heap->worker); + for (i = 0; i < heap->desc.NumDescriptors; ++i) { d3d12_desc_destroy(&descriptors[i], device); @@ -4057,6 +4134,12 @@ HRESULT d3d12_descriptor_heap_create(struct d3d12_device *device, dst[i].next = 0; } object->dirty_list_head = UINT_MAX; + + if (object->use_vk_heaps && FAILED(hr = descriptor_write_worker_start(&object->worker, object))) + { + vkd3d_free(object); + return hr; + } } else { diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 3be12be7..5d8926d1 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -1005,6 +1005,16 @@ struct d3d12_descriptor_heap_vk_set VkDescriptorType vk_type; };
+struct descriptor_write_worker +{ + union vkd3d_thread_handle thread; + struct vkd3d_mutex mutex; + struct vkd3d_cond cond; + bool should_exit; + + struct d3d12_descriptor_heap *heap; +}; + /* ID3D12DescriptorHeap */ struct d3d12_descriptor_heap { @@ -1022,6 +1032,7 @@ struct d3d12_descriptor_heap VkDescriptorPool vk_descriptor_pool; struct d3d12_descriptor_heap_vk_set vk_descriptor_sets[VKD3D_SET_INDEX_COUNT]; struct vkd3d_mutex vk_sets_mutex; + struct descriptor_write_worker worker;
unsigned int volatile dirty_list_head;
This looks fine, as far as I can say. The only part I'm a bit concerned about is how many threads this is going to create. Is it possible that an application creates a lot of descriptor heaps, each of those spawning a new thread? That wouldn't be very gentle on the OS. Would it be possible to have on thread per device, taking care of all the heaps belonging to that device? Or, if parallel processing is advisable, a per-device pool of threads taking care of all the heaps belonging to that device?
This merge request was approved by Giovanni Mascellani.
On Fri Aug 18 13:41:27 2023 +0000, Giovanni Mascellani wrote:
This looks fine, as far as I can say. The only part I'm a bit concerned about is how many threads this is going to create. Is it possible that an application creates a lot of descriptor heaps, each of those spawning a new thread? That wouldn't be very gentle on the OS. Would it be possible to have on thread per device, taking care of all the heaps belonging to that device? Or, if parallel processing is advisable, a per-device pool of threads taking care of all the heaps belonging to that device?
Typically few heaps are used. Only one of each type can be used per draw or compute call, and they can be shared, so there's no gain in creating more than one shader-visible heap.
On Fri Aug 18 13:41:27 2023 +0000, Conor McCarthy wrote:
Typically few heaps are used. Only one of each type can be used per draw or compute call, and they can be shared, so there's no gain in creating more than one shader-visible heap.
I'll leave to Henri to decide what's best here, but I still think that having one thread per device should be easily doable and avoid wasting resources if an application, for some reason, creates a lot of heaps. Apparently neither us nor native implementations really enforce which heap descriptors come from, so it could still be that some application creates many heaps each with just a few descriptors.
From 7887d56d0c76f52c7afb8a84225d138ac9922f33 Mon Sep 17 00:00:00 2001 From: Conor McCarthy <cmccarthy@codeweavers.com> Date: Sun, 30 Jul 2023 13:34:09 +1000 Subject: [PATCH 2/2] vkd3d: Write Vulkan descriptors in a worker thread. Raises framerate in Horizon Zero Dawn by about 5-10%.
It's great that it does, but this doesn't help much with understanding which underlying issue we're addressing, or with determining whether this MR is the most appropriate way to do that.
I'll leave to Henri to decide what's best here, but I still think that having one thread per device should be easily doable and avoid wasting resources if an application, for some reason, creates a lot of heaps. Apparently neither us nor native implementations really enforce which heap descriptors come from, so it could still be that some application creates many heaps each with just a few descriptors.
Yeah, in general I'd prefer creating fewer threads rather than more, unless it either can't be avoided or there a clear advantage to creating more threads. In fact, I wonder how hard it would be to use vkd3d_fence_worker_main() for this. Waiting for fences is a blocking operation, but it may not have to be, and in principle these waits are expected to complete quickly. That also depends on which issue we're trying to address here, of course...
Yeah, in general I'd prefer creating fewer threads rather than more, unless it either can't be avoided or there a clear advantage to creating more threads. In fact, I wonder how hard it would be to use vkd3d_fence_worker_main() for this. Waiting for fences is a blocking operation, but it may not have to be, and in principle these waits are expected to complete quickly. That also depends on which issue we're trying to address here, of course...
As it was probably evident from my comment, I'm also in favor of using as fewer threads as possible. Still, I'm not sure that reusing the fence worker would be a good idea, since the fence worker has to wait on either a timeline semaphore or a fence. Maybe the descriptor worker could even be rewritten in order to use a timeline semaphore to synchronize (though I'm not sure if it's possible to implement the variable condition), but it doesn't look that the same can be done for plain fences. And I'm not aware of any reasonable way to wait at the same time on sempahores/fences and whatever vkd3d already abstracts for mutexes and condition variables.
(I could go on with a rant on why Vulkan has so many different blocking functions, `vkWaitSemaphores()`, `vkWaitForFences()`, `vkWaitForPresentKHR()`, `vkAcquireNextImageKHR()` and maybe others I'm not ware of, and not a way to uniformly wait and multiplex on them, but I'm sure I'm preaching to the choir; and we have to deal with it anyway, see for instance https://gitlab.winehq.org/wine/wine/-/merge_requests/3262#note_38797)
Yeah, in general I'd prefer creating fewer threads rather than more, unless it either can't be avoided or there a clear advantage to creating more threads. In fact, I wonder how hard it would be to use vkd3d_fence_worker_main() for this. Waiting for fences is a blocking operation, but it may not have to be, and in principle these waits are expected to complete quickly. That also depends on which issue we're trying to address here, of course...
As it was probably evident from my comment, I'm also in favor of using as fewer threads as possible. Still, I'm not sure that reusing the fence worker would be a good idea, since the fence worker has to wait on either a timeline semaphore or a fence.
Sure, but those waits don't necessarily need to be blocking waits. It may still be a bad idea, of course, depending on which issue we're trying to address, but I'd like for some serious thought to be given to whether we can make it work.
On Sat Aug 19 08:17:56 2023 +0000, Giovanni Mascellani wrote:
I'll leave to Henri to decide what's best here, but I still think that having one thread per device should be easily doable and avoid wasting resources if an application, for some reason, creates a lot of heaps. Apparently neither us nor native implementations really enforce which heap descriptors come from, so it could still be that some application creates many heaps each with just a few descriptors.
We enforce one per heap type; see `d3d12_command_list_bind_descriptor_table()`. It's true this doesn't prevent an app from creating many heaps and using them in consecutive draw calls or multiple command lists. Changing heap bindings can impose a pretty big performance cost though.
The issue is, to prevent concurrent writes of the same Vulkan descriptor we must delay writing them until command list submission, so instead of descriptors being written on the fly, often by multiple threads, we write them all at the last millisecond from a single thread. In my testing of HZD, the worker thread always handled all or very nearly all writes by list submission time, so it removes this bottleneck.
I think the fence worker is unsuitable. If it has some fences to wait for, it must poll `vkWaitSemaphoresKHR()` with a zero or extremely short wait time in case some descriptor writes come along. To avoid spinning we would need to use a short wait time, not zero, which will delay descriptor handling. And when writes do occur, they may delay fence handling.
Two or four threads in `struct d3d12_device` looks to me like the best option. Or we could make it dynamic, so another thread is created if too many writes remain when command lists are submitted, at a cost of greater complexity.
I think the fence worker is unsuitable. If it has some fences to wait for, it must poll `vkWaitSemaphoresKHR()` with a zero or extremely short wait time in case some descriptor writes come along. To avoid spinning we would need to use a short wait time, not zero, which will delay descriptor handling. And when writes do occur, they may delay fence handling.
Well, it depends on the specifics, I think. In particular:
- How long does a d3d12_desc_flush_vk_heap_updates_locked() call typically take? How long does it maximally take? If the answer to the previous question is some approximation of "infinity", could we put a lower bound on that by e.g. limiting the maximum number of descriptors we process in a single d3d12_desc_flush_vk_heap_updates_locked() call? - How long does waiting for a fence typically take once we know it has been submitted? Would it be terrible to poll fences with e.g. a 1ms timeout? - What is the worst case behaviour? If descriptor writes were to get stuck behind a fence we'd need to wait for d3d12_command_queue_ExecuteCommandLists() to process them, but that should be no worse than what we're currently doing. The reverse might be worse, but we should be able to avoid that by polling fences inside d3d12_desc_flush_vk_heap_updates_locked() if needed. - Are there any nasty edge cases?
The issue is, to prevent concurrent writes of the same Vulkan descriptor we must delay writing them until command list submission, so instead of descriptors being written on the fly, often by multiple threads, we write them all at the last millisecond from a single thread.
Are concurrent writes to the same descriptor really allowed in DX12? If so, wouldn't that mean every DX12 driver has to use atomics/locks to update its descriptors? That seems like a lot of burden to put on drivers for little benefit to games (since they'd still be unsure of which descriptor write overwrote which). Are you sure this isn't a leftover from pre-update-after-bind, where entire descriptor sets needed synchronization, in which case we should be able to update concurrently as long as update after bind is used?
On Fri Aug 25 14:03:52 2023 +0000, Evan Tang wrote:
The issue is, to prevent concurrent writes of the same Vulkan
descriptor we must delay writing them until command list submission, so instead of descriptors being written on the fly, often by multiple threads, we write them all at the last millisecond from a single thread. Are concurrent writes to the same descriptor really allowed in DX12? If so, wouldn't that mean every DX12 driver has to use atomics/locks to update its descriptors? That seems like a lot of burden to put on drivers for little benefit to games (since they'd still be unsure of which descriptor write overwrote which). Are you sure this isn't a leftover from pre-update-after-bind, where entire descriptor sets needed synchronization, in which case we should be able to update concurrently as long as update after bind is used?
It's unclear if "free-threaded" means concurrent writes are allowed, but they do happen surprisingly often. It's unlikely Windows drivers do any kind of locking, so most likely the concurrently written descriptors are identical or at least produce a valid descriptor if the data is a blend of both. I have seen crashes in Vulkan from concurrent writes though, so we must prevent it. Descriptor buffers are used in vkd3d-proton to allow memcpying of Vulkan descriptors, and it seems to handle concurrency as well as Windows does.
On Mon Aug 28 09:19:20 2023 +0000, Henri Verbeet wrote:
I think the fence worker is unsuitable. If it has some fences to wait
for, it must poll `vkWaitSemaphoresKHR()` with a zero or extremely short wait time in case some descriptor writes come along. To avoid spinning we would need to use a short wait time, not zero, which will delay descriptor handling. And when writes do occur, they may delay fence handling. Well, it depends on the specifics, I think. In particular:
- How long does a d3d12_desc_flush_vk_heap_updates_locked() call
typically take? How long does it maximally take? If the answer to the previous question is some approximation of "infinity", could we put a lower bound on that by e.g. limiting the maximum number of descriptors we process in a single d3d12_desc_flush_vk_heap_updates_locked() call?
- How long does waiting for a fence typically take once we know it has
been submitted? Would it be terrible to poll fences with e.g. a 1ms timeout?
- What is the worst case behaviour? If descriptor writes were to get
stuck behind a fence we'd need to wait for d3d12_command_queue_ExecuteCommandLists() to process them, but that should be no worse than what we're currently doing. The reverse might be worse, but we should be able to avoid that by polling fences inside d3d12_desc_flush_vk_heap_updates_locked() if needed.
- Are there any nasty edge cases?
From the Vulkan spec:
timeout is the timeout period in units of nanoseconds. timeout is adjusted to the closest value allowed by the implementation-dependent timeout accuracy, which may be substantially longer than one nanosecond, and may be longer than the requested period.
The shortest timeout in current drivers is probably much shorter than we need, but this could bite us in the future, and there's no way to query the actual minimum value.
Well, it depends on the specifics, I think. In particular:
- How long does a d3d12_desc_flush_vk_heap_updates_locked() call typically take? How long does it maximally take? If the answer to the previous question is some approximation of "infinity", could we put a lower bound on that by e.g. limiting the maximum number of descriptors we process in a single d3d12_desc_flush_vk_heap_updates_locked() call?
- How long does waiting for a fence typically take once we know it has been submitted? Would it be terrible to poll fences with e.g. a 1ms timeout?
- What is the worst case behaviour? If descriptor writes were to get stuck behind a fence we'd need to wait for d3d12_command_queue_ExecuteCommandLists() to process them, but that should be no worse than what we're currently doing. The reverse might be worse, but we should be able to avoid that by polling fences inside d3d12_desc_flush_vk_heap_updates_locked() if needed.
- Are there any nasty edge cases?
My problem with this approach is that it depends on a lot of magic constants which would require tuning, and this tuning depends on the computer, on the game, possibly on the specific scene of the game, on the game settings and possibly many other factors. Polling at 1 ms can be nothing, if for some random reason fences are immediately ready, or it can be a lot, if you have 16 ms of budget per frame and waste 5 of them just polling for a handful of fences that manage to block every other operation. Getting into this sort of business to save on a thread per device doesn't seem ideal to me, though I'll admit you have more experience than me.