-- v5: vkd3d: Co-locate all descriptor-related members. vkd3d: Rename the device mutex to pipeline_cache_mutex. vkd3d: Write Vulkan descriptors in a worker thread. vkd3d: Update the descriptor `next` index before getting a reference for writing.
From: Conor McCarthy cmccarthy@codeweavers.com
Fixes a race condition where the descriptor is modified between getting its object and resetting the `next` index. The new object would never be written. While it is invalid for the app to write descriptors used by a command list which has been submitted to a queue, unused descriptors may be written. This change also supports writing descriptors in a worker thread. --- libs/vkd3d/resource.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index abbdfbe20..0c9c911a2 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2632,20 +2632,18 @@ void d3d12_desc_flush_vk_heap_updates_locked(struct d3d12_descriptor_heap *descr for (; i != UINT_MAX; i = next) { src = &descriptors[i]; - next = (int)src->next >> 1; + next = vkd3d_atomic_exchange(&src->next, 0); + next = (int)next >> 1;
+ /* A race exists here between updating src->next and getting the current object. The best + * we can do is get the object last, which may result in a harmless rewrite later. */ u.object = d3d12_desc_get_object_ref(src, device);
if (!u.object) - { - vkd3d_atomic_exchange(&src->next, 0); continue; - }
writes.held_refs[writes.held_ref_count++] = u.object; d3d12_desc_write_vk_heap(descriptor_heap, i, &writes, u.object, device); - - vkd3d_atomic_exchange(&src->next, 0); }
/* Avoid thunk calls wherever possible. */
From: Conor McCarthy cmccarthy@codeweavers.com
Raises framerate by 5-10% in games which write thousands of descriptors per frame, e.g. Horizon Zero Dawn.
The worker thread is a generic device worker which can also be used for other purposes if the need arises. --- libs/vkd3d/command.c | 4 ++ libs/vkd3d/device.c | 106 +++++++++++++++++++++++++++++++++++++ libs/vkd3d/resource.c | 9 ++++ libs/vkd3d/vkd3d_private.h | 10 ++++ 4 files changed, 129 insertions(+)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 15c8317b1..549f6a45f 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -2644,6 +2644,8 @@ static bool d3d12_command_list_update_compute_pipeline(struct d3d12_command_list { const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs;
+ vkd3d_cond_signal(&list->device->worker_cond); + if (list->current_pipeline != VK_NULL_HANDLE) return true;
@@ -2665,6 +2667,8 @@ static bool d3d12_command_list_update_graphics_pipeline(struct d3d12_command_lis VkRenderPass vk_render_pass; VkPipeline vk_pipeline;
+ vkd3d_cond_signal(&list->device->worker_cond); + if (list->current_pipeline != VK_NULL_HANDLE) return true;
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 5c801ca46..e60e0c488 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2495,6 +2495,28 @@ static ULONG STDMETHODCALLTYPE d3d12_device_AddRef(ID3D12Device5 *iface) return refcount; }
+static HRESULT device_worker_stop(struct d3d12_device *device) +{ + HRESULT hr; + + TRACE("device %p.\n", device); + + vkd3d_mutex_lock(&device->worker_mutex); + + device->worker_should_exit = true; + vkd3d_cond_signal(&device->worker_cond); + + vkd3d_mutex_unlock(&device->worker_mutex); + + if (FAILED(hr = vkd3d_join_thread(device->vkd3d_instance, &device->worker_thread))) + return hr; + + vkd3d_mutex_destroy(&device->worker_mutex); + vkd3d_cond_destroy(&device->worker_cond); + + return S_OK; +} + static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device5 *iface) { struct d3d12_device *device = impl_from_ID3D12Device5(iface); @@ -2520,6 +2542,9 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device5 *iface) d3d12_device_destroy_vkd3d_queues(device); vkd3d_desc_object_cache_cleanup(&device->view_desc_cache); vkd3d_desc_object_cache_cleanup(&device->cbuffer_desc_cache); + if (device->use_vk_heaps) + device_worker_stop(device); + vkd3d_free(device->heaps); VK_CALL(vkDestroyDevice(device->vk_device, NULL)); if (device->parent) IUnknown_Release(device->parent); @@ -4251,6 +4276,39 @@ struct d3d12_device *unsafe_impl_from_ID3D12Device5(ID3D12Device5 *iface) return impl_from_ID3D12Device5(iface); }
+static void *device_worker_main(void *arg) +{ + struct d3d12_descriptor_heap *heap; + struct d3d12_device *device = arg; + size_t i; + + vkd3d_set_thread_name("device_worker"); + + vkd3d_mutex_lock(&device->worker_mutex); + + for (;;) + { + for (i = 0; i < device->heap_count && !device->worker_should_exit; ++i) + { + heap = device->heaps[i]; + if (heap->dirty_list_head == UINT_MAX) + continue; + vkd3d_mutex_lock(&heap->vk_sets_mutex); + d3d12_desc_flush_vk_heap_updates_locked(heap, device); + vkd3d_mutex_unlock(&heap->vk_sets_mutex); + } + + if (device->worker_should_exit) + break; + + vkd3d_cond_wait(&device->worker_cond, &device->worker_mutex); + } + + vkd3d_mutex_unlock(&device->worker_mutex); + + return NULL; +} + static HRESULT d3d12_device_init(struct d3d12_device *device, struct vkd3d_instance *instance, const struct vkd3d_device_create_info *create_info) { @@ -4270,6 +4328,14 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
device->vk_device = VK_NULL_HANDLE;
+ device->heaps = NULL; + device->heap_capacity = 0; + device->heap_count = 0; + memset(&device->worker_thread, 0, sizeof(device->worker_thread)); + device->worker_should_exit = false; + vkd3d_mutex_init(&device->worker_mutex); + vkd3d_cond_init(&device->worker_cond); + if (FAILED(hr = vkd3d_create_vk_device(device, create_info))) goto out_free_instance;
@@ -4291,6 +4357,12 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, if (FAILED(hr = vkd3d_vk_descriptor_heap_layouts_init(device))) goto out_cleanup_uav_clear_state;
+ if (device->use_vk_heaps && FAILED(hr = vkd3d_create_thread(device->vkd3d_instance, + device_worker_main, device, &device->worker_thread))) + { + WARN("Failed to create worker thread, hr %#x.\n", hr); + } + vkd3d_render_pass_cache_init(&device->render_pass_cache); vkd3d_gpu_va_allocator_init(&device->gpu_va_allocator); vkd3d_time_domains_init(device); @@ -4361,6 +4433,40 @@ void d3d12_device_mark_as_removed(struct d3d12_device *device, HRESULT reason, device->removed_reason = reason; }
+HRESULT d3d12_device_add_descriptor_heap(struct d3d12_device *device, struct d3d12_descriptor_heap *heap) +{ + vkd3d_mutex_lock(&device->worker_mutex); + + if (!vkd3d_array_reserve((void **)&device->heaps, &device->heap_capacity, device->heap_count + 1, + sizeof(*device->heaps))) + { + vkd3d_mutex_unlock(&device->worker_mutex); + return E_OUTOFMEMORY; + } + device->heaps[device->heap_count++] = heap; + + vkd3d_mutex_unlock(&device->worker_mutex); + + return S_OK; +} + +void d3d12_device_remove_descriptor_heap(struct d3d12_device *device, struct d3d12_descriptor_heap *heap) +{ + size_t i; + + vkd3d_mutex_lock(&device->worker_mutex); + + for (i = 0; i < device->heap_count; ++i) + { + if (device->heaps[i] == heap) + { + device->heaps[i] = device->heaps[--device->heap_count]; + break; + } + } + + vkd3d_mutex_unlock(&device->worker_mutex); +}
#ifdef _WIN32 struct thread_data diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 0c9c911a2..609c67102 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -3995,6 +3995,9 @@ static ULONG STDMETHODCALLTYPE d3d12_descriptor_heap_Release(ID3D12DescriptorHea { struct d3d12_desc *descriptors = (struct d3d12_desc *)heap->descriptors;
+ if (heap->use_vk_heaps) + d3d12_device_remove_descriptor_heap(device, heap); + for (i = 0; i < heap->desc.NumDescriptors; ++i) { d3d12_desc_destroy(&descriptors[i], device); @@ -4318,6 +4321,12 @@ HRESULT d3d12_descriptor_heap_create(struct d3d12_device *device, dst[i].next = 0; } object->dirty_list_head = UINT_MAX; + + if (object->use_vk_heaps && FAILED(hr = d3d12_device_add_descriptor_heap(device, object))) + { + vkd3d_free(object); + return hr; + } } else { diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index e0eb9f3d3..025ad5348 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -1804,6 +1804,14 @@ struct d3d12_device unsigned int vk_pool_count; struct vkd3d_vk_descriptor_heap_layout vk_descriptor_heap_layouts[VKD3D_SET_INDEX_COUNT]; bool use_vk_heaps; + + struct d3d12_descriptor_heap **heaps; + size_t heap_capacity; + size_t heap_count; + union vkd3d_thread_handle worker_thread; + struct vkd3d_mutex worker_mutex; + struct vkd3d_cond worker_cond; + bool worker_should_exit; };
HRESULT d3d12_device_create(struct vkd3d_instance *instance, @@ -1813,6 +1821,8 @@ bool d3d12_device_is_uma(struct d3d12_device *device, bool *coherent); void d3d12_device_mark_as_removed(struct d3d12_device *device, HRESULT reason, const char *message, ...) VKD3D_PRINTF_FUNC(3, 4); struct d3d12_device *unsafe_impl_from_ID3D12Device5(ID3D12Device5 *iface); +HRESULT d3d12_device_add_descriptor_heap(struct d3d12_device *device, struct d3d12_descriptor_heap *heap); +void d3d12_device_remove_descriptor_heap(struct d3d12_device *device, struct d3d12_descriptor_heap *heap);
static inline HRESULT d3d12_device_query_interface(struct d3d12_device *device, REFIID iid, void **object) {
From: Conor McCarthy cmccarthy@codeweavers.com
It is used only for pipeline and render pass caching, and renaming it helps prevent inappropriate or erroneous use. --- libs/vkd3d/device.c | 4 ++-- libs/vkd3d/state.c | 12 ++++++------ libs/vkd3d/vkd3d_private.h | 3 ++- 3 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index e60e0c488..f32bc4712 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2056,7 +2056,7 @@ static HRESULT d3d12_device_init_pipeline_cache(struct d3d12_device *device) VkPipelineCacheCreateInfo cache_info; VkResult vr;
- vkd3d_mutex_init(&device->mutex); + vkd3d_mutex_init(&device->pipeline_cache_mutex);
cache_info.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; cache_info.pNext = NULL; @@ -2080,7 +2080,7 @@ static void d3d12_device_destroy_pipeline_cache(struct d3d12_device *device) if (device->vk_pipeline_cache) VK_CALL(vkDestroyPipelineCache(device->vk_device, device->vk_pipeline_cache, NULL));
- vkd3d_mutex_destroy(&device->mutex); + vkd3d_mutex_destroy(&device->pipeline_cache_mutex); }
#define VKD3D_VA_FALLBACK_BASE 0x8000000000000000ull diff --git a/libs/vkd3d/state.c b/libs/vkd3d/state.c index fc3187f4b..a833cff20 100644 --- a/libs/vkd3d/state.c +++ b/libs/vkd3d/state.c @@ -1691,7 +1691,7 @@ HRESULT vkd3d_render_pass_cache_find(struct vkd3d_render_pass_cache *cache, HRESULT hr = S_OK; unsigned int i;
- vkd3d_mutex_lock(&device->mutex); + vkd3d_mutex_lock(&device->pipeline_cache_mutex);
for (i = 0; i < cache->render_pass_count; ++i) { @@ -1708,7 +1708,7 @@ HRESULT vkd3d_render_pass_cache_find(struct vkd3d_render_pass_cache *cache, if (!found) hr = vkd3d_render_pass_cache_create_pass_locked(cache, device, key, vk_render_pass);
- vkd3d_mutex_unlock(&device->mutex); + vkd3d_mutex_unlock(&device->pipeline_cache_mutex);
return hr; } @@ -3615,7 +3615,7 @@ static VkPipeline d3d12_pipeline_state_find_compiled_pipeline(const struct d3d12
*vk_render_pass = VK_NULL_HANDLE;
- vkd3d_mutex_lock(&device->mutex); + vkd3d_mutex_lock(&device->pipeline_cache_mutex);
LIST_FOR_EACH_ENTRY(current, &graphics->compiled_pipelines, struct vkd3d_compiled_pipeline, entry) { @@ -3627,7 +3627,7 @@ static VkPipeline d3d12_pipeline_state_find_compiled_pipeline(const struct d3d12 } }
- vkd3d_mutex_unlock(&device->mutex); + vkd3d_mutex_unlock(&device->pipeline_cache_mutex);
return vk_pipeline; } @@ -3646,7 +3646,7 @@ static bool d3d12_pipeline_state_put_pipeline_to_cache(struct d3d12_pipeline_sta compiled_pipeline->vk_pipeline = vk_pipeline; compiled_pipeline->vk_render_pass = vk_render_pass;
- vkd3d_mutex_lock(&device->mutex); + vkd3d_mutex_lock(&device->pipeline_cache_mutex);
LIST_FOR_EACH_ENTRY(current, &graphics->compiled_pipelines, struct vkd3d_compiled_pipeline, entry) { @@ -3661,7 +3661,7 @@ static bool d3d12_pipeline_state_put_pipeline_to_cache(struct d3d12_pipeline_sta if (compiled_pipeline) list_add_tail(&graphics->compiled_pipelines, &compiled_pipeline->entry);
- vkd3d_mutex_unlock(&device->mutex); + vkd3d_mutex_unlock(&device->pipeline_cache_mutex); return compiled_pipeline; }
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 025ad5348..4c5c039ad 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -1757,9 +1757,10 @@ struct d3d12_device
struct vkd3d_gpu_va_allocator gpu_va_allocator;
- struct vkd3d_mutex mutex; struct vkd3d_desc_object_cache view_desc_cache; struct vkd3d_desc_object_cache cbuffer_desc_cache; + + struct vkd3d_mutex pipeline_cache_mutex; struct vkd3d_render_pass_cache render_pass_cache; VkPipelineCache vk_pipeline_cache;
From: Conor McCarthy cmccarthy@codeweavers.com
To optimise cache coherence, because decriptor updates use the most performance-critical code paths. --- libs/vkd3d/vkd3d_private.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4c5c039ad..bf32d04c2 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -1760,6 +1760,19 @@ struct d3d12_device struct vkd3d_desc_object_cache view_desc_cache; struct vkd3d_desc_object_cache cbuffer_desc_cache;
+ VkDescriptorPoolSize vk_pool_sizes[VKD3D_DESCRIPTOR_POOL_COUNT]; + unsigned int vk_pool_count; + struct vkd3d_vk_descriptor_heap_layout vk_descriptor_heap_layouts[VKD3D_SET_INDEX_COUNT]; + bool use_vk_heaps; + + struct d3d12_descriptor_heap **heaps; + size_t heap_capacity; + size_t heap_count; + union vkd3d_thread_handle worker_thread; + struct vkd3d_mutex worker_mutex; + struct vkd3d_cond worker_cond; + bool worker_should_exit; + struct vkd3d_mutex pipeline_cache_mutex; struct vkd3d_render_pass_cache render_pass_cache; VkPipelineCache vk_pipeline_cache; @@ -1800,19 +1813,6 @@ struct d3d12_device const struct vkd3d_format_compatibility_list *format_compatibility_lists; struct vkd3d_null_resources null_resources; struct vkd3d_uav_clear_state uav_clear_state; - - VkDescriptorPoolSize vk_pool_sizes[VKD3D_DESCRIPTOR_POOL_COUNT]; - unsigned int vk_pool_count; - struct vkd3d_vk_descriptor_heap_layout vk_descriptor_heap_layouts[VKD3D_SET_INDEX_COUNT]; - bool use_vk_heaps; - - struct d3d12_descriptor_heap **heaps; - size_t heap_capacity; - size_t heap_count; - union vkd3d_thread_handle worker_thread; - struct vkd3d_mutex worker_mutex; - struct vkd3d_cond worker_cond; - bool worker_should_exit; };
HRESULT d3d12_device_create(struct vkd3d_instance *instance,
On Mon Dec 11 04:02:13 2023 +0000, Conor McCarthy wrote:
There are no artifacts for the llvmpipe builds, and when testing upstream vkd3d on it locally it fails various tests. @giomasce how is this supposed to be done?
Fixed now. I see the llvmpipe tests are allowed to fail, but in this case it was hanging.
+static HRESULT device_worker_stop(struct d3d12_device *device) +{ + HRESULT hr; + + TRACE("device %p.\n", device); + + vkd3d_mutex_lock(&device->worker_mutex); + + device->worker_should_exit = true; + vkd3d_cond_signal(&device->worker_cond); + + vkd3d_mutex_unlock(&device->worker_mutex); + + if (FAILED(hr = vkd3d_join_thread(device->vkd3d_instance, &device->worker_thread))) + return hr; + + vkd3d_mutex_destroy(&device->worker_mutex); + vkd3d_cond_destroy(&device->worker_cond); + + return S_OK; +} ... +static void *device_worker_main(void *arg) +{ + struct d3d12_descriptor_heap *heap; + struct d3d12_device *device = arg; + size_t i; + + vkd3d_set_thread_name("device_worker"); + + vkd3d_mutex_lock(&device->worker_mutex); + + for (;;) + { + for (i = 0; i < device->heap_count && !device->worker_should_exit; ++i) + { + heap = device->heaps[i]; + if (heap->dirty_list_head == UINT_MAX) + continue; + vkd3d_mutex_lock(&heap->vk_sets_mutex); + d3d12_desc_flush_vk_heap_updates_locked(heap, device); + vkd3d_mutex_unlock(&heap->vk_sets_mutex); + } + + if (device->worker_should_exit) + break; + + vkd3d_cond_wait(&device->worker_cond, &device->worker_mutex); + } + + vkd3d_mutex_unlock(&device->worker_mutex); + + return NULL; +}
Does it make sense to check device->worker_should_exit on each loop iteration above? device->worker_mutex should prevent device_worker_stop() from modifying device->worker_should_exit before the vkd3d_cond_wait() call, right?
I also still think it would be helpful to explain the problem we're trying to solve (i.e., descriptor writes getting delayed until command list submission, and then potentially becoming a bottleneck) somewhere in the actual commit. Perhaps as a comment in device_worker_main().