The PE build of vkd3d uses Windows critical sections for synchronisation, and these slow down on the very high lock/unlock rate during multithreaded descriptor copying in Shadow of the Tomb Raider. This patch speeds up the demo by about 8%. By comparison, using SRW locks in the allocators and locking them for read only where applicable is about 4% faster.
From: Conor McCarthy cmccarthy@codeweavers.com
The PE build of vkd3d uses Windows critical sections for synchronisation, and these slow down on the very high lock/unlock rate during multithreaded descriptor copying in Shadow of the Tomb Raider. This patch speeds up the demo by about 8%. By comparison, using SRW locks in the allocators and locking them for read only where applicable is about 4% faster. --- libs/vkd3d/command.c | 19 +--- libs/vkd3d/device.c | 177 +------------------------------------ libs/vkd3d/resource.c | 34 ++++--- libs/vkd3d/vkd3d_private.h | 30 +++---- 4 files changed, 36 insertions(+), 224 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index dabd8424..2e9ba29e 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -2802,7 +2802,6 @@ static void d3d12_command_list_prepare_descriptors(struct d3d12_command_list *li unsigned int variable_binding_size, unbounded_offset, table_index, heap_size, i; const struct d3d12_root_signature *root_signature = bindings->root_signature; const struct d3d12_descriptor_set_layout *layout; - struct d3d12_device *device = list->device; const struct d3d12_desc *base_descriptor; VkDescriptorSet vk_descriptor_set;
@@ -2833,8 +2832,7 @@ static void d3d12_command_list_prepare_descriptors(struct d3d12_command_list *li /* Descriptors may not be set, eg. WoW. */ && (base_descriptor = bindings->descriptor_tables[table_index])) { - heap_size = vkd3d_gpu_descriptor_allocator_range_size_from_descriptor( - &device->gpu_descriptor_allocator, base_descriptor); + heap_size = base_descriptor->heap->desc.NumDescriptors;
if (heap_size < unbounded_offset) WARN("Descriptor heap size %u is less than the offset %u of an unbounded range in table %u, " @@ -2974,8 +2972,7 @@ static void d3d12_command_list_update_descriptor_table(struct d3d12_command_list descriptor_count = range->descriptor_count; if ((unbounded = descriptor_count == UINT_MAX)) { - descriptor_count = vkd3d_gpu_descriptor_allocator_range_size_from_descriptor( - &list->device->gpu_descriptor_allocator, descriptor); + descriptor_count = d3d12_desc_heap_range_size(descriptor);
if (descriptor_count > range->vk_binding_count) { @@ -3242,7 +3239,7 @@ static unsigned int d3d12_command_list_bind_descriptor_table(struct d3d12_comman /* AMD, Nvidia and Intel drivers on Windows work if SetDescriptorHeaps() * is not called, so we bind heaps from the tables instead. No NULL check is * needed here because it's checked when descriptor tables are set. */ - heap = vkd3d_gpu_descriptor_allocator_heap_from_descriptor(&list->device->gpu_descriptor_allocator, desc); + heap = desc->heap; offset = desc - (const struct d3d12_desc *)heap->descriptors;
if (heap->desc.Type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV) @@ -4517,16 +4514,6 @@ static void d3d12_command_list_set_descriptor_table(struct d3d12_command_list *l if (bindings->descriptor_tables[index] == desc) return;
- if (desc && !vkd3d_gpu_descriptor_allocator_heap_from_descriptor(&list->device->gpu_descriptor_allocator, - desc)) - { - /* Failure to find a heap means the descriptor handle is from - * the wrong heap type or not a handle at all. */ - ERR("Invalid heap for base descriptor %"PRIx64".\n", base_descriptor.ptr); - /* TODO: Mark list as invalid? */ - return; - } - bindings->descriptor_tables[index] = desc; bindings->descriptor_table_dirty_mask |= (uint64_t)1 << index; bindings->descriptor_table_active_mask |= (uint64_t)1 << index; diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 1ee6ed37..94e54c2c 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2366,175 +2366,6 @@ static void vkd3d_gpu_va_allocator_cleanup(struct vkd3d_gpu_va_allocator *alloca vkd3d_mutex_destroy(&allocator->mutex); }
-/* We could use bsearch() or recursion here, but it probably helps to omit - * all the extra function calls. */ -static struct vkd3d_gpu_descriptor_allocation *vkd3d_gpu_descriptor_allocator_binary_search( - const struct vkd3d_gpu_descriptor_allocator *allocator, const struct d3d12_desc *desc) -{ - struct vkd3d_gpu_descriptor_allocation *allocations = allocator->allocations; - const struct d3d12_desc *base; - size_t centre, count; - - for (count = allocator->allocation_count; count > 1; ) - { - centre = count >> 1; - base = allocations[centre].base; - if (base <= desc) - { - allocations += centre; - count -= centre; - } - else - { - count = centre; - } - } - - return allocations; -} - -bool vkd3d_gpu_descriptor_allocator_register_range(struct vkd3d_gpu_descriptor_allocator *allocator, - const struct d3d12_desc *base, size_t count) -{ - struct vkd3d_gpu_descriptor_allocation *allocation; - int rc; - - if ((rc = vkd3d_mutex_lock(&allocator->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return false; - } - - if (!vkd3d_array_reserve((void **)&allocator->allocations, &allocator->allocations_size, - allocator->allocation_count + 1, sizeof(*allocator->allocations))) - { - vkd3d_mutex_unlock(&allocator->mutex); - return false; - } - - if (allocator->allocation_count > 1) - allocation = vkd3d_gpu_descriptor_allocator_binary_search(allocator, base); - else - allocation = allocator->allocations; - allocation += allocator->allocation_count && base > allocation->base; - memmove(&allocation[1], allocation, (allocator->allocation_count++ - (allocation - allocator->allocations)) - * sizeof(*allocation)); - - allocation->base = base; - allocation->count = count; - - vkd3d_mutex_unlock(&allocator->mutex); - - return true; -} - -bool vkd3d_gpu_descriptor_allocator_unregister_range( - struct vkd3d_gpu_descriptor_allocator *allocator, const struct d3d12_desc *base) -{ - bool found; - size_t i; - int rc; - - if ((rc = vkd3d_mutex_lock(&allocator->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return false; - } - - for (i = 0, found = false; i < allocator->allocation_count; ++i) - { - if (allocator->allocations[i].base != base) - continue; - - memmove(&allocator->allocations[i], &allocator->allocations[i + 1], - (--allocator->allocation_count - i) * sizeof(allocator->allocations[0])); - - found = true; - break; - } - - vkd3d_mutex_unlock(&allocator->mutex); - - return found; -} - -static inline const struct vkd3d_gpu_descriptor_allocation *vkd3d_gpu_descriptor_allocator_allocation_from_descriptor( - const struct vkd3d_gpu_descriptor_allocator *allocator, const struct d3d12_desc *desc) -{ - const struct vkd3d_gpu_descriptor_allocation *allocation; - - allocation = vkd3d_gpu_descriptor_allocator_binary_search(allocator, desc); - return (desc >= allocation->base && desc - allocation->base < allocation->count) ? allocation : NULL; -} - -/* Return the available size from the specified descriptor to the heap end. */ -size_t vkd3d_gpu_descriptor_allocator_range_size_from_descriptor( - struct vkd3d_gpu_descriptor_allocator *allocator, const struct d3d12_desc *desc) -{ - const struct vkd3d_gpu_descriptor_allocation *allocation; - size_t remaining; - int rc; - - assert(allocator->allocation_count); - - if ((rc = vkd3d_mutex_lock(&allocator->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return 0; - } - - remaining = 0; - if ((allocation = vkd3d_gpu_descriptor_allocator_allocation_from_descriptor(allocator, desc))) - remaining = allocation->count - (desc - allocation->base); - - vkd3d_mutex_unlock(&allocator->mutex); - - return remaining; -} - -struct d3d12_descriptor_heap *vkd3d_gpu_descriptor_allocator_heap_from_descriptor( - struct vkd3d_gpu_descriptor_allocator *allocator, const struct d3d12_desc *desc) -{ - const struct vkd3d_gpu_descriptor_allocation *allocation; - int rc; - - if (!allocator->allocation_count) - return NULL; - - if ((rc = vkd3d_mutex_lock(&allocator->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return NULL; - } - - allocation = vkd3d_gpu_descriptor_allocator_allocation_from_descriptor(allocator, desc); - - vkd3d_mutex_unlock(&allocator->mutex); - - return allocation ? CONTAINING_RECORD(allocation->base, struct d3d12_descriptor_heap, descriptors) - : NULL; -} - -static bool vkd3d_gpu_descriptor_allocator_init(struct vkd3d_gpu_descriptor_allocator *allocator) -{ - int rc; - - memset(allocator, 0, sizeof(*allocator)); - if ((rc = vkd3d_mutex_init(&allocator->mutex))) - { - ERR("Failed to initialise mutex, error %d.\n", rc); - return false; - } - - return true; -} - -static void vkd3d_gpu_descriptor_allocator_cleanup(struct vkd3d_gpu_descriptor_allocator *allocator) -{ - vkd3d_free(allocator->allocations); - vkd3d_mutex_destroy(&allocator->mutex); -} - static bool have_vk_time_domain(VkTimeDomainEXT *domains, unsigned int count, VkTimeDomainEXT domain) { unsigned int i; @@ -2671,7 +2502,6 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device *iface) vkd3d_uav_clear_state_cleanup(&device->uav_clear_state, device); vkd3d_destroy_null_resources(&device->null_resources, device); vkd3d_gpu_va_allocator_cleanup(&device->gpu_va_allocator); - vkd3d_gpu_descriptor_allocator_cleanup(&device->gpu_descriptor_allocator); vkd3d_render_pass_cache_cleanup(&device->render_pass_cache, device); d3d12_device_destroy_pipeline_cache(device); d3d12_device_destroy_vkd3d_queues(device); @@ -3643,8 +3473,7 @@ static void d3d12_device_vk_heaps_copy_descriptors(struct d3d12_device *device, unsigned int dst_range_size, src_range_size; struct d3d12_desc *dst;
- descriptor_heap = vkd3d_gpu_descriptor_allocator_heap_from_descriptor(&device->gpu_descriptor_allocator, - d3d12_desc_from_cpu_handle(dst_descriptor_range_offsets[0])); + descriptor_heap = d3d12_desc_from_cpu_handle(dst_descriptor_range_offsets[0])->heap; heap_base = (const struct d3d12_desc *)descriptor_heap->descriptors; heap_end = heap_base + descriptor_heap->desc.NumDescriptors;
@@ -3662,8 +3491,7 @@ static void d3d12_device_vk_heaps_copy_descriptors(struct d3d12_device *device, if (dst < heap_base || dst >= heap_end) { flush_desc_writes(locations, infos, descriptor_heap, device); - descriptor_heap = vkd3d_gpu_descriptor_allocator_heap_from_descriptor(&device->gpu_descriptor_allocator, - dst); + descriptor_heap = dst->heap; heap_base = (const struct d3d12_desc *)descriptor_heap->descriptors; heap_end = heap_base + descriptor_heap->desc.NumDescriptors; } @@ -4320,7 +4148,6 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, goto out_cleanup_uav_clear_state;
vkd3d_render_pass_cache_init(&device->render_pass_cache); - vkd3d_gpu_descriptor_allocator_init(&device->gpu_descriptor_allocator); vkd3d_gpu_va_allocator_init(&device->gpu_va_allocator); vkd3d_time_domains_init(device);
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 68c28cd1..8a66088a 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2230,7 +2230,7 @@ static void d3d12_desc_write_vk_heap(const struct d3d12_desc *dst, const struct const struct vkd3d_vk_device_procs *vk_procs; bool is_null = false;
- descriptor_heap = vkd3d_gpu_descriptor_allocator_heap_from_descriptor(&device->gpu_descriptor_allocator, dst); + descriptor_heap = dst->heap; descriptor_set = &descriptor_heap->vk_descriptor_sets[vkd3d_vk_descriptor_set_index_from_vk_descriptor_type( src->vk_descriptor_type)]; vk_procs = &device->vk_procs; @@ -2295,13 +2295,13 @@ static void d3d12_desc_write_atomic_d3d12_only(struct d3d12_desc *dst, const str
if (!(dst->magic & VKD3D_DESCRIPTOR_MAGIC_HAS_VIEW) || InterlockedDecrement(&dst->u.view_info.view->refcount)) { - *dst = *src; + d3d12_desc_copy_raw(dst, src); vkd3d_mutex_unlock(mutex); return; }
defunct_view = dst->u.view_info.view; - *dst = *src; + d3d12_desc_copy_raw(dst, src); vkd3d_mutex_unlock(mutex);
/* Destroy the view after unlocking to reduce wait time. */ @@ -2322,7 +2322,7 @@ void d3d12_desc_write_atomic(struct d3d12_desc *dst, const struct d3d12_desc *sr && !InterlockedDecrement(&dst->u.view_info.view->refcount)) defunct_view = dst->u.view_info.view;
- *dst = *src; + d3d12_desc_copy_raw(dst, src);
vkd3d_mutex_unlock(mutex);
@@ -2407,7 +2407,7 @@ void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, if (src->magic & VKD3D_DESCRIPTOR_MAGIC_HAS_VIEW) vkd3d_view_incref(src->u.view_info.view);
- tmp = *src; + d3d12_desc_copy_raw(&tmp, src);
vkd3d_mutex_unlock(mutex);
@@ -3708,9 +3708,6 @@ static ULONG STDMETHODCALLTYPE d3d12_descriptor_heap_Release(ID3D12DescriptorHea d3d12_desc_destroy(&descriptors[i], device); }
- if (device->vk_info.EXT_descriptor_indexing && !vkd3d_gpu_descriptor_allocator_unregister_range( - &device->gpu_descriptor_allocator, descriptors)) - ERR("Failed to unregister descriptor range.\n"); break; }
@@ -4025,6 +4022,8 @@ HRESULT d3d12_descriptor_heap_create(struct d3d12_device *device, { size_t max_descriptor_count, descriptor_size; struct d3d12_descriptor_heap *object; + struct d3d12_desc *dst; + unsigned int i; HRESULT hr;
if (!(descriptor_size = d3d12_device_get_descriptor_handle_increment_size(device, desc->Type))) @@ -4057,12 +4056,19 @@ HRESULT d3d12_descriptor_heap_create(struct d3d12_device *device, return hr; }
- memset(object->descriptors, 0, descriptor_size * desc->NumDescriptors); - - if ((desc->Type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV || desc->Type == D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER) - && device->vk_info.EXT_descriptor_indexing && !vkd3d_gpu_descriptor_allocator_register_range( - &device->gpu_descriptor_allocator, (struct d3d12_desc *)object->descriptors, desc->NumDescriptors)) - ERR("Failed to register descriptor range.\n"); + if (desc->Type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV || desc->Type == D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER) + { + dst = (struct d3d12_desc *)object->descriptors; + for (i = 0; i < desc->NumDescriptors; ++i) + { + memset(&dst[i], 0, offsetof(struct d3d12_desc, heap)); + dst[i].heap = object; + } + } + else + { + memset(object->descriptors, 0, descriptor_size * desc->NumDescriptors); + }
TRACE("Created descriptor heap %p.\n", object);
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 2da1ab5d..b3fa5d64 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -397,24 +397,6 @@ struct vkd3d_gpu_descriptor_allocation size_t count; };
-struct vkd3d_gpu_descriptor_allocator -{ - struct vkd3d_mutex mutex; - - struct vkd3d_gpu_descriptor_allocation *allocations; - size_t allocations_size; - size_t allocation_count; -}; - -size_t vkd3d_gpu_descriptor_allocator_range_size_from_descriptor( - struct vkd3d_gpu_descriptor_allocator *allocator, const struct d3d12_desc *desc); -bool vkd3d_gpu_descriptor_allocator_register_range(struct vkd3d_gpu_descriptor_allocator *allocator, - const struct d3d12_desc *base, size_t count); -bool vkd3d_gpu_descriptor_allocator_unregister_range( - struct vkd3d_gpu_descriptor_allocator *allocator, const struct d3d12_desc *base); -struct d3d12_descriptor_heap *vkd3d_gpu_descriptor_allocator_heap_from_descriptor( - struct vkd3d_gpu_descriptor_allocator *allocator, const struct d3d12_desc *desc); - struct vkd3d_render_pass_key { unsigned int attachment_count; @@ -725,6 +707,7 @@ struct d3d12_desc VkDescriptorBufferInfo vk_cbv_info; struct vkd3d_view_info view_info; } u; + struct d3d12_descriptor_heap *heap; };
static inline struct d3d12_desc *d3d12_desc_from_cpu_handle(D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle) @@ -737,6 +720,11 @@ static inline struct d3d12_desc *d3d12_desc_from_gpu_handle(D3D12_GPU_DESCRIPTOR return (struct d3d12_desc *)(intptr_t)gpu_handle.ptr; }
+static inline void d3d12_desc_copy_raw(struct d3d12_desc *dst, const struct d3d12_desc *src) +{ + memcpy(dst, src, offsetof(struct d3d12_desc, heap)); +} + void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, struct d3d12_device *device); void d3d12_desc_create_cbv(struct d3d12_desc *descriptor, struct d3d12_device *device, const D3D12_CONSTANT_BUFFER_VIEW_DESC *desc); @@ -857,6 +845,11 @@ struct d3d12_descriptor_heap BYTE descriptors[]; };
+static inline unsigned int d3d12_desc_heap_range_size(const struct d3d12_desc *desc) +{ + return desc->heap->desc.NumDescriptors - (desc - (const struct d3d12_desc *)desc->heap->descriptors); +} + HRESULT d3d12_descriptor_heap_create(struct d3d12_device *device, const D3D12_DESCRIPTOR_HEAP_DESC *desc, struct d3d12_descriptor_heap **descriptor_heap);
@@ -1465,7 +1458,6 @@ struct d3d12_device PFN_vkd3d_signal_event signal_event; size_t wchar_size;
- struct vkd3d_gpu_descriptor_allocator gpu_descriptor_allocator; struct vkd3d_gpu_va_allocator gpu_va_allocator;
struct vkd3d_mutex mutex;
@@ -2833,8 +2832,7 @@ static void d3d12_command_list_prepare_descriptors(struct d3d12_command_list *li /* Descriptors may not be set, eg. WoW. */ && (base_descriptor = bindings->descriptor_tables[table_index])) { - heap_size = vkd3d_gpu_descriptor_allocator_range_size_from_descriptor( - &device->gpu_descriptor_allocator, base_descriptor); + heap_size = base_descriptor->heap->desc.NumDescriptors; if (heap_size < unbounded_offset) WARN("Descriptor heap size %u is less than the offset %u of an unbounded range in table %u, "
That doesn't do the same thing. Is that intentional, or should this use d3d12_desc_heap_range_size() like elsewhere?
-struct vkd3d_gpu_descriptor_allocator -{ - struct vkd3d_mutex mutex; - - struct vkd3d_gpu_descriptor_allocation *allocations; - size_t allocations_size; - size_t allocation_count; -};
We should also be able to remove struct vkd3d_gpu_descriptor_allocation now, right?
+static inline void d3d12_desc_copy_raw(struct d3d12_desc *dst, const struct d3d12_desc *src) +{ + memcpy(dst, src, offsetof(struct d3d12_desc, heap)); +}
That's a bit unfortunate; this ends up depending on the exact layout of struct d3d12_desc. Could we just wrap the descriptor "body" in a structure instead?