Greatly improves performance in various games that update or copy a large number of descriptors per frame due to the high overhead of pthread_mutex_{un}lock.
Based on vkd3d-proton patches by Hans-Kristian Arntzen, Philip Rebohle and Georg Lehmann.
Signed-off-by: Conor McCarthy cmccarthy@codeweavers.com --- configure.ac | 1 + include/private/vkd3d_common.h | 33 +++++++++++++++++++++++++++++++++ libs/vkd3d/device.c | 10 +--------- libs/vkd3d/resource.c | 18 ++++++++---------- libs/vkd3d/vkd3d_private.h | 17 ++--------------- 5 files changed, 45 insertions(+), 34 deletions(-)
diff --git a/configure.ac b/configure.ac index d296dfd4..546b6c57 100644 --- a/configure.ac +++ b/configure.ac @@ -131,6 +131,7 @@ VKD3D_CHECK_FUNC([HAVE_BUILTIN_POPCOUNT], [__builtin_popcount], [__builtin_popco VKD3D_CHECK_FUNC([HAVE_BUILTIN_ADD_OVERFLOW], [__builtin_add_overflow], [__builtin_add_overflow(0, 0, (int *)0)]) VKD3D_CHECK_FUNC([HAVE_SYNC_ADD_AND_FETCH], [__sync_add_and_fetch], [__sync_add_and_fetch((int *)0, 0)]) VKD3D_CHECK_FUNC([HAVE_SYNC_SUB_AND_FETCH], [__sync_sub_and_fetch], [__sync_sub_and_fetch((int *)0, 0)]) +VKD3D_CHECK_FUNC([HAVE_SYNC_LOCK_TEST_AND_SET], [__sync_lock_test_and_set], [__sync_lock_test_and_set((int *)0, 0)])
VKD3D_CHECK_PTHREAD_SETNAME_NP
diff --git a/include/private/vkd3d_common.h b/include/private/vkd3d_common.h index 8d1ca397..1ae43d8f 100644 --- a/include/private/vkd3d_common.h +++ b/include/private/vkd3d_common.h @@ -28,6 +28,10 @@ #include <stdbool.h> #include <stdint.h>
+#ifdef __SSE2__ +#include <emmintrin.h> +#endif + #ifdef _MSC_VER #include <intrin.h> #endif @@ -211,6 +215,18 @@ static inline LONG InterlockedDecrement(LONG volatile *x) # else # error "InterlockedDecrement() not implemented for this platform" # endif + +# if HAVE_SYNC_LOCK_TEST_AND_SET +static inline LONG InterlockedExchange(LONG volatile *ptr, LONG val) +{ + return __sync_lock_test_and_set(ptr, val); +} +# define vkd3d_spinlock_unlock(lock) __sync_lock_release(lock) +# else +# error "spinlocks not implemented for this platform" +# endif /* HAVE_SYNC_LOCK_TEST_AND_SET */ +#else +# define vkd3d_spinlock_unlock(lock) InterlockedExchange(lock, 0) #endif /* _WIN32 */
#if HAVE_SYNC_ADD_AND_FETCH @@ -222,6 +238,23 @@ static inline LONG InterlockedDecrement(LONG volatile *x) # error "atomic_add_fetch() not implemented for this platform" #endif /* HAVE_SYNC_ADD_AND_FETCH */
+typedef LONG vkd3d_spinlock_t; + +static inline void vkd3d_spinlock_acquire(vkd3d_spinlock_t *lock) +{ + while (InterlockedExchange(lock, 1)) + { +#ifdef __SSE2__ + _mm_pause(); +#endif + } +} + +static inline void vkd3d_spinlock_release(vkd3d_spinlock_t *lock) +{ + vkd3d_spinlock_unlock(lock); +} + static inline void vkd3d_parse_version(const char *version, int *major, int *minor) { *major = atoi(version); diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 0fadb521..28a01331 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2213,7 +2213,6 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device *iface) { struct d3d12_device *device = impl_from_ID3D12Device(iface); ULONG refcount = InterlockedDecrement(&device->refcount); - size_t i;
TRACE("%p decreasing refcount to %u.\n", device, refcount);
@@ -2231,8 +2230,6 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device *iface) vkd3d_fence_worker_stop(&device->fence_worker, device); d3d12_device_destroy_pipeline_cache(device); d3d12_device_destroy_vkd3d_queues(device); - for (i = 0; i < ARRAY_SIZE(device->desc_mutex); ++i) - pthread_mutex_destroy(&device->desc_mutex[i]); VK_CALL(vkDestroyDevice(device->vk_device, NULL)); if (device->parent) IUnknown_Release(device->parent); @@ -3135,8 +3132,7 @@ static void STDMETHODCALLTYPE d3d12_device_CopyDescriptors(ID3D12Device *iface, struct d3d12_device *device = impl_from_ID3D12Device(iface); unsigned int dst_range_idx, dst_idx, src_range_idx, src_idx; unsigned int dst_range_size, src_range_size; - const struct d3d12_desc *src; - struct d3d12_desc *dst; + struct d3d12_desc *src, *dst;
TRACE("iface %p, dst_descriptor_range_count %u, dst_descriptor_range_offsets %p, " "dst_descriptor_range_sizes %p, src_descriptor_range_count %u, " @@ -3692,7 +3688,6 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, { const struct vkd3d_vk_device_procs *vk_procs; HRESULT hr; - size_t i;
device->ID3D12Device_iface.lpVtbl = &d3d12_device_vtbl; device->refcount = 1; @@ -3731,9 +3726,6 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, vkd3d_render_pass_cache_init(&device->render_pass_cache); vkd3d_gpu_va_allocator_init(&device->gpu_va_allocator);
- for (i = 0; i < ARRAY_SIZE(device->desc_mutex); ++i) - pthread_mutex_init(&device->desc_mutex[i], NULL); - if ((device->parent = create_info->parent)) IUnknown_AddRef(device->parent);
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 2c6c07c7..f3cbb684 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2122,21 +2122,21 @@ void d3d12_desc_write_atomic(struct d3d12_desc *dst, const struct d3d12_desc *sr struct d3d12_device *device) { struct d3d12_desc destroy_desc; - pthread_mutex_t *mutex;
destroy_desc.u.view = NULL;
- mutex = d3d12_device_get_descriptor_mutex(device, dst); - pthread_mutex_lock(mutex); + vkd3d_spinlock_acquire(&dst->spinlock);
/* Nothing to do for VKD3D_DESCRIPTOR_MAGIC_CBV. */ if ((dst->magic & VKD3D_DESCRIPTOR_MAGIC_HAS_VIEW) && !InterlockedDecrement(&dst->u.view->refcount)) destroy_desc = *dst;
- *dst = *src; + dst->magic = src->magic; + dst->vk_descriptor_type = src->vk_descriptor_type; + dst->u = src->u;
- pthread_mutex_unlock(mutex); + vkd3d_spinlock_release(&dst->spinlock);
/* Destroy the view after unlocking to reduce wait time. */ if (destroy_desc.u.view) @@ -2150,12 +2150,11 @@ static void d3d12_desc_destroy(struct d3d12_desc *descriptor, struct d3d12_devic d3d12_desc_write_atomic(descriptor, &null_desc, device); }
-void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, +void d3d12_desc_copy(struct d3d12_desc *dst, struct d3d12_desc *src, struct d3d12_device *device) { bool needs_update = true; struct d3d12_desc tmp; - pthread_mutex_t *mutex;
assert(dst != src);
@@ -2181,14 +2180,13 @@ void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src,
if (needs_update) { - mutex = d3d12_device_get_descriptor_mutex(device, src); - pthread_mutex_lock(mutex); + vkd3d_spinlock_acquire(&src->spinlock);
if (src->magic & VKD3D_DESCRIPTOR_MAGIC_HAS_VIEW) vkd3d_view_incref(src->u.view); tmp = *src;
- pthread_mutex_unlock(mutex); + vkd3d_spinlock_release(&src->spinlock);
d3d12_desc_write_atomic(dst, &tmp, device); } diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 9829e0aa..78e503ae 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -517,6 +517,7 @@ struct d3d12_desc { uint32_t magic; VkDescriptorType vk_descriptor_type; + vkd3d_spinlock_t spinlock; union { VkDescriptorBufferInfo vk_cbv_info; @@ -534,7 +535,7 @@ static inline struct d3d12_desc *d3d12_desc_from_gpu_handle(D3D12_GPU_DESCRIPTOR return (struct d3d12_desc *)(intptr_t)gpu_handle.ptr; }
-void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, struct d3d12_device *device); +void d3d12_desc_copy(struct d3d12_desc *dst, struct d3d12_desc *src, struct d3d12_device *device); void d3d12_desc_create_cbv(struct d3d12_desc *descriptor, struct d3d12_device *device, const D3D12_CONSTANT_BUFFER_VIEW_DESC *desc); void d3d12_desc_create_srv(struct d3d12_desc *descriptor, @@ -1134,7 +1135,6 @@ struct d3d12_device struct vkd3d_fence_worker fence_worker;
pthread_mutex_t mutex; - pthread_mutex_t desc_mutex[8]; struct vkd3d_render_pass_cache render_pass_cache; VkPipelineCache vk_pipeline_cache;
@@ -1200,19 +1200,6 @@ static inline unsigned int d3d12_device_get_descriptor_handle_increment_size(str return ID3D12Device_GetDescriptorHandleIncrementSize(&device->ID3D12Device_iface, descriptor_type); }
-static inline pthread_mutex_t *d3d12_device_get_descriptor_mutex(struct d3d12_device *device, - const struct d3d12_desc *descriptor) -{ - STATIC_ASSERT(!(ARRAY_SIZE(device->desc_mutex) & (ARRAY_SIZE(device->desc_mutex) - 1))); - uintptr_t idx = (uintptr_t)descriptor; - - idx ^= idx >> 12; - idx ^= idx >> 6; - idx ^= idx >> 3; - - return &device->desc_mutex[idx & (ARRAY_SIZE(device->desc_mutex) - 1)]; -} - /* utils */ enum vkd3d_format_type {