From: Conor McCarthy cmccarthy@codeweavers.com
The list suffers from the ABA problem, where comparison with the head succeeds but the head's `next` pointer has changed. Occurs in Cyberpunk 2077, on NVIDIA at least. --- include/private/vkd3d_memory.h | 18 ++++++++ libs/vkd3d-common/memory.c | 21 +++++++++ libs/vkd3d/device.c | 44 +++++++++++++++--- libs/vkd3d/resource.c | 82 +++++++++++++++++++++++++++------- libs/vkd3d/vkd3d_private.h | 58 +++++++++++++++++++++++- 5 files changed, 200 insertions(+), 23 deletions(-)
diff --git a/include/private/vkd3d_memory.h b/include/private/vkd3d_memory.h index 8a2edb100..db93f1c5a 100644 --- a/include/private/vkd3d_memory.h +++ b/include/private/vkd3d_memory.h @@ -55,6 +55,24 @@ static inline void vkd3d_free(void *ptr) free(ptr); }
+static inline void *vkd3d_malloc_aligned(size_t size, size_t alignment) +{ + /* aligned_alloc() requires C11. */ + void *p = malloc(size); + + /* Systems which support double-wide CAS should return a pointer with the required alignment. */ + if ((intptr_t)p & (alignment - 1)) + { + FIXME("Alignment %zu is not supported.\n", alignment); + vkd3d_free(p); + return NULL; + } + + return p; +} + +bool are_128_bit_atomics_supported(void); + static inline char *vkd3d_strdup(const char *string) { size_t len = strlen(string) + 1; diff --git a/libs/vkd3d-common/memory.c b/libs/vkd3d-common/memory.c index 2bf8947e0..4c9dfcf16 100644 --- a/libs/vkd3d-common/memory.c +++ b/libs/vkd3d-common/memory.c @@ -46,3 +46,24 @@ bool vkd3d_array_reserve(void **elements, size_t *capacity, size_t element_count
return true; } + +#ifdef __x86_64__ +static inline void do_cpuid(uint32_t ax, uint32_t cx, uint32_t info[4]) +{ + __asm__ ("cpuid" : "=a"(info[0]), "=b" (info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(ax), "c"(cx)); +} + +bool are_128_bit_atomics_supported() +{ + unsigned int regs[4]; + + /* Get standard cpuid level and vendor name */ + do_cpuid(0, 0, regs); + /* Check for supported cpuid version */ + if (!regs[0]) + return false; + /* Get cpu features */ + do_cpuid(1, 0, regs); + return (!!(regs[2] & (1 << 13))); +} +#endif /* __x86_64__*/ diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index a2e1f13de..c0068a994 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2433,10 +2433,13 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device) device->vk_pool_count = 6; };
-static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) +static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size, + const struct d3d12_device *device) { - cache->head = NULL; + cache->head.address.u.object = NULL; + cache->head.address.serial_id = 0; cache->size = size; + vkd3d_mutex_init(&cache->mutex); }
static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) @@ -2444,11 +2447,12 @@ static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cach union d3d12_desc_object u; void *next;
- for (u.object = cache->head; u.object; u.object = next) + for (u = cache->head.address.u; u.object; u.object = next) { next = u.header->next; vkd3d_free(u.object); } + vkd3d_mutex_destroy(&cache->mutex); }
/* ID3D12Device */ @@ -4024,8 +4028,38 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, device->blocked_queue_count = 0; vkd3d_mutex_init(&device->blocked_queues_mutex);
- vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view)); - vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc)); +#ifdef __x86_64__ + if (are_128_bit_atomics_supported()) + { + STATIC_ASSERT(sizeof(union desc_object_address) == 16); + STATIC_ASSERT(__alignof__(union desc_object_address) == 16); + device->cache_ops.desc_object_cache_get = vkd3d_desc_object_cache_get; + device->cache_ops.desc_object_cache_push = vkd3d_desc_object_cache_push; + } + else + { + /* Use of mutexes has a significant cost in games with heavy multithreaded view creation/destruction. + * For example, the framerate is halved in some segments of the Shadow of the Tomb Raider benchmark, + * and drops about a third overall in the Horizon Zero Dawn benchmark. + * x86-64 CPUs which lack CAS128 support are old and probably rare. */ + WARN("128-bit atomics are not supported by the CPU.\n"); + device->cache_ops.desc_object_cache_get = vkd3d_desc_object_cache_get_with_mutex; + device->cache_ops.desc_object_cache_push = vkd3d_desc_object_cache_push_with_mutex; + } +#elif defined(__i386__) + STATIC_ASSERT(sizeof(union desc_object_address) == 8); + STATIC_ASSERT(__alignof__(union desc_object_address) == 8); + device->cache_ops.desc_object_cache_get = vkd3d_desc_object_cache_get; + device->cache_ops.desc_object_cache_push = vkd3d_desc_object_cache_push; +#else + /* TODO: implement atomic pointer compare+swap with ABA safety. */ + WARN("Atomic linked list is not supported.\n"); + device->cache_ops.desc_object_cache_get = vkd3d_desc_object_cache_get_with_mutex; + device->cache_ops.desc_object_cache_push = vkd3d_desc_object_cache_push_with_mutex; +#endif + + vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view), device); + vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc), device);
device_init_descriptor_pool_sizes(device);
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index cd3856c29..8bd6df52f 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2282,43 +2282,91 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }
+static inline bool atomic_compare_exchange_desc_object_address(union desc_object_address volatile *head, + union desc_object_address cmp, union desc_object_address xchg, struct vkd3d_desc_object_cache *cache) +{ +#ifdef __x86_64__ + return vkd3d_atomic_compare_exchange_128(&head->value, cmp.value, xchg.value); +#elif defined(__i386__) + return vkd3d_atomic_compare_exchange_64(&head->value, cmp.value, xchg.value); +#else +# error "atomic linked list not implemented for this platform" +#endif /* __x86_64__ */ +} + /* Objects are cached so that vkd3d_view_incref() can safely check the refcount * of an object freed by another thread. */ -static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) +void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - void *next; + union desc_object_address head, new_head;
do { - u.object = cache->head; - if (!u.object) - return vkd3d_malloc(cache->size); - next = u.header->next; + head = cache->head; + if (!head.address.u.object) + return vkd3d_malloc_aligned(cache->size, sizeof(head)); + /* This `next` value may be incorrect if other threads pop the head, use it, then push it + * back, but the head's serial_id will have been incremented in vkd3d_desc_object_cache_push(), + * so the comparison will return false. */ + new_head.address.u.object = head.address.u.header->next; + /* The object pointer and current serial_id combined form the object's comparison value. */ + new_head.address.serial_id = head.address.serial_id; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, u.object, next)); + while (!atomic_compare_exchange_desc_object_address(&cache->head, head, new_head, cache)); + + return head.address.u.object; +} + +void *vkd3d_desc_object_cache_get_with_mutex(struct vkd3d_desc_object_cache *cache) +{ + union d3d12_desc_object u; + + vkd3d_mutex_lock(&cache->mutex);
+ u = cache->head.address.u; + if (!u.object) + u.object = vkd3d_malloc(cache->size); + else + cache->head.address.u.object = u.header->next; + + vkd3d_mutex_unlock(&cache->mutex); return u.object; }
-static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) +void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { - union d3d12_desc_object u = {object}; - void *head; + union desc_object_address head, new_head; + + new_head.address.u.object = object;
do { head = cache->head; - u.header->next = head; + /* Increment the serial_id when pushing a new head. */ + new_head.address.serial_id = head.address.serial_id + 1; + new_head.address.u.header->next = head.address.u.object; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, head, u.object)); + while (!atomic_compare_exchange_desc_object_address(&cache->head, head, new_head, cache)); +} + +void vkd3d_desc_object_cache_push_with_mutex(struct vkd3d_desc_object_cache *cache, void *object) +{ + union d3d12_desc_object u; + + vkd3d_mutex_lock(&cache->mutex); + + u.object = object; + u.header->next = cache->head.address.u.object; + cache->head.address.u = u; + + vkd3d_mutex_unlock(&cache->mutex); }
static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) { struct vkd3d_cbuffer_desc *desc;
- if (!(desc = vkd3d_desc_object_cache_get(&device->cbuffer_desc_cache))) + if (!(desc = device->cache_ops.desc_object_cache_get(&device->cbuffer_desc_cache))) return NULL;
desc->h.magic = VKD3D_DESCRIPTOR_MAGIC_CBV; @@ -2335,7 +2383,7 @@ static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_
assert(magic);
- if (!(view = vkd3d_desc_object_cache_get(&device->view_desc_cache))) + if (!(view = device->cache_ops.desc_object_cache_get(&device->view_desc_cache))) { ERR("Failed to allocate descriptor object.\n"); return NULL; @@ -2374,7 +2422,7 @@ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *dev if (view->v.vk_counter_view) VK_CALL(vkDestroyBufferView(device->vk_device, view->v.vk_counter_view, NULL));
- vkd3d_desc_object_cache_push(&device->view_desc_cache, view); + device->cache_ops.desc_object_cache_push(&device->view_desc_cache, view); }
void vkd3d_view_decref(void *view, struct d3d12_device *device) @@ -2387,7 +2435,7 @@ void vkd3d_view_decref(void *view, struct d3d12_device *device) if (u.header->magic != VKD3D_DESCRIPTOR_MAGIC_CBV) vkd3d_view_destroy(u.view, device); else - vkd3d_desc_object_cache_push(&device->cbuffer_desc_cache, u.object); + device->cache_ops.desc_object_cache_push(&device->cbuffer_desc_cache, u.object); }
static inline void d3d12_desc_replace(struct d3d12_desc *dst, void *view, struct d3d12_device *device) diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4bd6812b1..cb1395433 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -268,6 +268,11 @@ static inline bool vkd3d_atomic_compare_exchange(unsigned int volatile *x, unsig return InterlockedCompareExchange((LONG volatile *)x, xchg, cmp) == cmp; }
+static inline bool vkd3d_atomic_compare_exchange_64(uint64_t volatile *x, uint64_t cmp, uint64_t xchg) +{ + return InterlockedCompareExchange64((LONG64 volatile *)x, xchg, cmp) == cmp; +} + static inline unsigned int vkd3d_atomic_exchange(unsigned int volatile *x, unsigned int val) { return InterlockedExchange((LONG volatile *)x, val); @@ -409,6 +414,11 @@ static inline bool vkd3d_atomic_compare_exchange(unsigned int volatile *x, unsig return __sync_bool_compare_and_swap(x, cmp, xchg); }
+static inline bool vkd3d_atomic_compare_exchange_64(uint64_t volatile *x, uint64_t cmp, uint64_t xchg) +{ + return __sync_bool_compare_and_swap(x, cmp, xchg); +} + static inline bool vkd3d_atomic_compare_exchange_pointer(void * volatile *x, void *cmp, void *xchg) { return __sync_bool_compare_and_swap(x, cmp, xchg); @@ -453,6 +463,25 @@ static inline void *vkd3d_atomic_exchange_pointer(void * volatile *x, void *val)
#endif /* _WIN32 */
+#ifdef __x86_64__ +/* Avoid complications with compiler support. Based on the Wine implementation. */ +static inline bool vkd3d_atomic_compare_exchange_128(__uint128_t volatile *x, __uint128_t cmp, __uint128_t xchg) +{ + uint64_t xchg_high = xchg >> 64, xchg_low = xchg; + uint64_t volatile *dest = (uint64_t volatile *)x; + uint64_t cmp_high = cmp >> 64, cmp_low = cmp; + unsigned char ret; + + __asm__ __volatile__( "lock cmpxchg16b %0; setz %b2" + : "=m" (dest[0]), "=m" (dest[1]), "=r" (ret), + "=a" (cmp_low), "=d" (cmp_high) + : "m" (dest[0]), "m" (dest[1]), "3" (cmp_low), "4" (cmp_high), + "b" (xchg_low), "c" (xchg_high) + ); + return ret; +} +#endif /* __x86_64__ */ + HRESULT vkd3d_create_thread(struct vkd3d_instance *instance, PFN_vkd3d_thread thread_main, void *data, union vkd3d_thread_handle *thread); HRESULT vkd3d_join_thread(struct vkd3d_instance *instance, union vkd3d_thread_handle *thread); @@ -1690,14 +1719,35 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
+union desc_object_address +{ +#ifdef __x86_64__ + __uint128_t value; +#else + uint64_t value; +#endif + struct + { + union d3d12_desc_object u; + size_t serial_id; + } address; +}; + struct vkd3d_desc_object_cache { - void * volatile head; + union desc_object_address volatile head; size_t size; + struct vkd3d_mutex mutex; };
#define VKD3D_DESCRIPTOR_POOL_COUNT 6
+struct desc_object_cache_ops +{ + void *(*desc_object_cache_get)(struct vkd3d_desc_object_cache *cache); + void (*desc_object_cache_push)(struct vkd3d_desc_object_cache *cache, void *object); +}; + /* ID3D12Device */ struct d3d12_device { @@ -1712,6 +1762,7 @@ struct d3d12_device
struct vkd3d_gpu_va_allocator gpu_va_allocator;
+ struct desc_object_cache_ops cache_ops; struct vkd3d_mutex mutex; struct vkd3d_desc_object_cache view_desc_cache; struct vkd3d_desc_object_cache cbuffer_desc_cache; @@ -1790,6 +1841,11 @@ static inline unsigned int d3d12_device_get_descriptor_handle_increment_size(str return ID3D12Device_GetDescriptorHandleIncrementSize(&device->ID3D12Device_iface, descriptor_type); }
+void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache); +void *vkd3d_desc_object_cache_get_with_mutex(struct vkd3d_desc_object_cache *cache); +void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object); +void vkd3d_desc_object_cache_push_with_mutex(struct vkd3d_desc_object_cache *cache, void *object); + /* utils */ enum vkd3d_format_type {