From: Conor McCarthy cmccarthy@codeweavers.com
The list suffers from the ABA problem, where comparison with the head succeeds but the head's `next` pointer has changed. Occurs in Cyberpunk 2077, on NVIDIA at least. --- include/private/vkd3d_memory.h | 18 +++++++++++ libs/vkd3d-common/memory.c | 31 +++++++++++++++++++ libs/vkd3d/device.c | 20 +++++++++---- libs/vkd3d/resource.c | 55 ++++++++++++++++++++++++++-------- libs/vkd3d/vkd3d_private.h | 52 +++++++++++++++++++++++++++++++- 5 files changed, 158 insertions(+), 18 deletions(-)
diff --git a/include/private/vkd3d_memory.h b/include/private/vkd3d_memory.h index 8a2edb100..db93f1c5a 100644 --- a/include/private/vkd3d_memory.h +++ b/include/private/vkd3d_memory.h @@ -55,6 +55,24 @@ static inline void vkd3d_free(void *ptr) free(ptr); }
+static inline void *vkd3d_malloc_aligned(size_t size, size_t alignment) +{ + /* aligned_alloc() requires C11. */ + void *p = malloc(size); + + /* Systems which support double-wide CAS should return a pointer with the required alignment. */ + if ((intptr_t)p & (alignment - 1)) + { + FIXME("Alignment %zu is not supported.\n", alignment); + vkd3d_free(p); + return NULL; + } + + return p; +} + +bool are_128_bit_atomics_supported(void); + static inline char *vkd3d_strdup(const char *string) { size_t len = strlen(string) + 1; diff --git a/libs/vkd3d-common/memory.c b/libs/vkd3d-common/memory.c index 2bf8947e0..9852532a3 100644 --- a/libs/vkd3d-common/memory.c +++ b/libs/vkd3d-common/memory.c @@ -46,3 +46,34 @@ bool vkd3d_array_reserve(void **elements, size_t *capacity, size_t element_count
return true; } + +#ifdef __x86_64__ +static void do_cpuid(unsigned int ax, unsigned int cx, unsigned int *p) +{ + __asm__( "pushq %rbx\n\t" + "movl %edi,%eax\n\t" + "movl %esi,%ecx\n\t" + "movq %rdx,%r8\n\t" + "cpuid\n\t" + "movl %eax,(%r8)\n\t" + "movl %ebx,4(%r8)\n\t" + "movl %ecx,8(%r8)\n\t" + "movl %edx,12(%r8)\n\t" + "popq %rbx\n\t" + ); +} + +bool are_128_bit_atomics_supported() +{ + unsigned int regs[4]; + + /* Get standard cpuid level and vendor name */ + do_cpuid(0, 0, regs); + /* Check for supported cpuid version */ + if (!regs[0]) + return false; + /* Get cpu features */ + do_cpuid(1, 0, regs); + return (!!(regs[2] & (1 << 13))); +} +#endif /* __x86_64__*/ diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index a2e1f13de..84c776cd8 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2433,10 +2433,15 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device) device->vk_pool_count = 6; };
-static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) +static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size, + const struct d3d12_device *device) { - cache->head = NULL; + cache->head.address.u.object = NULL; + cache->head.address.tag = 0; cache->size = size; +#ifdef __x86_64__ + cache->have_128_bit_atomics = device->have_128_bit_atomics; +#endif }
static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) @@ -2444,7 +2449,7 @@ static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cach union d3d12_desc_object u; void *next;
- for (u.object = cache->head; u.object; u.object = next) + for (u = cache->head.address.u; u.object; u.object = next) { next = u.header->next; vkd3d_free(u.object); @@ -4024,8 +4029,13 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, device->blocked_queue_count = 0; vkd3d_mutex_init(&device->blocked_queues_mutex);
- vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view)); - vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc)); +#ifdef __x86_64__ + if (!(device->have_128_bit_atomics = are_128_bit_atomics_supported())) + WARN("128-bit atomics are not supported.\n"); +#endif + + vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view), device); + vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc), device);
device_init_descriptor_pool_sizes(device);
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index cd3856c29..0d6b5123c 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2282,36 +2282,67 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }
+static inline bool atomic_compare_exchange_desc_object_address(union desc_object_address volatile *head, + union desc_object_address cmp, union desc_object_address xchg, struct vkd3d_desc_object_cache *cache) +{ +#ifdef __x86_64__ + if (cache->have_128_bit_atomics) + return vkd3d_atomic_compare_exchange_128(&head->value, cmp.value, xchg.value); + else + return vkd3d_atomic_compare_exchange_pointer(&head->address.u.object, cmp.address.u.object, + xchg.address.u.object); +#elif defined(__APPLE__) + /* TODO: solve ABA problem. */ + return vkd3d_atomic_compare_exchange_pointer(&head->address.u.object, cmp.address.u.object, + xchg.address.u.object); +#else + return vkd3d_atomic_compare_exchange_64(&head->value, cmp.value, xchg.value); +#endif /* __x86_64__ */ +} + /* Objects are cached so that vkd3d_view_incref() can safely check the refcount * of an object freed by another thread. */ static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - void *next; + union desc_object_address head, next;
do { - u.object = cache->head; - if (!u.object) - return vkd3d_malloc(cache->size); - next = u.header->next; + head = cache->head; + if (!head.address.u.object) + { + union d3d12_desc_object u; + if ((u.object = vkd3d_malloc_aligned(cache->size, sizeof(head)))) + u.header->tag = 0; + return u.object; + } + /* This next value may be incorrect if other threads pop the head, use it, then push it + * back, but the head's tag will have been incremented in vkd3d_desc_object_cache_push(), + * so the comparison will return false. */ + next.address.u.object = head.address.u.header->next; + /* The object pointer and its tag combined form the object's comparison value. */ + next.address.tag = next.address.u.object ? next.address.u.header->tag : 0; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, u.object, next)); + while (!atomic_compare_exchange_desc_object_address(&cache->head, head, next, cache));
- return u.object; + return head.address.u.object; }
static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { - union d3d12_desc_object u = {object}; - void *head; + union desc_object_address head, push; + + push.address.u.object = object; + /* An object's tag is incremented when it is pushed as the new head. Storing a + * unique tag for each object reduces the risk of wrap on 32-bit systems. */ + push.address.tag = ++push.address.u.header->tag;
do { head = cache->head; - u.header->next = head; + push.address.u.header->next = head.address.u.object; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, head, u.object)); + while (!atomic_compare_exchange_desc_object_address(&cache->head, head, push, cache)); }
static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4bd6812b1..814071883 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -268,6 +268,11 @@ static inline bool vkd3d_atomic_compare_exchange(unsigned int volatile *x, unsig return InterlockedCompareExchange((LONG volatile *)x, xchg, cmp) == cmp; }
+static inline bool vkd3d_atomic_compare_exchange_64(uint64_t volatile *x, uint64_t cmp, uint64_t xchg) +{ + return InterlockedCompareExchange64((LONG64 volatile *)x, xchg, cmp) == cmp; +} + static inline unsigned int vkd3d_atomic_exchange(unsigned int volatile *x, unsigned int val) { return InterlockedExchange((LONG volatile *)x, val); @@ -409,6 +414,11 @@ static inline bool vkd3d_atomic_compare_exchange(unsigned int volatile *x, unsig return __sync_bool_compare_and_swap(x, cmp, xchg); }
+static inline bool vkd3d_atomic_compare_exchange_64(uint64_t volatile *x, uint64_t cmp, uint64_t xchg) +{ + return __sync_bool_compare_and_swap(x, cmp, xchg); +} + static inline bool vkd3d_atomic_compare_exchange_pointer(void * volatile *x, void *cmp, void *xchg) { return __sync_bool_compare_and_swap(x, cmp, xchg); @@ -453,6 +463,25 @@ static inline void *vkd3d_atomic_exchange_pointer(void * volatile *x, void *val)
#endif /* _WIN32 */
+#ifdef __x86_64__ +/* Avoid complications with compiler support. Based on the Wine implementation. */ +static inline bool vkd3d_atomic_compare_exchange_128(__uint128_t volatile *x, __uint128_t cmp, __uint128_t xchg) +{ + uint64_t xchg_high = xchg >> 64, xchg_low = xchg; + uint64_t volatile *dest = (uint64_t volatile *)x; + uint64_t cmp_high = cmp >> 64, cmp_low = cmp; + unsigned char ret; + + __asm__ __volatile__( "lock cmpxchg16b %0; setz %b2" + : "=m" (dest[0]), "=m" (dest[1]), "=r" (ret), + "=a" (cmp_low), "=d" (cmp_high) + : "m" (dest[0]), "m" (dest[1]), "3" (cmp_low), "4" (cmp_high), + "b" (xchg_low), "c" (xchg_high) + ); + return ret; +} +#endif /* __x86_64__ */ + HRESULT vkd3d_create_thread(struct vkd3d_instance *instance, PFN_vkd3d_thread thread_main, void *data, union vkd3d_thread_handle *thread); HRESULT vkd3d_join_thread(struct vkd3d_instance *instance, union vkd3d_thread_handle *thread); @@ -842,6 +871,7 @@ struct vkd3d_desc_header uint32_t magic; unsigned int volatile refcount; void *next; + size_t tag; VkDescriptorType vk_descriptor_type; };
@@ -1690,10 +1720,27 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
+union desc_object_address +{ +#ifdef __x86_64__ + __uint128_t value; +#else + uint64_t value; +#endif + struct + { + union d3d12_desc_object u; + size_t tag; + } address; +}; + struct vkd3d_desc_object_cache { - void * volatile head; + union desc_object_address volatile head; size_t size; +#ifdef __x86_64__ + bool have_128_bit_atomics; +#endif };
#define VKD3D_DESCRIPTOR_POOL_COUNT 6 @@ -1712,6 +1759,9 @@ struct d3d12_device
struct vkd3d_gpu_va_allocator gpu_va_allocator;
+#ifdef __x86_64__ + bool have_128_bit_atomics; +#endif struct vkd3d_mutex mutex; struct vkd3d_desc_object_cache view_desc_cache; struct vkd3d_desc_object_cache cbuffer_desc_cache;