From: Conor McCarthy cmccarthy@codeweavers.com
The list suffers from the ABA problem, where comparison with the head succeeds but the head's `next` pointer has changed. Occurs in Cyberpunk 2077, on NVIDIA at least. --- include/private/vkd3d_memory.h | 18 +++++++++++ libs/vkd3d-common/memory.c | 31 +++++++++++++++++++ libs/vkd3d/device.c | 20 +++++++++---- libs/vkd3d/resource.c | 55 ++++++++++++++++++++++++++-------- libs/vkd3d/vkd3d_private.h | 52 +++++++++++++++++++++++++++++++- 5 files changed, 158 insertions(+), 18 deletions(-)
diff --git a/include/private/vkd3d_memory.h b/include/private/vkd3d_memory.h index 8a2edb100..db93f1c5a 100644 --- a/include/private/vkd3d_memory.h +++ b/include/private/vkd3d_memory.h @@ -55,6 +55,24 @@ static inline void vkd3d_free(void *ptr) free(ptr); }
+static inline void *vkd3d_malloc_aligned(size_t size, size_t alignment) +{ + /* aligned_alloc() requires C11. */ + void *p = malloc(size); + + /* Systems which support double-wide CAS should return a pointer with the required alignment. */ + if ((intptr_t)p & (alignment - 1)) + { + FIXME("Alignment %zu is not supported.\n", alignment); + vkd3d_free(p); + return NULL; + } + + return p; +} + +bool are_128_bit_atomics_supported(void); + static inline char *vkd3d_strdup(const char *string) { size_t len = strlen(string) + 1; diff --git a/libs/vkd3d-common/memory.c b/libs/vkd3d-common/memory.c index 2bf8947e0..9852532a3 100644 --- a/libs/vkd3d-common/memory.c +++ b/libs/vkd3d-common/memory.c @@ -46,3 +46,34 @@ bool vkd3d_array_reserve(void **elements, size_t *capacity, size_t element_count
return true; } + +#ifdef __x86_64__ +static void do_cpuid(unsigned int ax, unsigned int cx, unsigned int *p) +{ + __asm__( "pushq %rbx\n\t" + "movl %edi,%eax\n\t" + "movl %esi,%ecx\n\t" + "movq %rdx,%r8\n\t" + "cpuid\n\t" + "movl %eax,(%r8)\n\t" + "movl %ebx,4(%r8)\n\t" + "movl %ecx,8(%r8)\n\t" + "movl %edx,12(%r8)\n\t" + "popq %rbx\n\t" + ); +} + +bool are_128_bit_atomics_supported() +{ + unsigned int regs[4]; + + /* Get standard cpuid level and vendor name */ + do_cpuid(0, 0, regs); + /* Check for supported cpuid version */ + if (!regs[0]) + return false; + /* Get cpu features */ + do_cpuid(1, 0, regs); + return (!!(regs[2] & (1 << 13))); +} +#endif /* __x86_64__*/ diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index a2e1f13de..84c776cd8 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2433,10 +2433,15 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device) device->vk_pool_count = 6; };
-static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) +static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size, + const struct d3d12_device *device) { - cache->head = NULL; + cache->head.address.u.object = NULL; + cache->head.address.tag = 0; cache->size = size; +#ifdef __x86_64__ + cache->have_128_bit_atomics = device->have_128_bit_atomics; +#endif }
static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) @@ -2444,7 +2449,7 @@ static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cach union d3d12_desc_object u; void *next;
- for (u.object = cache->head; u.object; u.object = next) + for (u = cache->head.address.u; u.object; u.object = next) { next = u.header->next; vkd3d_free(u.object); @@ -4024,8 +4029,13 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, device->blocked_queue_count = 0; vkd3d_mutex_init(&device->blocked_queues_mutex);
- vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view)); - vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc)); +#ifdef __x86_64__ + if (!(device->have_128_bit_atomics = are_128_bit_atomics_supported())) + WARN("128-bit atomics are not supported.\n"); +#endif + + vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view), device); + vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc), device);
device_init_descriptor_pool_sizes(device);
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index cd3856c29..0d6b5123c 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2282,36 +2282,67 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }
+static inline bool atomic_compare_exchange_desc_object_address(union desc_object_address volatile *head, + union desc_object_address cmp, union desc_object_address xchg, struct vkd3d_desc_object_cache *cache) +{ +#ifdef __x86_64__ + if (cache->have_128_bit_atomics) + return vkd3d_atomic_compare_exchange_128(&head->value, cmp.value, xchg.value); + else + return vkd3d_atomic_compare_exchange_pointer(&head->address.u.object, cmp.address.u.object, + xchg.address.u.object); +#elif defined(__APPLE__) + /* TODO: solve ABA problem. */ + return vkd3d_atomic_compare_exchange_pointer(&head->address.u.object, cmp.address.u.object, + xchg.address.u.object); +#else + return vkd3d_atomic_compare_exchange_64(&head->value, cmp.value, xchg.value); +#endif /* __x86_64__ */ +} + /* Objects are cached so that vkd3d_view_incref() can safely check the refcount * of an object freed by another thread. */ static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - void *next; + union desc_object_address head, next;
do { - u.object = cache->head; - if (!u.object) - return vkd3d_malloc(cache->size); - next = u.header->next; + head = cache->head; + if (!head.address.u.object) + { + union d3d12_desc_object u; + if ((u.object = vkd3d_malloc_aligned(cache->size, sizeof(head)))) + u.header->tag = 0; + return u.object; + } + /* This next value may be incorrect if other threads pop the head, use it, then push it + * back, but the head's tag will have been incremented in vkd3d_desc_object_cache_push(), + * so the comparison will return false. */ + next.address.u.object = head.address.u.header->next; + /* The object pointer and its tag combined form the object's comparison value. */ + next.address.tag = next.address.u.object ? next.address.u.header->tag : 0; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, u.object, next)); + while (!atomic_compare_exchange_desc_object_address(&cache->head, head, next, cache));
- return u.object; + return head.address.u.object; }
static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { - union d3d12_desc_object u = {object}; - void *head; + union desc_object_address head, push; + + push.address.u.object = object; + /* An object's tag is incremented when it is pushed as the new head. Storing a + * unique tag for each object reduces the risk of wrap on 32-bit systems. */ + push.address.tag = ++push.address.u.header->tag;
do { head = cache->head; - u.header->next = head; + push.address.u.header->next = head.address.u.object; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, head, u.object)); + while (!atomic_compare_exchange_desc_object_address(&cache->head, head, push, cache)); }
static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4bd6812b1..814071883 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -268,6 +268,11 @@ static inline bool vkd3d_atomic_compare_exchange(unsigned int volatile *x, unsig return InterlockedCompareExchange((LONG volatile *)x, xchg, cmp) == cmp; }
+static inline bool vkd3d_atomic_compare_exchange_64(uint64_t volatile *x, uint64_t cmp, uint64_t xchg) +{ + return InterlockedCompareExchange64((LONG64 volatile *)x, xchg, cmp) == cmp; +} + static inline unsigned int vkd3d_atomic_exchange(unsigned int volatile *x, unsigned int val) { return InterlockedExchange((LONG volatile *)x, val); @@ -409,6 +414,11 @@ static inline bool vkd3d_atomic_compare_exchange(unsigned int volatile *x, unsig return __sync_bool_compare_and_swap(x, cmp, xchg); }
+static inline bool vkd3d_atomic_compare_exchange_64(uint64_t volatile *x, uint64_t cmp, uint64_t xchg) +{ + return __sync_bool_compare_and_swap(x, cmp, xchg); +} + static inline bool vkd3d_atomic_compare_exchange_pointer(void * volatile *x, void *cmp, void *xchg) { return __sync_bool_compare_and_swap(x, cmp, xchg); @@ -453,6 +463,25 @@ static inline void *vkd3d_atomic_exchange_pointer(void * volatile *x, void *val)
#endif /* _WIN32 */
+#ifdef __x86_64__ +/* Avoid complications with compiler support. Based on the Wine implementation. */ +static inline bool vkd3d_atomic_compare_exchange_128(__uint128_t volatile *x, __uint128_t cmp, __uint128_t xchg) +{ + uint64_t xchg_high = xchg >> 64, xchg_low = xchg; + uint64_t volatile *dest = (uint64_t volatile *)x; + uint64_t cmp_high = cmp >> 64, cmp_low = cmp; + unsigned char ret; + + __asm__ __volatile__( "lock cmpxchg16b %0; setz %b2" + : "=m" (dest[0]), "=m" (dest[1]), "=r" (ret), + "=a" (cmp_low), "=d" (cmp_high) + : "m" (dest[0]), "m" (dest[1]), "3" (cmp_low), "4" (cmp_high), + "b" (xchg_low), "c" (xchg_high) + ); + return ret; +} +#endif /* __x86_64__ */ + HRESULT vkd3d_create_thread(struct vkd3d_instance *instance, PFN_vkd3d_thread thread_main, void *data, union vkd3d_thread_handle *thread); HRESULT vkd3d_join_thread(struct vkd3d_instance *instance, union vkd3d_thread_handle *thread); @@ -842,6 +871,7 @@ struct vkd3d_desc_header uint32_t magic; unsigned int volatile refcount; void *next; + size_t tag; VkDescriptorType vk_descriptor_type; };
@@ -1690,10 +1720,27 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
+union desc_object_address +{ +#ifdef __x86_64__ + __uint128_t value; +#else + uint64_t value; +#endif + struct + { + union d3d12_desc_object u; + size_t tag; + } address; +}; + struct vkd3d_desc_object_cache { - void * volatile head; + union desc_object_address volatile head; size_t size; +#ifdef __x86_64__ + bool have_128_bit_atomics; +#endif };
#define VKD3D_DESCRIPTOR_POOL_COUNT 6 @@ -1712,6 +1759,9 @@ struct d3d12_device
struct vkd3d_gpu_va_allocator gpu_va_allocator;
+#ifdef __x86_64__ + bool have_128_bit_atomics; +#endif struct vkd3d_mutex mutex; struct vkd3d_desc_object_cache view_desc_cache; struct vkd3d_desc_object_cache cbuffer_desc_cache;
I designed it to minimise the risk of wrap, relevant on 32-bit, at the cost of reading from the next object in vkd3d_desc_object_cache_get(). Perhaps we don't care that much about unlikely problems on 32-bit though.
Once this is upstream it will need to be fixed on Mac.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d-common/memory.c:
return true;
}
+#ifdef __x86_64__ +static void do_cpuid(unsigned int ax, unsigned int cx, unsigned int *p)
I don't think you can do that: either you declare the function with attribute `naked` (see https://gcc.gnu.org/onlinedocs/gcc/x86-Function-Attributes.html) and use a basic ASM block, so that you're in charge to interpret calling conventions and ABI in general, or you don't declare the function `naked` and use an extended ASM block with the usual input, output and clobber register declarations.
Between the two, I think that in general using the extended ASM block is better, because it allows the compiler to inline the function call and allocate registers optimally. Not that it should matter that much, in this specific case.
Another minor thing: I would use `uint32_t` instead of `unsigned int`, given that we're talking very low level with a fixed bit width,
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/vkd3d_private.h:
HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
+union desc_object_address +{ +#ifdef __x86_64__
- __uint128_t value;
+#else
- uint64_t value;
+#endif
- struct
- {
union d3d12_desc_object u;
size_t tag;
- } address;
+};
Can you statically assert that on x86_64 this has size and alignment of 16 bytes?
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/resource.c:
+static inline bool atomic_compare_exchange_desc_object_address(union desc_object_address volatile *head,
union desc_object_address cmp, union desc_object_address xchg, struct vkd3d_desc_object_cache *cache)
+{ +#ifdef __x86_64__
- if (cache->have_128_bit_atomics)
return vkd3d_atomic_compare_exchange_128(&head->value, cmp.value, xchg.value);
- else
return vkd3d_atomic_compare_exchange_pointer(&head->address.u.object, cmp.address.u.object,
xchg.address.u.object);
+#elif defined(__APPLE__)
- /* TODO: solve ABA problem. */
- return vkd3d_atomic_compare_exchange_pointer(&head->address.u.object, cmp.address.u.object,
xchg.address.u.object);
+#else
- return vkd3d_atomic_compare_exchange_64(&head->value, cmp.value, xchg.value);
+#endif /* __x86_64__ */
Hmm, I think this leaves all non-x86_64 architectures broken, and x86_64 too in case `cmpxchg16b` is not supported. I think for these cases we should fallback to guarding the linked list with a mutex.
There was some discussion on whether this object cache is on such a hot path that it requires arch-specific optimizations (instead of just protecting the object cache with a mutex).
After a few tests with Cyberpunk 2077 (using Conor's branch `sm6_rebase`), it seems that yes, there is some performance loss if we just use a mutex instead of the lock-free implementation: specifically, the game calls `CreateConstantBufferView()` a lot of times and from many threads, causing a lot of mutex contention; also, while for other view types creation and destruction involve some Vulkan call, for CBVs there is only some little arithmetic and a few atomic operations, so it is conceivable that a contended mutex impacts performances significantly.