-- v2: vkd3d: Fix invalid atomic behaviour in the view cache linked list.
From: Conor McCarthy cmccarthy@codeweavers.com
The list suffers from the ABA problem, where comparison with the head succeeds but the head's `next` pointer has changed. Occurs in Cyberpunk 2077, on NVIDIA at least. --- include/private/vkd3d_memory.h | 18 ++++++++ libs/vkd3d-common/memory.c | 21 +++++++++ libs/vkd3d/device.c | 44 +++++++++++++++--- libs/vkd3d/resource.c | 82 +++++++++++++++++++++++++++------- libs/vkd3d/vkd3d_private.h | 58 +++++++++++++++++++++++- 5 files changed, 200 insertions(+), 23 deletions(-)
diff --git a/include/private/vkd3d_memory.h b/include/private/vkd3d_memory.h index 8a2edb100..db93f1c5a 100644 --- a/include/private/vkd3d_memory.h +++ b/include/private/vkd3d_memory.h @@ -55,6 +55,24 @@ static inline void vkd3d_free(void *ptr) free(ptr); }
+static inline void *vkd3d_malloc_aligned(size_t size, size_t alignment) +{ + /* aligned_alloc() requires C11. */ + void *p = malloc(size); + + /* Systems which support double-wide CAS should return a pointer with the required alignment. */ + if ((intptr_t)p & (alignment - 1)) + { + FIXME("Alignment %zu is not supported.\n", alignment); + vkd3d_free(p); + return NULL; + } + + return p; +} + +bool are_128_bit_atomics_supported(void); + static inline char *vkd3d_strdup(const char *string) { size_t len = strlen(string) + 1; diff --git a/libs/vkd3d-common/memory.c b/libs/vkd3d-common/memory.c index 2bf8947e0..4c9dfcf16 100644 --- a/libs/vkd3d-common/memory.c +++ b/libs/vkd3d-common/memory.c @@ -46,3 +46,24 @@ bool vkd3d_array_reserve(void **elements, size_t *capacity, size_t element_count
return true; } + +#ifdef __x86_64__ +static inline void do_cpuid(uint32_t ax, uint32_t cx, uint32_t info[4]) +{ + __asm__ ("cpuid" : "=a"(info[0]), "=b" (info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(ax), "c"(cx)); +} + +bool are_128_bit_atomics_supported() +{ + unsigned int regs[4]; + + /* Get standard cpuid level and vendor name */ + do_cpuid(0, 0, regs); + /* Check for supported cpuid version */ + if (!regs[0]) + return false; + /* Get cpu features */ + do_cpuid(1, 0, regs); + return (!!(regs[2] & (1 << 13))); +} +#endif /* __x86_64__*/ diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index a2e1f13de..c0068a994 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2433,10 +2433,13 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device) device->vk_pool_count = 6; };
-static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) +static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size, + const struct d3d12_device *device) { - cache->head = NULL; + cache->head.address.u.object = NULL; + cache->head.address.serial_id = 0; cache->size = size; + vkd3d_mutex_init(&cache->mutex); }
static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) @@ -2444,11 +2447,12 @@ static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cach union d3d12_desc_object u; void *next;
- for (u.object = cache->head; u.object; u.object = next) + for (u = cache->head.address.u; u.object; u.object = next) { next = u.header->next; vkd3d_free(u.object); } + vkd3d_mutex_destroy(&cache->mutex); }
/* ID3D12Device */ @@ -4024,8 +4028,38 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, device->blocked_queue_count = 0; vkd3d_mutex_init(&device->blocked_queues_mutex);
- vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view)); - vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc)); +#ifdef __x86_64__ + if (are_128_bit_atomics_supported()) + { + STATIC_ASSERT(sizeof(union desc_object_address) == 16); + STATIC_ASSERT(__alignof__(union desc_object_address) == 16); + device->cache_ops.desc_object_cache_get = vkd3d_desc_object_cache_get; + device->cache_ops.desc_object_cache_push = vkd3d_desc_object_cache_push; + } + else + { + /* Use of mutexes has a significant cost in games with heavy multithreaded view creation/destruction. + * For example, the framerate is halved in some segments of the Shadow of the Tomb Raider benchmark, + * and drops about a third overall in the Horizon Zero Dawn benchmark. + * x86-64 CPUs which lack CAS128 support are old and probably rare. */ + WARN("128-bit atomics are not supported by the CPU.\n"); + device->cache_ops.desc_object_cache_get = vkd3d_desc_object_cache_get_with_mutex; + device->cache_ops.desc_object_cache_push = vkd3d_desc_object_cache_push_with_mutex; + } +#elif defined(__i386__) + STATIC_ASSERT(sizeof(union desc_object_address) == 8); + STATIC_ASSERT(__alignof__(union desc_object_address) == 8); + device->cache_ops.desc_object_cache_get = vkd3d_desc_object_cache_get; + device->cache_ops.desc_object_cache_push = vkd3d_desc_object_cache_push; +#else + /* TODO: implement atomic pointer compare+swap with ABA safety. */ + WARN("Atomic linked list is not supported.\n"); + device->cache_ops.desc_object_cache_get = vkd3d_desc_object_cache_get_with_mutex; + device->cache_ops.desc_object_cache_push = vkd3d_desc_object_cache_push_with_mutex; +#endif + + vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view), device); + vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc), device);
device_init_descriptor_pool_sizes(device);
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index cd3856c29..8bd6df52f 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2282,43 +2282,91 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }
+static inline bool atomic_compare_exchange_desc_object_address(union desc_object_address volatile *head, + union desc_object_address cmp, union desc_object_address xchg, struct vkd3d_desc_object_cache *cache) +{ +#ifdef __x86_64__ + return vkd3d_atomic_compare_exchange_128(&head->value, cmp.value, xchg.value); +#elif defined(__i386__) + return vkd3d_atomic_compare_exchange_64(&head->value, cmp.value, xchg.value); +#else +# error "atomic linked list not implemented for this platform" +#endif /* __x86_64__ */ +} + /* Objects are cached so that vkd3d_view_incref() can safely check the refcount * of an object freed by another thread. */ -static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) +void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - void *next; + union desc_object_address head, new_head;
do { - u.object = cache->head; - if (!u.object) - return vkd3d_malloc(cache->size); - next = u.header->next; + head = cache->head; + if (!head.address.u.object) + return vkd3d_malloc_aligned(cache->size, sizeof(head)); + /* This `next` value may be incorrect if other threads pop the head, use it, then push it + * back, but the head's serial_id will have been incremented in vkd3d_desc_object_cache_push(), + * so the comparison will return false. */ + new_head.address.u.object = head.address.u.header->next; + /* The object pointer and current serial_id combined form the object's comparison value. */ + new_head.address.serial_id = head.address.serial_id; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, u.object, next)); + while (!atomic_compare_exchange_desc_object_address(&cache->head, head, new_head, cache)); + + return head.address.u.object; +} + +void *vkd3d_desc_object_cache_get_with_mutex(struct vkd3d_desc_object_cache *cache) +{ + union d3d12_desc_object u; + + vkd3d_mutex_lock(&cache->mutex);
+ u = cache->head.address.u; + if (!u.object) + u.object = vkd3d_malloc(cache->size); + else + cache->head.address.u.object = u.header->next; + + vkd3d_mutex_unlock(&cache->mutex); return u.object; }
-static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) +void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { - union d3d12_desc_object u = {object}; - void *head; + union desc_object_address head, new_head; + + new_head.address.u.object = object;
do { head = cache->head; - u.header->next = head; + /* Increment the serial_id when pushing a new head. */ + new_head.address.serial_id = head.address.serial_id + 1; + new_head.address.u.header->next = head.address.u.object; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, head, u.object)); + while (!atomic_compare_exchange_desc_object_address(&cache->head, head, new_head, cache)); +} + +void vkd3d_desc_object_cache_push_with_mutex(struct vkd3d_desc_object_cache *cache, void *object) +{ + union d3d12_desc_object u; + + vkd3d_mutex_lock(&cache->mutex); + + u.object = object; + u.header->next = cache->head.address.u.object; + cache->head.address.u = u; + + vkd3d_mutex_unlock(&cache->mutex); }
static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) { struct vkd3d_cbuffer_desc *desc;
- if (!(desc = vkd3d_desc_object_cache_get(&device->cbuffer_desc_cache))) + if (!(desc = device->cache_ops.desc_object_cache_get(&device->cbuffer_desc_cache))) return NULL;
desc->h.magic = VKD3D_DESCRIPTOR_MAGIC_CBV; @@ -2335,7 +2383,7 @@ static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_
assert(magic);
- if (!(view = vkd3d_desc_object_cache_get(&device->view_desc_cache))) + if (!(view = device->cache_ops.desc_object_cache_get(&device->view_desc_cache))) { ERR("Failed to allocate descriptor object.\n"); return NULL; @@ -2374,7 +2422,7 @@ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *dev if (view->v.vk_counter_view) VK_CALL(vkDestroyBufferView(device->vk_device, view->v.vk_counter_view, NULL));
- vkd3d_desc_object_cache_push(&device->view_desc_cache, view); + device->cache_ops.desc_object_cache_push(&device->view_desc_cache, view); }
void vkd3d_view_decref(void *view, struct d3d12_device *device) @@ -2387,7 +2435,7 @@ void vkd3d_view_decref(void *view, struct d3d12_device *device) if (u.header->magic != VKD3D_DESCRIPTOR_MAGIC_CBV) vkd3d_view_destroy(u.view, device); else - vkd3d_desc_object_cache_push(&device->cbuffer_desc_cache, u.object); + device->cache_ops.desc_object_cache_push(&device->cbuffer_desc_cache, u.object); }
static inline void d3d12_desc_replace(struct d3d12_desc *dst, void *view, struct d3d12_device *device) diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4bd6812b1..cb1395433 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -268,6 +268,11 @@ static inline bool vkd3d_atomic_compare_exchange(unsigned int volatile *x, unsig return InterlockedCompareExchange((LONG volatile *)x, xchg, cmp) == cmp; }
+static inline bool vkd3d_atomic_compare_exchange_64(uint64_t volatile *x, uint64_t cmp, uint64_t xchg) +{ + return InterlockedCompareExchange64((LONG64 volatile *)x, xchg, cmp) == cmp; +} + static inline unsigned int vkd3d_atomic_exchange(unsigned int volatile *x, unsigned int val) { return InterlockedExchange((LONG volatile *)x, val); @@ -409,6 +414,11 @@ static inline bool vkd3d_atomic_compare_exchange(unsigned int volatile *x, unsig return __sync_bool_compare_and_swap(x, cmp, xchg); }
+static inline bool vkd3d_atomic_compare_exchange_64(uint64_t volatile *x, uint64_t cmp, uint64_t xchg) +{ + return __sync_bool_compare_and_swap(x, cmp, xchg); +} + static inline bool vkd3d_atomic_compare_exchange_pointer(void * volatile *x, void *cmp, void *xchg) { return __sync_bool_compare_and_swap(x, cmp, xchg); @@ -453,6 +463,25 @@ static inline void *vkd3d_atomic_exchange_pointer(void * volatile *x, void *val)
#endif /* _WIN32 */
+#ifdef __x86_64__ +/* Avoid complications with compiler support. Based on the Wine implementation. */ +static inline bool vkd3d_atomic_compare_exchange_128(__uint128_t volatile *x, __uint128_t cmp, __uint128_t xchg) +{ + uint64_t xchg_high = xchg >> 64, xchg_low = xchg; + uint64_t volatile *dest = (uint64_t volatile *)x; + uint64_t cmp_high = cmp >> 64, cmp_low = cmp; + unsigned char ret; + + __asm__ __volatile__( "lock cmpxchg16b %0; setz %b2" + : "=m" (dest[0]), "=m" (dest[1]), "=r" (ret), + "=a" (cmp_low), "=d" (cmp_high) + : "m" (dest[0]), "m" (dest[1]), "3" (cmp_low), "4" (cmp_high), + "b" (xchg_low), "c" (xchg_high) + ); + return ret; +} +#endif /* __x86_64__ */ + HRESULT vkd3d_create_thread(struct vkd3d_instance *instance, PFN_vkd3d_thread thread_main, void *data, union vkd3d_thread_handle *thread); HRESULT vkd3d_join_thread(struct vkd3d_instance *instance, union vkd3d_thread_handle *thread); @@ -1690,14 +1719,35 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
+union desc_object_address +{ +#ifdef __x86_64__ + __uint128_t value; +#else + uint64_t value; +#endif + struct + { + union d3d12_desc_object u; + size_t serial_id; + } address; +}; + struct vkd3d_desc_object_cache { - void * volatile head; + union desc_object_address volatile head; size_t size; + struct vkd3d_mutex mutex; };
#define VKD3D_DESCRIPTOR_POOL_COUNT 6
+struct desc_object_cache_ops +{ + void *(*desc_object_cache_get)(struct vkd3d_desc_object_cache *cache); + void (*desc_object_cache_push)(struct vkd3d_desc_object_cache *cache, void *object); +}; + /* ID3D12Device */ struct d3d12_device { @@ -1712,6 +1762,7 @@ struct d3d12_device
struct vkd3d_gpu_va_allocator gpu_va_allocator;
+ struct desc_object_cache_ops cache_ops; struct vkd3d_mutex mutex; struct vkd3d_desc_object_cache view_desc_cache; struct vkd3d_desc_object_cache cbuffer_desc_cache; @@ -1790,6 +1841,11 @@ static inline unsigned int d3d12_device_get_descriptor_handle_increment_size(str return ID3D12Device_GetDescriptorHandleIncrementSize(&device->ID3D12Device_iface, descriptor_type); }
+void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache); +void *vkd3d_desc_object_cache_get_with_mutex(struct vkd3d_desc_object_cache *cache); +void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object); +void vkd3d_desc_object_cache_push_with_mutex(struct vkd3d_desc_object_cache *cache, void *object); + /* utils */ enum vkd3d_format_type {
The performance difference is larger in Horizon Zero Dawn.
I concluded it's unsafe to use tag values stored in each object because we could load a value which becomes stale before the swap. A single tag value is more likely to overflow in 32-bit, but 32-bit use is likely very uncommon and the issue is rare enough that it's unlikely to be coincident with an overflow.
I previously talked about parts of this with Giovanni on IRC, but the productive thing to do is probably to continue that conversation here.
I have a fairly strong dislike for the direction this is going in. In particular, in no specific order:
- Implementing lock-free data structures correctly is notoriously hard, and we're probably seeing an example of that here. Perhaps more importantly, while implementing these correctly may be hard, reviewing the code is generally even harder.
- Adding inline assembly and architecture specific code doesn't help.
- Neither does inlining the linked list implementation in the device and object cache code.
- If the issue with using a regular mutex or even a spinlock is contention, perhaps we should try to address that, instead of attempting to make the synchronisation primitives faster. (Do these caches need to be global to the device? Could we e.g. make them local to the CPU core or thread accessing them?)
- In the case of CBVs in particular, given the number of them that applications appear to create and destroy per frame, as well as the fact that these are fairly small structures, allocating the individually using vkd3d_malloc() seems less than ideal. (I.e., I imagine we'd want to use slab allocation for these in order to improve both locality and allocation overhead.)
Chris Robinson (@kcat) commented about include/private/vkd3d_memory.h:
free(ptr);
}
+static inline void *vkd3d_malloc_aligned(size_t size, size_t alignment) +{
- /* aligned_alloc() requires C11. */
- void *p = malloc(size);
- /* Systems which support double-wide CAS should return a pointer with the required alignment. */
- if ((intptr_t)p & (alignment - 1))
This doesn't look safe. `malloc` is only required to return an alignment suitable for built-in POD types, but it may have a larger alignment depending on where it's able to find available memory, resulting in this spuriously succeeding or failing. If you can't use win32's `_aligned_malloc`/`_aligned_free`, or POSIX's `posix_memalign`, you may need to manually align with over-allocation.