-- v3: vkd3d: Fix invalid atomic behaviour in the view cache linked list.
From: Conor McCarthy cmccarthy@codeweavers.com
The list suffers from the ABA problem, where comparison with the head succeeds but the head's `next` pointer has changed. Occurs in Cyberpunk 2077, on NVIDIA at least. --- libs/vkd3d/device.c | 12 ++++++---- libs/vkd3d/resource.c | 48 ++++++++++++++++++++++++++++---------- libs/vkd3d/vkd3d_private.h | 10 +++++++- 3 files changed, 53 insertions(+), 17 deletions(-)
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index a2e1f13d..19ecb0eb 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2435,19 +2435,23 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device)
static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) { - cache->head = NULL; + memset(cache, 0, sizeof(*cache)); cache->size = size; }
static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) { union d3d12_desc_object u; + unsigned int i; void *next;
- for (u.object = cache->head; u.object; u.object = next) + for (i = 0; i < ARRAY_SIZE(cache->heads); ++i) { - next = u.header->next; - vkd3d_free(u.object); + for (u.object = cache->heads[i].head; u.object; u.object = next) + { + next = u.header->next; + vkd3d_free(u.object); + } } }
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index cd3856c2..3702bc1c 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2282,38 +2282,62 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }
+#define HEAD_INDEX_MASK (ARRAY_SIZE(cache->heads) - 1) + /* Objects are cached so that vkd3d_view_incref() can safely check the refcount * of an object freed by another thread. */ static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { union d3d12_desc_object u; - void *next; + unsigned int i;
- do + i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; + for (;;) { - u.object = cache->head; - if (!u.object) + if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) + { + if ((u.object = cache->heads[i].head)) + { + vkd3d_atomic_decrement(&cache->free_count); + cache->heads[i].head = u.header->next; + vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); + return u.object; + } + vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); + } + + if (cache->free_count < ARRAY_SIZE(cache->heads)) return vkd3d_malloc(cache->size); - next = u.header->next; - } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, u.object, next));
- return u.object; + i = (i + 1) & HEAD_INDEX_MASK; + } }
static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { union d3d12_desc_object u = {object}; + unsigned int i; void *head;
- do + /* Using the same index as above may result in a somewhat uneven distribution, + * but the main objective is to avoid costly spinlock contention. */ + i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; + for (;;) { - head = cache->head; - u.header->next = head; + if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) + break; + i = (i + 1) & HEAD_INDEX_MASK; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, head, u.object)); + + head = cache->heads[i].head; + u.header->next = head; + cache->heads[i].head = u.object; + vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); + vkd3d_atomic_increment(&cache->free_count); }
+#undef HEAD_INDEX_MASK + static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) { struct vkd3d_cbuffer_desc *desc; diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4bd6812b..7c930df2 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -1690,9 +1690,17 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
+struct desc_object_cache_head +{ + void *head; + unsigned int spinlock; +}; + struct vkd3d_desc_object_cache { - void * volatile head; + struct desc_object_cache_head heads[8]; + unsigned int next_index; + unsigned int free_count; size_t size; };
On Tue Aug 8 15:06:39 2023 +0000, Henri Verbeet wrote:
I previously talked about parts of this with Giovanni on IRC, but the productive thing to do is probably to continue that conversation here. I have a fairly strong dislike for the direction this is going in. In particular, in no specific order:
- Implementing lock-free data structures correctly is notoriously
hard, and we're probably seeing an example of that here. Perhaps more importantly, while implementing these correctly may be hard, reviewing the code is generally even harder.
- Adding inline assembly and architecture specific code doesn't help.
- Neither does inlining the linked list implementation in the device
and object cache code.
- If the issue with using a regular mutex or even a spinlock is
contention, perhaps we should try to address that, instead of attempting to make the synchronisation primitives faster. (Do these caches need to be global to the device? Could we e.g. make them local to the CPU core or thread accessing them?)
- In the case of CBVs in particular, given the number of them that
applications appear to create and destroy per frame, as well as the fact that these are fairly small structures, allocating the individually using vkd3d_malloc() seems less than ideal. (I.e., I imagine we'd want to use slab allocation for these in order to improve both locality and allocation overhead.)
Making caches local to the thread has the potential for some thorny edge cases where one thread creates more than it frees, and another does the opposite (on copied descriptors), so its cache grows until the system is out of memory.
The new implementation is much simpler than the 128-bit CAS version and has about the same performance. It's somewhat similar to the old mutex array scheme.
I tried a slab allocator a while back and it had no effect on performance, but it would use less memory so may be worth revisiting.