-- v4: vkd3d: Fix invalid atomic behaviour in the view cache linked list.
From: Conor McCarthy cmccarthy@codeweavers.com
The list suffers from the ABA problem, where comparison with the head succeeds but the head's `next` pointer has changed. Occurs in Cyberpunk 2077, on NVIDIA at least. --- libs/vkd3d/device.c | 12 +++++--- libs/vkd3d/resource.c | 57 ++++++++++++++++++++++++++++---------- libs/vkd3d/vkd3d_private.h | 10 ++++++- 3 files changed, 60 insertions(+), 19 deletions(-)
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index a2e1f13d..19ecb0eb 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2435,19 +2435,23 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device)
static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) { - cache->head = NULL; + memset(cache, 0, sizeof(*cache)); cache->size = size; }
static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) { union d3d12_desc_object u; + unsigned int i; void *next;
- for (u.object = cache->head; u.object; u.object = next) + for (i = 0; i < ARRAY_SIZE(cache->heads); ++i) { - next = u.header->next; - vkd3d_free(u.object); + for (u.object = cache->heads[i].head; u.object; u.object = next) + { + next = u.header->next; + vkd3d_free(u.object); + } } }
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index cd3856c2..f4158eba 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2282,38 +2282,67 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }
-/* Objects are cached so that vkd3d_view_incref() can safely check the refcount - * of an object freed by another thread. */ +#define HEAD_INDEX_MASK (ARRAY_SIZE(cache->heads) - 1) + +/* Objects are cached so that vkd3d_view_incref() can safely check the refcount of an + * object freed by another thread. This could be implemented as a single atomic linked + * list, but it requires handling the ABA problem, which brings issues with cross-platform + * support, compiler support, and non-universal x86-64 support for 128-bit CAS. */ static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { union d3d12_desc_object u; - void *next; + unsigned int i;
- do + STATIC_ASSERT(!(ARRAY_SIZE(cache->heads) & HEAD_INDEX_MASK)); + + i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; + for (;;) { - u.object = cache->head; - if (!u.object) + if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) + { + if ((u.object = cache->heads[i].head)) + { + vkd3d_atomic_decrement(&cache->free_count); + cache->heads[i].head = u.header->next; + vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); + return u.object; + } + vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); + } + /* Keeping a free count avoids uncertainty over when this loop should terminate, + * which could result in excess allocations gradually increasing without limit. */ + if (cache->free_count < ARRAY_SIZE(cache->heads)) return vkd3d_malloc(cache->size); - next = u.header->next; - } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, u.object, next));
- return u.object; + i = (i + 1) & HEAD_INDEX_MASK; + } }
static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { union d3d12_desc_object u = {object}; + unsigned int i; void *head;
- do + /* Using the same index as above may result in a somewhat uneven distribution, + * but the main objective is to avoid costly spinlock contention. */ + i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; + for (;;) { - head = cache->head; - u.header->next = head; + if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) + break; + i = (i + 1) & HEAD_INDEX_MASK; } - while (!vkd3d_atomic_compare_exchange_pointer(&cache->head, head, u.object)); + + head = cache->heads[i].head; + u.header->next = head; + cache->heads[i].head = u.object; + vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); + vkd3d_atomic_increment(&cache->free_count); }
+#undef HEAD_INDEX_MASK + static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) { struct vkd3d_cbuffer_desc *desc; diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4bd6812b..de8072f6 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -1690,9 +1690,17 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
+struct desc_object_cache_head +{ + void *head; + unsigned int spinlock; +}; + struct vkd3d_desc_object_cache { - void * volatile head; + struct desc_object_cache_head heads[16]; + unsigned int next_index; + unsigned int free_count; size_t size; };
Horizon Zero Dawn benefits a bit from using 16 heads instead of 8, which is also more future-proof. Like Cyberpunk it creates many CBVs per frame, as does SotTR.
It's possible on x86_64 to eliminate view objects for all buffer types with extension `buffer_device_address`, which reduces the buffer info to a 64-bit address and 64-bit size, which can be written to the descriptor with a single `_mm_stream_si128` or `_mm_store_si128` instruction. But this would also require descriptor buffers, because they are the only way to write a Vulkan buffer descriptor with a device address. It doesn't seem worthwhile at the moment.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/vkd3d_private.h:
HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
+struct desc_object_cache_head +{
- void *head;
- unsigned int spinlock;
+};
In theory it is advised to avoid putting more than a spinlock on the same cache line, because the cache line would be contended by different cores even if they mean to operate on different spinlocks. I guess that would amount to ensure that `struct desc_object_cache_head` is padded and aligned to the size of a cache line. On Intel architectures a cache line is 64 bytes, so you are putting four spinlocks in the same line.
That, of course, could turn out to be the usual theoretical thing that doesn't count at all in practice, but maybe it's worth having a try.
Unfortunately C doesn't (as far as I know) offer a portable way to query the cache line size at compilation time ([as C++17 does](https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_s...)). [Experimenting a little bit with the compiler explorer](https://godbolt.org/z/fEe4rK74s) it seems that most architectures are either 32 or 64 bytes, with PowerPC being 128 bytes and ARM64 possibly even 265 bytes. Given that we mostly care about Intel and ARM, I guess that we can just settle for 64, but 256 for ARM64.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/resource.c:
-/* Objects are cached so that vkd3d_view_incref() can safely check the refcount
- of an object freed by another thread. */
+#define HEAD_INDEX_MASK (ARRAY_SIZE(cache->heads) - 1)
+/* Objects are cached so that vkd3d_view_incref() can safely check the refcount of an
- object freed by another thread. This could be implemented as a single atomic linked
- list, but it requires handling the ABA problem, which brings issues with cross-platform
- support, compiler support, and non-universal x86-64 support for 128-bit CAS. */
static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { union d3d12_desc_object u;
- void *next;
- unsigned int i;
- do
- STATIC_ASSERT(!(ARRAY_SIZE(cache->heads) & HEAD_INDEX_MASK));
I guess this assertion belongs to just after the definition of `HEAD_INDEX_MASK`.
Making caches local to the thread has the potential for some thorny edge cases where one thread creates more than it frees, and another does the opposite (on copied descriptors), so its cache grows until the system is out of memory.
Right, thread local caches would need occasional rebalancing against a global cache. The nice thing about them is that they're essentially wait-free though, and even with such worst case behaviour you'd have less contention than with just the global cache.
In principle we could actually get per-CPU caches on Linux using RSEQs (restartable sequences), but unfortunately I'm not aware of any equivalent Win32 mechanism.
The new implementation is much simpler than the 128-bit CAS version and has about the same performance. It's somewhat similar to the old mutex array scheme.
Yeah, conceptually I like this much better. (And fwiw, that's a fairly standard scheme usually referred to as mutex/lock striping.) This does still have quite a number of atomic operations in the hot path though.
Unfortunately C doesn't (as far as I know) offer a portable way to query the cache line size at compilation time ([as C++17 does](https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_s...)). [Experimenting a little bit with the compiler explorer](https://godbolt.org/z/fEe4rK74s) it seems that most architectures are either 32 or 64 bytes, with PowerPC being 128 bytes and ARM64 possibly even 265 bytes. Given that we mostly care about Intel and ARM, I guess that we can just settle for 64, but 256 for ARM64.
Recent gcc versions have __GCC_DESTRUCTIVE_SIZE. That's not portable of course, but we could easily do something along the lines of
```c #ifdef __GCC_DESTRUCTIVE_SIZE # define VKD3D_DESTRUCTIVE_SIZE __GCC_DESTRUCTIVE_SIZE #elif ... # define VKD3D_DESTRUCTIVE_SIZE ... #else # define VKD3D_DESTRUCTIVE_SIZE 64 #endif ```
On Tue Aug 15 14:02:23 2023 +0000, Giovanni Mascellani wrote:
I guess this assertion belongs to just after the definition of `HEAD_INDEX_MASK`.
We could define `HEAD_INDEX_MASK` as `ARRAY_SIZE(((struct vkd3d_desc_object_cache *)NULL)->heads) - 1` to make this possible, but I see no compelling need for it.
On Wed Aug 16 02:00:17 2023 +0000, Henri Verbeet wrote:
Making caches local to the thread has the potential for some thorny
edge cases where one thread creates more than it frees, and another does the opposite (on copied descriptors), so its cache grows until the system is out of memory. Right, thread local caches would need occasional rebalancing against a global cache. The nice thing about them is that they're essentially wait-free though, and even with such worst case behaviour you'd have less contention than with just the global cache. In principle we could actually get per-CPU caches on Linux using RSEQs (restartable sequences), but unfortunately I'm not aware of any equivalent Win32 mechanism.
The new implementation is much simpler than the 128-bit CAS version
and has about the same performance. It's somewhat similar to the old mutex array scheme. Yeah, conceptually I like this much better. (And fwiw, that's a fairly standard scheme usually referred to as mutex/lock striping.) This does still have quite a number of atomic operations in the hot path though.
Unfortunately C doesn't (as far as I know) offer a portable way to
query the cache line size at compilation time ([as C++17 does](https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_s...)). [Experimenting a little bit with the compiler explorer](https://godbolt.org/z/fEe4rK74s) it seems that most architectures are either 32 or 64 bytes, with PowerPC being 128 bytes and ARM64 possibly even 265 bytes. Given that we mostly care about Intel and ARM, I guess that we can just settle for 64, but 256 for ARM64. Recent gcc versions have __GCC_DESTRUCTIVE_SIZE. That's not portable of course, but we could easily do something along the lines of
#ifdef __GCC_DESTRUCTIVE_SIZE # define VKD3D_DESTRUCTIVE_SIZE __GCC_DESTRUCTIVE_SIZE #elif ... # define VKD3D_DESTRUCTIVE_SIZE ... #else # define VKD3D_DESTRUCTIVE_SIZE 64 #endif
Spinning is the big performance killer. That seems to be the case for mutexes too because entry uses spinlocking. I see no measurable performance gain from a 64-byte alignment, but there is always the chance of gains on other hardware. FWIW the old 128-bit CAS implementation was only very slightly slower than this despite using a single atomic value.
I did some measurements with Cyberpunk 2077 to see how many times we need to spin (i.e., execute the `for` loop) on average for each call to `vkd3d_desc_object_cache_get()`. Results seem to be good: the ratio never reaches 2. It starts at 1, then grows a bit towards 1.5-1.6, then it decreases back seemingly converging to 1. That means that after some transient we basically never spin more than once for each call to `vkd3d_desc_object_get()`.
I think the MR is already good enough to be accepted. Further optimization like the cache size or thread-local caches could be considered in the future if some more performance has to be squeezed (though I wouldn't oppose to having them immediately if anybody wants to implement them right away).
This merge request was approved by Giovanni Mascellani.
Spinning is the big performance killer. That seems to be the case for mutexes too because entry uses spinlocking. I see no measurable performance gain from a 64-byte alignment,
I imagine at least part of that is due to the atomic operations on cache->next_index and cache->free_count in vkd3d_desc_object_cache_get() and vkd3d_desc_object_cache_push().
I did some measurements with Cyberpunk 2077 to see how many times we need to spin (i.e., execute the `for` loop) on average for each call to `vkd3d_desc_object_cache_get()`. Results seem to be good: the ratio never reaches 2. It starts at 1, then grows a bit towards 1.5-1.6, then it decreases back seemingly converging to 1. That means that after some transient we basically never spin more than once for each call to `vkd3d_desc_object_get()`.
So it essentially get rid of the contention; that's great to know.
I think the MR is already good enough to be accepted. Further optimization like the cache size or thread-local caches could be considered in the future if some more performance has to be squeezed (though I wouldn't oppose to having them immediately if anybody wants to implement them right away).
I think it's an improvement too, so I'll approve this. I do think there's further room for improvement though, both in terms of performance and in terms of code quality, and I'd prefer seeing those sooner rather than later. (E.g., I don't like the magic "16"; I don't like that we're rolling our own spinlocks here; I don't like the number of atomic operations in what's supposed to be a hot path.)
This merge request was approved by Henri Verbeet.