The PE build uses FlsAlloc(), which for our purposes makes no difference vs TlsAlloc(), and allows the use of a destruction callback.
-- v8: vkd3d: Replace the descriptor object cache with a thread-local implementation.
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d/device.c | 102 ++++++++++++++++++++++++++++----- libs/vkd3d/resource.c | 114 ++++++++++++++++++++++--------------- libs/vkd3d/vkd3d_private.h | 100 +++++++++++++++++++++++++++++--- 3 files changed, 245 insertions(+), 71 deletions(-)
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index cf8b506fd..ae3e74100 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2475,28 +2475,96 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device) device->vk_pool_count = 6; };
-static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) +static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size, + struct desc_rebalance *rebalance) { - memset(cache, 0, sizeof(*cache)); cache->size = size; + cache->rebalance = rebalance; }
static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - unsigned int i; - void *next; + size_t i; + + for (i = 0; i < cache->count; ++i) + vkd3d_free(cache->data[i]); + vkd3d_free(cache->data); +} + +static void desc_rebalance_init(struct desc_rebalance *rebalance) +{ + vkd3d_mutex_init(&rebalance->mutex); +} + +static +#ifdef _WIN32 +CALLBACK +#endif +void global_tls_key_destroy(void *value) +{ + struct desc_object_caches *caches = value;
- for (i = 0; i < ARRAY_SIZE(cache->heads); ++i) + if (caches) { - for (u.object = cache->heads[i].head; u.object; u.object = next) - { - next = u.header->next; - vkd3d_free(u.object); - } + vkd3d_desc_object_cache_cleanup(&caches->view_desc_cache); + vkd3d_desc_object_cache_cleanup(&caches->cbuffer_desc_cache); + vkd3d_free(caches); } }
+static struct desc_rebalance view_desc_rebalance; +static struct desc_rebalance cbuffer_desc_rebalance; + +static bool global_tls_key_get(struct vkd3d_tls_key *key) +{ + static struct vkd3d_mutex key_cache_mutex = VKD3D_MUTEX_INITIALIZER; + static struct vkd3d_tls_key tls_key; + static bool tls_initialised; + bool ret = true; + + vkd3d_mutex_lock(&key_cache_mutex); + + if (!tls_initialised) + { + ret = vkd3d_tls_key_create(&tls_key, global_tls_key_destroy); + desc_rebalance_init(&view_desc_rebalance); + desc_rebalance_init(&cbuffer_desc_rebalance); + tls_initialised = true; + } + + vkd3d_mutex_unlock(&key_cache_mutex); + + *key = tls_key; + return ret; +} + +struct desc_object_caches *device_get_desc_object_caches(struct d3d12_device *device) +{ + struct desc_object_caches *caches = vkd3d_tls_key_get_value(&device->tls_key); + + if (caches) + return caches; + + if (!(caches = vkd3d_calloc(1, sizeof(*caches)))) + { + ERR("Failed to allocate caches.\n"); + /* Returning null here is not worth the cost of handling it in callers. */ + abort(); + } + + vkd3d_desc_object_cache_init(&caches->view_desc_cache, sizeof(struct vkd3d_view), &view_desc_rebalance); + vkd3d_desc_object_cache_init(&caches->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc), + &cbuffer_desc_rebalance); + + if (!vkd3d_tls_key_set_value(&device->tls_key, caches)) + { + ERR("Failed to store descriptor object cache pointer.\n"); + abort(); + } + + return caches; +} + /* ID3D12Device */ static inline struct d3d12_device *impl_from_ID3D12Device7(ID3D12Device7 *iface) { @@ -2585,8 +2653,6 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device7 *iface) vkd3d_render_pass_cache_cleanup(&device->render_pass_cache, device); d3d12_device_destroy_pipeline_cache(device); d3d12_device_destroy_vkd3d_queues(device); - vkd3d_desc_object_cache_cleanup(&device->view_desc_cache); - vkd3d_desc_object_cache_cleanup(&device->cbuffer_desc_cache); if (device->use_vk_heaps) device_worker_stop(device); vkd3d_free(device->heaps); @@ -4481,8 +4547,14 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, device->blocked_queue_count = 0; vkd3d_mutex_init(&device->blocked_queues_mutex);
- vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view)); - vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc)); + /* Store a copy of the TLS key in the device object. The object is often accessed + * for other reasons after the key is read, so reading it is less likely to cause + * an additional cache miss than reading the global value would be. */ + if (!global_tls_key_get(&device->tls_key)) + { + hr = E_FAIL; + goto out_cleanup_descriptor_heap_layouts; + }
device_init_descriptor_pool_sizes(device);
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index c04f48702..aae09d7a6 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2301,72 +2301,87 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }
-#define HEAD_INDEX_MASK (ARRAY_SIZE(cache->heads) - 1) +static const unsigned int REBALANCE_SIZE = 64; +/* Rebalance threshold expands by one extra REBALANCE_SIZE per (threshold / REBALANCE_DIVISOR). */ +static const unsigned int REBALANCE_DIVISOR = 512; +/* Avoid some initial reallocations. */ +static const unsigned int REBALANCE_MIN_ALLOCATION = REBALANCE_SIZE * 16;
/* Objects are cached so that vkd3d_view_incref() can safely check the refcount of an - * object freed by another thread. This could be implemented as a single atomic linked - * list, but it requires handling the ABA problem, which brings issues with cross-platform - * support, compiler support, and non-universal x86-64 support for 128-bit CAS. */ + * object freed by another thread. */ static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - unsigned int i; + struct desc_rebalance *rebalance; + void *object;
- STATIC_ASSERT(!(ARRAY_SIZE(cache->heads) & HEAD_INDEX_MASK)); + if (cache->count) + return cache->data[--cache->count];
- i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; - for (;;) - { - if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) - { - if ((u.object = cache->heads[i].head)) - { - vkd3d_atomic_decrement(&cache->free_count); - cache->heads[i].head = u.header->next; - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - return u.object; - } - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - } - /* Keeping a free count avoids uncertainty over when this loop should terminate, - * which could result in excess allocations gradually increasing without limit. */ - if (cache->free_count < ARRAY_SIZE(cache->heads)) - return vkd3d_malloc(cache->size); + /* For performance reasons, it is critical to minimise rebalancing activity because + * it occurs while a single mutex is locked, and therefore can bottleneck descriptor updates. + * The goal is to raise rebalance_threshold until the cache no longer runs out of objects + * after sending a chunk to the rebalance array. If the cache is empty then a previously + * sent rebalance chunk should not have been sent. Raise the threshold for sending chunks. */ + cache->rebalance_threshold += cache->threshold_expansion; + cache->threshold_expansion = 0;
- i = (i + 1) & HEAD_INDEX_MASK; + rebalance = cache->rebalance; + vkd3d_mutex_lock(&rebalance->mutex); + + if (rebalance->count) + { + assert(rebalance->count >= REBALANCE_SIZE); + rebalance->count -= REBALANCE_SIZE; + vkd3d_array_reserve((void **)&cache->data, &cache->capacity, REBALANCE_SIZE, sizeof(*cache->data)); + memcpy(cache->data, &rebalance->data[rebalance->count], REBALANCE_SIZE * sizeof(*cache->data)); + cache->count = REBALANCE_SIZE - 1; + object = cache->data[REBALANCE_SIZE - 1]; + } + else + { + object = vkd3d_malloc(cache->size); } + + vkd3d_mutex_unlock(&rebalance->mutex); + return object; }
static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { - union d3d12_desc_object u = {object}; - unsigned int i; - void *head; + struct desc_rebalance *rebalance;
- /* Using the same index as above may result in a somewhat uneven distribution, - * but the main objective is to avoid costly spinlock contention. */ - i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; - for (;;) + if (!vkd3d_array_reserve((void **)&cache->data, &cache->capacity, max(cache->count + 1, REBALANCE_SIZE), + sizeof(*cache->data))) { - if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) - break; - i = (i + 1) & HEAD_INDEX_MASK; + ERR("Failed to allocate cache array.\n"); + vkd3d_free(object); + return; } + cache->data[cache->count++] = object;
- head = cache->heads[i].head; - u.header->next = head; - cache->heads[i].head = u.object; - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - vkd3d_atomic_increment(&cache->free_count); -} + if (cache->count < cache->rebalance_threshold + REBALANCE_SIZE) + return;
-#undef HEAD_INDEX_MASK + cache->count -= REBALANCE_SIZE; + cache->threshold_expansion = REBALANCE_SIZE * (1 + cache->rebalance_threshold / REBALANCE_DIVISOR); + + rebalance = cache->rebalance; + vkd3d_mutex_lock(&rebalance->mutex); + + vkd3d_array_reserve((void **)&rebalance->data, &rebalance->capacity, + max(rebalance->count + REBALANCE_SIZE, REBALANCE_MIN_ALLOCATION), sizeof(*rebalance->data)); + memcpy(&rebalance->data[rebalance->count], &cache->data[cache->count], REBALANCE_SIZE * sizeof(*rebalance->data)); + rebalance->count += REBALANCE_SIZE; + + vkd3d_mutex_unlock(&rebalance->mutex); +}
static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) { + struct desc_object_caches *caches = device_get_desc_object_caches(device); struct vkd3d_cbuffer_desc *desc;
- if (!(desc = vkd3d_desc_object_cache_get(&device->cbuffer_desc_cache))) + if (!(desc = vkd3d_desc_object_cache_get(&caches->cbuffer_desc_cache))) return NULL;
desc->h.magic = VKD3D_DESCRIPTOR_MAGIC_CBV; @@ -2379,11 +2394,12 @@ static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_descriptor_type, enum vkd3d_view_type type, struct d3d12_device *device) { + struct desc_object_caches *caches = device_get_desc_object_caches(device); struct vkd3d_view *view;
assert(magic);
- if (!(view = vkd3d_desc_object_cache_get(&device->view_desc_cache))) + if (!(view = vkd3d_desc_object_cache_get(&caches->view_desc_cache))) { ERR("Failed to allocate descriptor object.\n"); return NULL; @@ -2401,6 +2417,7 @@ static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *device) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + struct desc_object_caches *caches = device_get_desc_object_caches(device);
TRACE("Destroying view %p.\n", view);
@@ -2422,7 +2439,7 @@ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *dev if (view->v.vk_counter_view) VK_CALL(vkDestroyBufferView(device->vk_device, view->v.vk_counter_view, NULL));
- vkd3d_desc_object_cache_push(&device->view_desc_cache, view); + vkd3d_desc_object_cache_push(&caches->view_desc_cache, view); }
void vkd3d_view_decref(void *view, struct d3d12_device *device) @@ -2435,7 +2452,10 @@ void vkd3d_view_decref(void *view, struct d3d12_device *device) if (u.header->magic != VKD3D_DESCRIPTOR_MAGIC_CBV) vkd3d_view_destroy(u.view, device); else - vkd3d_desc_object_cache_push(&device->cbuffer_desc_cache, u.object); + { + struct desc_object_caches *caches = device_get_desc_object_caches(device); + vkd3d_desc_object_cache_push(&caches->cbuffer_desc_cache, u.object); + } }
static inline void d3d12_desc_replace(struct d3d12_desc *dst, void *view, struct d3d12_device *device) diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index bb78ba777..3d749f3cd 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -207,11 +207,18 @@ struct vkd3d_mutex CRITICAL_SECTION lock; };
+#define VKD3D_MUTEX_INITIALIZER {{NULL, -1, 0, 0, 0, 0}} + struct vkd3d_cond { CONDITION_VARIABLE cond; };
+struct vkd3d_tls_key +{ + unsigned int key; +}; + static inline void vkd3d_mutex_init(struct vkd3d_mutex *lock) { InitializeCriticalSection(&lock->lock); @@ -257,6 +264,35 @@ static inline void vkd3d_cond_destroy(struct vkd3d_cond *cond) { }
+typedef PFLS_CALLBACK_FUNCTION vkd3d_tls_destructor_fn; + +static inline bool vkd3d_tls_key_create(struct vkd3d_tls_key *key, vkd3d_tls_destructor_fn destructor) +{ + if ((key->key = FlsAlloc(destructor)) == FLS_OUT_OF_INDEXES) + { + ERR("Failed to allocate key.\n."); + return false; + } + return true; +} + +static inline bool vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) +{ + if (!FlsSetValue(key->key, value)) + { + ERR("Failed to set value, err %u.\n.", GetLastError()); + return false; + } + return true; +} + +static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key) +{ + /* The descriptor object cache uses this function, which means performance is too + * critical to allow use of SetLastError() and GetLastError(). */ + return FlsGetValue(key->key); +} + static inline unsigned int vkd3d_atomic_increment(unsigned int volatile *x) { return InterlockedIncrement((LONG volatile *)x); @@ -302,11 +338,17 @@ struct vkd3d_mutex pthread_mutex_t lock; };
+#define VKD3D_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER + struct vkd3d_cond { pthread_cond_t cond; };
+struct vkd3d_tls_key +{ + pthread_key_t key; +};
static inline void vkd3d_mutex_init(struct vkd3d_mutex *lock) { @@ -389,6 +431,35 @@ static inline void vkd3d_cond_destroy(struct vkd3d_cond *cond) ERR("Could not destroy the condition variable, error %d.\n", ret); }
+typedef void (*vkd3d_tls_destructor_fn)(void *); + +static inline bool vkd3d_tls_key_create(struct vkd3d_tls_key *key, vkd3d_tls_destructor_fn destructor) +{ + int rc = pthread_key_create(&key->key, destructor); + if (rc < 0) + { + ERR("Failed to create key, rc %d.\n.", rc); + return false; + } + return true; +} + +static inline bool vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) +{ + int rc = pthread_setspecific(key->key, value); + if (rc < 0) + { + ERR("Failed to set value, rc %d.\n.", rc); + return false; + } + return true; +} + +static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key) +{ + return pthread_getspecific(key->key); +} + # if HAVE_SYNC_SUB_AND_FETCH static inline unsigned int vkd3d_atomic_decrement(unsigned int volatile *x) { @@ -1736,20 +1807,31 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
-struct desc_object_cache_head +struct desc_rebalance { - void *head; - unsigned int spinlock; + void **data; + size_t capacity; + size_t count; + struct vkd3d_mutex mutex; };
struct vkd3d_desc_object_cache { - struct desc_object_cache_head heads[16]; - unsigned int next_index; - unsigned int free_count; + void **data; + size_t capacity; + size_t count; + size_t rebalance_threshold; + size_t threshold_expansion; + struct desc_rebalance *rebalance; size_t size; };
+struct desc_object_caches +{ + struct vkd3d_desc_object_cache view_desc_cache; + struct vkd3d_desc_object_cache cbuffer_desc_cache; +}; + #define VKD3D_DESCRIPTOR_POOL_COUNT 6
/* ID3D12Device */ @@ -1764,10 +1846,9 @@ struct d3d12_device PFN_vkd3d_signal_event signal_event; size_t wchar_size;
- struct vkd3d_gpu_va_allocator gpu_va_allocator; + struct vkd3d_tls_key tls_key;
- struct vkd3d_desc_object_cache view_desc_cache; - struct vkd3d_desc_object_cache cbuffer_desc_cache; + struct vkd3d_gpu_va_allocator gpu_va_allocator;
VkDescriptorPoolSize vk_pool_sizes[VKD3D_DESCRIPTOR_POOL_COUNT]; unsigned int vk_pool_count; @@ -1830,6 +1911,7 @@ struct vkd3d_queue *d3d12_device_get_vkd3d_queue(struct d3d12_device *device, D3 bool d3d12_device_is_uma(struct d3d12_device *device, bool *coherent); void d3d12_device_mark_as_removed(struct d3d12_device *device, HRESULT reason, const char *message, ...) VKD3D_PRINTF_FUNC(3, 4); +struct desc_object_caches *device_get_desc_object_caches(struct d3d12_device *device); struct d3d12_device *unsafe_impl_from_ID3D12Device7(ID3D12Device7 *iface); HRESULT d3d12_device_add_descriptor_heap(struct d3d12_device *device, struct d3d12_descriptor_heap *heap); void d3d12_device_remove_descriptor_heap(struct d3d12_device *device, struct d3d12_descriptor_heap *heap);