[PATCH 1/1] vkd3d: Introduce a thread-local descriptor object cache.

29 Sep 2023

From: Conor McCarthy cmccarthy@codeweavers.com
---
 libs/vkd3d/device.c        |  87 +++++++++++++++++++++++++-------
 libs/vkd3d/resource.c      | 100 +++++++++++++++++++------------------
 libs/vkd3d/vkd3d_private.h |  75 +++++++++++++++++++++++++---
 tests/d3d12.c              |   2 +
 4 files changed, 191 insertions(+), 73 deletions(-)

diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c
index d8c94fbf..c49847f0 100644
--- a/libs/vkd3d/device.c
+++ b/libs/vkd3d/device.c
@@ -2434,26 +2434,65 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device)
     device->vk_pool_count = 6;
 };
-static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size)
+struct global_tls_key_cache
 {
-    memset(cache, 0, sizeof(*cache));
-    cache->size = size;
+    struct vkd3d_mutex mutex;
+    struct vkd3d_tls_key *keys;
+    size_t key_capacity;
+    size_t key_count;
+};
+
+static struct global_tls_key_cache key_cache = {{VKD3D_MUTEX_INITIALIZER}, NULL, 0, 0};
+
+static int global_tls_key_cache_get(struct vkd3d_tls_key *key)
+{
+    int rc = 0;
+
+    vkd3d_mutex_lock(&key_cache.mutex);
+
+    if (key_cache.key_count)
+        *key = key_cache.keys[--key_cache.key_count];
+    else
+        rc = vkd3d_tls_key_create(key);
+
+    vkd3d_mutex_unlock(&key_cache.mutex);
+
+    return rc;
 }
-static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache)
+static void global_tls_key_cache_put(struct vkd3d_tls_key *key)
 {
-    union d3d12_desc_object u;
-    unsigned int i;
-    void *next;
+    vkd3d_mutex_lock(&key_cache.mutex);
-    for (i = 0; i < ARRAY_SIZE(cache->heads); ++i)
+    if (!vkd3d_array_reserve((void **)&key_cache.keys, &key_cache.key_capacity, key_cache.key_count + 1,
+            sizeof(*key_cache.keys)))
     {
-        for (u.object = cache->heads[i].head; u.object; u.object = next)
-        {
-            next = u.header->next;
-            vkd3d_free(u.object);
-        }
+        ERR("Failed to cache TLS key for reuse.\n");
+    }
+    else
+    {
+        vkd3d_tls_key_set_value(key, NULL);
+        key_cache.keys[key_cache.key_count++] = *key;
     }
+
+    vkd3d_mutex_unlock(&key_cache.mutex);
+}
+
+struct desc_object_caches *device_get_desc_object_caches(struct d3d12_device *device)
+{
+    struct desc_object_caches *caches = vkd3d_tls_key_get_value(&device->tls_key);
+
+    if (caches)
+        return caches;
+
+    caches = vkd3d_calloc(1, sizeof(*caches));
+    vkd3d_tls_key_set_value(&device->tls_key, caches);
+    caches->view_desc_cache.size = sizeof(struct vkd3d_view);
+    caches->view_desc_cache.rebalance = &device->view_desc_rebalance;
+    caches->cbuffer_desc_cache.size = sizeof(struct vkd3d_cbuffer_desc);
+    caches->cbuffer_desc_cache.rebalance = &device->cbuffer_desc_rebalance;
+
+    return caches;
 }
/* ID3D12Device */
@@ -2520,12 +2559,11 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device5 *iface)
         vkd3d_render_pass_cache_cleanup(&device->render_pass_cache, device);
         d3d12_device_destroy_pipeline_cache(device);
         d3d12_device_destroy_vkd3d_queues(device);
-        vkd3d_desc_object_cache_cleanup(&device->view_desc_cache);
-        vkd3d_desc_object_cache_cleanup(&device->cbuffer_desc_cache);
         VK_CALL(vkDestroyDevice(device->vk_device, NULL));
         if (device->parent)
             IUnknown_Release(device->parent);
         vkd3d_instance_decref(device->vkd3d_instance);
+        global_tls_key_cache_put(&device->tls_key);
vkd3d_free(device);
     }
@@ -4208,11 +4246,18 @@ struct d3d12_device *unsafe_impl_from_ID3D12Device5(ID3D12Device5 *iface)
     return impl_from_ID3D12Device5(iface);
 }
+static void desc_rebalance_init(struct desc_rebalance *rebalance)
+{
+    memset(rebalance, 0, sizeof(*rebalance));
+    vkd3d_mutex_init(&rebalance->mutex);
+}
+
 static HRESULT d3d12_device_init(struct d3d12_device *device,
         struct vkd3d_instance *instance, const struct vkd3d_device_create_info *create_info)
 {
     const struct vkd3d_vk_device_procs *vk_procs;
     HRESULT hr;
+    int rc;
device->ID3D12Device5_iface.lpVtbl = &d3d12_device_vtbl;
     device->refcount = 1;
@@ -4255,8 +4300,14 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
     device->blocked_queue_count = 0;
     vkd3d_mutex_init(&device->blocked_queues_mutex);
-    vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view));
-    vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc));
+    if ((rc = global_tls_key_cache_get(&device->tls_key)) < 0)
+    {
+        ERR("Failed to allocate TLS key, rc %d\n.", rc);
+        hr = E_FAIL;
+        goto out_cleanup_descriptor_heap_layouts;
+    }
+    desc_rebalance_init(&device->view_desc_rebalance);
+    desc_rebalance_init(&device->cbuffer_desc_rebalance);
device_init_descriptor_pool_sizes(device);
@@ -4265,6 +4316,8 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
return S_OK;
+out_cleanup_descriptor_heap_layouts:
+    vkd3d_vk_descriptor_heap_layouts_cleanup(device);
 out_cleanup_uav_clear_state:
     vkd3d_uav_clear_state_cleanup(&device->uav_clear_state, device);
 out_destroy_null_resources:
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c
index 14fb24a9..a1430dc0 100644
--- a/libs/vkd3d/resource.c
+++ b/libs/vkd3d/resource.c
@@ -2282,72 +2282,71 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource)
     return d3d12_resource_decref(impl_from_ID3D12Resource(resource));
 }
-#define HEAD_INDEX_MASK (ARRAY_SIZE(cache->heads) - 1)
+static const unsigned int REBALANCE_SIZE = 64;
-/* Objects are cached so that vkd3d_view_incref() can safely check the refcount of an
- * object freed by another thread. This could be implemented as a single atomic linked
- * list, but it requires handling the ABA problem, which brings issues with cross-platform
- * support, compiler support, and non-universal x86-64 support for 128-bit CAS. */
 static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache)
 {
-    union d3d12_desc_object u;
-    unsigned int i;
+    struct desc_rebalance *rebalance;
+    void *object;
-    STATIC_ASSERT(!(ARRAY_SIZE(cache->heads) & HEAD_INDEX_MASK));
+    if (cache->count)
+        return cache->data[--cache->count];
-    i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK;
-    for (;;)
-    {
-        if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1))
-        {
-            if ((u.object = cache->heads[i].head))
-            {
-                vkd3d_atomic_decrement(&cache->free_count);
-                cache->heads[i].head = u.header->next;
-                vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0);
-                return u.object;
-            }
-            vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0);
-        }
-        /* Keeping a free count avoids uncertainty over when this loop should terminate,
-         * which could result in excess allocations gradually increasing without limit. */
-        if (cache->free_count < ARRAY_SIZE(cache->heads))
-            return vkd3d_malloc(cache->size);
+    cache->max_count += cache->prev_rebalance_count;
+    cache->prev_rebalance_count = 0;
-        i = (i + 1) & HEAD_INDEX_MASK;
+    rebalance = cache->rebalance;
+    vkd3d_mutex_lock(&rebalance->mutex);
+
+    if (rebalance->count)
+    {
+        rebalance->count -= REBALANCE_SIZE;
+        vkd3d_array_reserve((void **)&cache->data, &cache->capacity, REBALANCE_SIZE, sizeof(*cache->data));
+        memcpy(cache->data, &rebalance->data[rebalance->count], REBALANCE_SIZE * sizeof(*cache->data));
+        cache->count = REBALANCE_SIZE;
+        object = cache->data[--cache->count];
+    }
+    else
+    {
+        object = vkd3d_malloc(cache->size);
     }
+
+    vkd3d_mutex_unlock(&rebalance->mutex);
+    return object;
 }
static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object)
 {
-    union d3d12_desc_object u = {object};
-    unsigned int i;
-    void *head;
+    struct desc_rebalance *rebalance;
-    /* Using the same index as above may result in a somewhat uneven distribution,
-     * but the main objective is to avoid costly spinlock contention. */
-    i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK;
-    for (;;)
-    {
-        if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1))
-            break;
-        i = (i + 1) & HEAD_INDEX_MASK;
-    }
+    vkd3d_array_reserve((void **)&cache->data, &cache->capacity, max(cache->count + 1, 64u), sizeof(*cache->data));
+    cache->data[cache->count++] = object;
+
+    if (cache->count < cache->max_count + REBALANCE_SIZE)
+        return;
+
+    cache->count -= REBALANCE_SIZE;
+    cache->prev_rebalance_count = REBALANCE_SIZE;
+
+    rebalance = cache->rebalance;
+    vkd3d_mutex_lock(&rebalance->mutex);
-    head = cache->heads[i].head;
-    u.header->next = head;
-    cache->heads[i].head = u.object;
-    vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0);
-    vkd3d_atomic_increment(&cache->free_count);
+    vkd3d_array_reserve((void **)&rebalance->data, &rebalance->capacity, max(rebalance->count + REBALANCE_SIZE, 1024u),
+            sizeof(*rebalance->data));
+    memcpy(&rebalance->data[rebalance->count], &cache->data[cache->count], REBALANCE_SIZE * sizeof(*rebalance->data));
+    rebalance->count += REBALANCE_SIZE;
+
+    vkd3d_mutex_unlock(&rebalance->mutex);
 }
#undef HEAD_INDEX_MASK
static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device)
 {
+    struct desc_object_caches *caches = device_get_desc_object_caches(device);
     struct vkd3d_cbuffer_desc *desc;
-    if (!(desc = vkd3d_desc_object_cache_get(&device->cbuffer_desc_cache)))
+    if (!(desc = vkd3d_desc_object_cache_get(&caches->cbuffer_desc_cache)))
         return NULL;
desc->h.magic = VKD3D_DESCRIPTOR_MAGIC_CBV;
@@ -2360,11 +2359,12 @@ static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device
 static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_descriptor_type,
         enum vkd3d_view_type type, struct d3d12_device *device)
 {
+    struct desc_object_caches *caches = device_get_desc_object_caches(device);
     struct vkd3d_view *view;
assert(magic);
-    if (!(view = vkd3d_desc_object_cache_get(&device->view_desc_cache)))
+    if (!(view = vkd3d_desc_object_cache_get(&caches->view_desc_cache)))
     {
         ERR("Failed to allocate descriptor object.\n");
         return NULL;
@@ -2382,6 +2382,7 @@ static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_
 static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *device)
 {
     const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
+    struct desc_object_caches *caches = device_get_desc_object_caches(device);
TRACE("Destroying view %p.\n", view);
@@ -2403,7 +2404,7 @@ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *dev
     if (view->v.vk_counter_view)
         VK_CALL(vkDestroyBufferView(device->vk_device, view->v.vk_counter_view, NULL));
-    vkd3d_desc_object_cache_push(&device->view_desc_cache, view);
+    vkd3d_desc_object_cache_push(&caches->view_desc_cache, view);
 }
void vkd3d_view_decref(void *view, struct d3d12_device *device)
@@ -2416,7 +2417,10 @@ void vkd3d_view_decref(void *view, struct d3d12_device *device)
     if (u.header->magic != VKD3D_DESCRIPTOR_MAGIC_CBV)
         vkd3d_view_destroy(u.view, device);
     else
-        vkd3d_desc_object_cache_push(&device->cbuffer_desc_cache, u.object);
+    {
+        struct desc_object_caches *caches = device_get_desc_object_caches(device);
+        vkd3d_desc_object_cache_push(&caches->cbuffer_desc_cache, u.object);
+    }
 }
static inline void d3d12_desc_replace(struct d3d12_desc *dst, void *view, struct d3d12_device *device)
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h
index 89f8b15e..f261a8af 100644
--- a/libs/vkd3d/vkd3d_private.h
+++ b/libs/vkd3d/vkd3d_private.h
@@ -204,11 +204,18 @@ struct vkd3d_mutex
     CRITICAL_SECTION lock;
 };
+#define VKD3D_MUTEX_INITIALIZER {NULL, -1, 0, 0, 0, 0}
+
 struct vkd3d_cond
 {
     CONDITION_VARIABLE cond;
 };
+struct vkd3d_tls_key
+{
+    unsigned int key;
+};
+
 static inline void vkd3d_mutex_init(struct vkd3d_mutex *lock)
 {
     InitializeCriticalSection(&lock->lock);
@@ -254,6 +261,24 @@ static inline void vkd3d_cond_destroy(struct vkd3d_cond *cond)
 {
 }
+static inline int vkd3d_tls_key_create(struct vkd3d_tls_key *key)
+{
+    if ((key->key = TlsAlloc()) == TLS_OUT_OF_INDEXES)
+        return -1;
+    return 0;
+}
+
+static inline int vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value)
+{
+    TlsSetValue(key->key, value);
+    return 0;
+}
+
+static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key)
+{
+    return TlsGetValue(key->key);
+}
+
 static inline unsigned int vkd3d_atomic_increment(unsigned int volatile *x)
 {
     return InterlockedIncrement((LONG volatile *)x);
@@ -299,11 +324,17 @@ struct vkd3d_mutex
     pthread_mutex_t lock;
 };
+#define VKD3D_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+
 struct vkd3d_cond
 {
     pthread_cond_t cond;
 };
+struct vkd3d_tls_key
+{
+    pthread_key_t key;
+};
static inline void vkd3d_mutex_init(struct vkd3d_mutex *lock)
 {
@@ -386,6 +417,21 @@ static inline void vkd3d_cond_destroy(struct vkd3d_cond *cond)
         ERR("Could not destroy the condition variable, error %d.\n", ret);
 }
+static inline int vkd3d_tls_key_create(struct vkd3d_tls_key *key)
+{
+    return pthread_key_create(&key->key, NULL);
+}
+
+static inline int vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value)
+{
+    return pthread_setspecific(key->key, value);
+}
+
+static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key)
+{
+    return pthread_getspecific(key->key);
+}
+
 # if HAVE_SYNC_SUB_AND_FETCH
 static inline unsigned int vkd3d_atomic_decrement(unsigned int volatile *x)
 {
@@ -1692,20 +1738,31 @@ struct vkd3d_uav_clear_state
 HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
 void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
-struct desc_object_cache_head
+struct desc_rebalance
 {
-    void *head;
-    unsigned int spinlock;
+    void **data;
+    size_t capacity;
+    size_t count;
+    struct vkd3d_mutex mutex;
 };
struct vkd3d_desc_object_cache
 {
-    struct desc_object_cache_head heads[16];
-    unsigned int next_index;
-    unsigned int free_count;
+    void **data;
+    size_t capacity;
+    size_t count;
+    size_t max_count;
+    size_t prev_rebalance_count;
+    struct desc_rebalance *rebalance;
     size_t size;
 };
+struct desc_object_caches
+{
+    struct vkd3d_desc_object_cache view_desc_cache;
+    struct vkd3d_desc_object_cache cbuffer_desc_cache;
+};
+
 #define VKD3D_DESCRIPTOR_POOL_COUNT 6
/* ID3D12Device */
@@ -1723,8 +1780,9 @@ struct d3d12_device
     struct vkd3d_gpu_va_allocator gpu_va_allocator;
struct vkd3d_mutex mutex;
-    struct vkd3d_desc_object_cache view_desc_cache;
-    struct vkd3d_desc_object_cache cbuffer_desc_cache;
+    struct vkd3d_tls_key tls_key;
+    struct desc_rebalance view_desc_rebalance;
+    struct desc_rebalance cbuffer_desc_rebalance;
     struct vkd3d_render_pass_cache render_pass_cache;
     VkPipelineCache vk_pipeline_cache;
@@ -1777,6 +1835,7 @@ struct vkd3d_queue *d3d12_device_get_vkd3d_queue(struct d3d12_device *device, D3
 bool d3d12_device_is_uma(struct d3d12_device *device, bool *coherent);
 void d3d12_device_mark_as_removed(struct d3d12_device *device, HRESULT reason,
         const char *message, ...) VKD3D_PRINTF_FUNC(3, 4);
+struct desc_object_caches *device_get_desc_object_caches(struct d3d12_device *device);
 struct d3d12_device *unsafe_impl_from_ID3D12Device5(ID3D12Device5 *iface);
static inline HRESULT d3d12_device_query_interface(struct d3d12_device *device, REFIID iid, void **object)
diff --git a/tests/d3d12.c b/tests/d3d12.c
index 86e8a8b1..d61309e9 100644
--- a/tests/d3d12.c
+++ b/tests/d3d12.c
@@ -36343,6 +36343,8 @@ static void test_readback_map_stability(void)
     ID3D12Resource_Unmap(buffer, 0, NULL);
ID3D12Resource_Release(buffer);
+
+    destroy_test_context(&context);
 }
static void test_vs_ps_relative_addressing(void)
-- 
GitLab

https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/384

    

2025

2024

2023

2022

[PATCH 1/1] vkd3d: Introduce a thread-local descriptor object cache.