[PATCH 0/1] MR384: Draft: vkd3d: Introduce a thread-local descriptor object cache.

List overview All Threads

newer

older

Re: [PATCH v6 0/1] MR3966: mpr:...

[PATCH v3 0/8] MR3993:...

Conor McCarthy (＠cmccarthy)

29 Sep 2023 29 Sep '23

5:15 a.m.

A TLS index is allocated for each device object. We could use a global TLS, but that would share caches for all devices. I'm not sure if that's a problem. One index per device means freed indices must be recycled, which requires a global mutex, but a global TLS index would also need a global mutex.

This currently leaks cache memory when a device is freed. To fix this, allocations must be tracked. A global cache is more difficult to free, which is not normally an issue, but it is when using valgrind.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/384

Show replies by date

Conor McCarthy

29 Sep 29 Sep

5:15 a.m.

New subject: [PATCH 1/1] vkd3d: Introduce a thread-local descriptor object cache.

From: Conor McCarthy cmccarthy@codeweavers.com

--- libs/vkd3d/device.c | 87 +++++++++++++++++++++++++------- libs/vkd3d/resource.c | 100 +++++++++++++++++++------------------ libs/vkd3d/vkd3d_private.h | 75 +++++++++++++++++++++++++--- tests/d3d12.c | 2 + 4 files changed, 191 insertions(+), 73 deletions(-)

diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index d8c94fbf..c49847f0 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2434,26 +2434,65 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device) device->vk_pool_count = 6; };

-static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) +struct global_tls_key_cache { - memset(cache, 0, sizeof(*cache)); - cache->size = size; + struct vkd3d_mutex mutex; + struct vkd3d_tls_key *keys; + size_t key_capacity; + size_t key_count; +}; + +static struct global_tls_key_cache key_cache = {{VKD3D_MUTEX_INITIALIZER}, NULL, 0, 0}; + +static int global_tls_key_cache_get(struct vkd3d_tls_key *key) +{ + int rc = 0; + + vkd3d_mutex_lock(&key_cache.mutex); + + if (key_cache.key_count) + *key = key_cache.keys[--key_cache.key_count]; + else + rc = vkd3d_tls_key_create(key); + + vkd3d_mutex_unlock(&key_cache.mutex); + + return rc; }

-static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) +static void global_tls_key_cache_put(struct vkd3d_tls_key *key) { - union d3d12_desc_object u; - unsigned int i; - void *next; + vkd3d_mutex_lock(&key_cache.mutex);

- for (i = 0; i < ARRAY_SIZE(cache->heads); ++i) + if (!vkd3d_array_reserve((void **)&key_cache.keys, &key_cache.key_capacity, key_cache.key_count + 1, + sizeof(*key_cache.keys))) { - for (u.object = cache->heads[i].head; u.object; u.object = next) - { - next = u.header->next; - vkd3d_free(u.object); - } + ERR("Failed to cache TLS key for reuse.\n"); + } + else + { + vkd3d_tls_key_set_value(key, NULL); + key_cache.keys[key_cache.key_count++] = *key; } + + vkd3d_mutex_unlock(&key_cache.mutex); +} + +struct desc_object_caches *device_get_desc_object_caches(struct d3d12_device *device) +{ + struct desc_object_caches *caches = vkd3d_tls_key_get_value(&device->tls_key); + + if (caches) + return caches; + + caches = vkd3d_calloc(1, sizeof(*caches)); + vkd3d_tls_key_set_value(&device->tls_key, caches); + caches->view_desc_cache.size = sizeof(struct vkd3d_view); + caches->view_desc_cache.rebalance = &device->view_desc_rebalance; + caches->cbuffer_desc_cache.size = sizeof(struct vkd3d_cbuffer_desc); + caches->cbuffer_desc_cache.rebalance = &device->cbuffer_desc_rebalance; + + return caches; }

/* ID3D12Device */ @@ -2520,12 +2559,11 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device5 *iface) vkd3d_render_pass_cache_cleanup(&device->render_pass_cache, device); d3d12_device_destroy_pipeline_cache(device); d3d12_device_destroy_vkd3d_queues(device); - vkd3d_desc_object_cache_cleanup(&device->view_desc_cache); - vkd3d_desc_object_cache_cleanup(&device->cbuffer_desc_cache); VK_CALL(vkDestroyDevice(device->vk_device, NULL)); if (device->parent) IUnknown_Release(device->parent); vkd3d_instance_decref(device->vkd3d_instance); + global_tls_key_cache_put(&device->tls_key);

vkd3d_free(device); } @@ -4208,11 +4246,18 @@ struct d3d12_device *unsafe_impl_from_ID3D12Device5(ID3D12Device5 *iface) return impl_from_ID3D12Device5(iface); }

+static void desc_rebalance_init(struct desc_rebalance *rebalance) +{ + memset(rebalance, 0, sizeof(*rebalance)); + vkd3d_mutex_init(&rebalance->mutex); +} + static HRESULT d3d12_device_init(struct d3d12_device *device, struct vkd3d_instance *instance, const struct vkd3d_device_create_info *create_info) { const struct vkd3d_vk_device_procs *vk_procs; HRESULT hr; + int rc;

device->ID3D12Device5_iface.lpVtbl = &d3d12_device_vtbl; device->refcount = 1; @@ -4255,8 +4300,14 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, device->blocked_queue_count = 0; vkd3d_mutex_init(&device->blocked_queues_mutex);

- vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view)); - vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc)); + if ((rc = global_tls_key_cache_get(&device->tls_key)) < 0) + { + ERR("Failed to allocate TLS key, rc %d\n.", rc); + hr = E_FAIL; + goto out_cleanup_descriptor_heap_layouts; + } + desc_rebalance_init(&device->view_desc_rebalance); + desc_rebalance_init(&device->cbuffer_desc_rebalance);

device_init_descriptor_pool_sizes(device);

@@ -4265,6 +4316,8 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,

return S_OK;

+out_cleanup_descriptor_heap_layouts: + vkd3d_vk_descriptor_heap_layouts_cleanup(device); out_cleanup_uav_clear_state: vkd3d_uav_clear_state_cleanup(&device->uav_clear_state, device); out_destroy_null_resources: diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 14fb24a9..a1430dc0 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2282,72 +2282,71 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }

-#define HEAD_INDEX_MASK (ARRAY_SIZE(cache->heads) - 1) +static const unsigned int REBALANCE_SIZE = 64;

-/* Objects are cached so that vkd3d_view_incref() can safely check the refcount of an - * object freed by another thread. This could be implemented as a single atomic linked - * list, but it requires handling the ABA problem, which brings issues with cross-platform - * support, compiler support, and non-universal x86-64 support for 128-bit CAS. */ static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - unsigned int i; + struct desc_rebalance *rebalance; + void *object;

- STATIC_ASSERT(!(ARRAY_SIZE(cache->heads) & HEAD_INDEX_MASK)); + if (cache->count) + return cache->data[--cache->count];

- i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; - for (;;) - { - if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) - { - if ((u.object = cache->heads[i].head)) - { - vkd3d_atomic_decrement(&cache->free_count); - cache->heads[i].head = u.header->next; - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - return u.object; - } - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - } - /* Keeping a free count avoids uncertainty over when this loop should terminate, - * which could result in excess allocations gradually increasing without limit. */ - if (cache->free_count < ARRAY_SIZE(cache->heads)) - return vkd3d_malloc(cache->size); + cache->max_count += cache->prev_rebalance_count; + cache->prev_rebalance_count = 0;

- i = (i + 1) & HEAD_INDEX_MASK; + rebalance = cache->rebalance; + vkd3d_mutex_lock(&rebalance->mutex); + + if (rebalance->count) + { + rebalance->count -= REBALANCE_SIZE; + vkd3d_array_reserve((void **)&cache->data, &cache->capacity, REBALANCE_SIZE, sizeof(*cache->data)); + memcpy(cache->data, &rebalance->data[rebalance->count], REBALANCE_SIZE * sizeof(*cache->data)); + cache->count = REBALANCE_SIZE; + object = cache->data[--cache->count]; + } + else + { + object = vkd3d_malloc(cache->size); } + + vkd3d_mutex_unlock(&rebalance->mutex); + return object; }

static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { - union d3d12_desc_object u = {object}; - unsigned int i; - void *head; + struct desc_rebalance *rebalance;

- /* Using the same index as above may result in a somewhat uneven distribution, - * but the main objective is to avoid costly spinlock contention. */ - i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; - for (;;) - { - if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) - break; - i = (i + 1) & HEAD_INDEX_MASK; - } + vkd3d_array_reserve((void **)&cache->data, &cache->capacity, max(cache->count + 1, 64u), sizeof(*cache->data)); + cache->data[cache->count++] = object; + + if (cache->count < cache->max_count + REBALANCE_SIZE) + return; + + cache->count -= REBALANCE_SIZE; + cache->prev_rebalance_count = REBALANCE_SIZE; + + rebalance = cache->rebalance; + vkd3d_mutex_lock(&rebalance->mutex);

- head = cache->heads[i].head; - u.header->next = head; - cache->heads[i].head = u.object; - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - vkd3d_atomic_increment(&cache->free_count); + vkd3d_array_reserve((void **)&rebalance->data, &rebalance->capacity, max(rebalance->count + REBALANCE_SIZE, 1024u), + sizeof(*rebalance->data)); + memcpy(&rebalance->data[rebalance->count], &cache->data[cache->count], REBALANCE_SIZE * sizeof(*rebalance->data)); + rebalance->count += REBALANCE_SIZE; + + vkd3d_mutex_unlock(&rebalance->mutex); }

#undef HEAD_INDEX_MASK

static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) { + struct desc_object_caches *caches = device_get_desc_object_caches(device); struct vkd3d_cbuffer_desc *desc;

- if (!(desc = vkd3d_desc_object_cache_get(&device->cbuffer_desc_cache))) + if (!(desc = vkd3d_desc_object_cache_get(&caches->cbuffer_desc_cache))) return NULL;

desc->h.magic = VKD3D_DESCRIPTOR_MAGIC_CBV; @@ -2360,11 +2359,12 @@ static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_descriptor_type, enum vkd3d_view_type type, struct d3d12_device *device) { + struct desc_object_caches *caches = device_get_desc_object_caches(device); struct vkd3d_view *view;

assert(magic);

- if (!(view = vkd3d_desc_object_cache_get(&device->view_desc_cache))) + if (!(view = vkd3d_desc_object_cache_get(&caches->view_desc_cache))) { ERR("Failed to allocate descriptor object.\n"); return NULL; @@ -2382,6 +2382,7 @@ static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *device) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + struct desc_object_caches *caches = device_get_desc_object_caches(device);

TRACE("Destroying view %p.\n", view);

@@ -2403,7 +2404,7 @@ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *dev if (view->v.vk_counter_view) VK_CALL(vkDestroyBufferView(device->vk_device, view->v.vk_counter_view, NULL));

- vkd3d_desc_object_cache_push(&device->view_desc_cache, view); + vkd3d_desc_object_cache_push(&caches->view_desc_cache, view); }

void vkd3d_view_decref(void *view, struct d3d12_device *device) @@ -2416,7 +2417,10 @@ void vkd3d_view_decref(void *view, struct d3d12_device *device) if (u.header->magic != VKD3D_DESCRIPTOR_MAGIC_CBV) vkd3d_view_destroy(u.view, device); else - vkd3d_desc_object_cache_push(&device->cbuffer_desc_cache, u.object); + { + struct desc_object_caches *caches = device_get_desc_object_caches(device); + vkd3d_desc_object_cache_push(&caches->cbuffer_desc_cache, u.object); + } }

static inline void d3d12_desc_replace(struct d3d12_desc *dst, void *view, struct d3d12_device *device) diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 89f8b15e..f261a8af 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -204,11 +204,18 @@ struct vkd3d_mutex CRITICAL_SECTION lock; };

+#define VKD3D_MUTEX_INITIALIZER {NULL, -1, 0, 0, 0, 0} + struct vkd3d_cond { CONDITION_VARIABLE cond; };

+struct vkd3d_tls_key +{ + unsigned int key; +}; + static inline void vkd3d_mutex_init(struct vkd3d_mutex *lock) { InitializeCriticalSection(&lock->lock); @@ -254,6 +261,24 @@ static inline void vkd3d_cond_destroy(struct vkd3d_cond *cond) { }

+static inline int vkd3d_tls_key_create(struct vkd3d_tls_key *key) +{ + if ((key->key = TlsAlloc()) == TLS_OUT_OF_INDEXES) + return -1; + return 0; +} + +static inline int vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) +{ + TlsSetValue(key->key, value); + return 0; +} + +static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key) +{ + return TlsGetValue(key->key); +} + static inline unsigned int vkd3d_atomic_increment(unsigned int volatile *x) { return InterlockedIncrement((LONG volatile *)x); @@ -299,11 +324,17 @@ struct vkd3d_mutex pthread_mutex_t lock; };

+#define VKD3D_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER + struct vkd3d_cond { pthread_cond_t cond; };

+struct vkd3d_tls_key +{ + pthread_key_t key; +};

static inline void vkd3d_mutex_init(struct vkd3d_mutex *lock) { @@ -386,6 +417,21 @@ static inline void vkd3d_cond_destroy(struct vkd3d_cond *cond) ERR("Could not destroy the condition variable, error %d.\n", ret); }

+static inline int vkd3d_tls_key_create(struct vkd3d_tls_key *key) +{ + return pthread_key_create(&key->key, NULL); +} + +static inline int vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) +{ + return pthread_setspecific(key->key, value); +} + +static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key) +{ + return pthread_getspecific(key->key); +} + # if HAVE_SYNC_SUB_AND_FETCH static inline unsigned int vkd3d_atomic_decrement(unsigned int volatile *x) { @@ -1692,20 +1738,31 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);

-struct desc_object_cache_head +struct desc_rebalance { - void *head; - unsigned int spinlock; + void **data; + size_t capacity; + size_t count; + struct vkd3d_mutex mutex; };

struct vkd3d_desc_object_cache { - struct desc_object_cache_head heads[16]; - unsigned int next_index; - unsigned int free_count; + void **data; + size_t capacity; + size_t count; + size_t max_count; + size_t prev_rebalance_count; + struct desc_rebalance *rebalance; size_t size; };

+struct desc_object_caches +{ + struct vkd3d_desc_object_cache view_desc_cache; + struct vkd3d_desc_object_cache cbuffer_desc_cache; +}; + #define VKD3D_DESCRIPTOR_POOL_COUNT 6

/* ID3D12Device */ @@ -1723,8 +1780,9 @@ struct d3d12_device struct vkd3d_gpu_va_allocator gpu_va_allocator;

struct vkd3d_mutex mutex; - struct vkd3d_desc_object_cache view_desc_cache; - struct vkd3d_desc_object_cache cbuffer_desc_cache; + struct vkd3d_tls_key tls_key; + struct desc_rebalance view_desc_rebalance; + struct desc_rebalance cbuffer_desc_rebalance; struct vkd3d_render_pass_cache render_pass_cache; VkPipelineCache vk_pipeline_cache;

@@ -1777,6 +1835,7 @@ struct vkd3d_queue *d3d12_device_get_vkd3d_queue(struct d3d12_device *device, D3 bool d3d12_device_is_uma(struct d3d12_device *device, bool *coherent); void d3d12_device_mark_as_removed(struct d3d12_device *device, HRESULT reason, const char *message, ...) VKD3D_PRINTF_FUNC(3, 4); +struct desc_object_caches *device_get_desc_object_caches(struct d3d12_device *device); struct d3d12_device *unsafe_impl_from_ID3D12Device5(ID3D12Device5 *iface);

static inline HRESULT d3d12_device_query_interface(struct d3d12_device *device, REFIID iid, void **object) diff --git a/tests/d3d12.c b/tests/d3d12.c index 86e8a8b1..d61309e9 100644 --- a/tests/d3d12.c +++ b/tests/d3d12.c @@ -36343,6 +36343,8 @@ static void test_readback_map_stability(void) ID3D12Resource_Unmap(buffer, 0, NULL);

ID3D12Resource_Release(buffer); + + destroy_test_context(&context); }

static void test_vs_ps_relative_addressing(void)

-- GitLab https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/384

Conor McCarthy (＠cmccarthy)

7:14 a.m.

Mainly looking for comment on global vs per device. Windows lacks an equivalent of pthread's callback for freeing memory, so we would need to track devices and free cache memory when the last is released.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/384#note_47179

Henri Verbeet (＠hverbeet)

2 Oct 2 Oct

11:58 a.m.

...

Mainly looking for comment on global vs per device. Windows lacks an equivalent of pthread's callback for freeing memory, so we would need to track devices and free cache memory when the last is released.

I don't think there's necessarily a problem with using per-device caches, although it seems slightly harder than the global option at first sight. If the issue is purely getting notified about thread exit on Windows: DllMain() will get called with DLL_THREAD_DETACH on thread exit on Windows.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/384#note_47359

Conor McCarthy (＠cmccarthy)

3 Oct 3 Oct

5:32 a.m.

On Tue Oct 3 05:32:10 2023 +0000, Henri Verbeet wrote:

...

...
Mainly looking for comment on global vs per device. Windows lacks an

equivalent of pthread's callback for freeing memory, so we would need to track devices and free cache memory when the last is released. I don't think there's necessarily a problem with using per-device caches, although it seems slightly harder than the global option at first sight. If the issue is purely getting notified about thread exit on Windows: DllMain() will get called with DLL_THREAD_DETACH on thread exit on Windows.

I figured it best to avoid adding platform-specific handling of thread detachment. The PE build is statically linked into wined3d, so it would need go in wined3d's `DllMain()`, right?

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/384#note_47458

Henri Verbeet (＠hverbeet)

1:42 p.m.

We could perhaps also just use FlsAlloc(). Technically that would make this a per-fiber cache on Windows, but I don't think we particularly care about the distinction for our usage.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/384#note_47477

Giovanni Mascellani (＠giomasce)

5 Oct 5 Oct

9:16 a.m.

On Thu Oct 5 09:16:57 2023 +0000, Conor McCarthy wrote:

...

Mainly looking for comment on global vs per device. Windows lacks an equivalent of pthread's callback for freeing memory, so we would need to track devices and free cache memory when the last is released.

I guess that `thread_local`, even if we agreed to jump to C11, would have the same problem?

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/384#note_47667

647

Age (days ago)

653

Last active (days ago)

wine-gitlab@winehq.org

6 comments

4 participants

tags (0)

participants (4)

Conor McCarthy
Conor McCarthy (＠cmccarthy)
Giovanni Mascellani (＠giomasce)
Henri Verbeet (＠hverbeet)