The PE build uses FlsAlloc(), which for our purposes makes no difference vs TlsAlloc(), and allows the use of a destruction callback.
-- v5: vkd3d: Replace the descriptor object cache with a thread-local implementation.
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d/device.c | 104 +++++++++++++++++++++++++++++----- libs/vkd3d/resource.c | 113 +++++++++++++++++++++---------------- libs/vkd3d/vkd3d_private.h | 101 ++++++++++++++++++++++++++++++--- 3 files changed, 245 insertions(+), 73 deletions(-)
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 2d7051f3..f55cdaa6 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2434,28 +2434,96 @@ static void device_init_descriptor_pool_sizes(struct d3d12_device *device) device->vk_pool_count = 6; };
-static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size) +static void vkd3d_desc_object_cache_init(struct vkd3d_desc_object_cache *cache, size_t size, + struct desc_rebalance *rebalance) { - memset(cache, 0, sizeof(*cache)); cache->size = size; + cache->rebalance = rebalance; }
static void vkd3d_desc_object_cache_cleanup(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - unsigned int i; - void *next; + size_t i; + + for (i = 0; i < cache->count; ++i) + vkd3d_free(cache->data[i]); + vkd3d_free(cache->data); +} + +static void desc_rebalance_init(struct desc_rebalance *rebalance) +{ + vkd3d_mutex_init(&rebalance->mutex); +} + +static +#ifdef _WIN32 +CALLBACK +#endif +void global_tls_key_destroy(void *value) +{ + struct desc_object_caches *caches = value;
- for (i = 0; i < ARRAY_SIZE(cache->heads); ++i) + if (caches) { - for (u.object = cache->heads[i].head; u.object; u.object = next) - { - next = u.header->next; - vkd3d_free(u.object); - } + vkd3d_desc_object_cache_cleanup(&caches->view_desc_cache); + vkd3d_desc_object_cache_cleanup(&caches->cbuffer_desc_cache); + vkd3d_free(caches); } }
+static struct desc_rebalance view_desc_rebalance; +static struct desc_rebalance cbuffer_desc_rebalance; + +static bool global_tls_key_get(struct vkd3d_tls_key *key) +{ + static struct vkd3d_mutex key_cache_mutex = {VKD3D_MUTEX_INITIALIZER}; + static struct vkd3d_tls_key tls_key; + static bool tls_initialised; + bool ret = true; + + vkd3d_mutex_lock(&key_cache_mutex); + + if (!tls_initialised) + { + ret = vkd3d_tls_key_create(&tls_key, global_tls_key_destroy); + desc_rebalance_init(&view_desc_rebalance); + desc_rebalance_init(&cbuffer_desc_rebalance); + tls_initialised = true; + } + + vkd3d_mutex_unlock(&key_cache_mutex); + + *key = tls_key; + return ret; +} + +struct desc_object_caches *device_get_desc_object_caches(struct d3d12_device *device) +{ + struct desc_object_caches *caches = vkd3d_tls_key_get_value(&device->tls_key); + + if (caches) + return caches; + + if (!(caches = vkd3d_calloc(1, sizeof(*caches)))) + { + ERR("Failed to allocate caches.\n"); + /* Returning null here is not worth the cost of handling it in callers. */ + abort(); + } + + vkd3d_desc_object_cache_init(&caches->view_desc_cache, sizeof(struct vkd3d_view), &view_desc_rebalance); + vkd3d_desc_object_cache_init(&caches->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc), + &cbuffer_desc_rebalance); + + if (!vkd3d_tls_key_set_value(&device->tls_key, caches)) + { + ERR("Failed to store descriptor object cache pointer.\n"); + abort(); + } + + return caches; +} + /* ID3D12Device */ static inline struct d3d12_device *impl_from_ID3D12Device5(ID3D12Device5 *iface) { @@ -2520,8 +2588,6 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device5 *iface) vkd3d_render_pass_cache_cleanup(&device->render_pass_cache, device); d3d12_device_destroy_pipeline_cache(device); d3d12_device_destroy_vkd3d_queues(device); - vkd3d_desc_object_cache_cleanup(&device->view_desc_cache); - vkd3d_desc_object_cache_cleanup(&device->cbuffer_desc_cache); VK_CALL(vkDestroyDevice(device->vk_device, NULL)); if (device->parent) IUnknown_Release(device->parent); @@ -4276,8 +4342,14 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, device->blocked_queue_count = 0; vkd3d_mutex_init(&device->blocked_queues_mutex);
- vkd3d_desc_object_cache_init(&device->view_desc_cache, sizeof(struct vkd3d_view)); - vkd3d_desc_object_cache_init(&device->cbuffer_desc_cache, sizeof(struct vkd3d_cbuffer_desc)); + /* Store a copy of the TLS key in the device object. The object is often accessed + * for other reasons after the key is read, so reading it is less likely to cause + * an additional cache miss than reading the global value would be. */ + if (!global_tls_key_get(&device->tls_key)) + { + hr = E_FAIL; + goto out_cleanup_descriptor_heap_layouts; + }
device_init_descriptor_pool_sizes(device);
@@ -4286,6 +4358,8 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
return S_OK;
+out_cleanup_descriptor_heap_layouts: + vkd3d_vk_descriptor_heap_layouts_cleanup(device); out_cleanup_uav_clear_state: vkd3d_uav_clear_state_cleanup(&device->uav_clear_state, device); out_destroy_null_resources: diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 8898c247..b6023d24 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2290,72 +2290,80 @@ ULONG vkd3d_resource_decref(ID3D12Resource *resource) return d3d12_resource_decref(impl_from_ID3D12Resource(resource)); }
-#define HEAD_INDEX_MASK (ARRAY_SIZE(cache->heads) - 1) +static const unsigned int REBALANCE_SIZE = 64; +/* Rebalance threshold expands by one extra REBALANCE_SIZE per (threshold / REBALANCE_DIVISOR). */ +static const unsigned int REBALANCE_DIVISOR = 512; +/* Avoid some initial reallocations. */ +static const unsigned int REBALANCE_MIN_ALLOCATION = REBALANCE_SIZE * 16;
-/* Objects are cached so that vkd3d_view_incref() can safely check the refcount of an - * object freed by another thread. This could be implemented as a single atomic linked - * list, but it requires handling the ABA problem, which brings issues with cross-platform - * support, compiler support, and non-universal x86-64 support for 128-bit CAS. */ static void *vkd3d_desc_object_cache_get(struct vkd3d_desc_object_cache *cache) { - union d3d12_desc_object u; - unsigned int i; + struct desc_rebalance *rebalance; + void *object;
- STATIC_ASSERT(!(ARRAY_SIZE(cache->heads) & HEAD_INDEX_MASK)); + if (cache->count) + return cache->data[--cache->count];
- i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; - for (;;) - { - if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) - { - if ((u.object = cache->heads[i].head)) - { - vkd3d_atomic_decrement(&cache->free_count); - cache->heads[i].head = u.header->next; - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - return u.object; - } - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - } - /* Keeping a free count avoids uncertainty over when this loop should terminate, - * which could result in excess allocations gradually increasing without limit. */ - if (cache->free_count < ARRAY_SIZE(cache->heads)) - return vkd3d_malloc(cache->size); + /* For performance reasons it is critical to minimise rebalancing activity because + * it occurs while a single mutex is locked, and therefore can bottleneck descriptor updates. + * The goal is to raise rebalance_threshold until the cache no longer runs out of objects + * after sending a chunk to the rebalance array. If the cache is empty then a previously + * sent rebalance chunk should not have been sent. Raise the threshold for sending chunks. */ + cache->rebalance_threshold += cache->threshold_expansion; + cache->threshold_expansion = 0;
- i = (i + 1) & HEAD_INDEX_MASK; + rebalance = cache->rebalance; + vkd3d_mutex_lock(&rebalance->mutex); + + if (rebalance->count) + { + assert(rebalance->count >= REBALANCE_SIZE); + rebalance->count -= REBALANCE_SIZE; + vkd3d_array_reserve((void **)&cache->data, &cache->capacity, REBALANCE_SIZE, sizeof(*cache->data)); + memcpy(cache->data, &rebalance->data[rebalance->count], REBALANCE_SIZE * sizeof(*cache->data)); + cache->count = REBALANCE_SIZE - 1; + object = cache->data[REBALANCE_SIZE - 1]; + } + else + { + object = vkd3d_malloc(cache->size); } + + vkd3d_mutex_unlock(&rebalance->mutex); + return object; }
static void vkd3d_desc_object_cache_push(struct vkd3d_desc_object_cache *cache, void *object) { - union d3d12_desc_object u = {object}; - unsigned int i; - void *head; + struct desc_rebalance *rebalance;
- /* Using the same index as above may result in a somewhat uneven distribution, - * but the main objective is to avoid costly spinlock contention. */ - i = (vkd3d_atomic_increment(&cache->next_index)) & HEAD_INDEX_MASK; - for (;;) - { - if (vkd3d_atomic_compare_exchange(&cache->heads[i].spinlock, 0, 1)) - break; - i = (i + 1) & HEAD_INDEX_MASK; - } + vkd3d_array_reserve((void **)&cache->data, &cache->capacity, max(cache->count + 1, REBALANCE_SIZE), + sizeof(*cache->data)); + cache->data[cache->count++] = object;
- head = cache->heads[i].head; - u.header->next = head; - cache->heads[i].head = u.object; - vkd3d_atomic_exchange(&cache->heads[i].spinlock, 0); - vkd3d_atomic_increment(&cache->free_count); -} + if (cache->count < cache->rebalance_threshold + REBALANCE_SIZE) + return;
-#undef HEAD_INDEX_MASK + cache->count -= REBALANCE_SIZE; + cache->threshold_expansion = REBALANCE_SIZE * (1 + cache->rebalance_threshold / REBALANCE_DIVISOR); + + rebalance = cache->rebalance; + vkd3d_mutex_lock(&rebalance->mutex); + + vkd3d_array_reserve((void **)&rebalance->data, &rebalance->capacity, + max(rebalance->count + REBALANCE_SIZE, REBALANCE_MIN_ALLOCATION), sizeof(*rebalance->data)); + memcpy(&rebalance->data[rebalance->count], &cache->data[cache->count], REBALANCE_SIZE * sizeof(*rebalance->data)); + rebalance->count += REBALANCE_SIZE; + + vkd3d_mutex_unlock(&rebalance->mutex); +}
static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device *device) { + struct desc_object_caches *caches = device_get_desc_object_caches(device); struct vkd3d_cbuffer_desc *desc;
- if (!(desc = vkd3d_desc_object_cache_get(&device->cbuffer_desc_cache))) + if (!(desc = vkd3d_desc_object_cache_get(&caches->cbuffer_desc_cache))) return NULL;
desc->h.magic = VKD3D_DESCRIPTOR_MAGIC_CBV; @@ -2368,11 +2376,12 @@ static struct vkd3d_cbuffer_desc *vkd3d_cbuffer_desc_create(struct d3d12_device static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_descriptor_type, enum vkd3d_view_type type, struct d3d12_device *device) { + struct desc_object_caches *caches = device_get_desc_object_caches(device); struct vkd3d_view *view;
assert(magic);
- if (!(view = vkd3d_desc_object_cache_get(&device->view_desc_cache))) + if (!(view = vkd3d_desc_object_cache_get(&caches->view_desc_cache))) { ERR("Failed to allocate descriptor object.\n"); return NULL; @@ -2390,6 +2399,7 @@ static struct vkd3d_view *vkd3d_view_create(uint32_t magic, VkDescriptorType vk_ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *device) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + struct desc_object_caches *caches = device_get_desc_object_caches(device);
TRACE("Destroying view %p.\n", view);
@@ -2411,7 +2421,7 @@ static void vkd3d_view_destroy(struct vkd3d_view *view, struct d3d12_device *dev if (view->v.vk_counter_view) VK_CALL(vkDestroyBufferView(device->vk_device, view->v.vk_counter_view, NULL));
- vkd3d_desc_object_cache_push(&device->view_desc_cache, view); + vkd3d_desc_object_cache_push(&caches->view_desc_cache, view); }
void vkd3d_view_decref(void *view, struct d3d12_device *device) @@ -2424,7 +2434,10 @@ void vkd3d_view_decref(void *view, struct d3d12_device *device) if (u.header->magic != VKD3D_DESCRIPTOR_MAGIC_CBV) vkd3d_view_destroy(u.view, device); else - vkd3d_desc_object_cache_push(&device->cbuffer_desc_cache, u.object); + { + struct desc_object_caches *caches = device_get_desc_object_caches(device); + vkd3d_desc_object_cache_push(&caches->cbuffer_desc_cache, u.object); + } }
static inline void d3d12_desc_replace(struct d3d12_desc *dst, void *view, struct d3d12_device *device) diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 2e9845df..85d2df0e 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -205,11 +205,18 @@ struct vkd3d_mutex CRITICAL_SECTION lock; };
+#define VKD3D_MUTEX_INITIALIZER {NULL, -1, 0, 0, 0, 0} + struct vkd3d_cond { CONDITION_VARIABLE cond; };
+struct vkd3d_tls_key +{ + unsigned int key; +}; + static inline void vkd3d_mutex_init(struct vkd3d_mutex *lock) { InitializeCriticalSection(&lock->lock); @@ -255,6 +262,35 @@ static inline void vkd3d_cond_destroy(struct vkd3d_cond *cond) { }
+typedef PFLS_CALLBACK_FUNCTION vkd3d_tls_destructor_fn; + +static inline bool vkd3d_tls_key_create(struct vkd3d_tls_key *key, vkd3d_tls_destructor_fn destructor) +{ + if ((key->key = FlsAlloc(destructor)) == FLS_OUT_OF_INDEXES) + { + ERR("Failed to allocate TLS key.\n."); + return false; + } + return true; +} + +static inline bool vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) +{ + if (!FlsSetValue(key->key, value)) + { + ERR("Failed to allocate TLS key, err %u.\n.", GetLastError()); + return false; + } + return true; +} + +static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key) +{ + /* The descriptor object cache uses this function, which means performance is too + * critical to allow use of SetLastError() and GetLastError(). */ + return FlsGetValue(key->key); +} + static inline unsigned int vkd3d_atomic_increment(unsigned int volatile *x) { return InterlockedIncrement((LONG volatile *)x); @@ -300,11 +336,17 @@ struct vkd3d_mutex pthread_mutex_t lock; };
+#define VKD3D_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER + struct vkd3d_cond { pthread_cond_t cond; };
+struct vkd3d_tls_key +{ + pthread_key_t key; +};
static inline void vkd3d_mutex_init(struct vkd3d_mutex *lock) { @@ -387,6 +429,37 @@ static inline void vkd3d_cond_destroy(struct vkd3d_cond *cond) ERR("Could not destroy the condition variable, error %d.\n", ret); }
+typedef void (*vkd3d_tls_destructor_fn)(void *); + +static inline bool vkd3d_tls_key_create(struct vkd3d_tls_key *key, vkd3d_tls_destructor_fn destructor) +{ + int rc = pthread_key_create(&key->key, destructor); + if (rc < 0) + { + ERR("Failed to allocate TLS key, rc %d.\n.", rc); + return false; + } + return true; +} + +static inline bool vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) +{ + int rc = pthread_setspecific(key->key, value); + if (rc < 0) + { + ERR("Failed to set TLS key value, rc %d.\n.", rc); + return false; + } + return true; +} + +static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key) +{ + /* The descriptor object cache uses this function, which means performance is too + * critical to allow use of SetLastError() and GetLastError(). */ + return pthread_getspecific(key->key); +} + # if HAVE_SYNC_SUB_AND_FETCH static inline unsigned int vkd3d_atomic_decrement(unsigned int volatile *x) { @@ -1694,20 +1767,31 @@ struct vkd3d_uav_clear_state HRESULT vkd3d_uav_clear_state_init(struct vkd3d_uav_clear_state *state, struct d3d12_device *device); void vkd3d_uav_clear_state_cleanup(struct vkd3d_uav_clear_state *state, struct d3d12_device *device);
-struct desc_object_cache_head +struct desc_rebalance { - void *head; - unsigned int spinlock; + void **data; + size_t capacity; + size_t count; + struct vkd3d_mutex mutex; };
struct vkd3d_desc_object_cache { - struct desc_object_cache_head heads[16]; - unsigned int next_index; - unsigned int free_count; + void **data; + size_t capacity; + size_t count; + size_t rebalance_threshold; + size_t threshold_expansion; + struct desc_rebalance *rebalance; size_t size; };
+struct desc_object_caches +{ + struct vkd3d_desc_object_cache view_desc_cache; + struct vkd3d_desc_object_cache cbuffer_desc_cache; +}; + #define VKD3D_DESCRIPTOR_POOL_COUNT 6
/* ID3D12Device */ @@ -1722,11 +1806,11 @@ struct d3d12_device PFN_vkd3d_signal_event signal_event; size_t wchar_size;
+ struct vkd3d_tls_key tls_key; + struct vkd3d_gpu_va_allocator gpu_va_allocator;
struct vkd3d_mutex mutex; - struct vkd3d_desc_object_cache view_desc_cache; - struct vkd3d_desc_object_cache cbuffer_desc_cache; struct vkd3d_render_pass_cache render_pass_cache; VkPipelineCache vk_pipeline_cache;
@@ -1779,6 +1863,7 @@ struct vkd3d_queue *d3d12_device_get_vkd3d_queue(struct d3d12_device *device, D3 bool d3d12_device_is_uma(struct d3d12_device *device, bool *coherent); void d3d12_device_mark_as_removed(struct d3d12_device *device, HRESULT reason, const char *message, ...) VKD3D_PRINTF_FUNC(3, 4); +struct desc_object_caches *device_get_desc_object_caches(struct d3d12_device *device); struct d3d12_device *unsafe_impl_from_ID3D12Device5(ID3D12Device5 *iface);
static inline HRESULT d3d12_device_query_interface(struct d3d12_device *device, REFIID iid, void **object)
I added comments to clarify where performance affects design.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/vkd3d_private.h:
+static inline bool vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) {
- return pthread_setspecific(key->key, value);
- int rc = pthread_setspecific(key->key, value);
- if (rc < 0)
- {
ERR("Failed to set TLS key value, rc %d.\n.", rc);
return false;
- }
- return true;
}
static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key) {
- /* The descriptor object cache uses this function, which means performance is too
* critical to allow use of SetLastError() and GetLastError(). */
That's wrong, as far as I can tell `pthread_getspecific()` simply doesn't offer an interface to know whether the key was valid or not.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/vkd3d_private.h:
- return 0;
- {
ERR("Failed to allocate TLS key.\n.");
return false;
- }
- return true;
}
-static inline int vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) +static inline bool vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) {
- FlsSetValue(key->key, value);
- return 0;
- if (!FlsSetValue(key->key, value))
- {
ERR("Failed to allocate TLS key, err %u.\n.", GetLastError());
Wrong error message.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/vkd3d_private.h:
- if (rc < 0)
- {
ERR("Failed to allocate TLS key, rc %d.\n.", rc);
return false;
- }
- return true;
}
-static inline int vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) +static inline bool vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) {
- return pthread_setspecific(key->key, value);
- int rc = pthread_setspecific(key->key, value);
- if (rc < 0)
- {
ERR("Failed to set TLS key value, rc %d.\n.", rc);
Wrong error message.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/vkd3d_private.h:
+static inline bool vkd3d_tls_key_set_value(const struct vkd3d_tls_key *key, void *value) {
- FlsSetValue(key->key, value);
- return 0;
- if (!FlsSetValue(key->key, value))
- {
ERR("Failed to allocate TLS key, err %u.\n.", GetLastError());
return false;
- }
- return true;
}
static inline void *vkd3d_tls_key_get_value(const struct vkd3d_tls_key *key) {
- /* The descriptor object cache uses this function, which means performance is too
* critical to allow use of SetLastError() and GetLastError(). */
It's a bit annoying that `GetLastError()` and `SetLastError()` resolve to a function call instead of just getting and setting a field on the PEB. Their overhead is still quite small, I guess, when I can pass on this.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/vkd3d_private.h:
CRITICAL_SECTION lock;
};
+#define VKD3D_MUTEX_INITIALIZER {NULL, -1, 0, 0, 0, 0}
This should probably have another pair of braces around. The outer brace pair initializes `vkd3d_mutex` and the inner one initializes `CRITICAL_SECTION`.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d/resource.c:
i = (i + 1) & HEAD_INDEX_MASK;
- rebalance = cache->rebalance;
- vkd3d_mutex_lock(&rebalance->mutex);
- if (rebalance->count)
- {
assert(rebalance->count >= REBALANCE_SIZE);
rebalance->count -= REBALANCE_SIZE;
vkd3d_array_reserve((void **)&cache->data, &cache->capacity, REBALANCE_SIZE, sizeof(*cache->data));
memcpy(cache->data, &rebalance->data[rebalance->count], REBALANCE_SIZE * sizeof(*cache->data));
cache->count = REBALANCE_SIZE - 1;
object = cache->data[REBALANCE_SIZE - 1];
- }
- else
- {
object = vkd3d_malloc(cache->size);
Given that you already have a thread-local cache, maybe it makes sense to allocate more than a single object each time you need one?
I'm not yet convinced by the rebalancing algorithm, particularly by the fact that `rebalance_threshold` can theoretically grow arbitrarily large and the fact that, independently of how large `rebalance_threshold` might become, you still transfer no more than `REBALANCE_SIZE` objects at each operation. Also, if you transfer objects in packets of `REBALANCE_SIZE` anyway, you could directly allocate them in packets of `REBALANCE_SIZE` and store in the rebalance just one pointer instead of many for each packet (so you waste less time in the critical `memcpy()`'s).
On Mon Oct 16 14:34:36 2023 +0000, Giovanni Mascellani wrote:
I'm not yet convinced by the rebalancing algorithm, particularly by the fact that `rebalance_threshold` can theoretically grow arbitrarily large and the fact that, independently of how large `rebalance_threshold` might become, you still transfer no more than `REBALANCE_SIZE` objects at each operation. Also, if you transfer objects in packets of `REBALANCE_SIZE` anyway, you could directly allocate them in packets of `REBALANCE_SIZE` and store in the rebalance just one pointer instead of many for each packet (so you waste less time in the critical `memcpy()`'s).
I looked at this during development. When an object is freed it must be marked as free within its `REBALANCE_SIZE` packet, and the only way to make that object available for reuse is to add the packet to the cache. So we end up adding packets containing only one free object. But we can't send packets to the rebalancing array unless they are completely free, otherwise multiple threads can allocate and free objects within the same packet, and we are back to square one.
It may be slightly advantageous to transfer more than `REBALANCE_SIZE` because of cache coherence and the overhead of mutex locking/unlocking. Do you see any other advantages?
On Mon Oct 16 14:34:36 2023 +0000, Conor McCarthy wrote:
I looked at this during development. When an object is freed it must be marked as free within its `REBALANCE_SIZE` packet, and the only way to make that object available for reuse is to add the packet to the cache. So we end up adding packets containing only one free object. But we can't send packets to the rebalancing array unless they are completely free, otherwise multiple threads can allocate and free objects within the same packet, and we are back to square one. It may be slightly advantageous to transfer more than `REBALANCE_SIZE` because of cache coherence and the overhead of mutex locking/unlocking. Do you see any other advantages?
I wouldn't say arbitrarily large. It grows only if its current size proved insufficient to fill the client's demands, so it can't grow larger than the maximum descriptor allocation made by the client. If the client does silly things with threads, many threads could have a large allocation, but the total is unlikely to grow exceptionally large.
I looked at this during development. When an object is freed it must be marked as free within its `REBALANCE_SIZE` packet, and the only way to make that object available for reuse is to add the packet to the cache. So we end up adding packets containing only one free object. But we can't send packets to the rebalancing array unless they are completely free, otherwise multiple threads can allocate and free objects within the same packet, and we are back to square one.
Yeah, I think I wrote that in a confusing way: what I meant is that, instead of storing an array of objects in the rebalance object, you can store an array of pointers to arrays of `REBALANCE_SIZE` objects. When you have `REBALANCE_SIZE` objects to push, you first create an array with all of them outside of the critical region, then inside the critical region you only have to copy one pointer instead of `REBALANCE_SIZE`. But I just realized that means you have to allocate another thing (the array itself), and maybe we don't want to handle that.
It may be slightly advantageous to transfer more than `REBALANCE_SIZE` because of cache coherence and the overhead of mutex locking/unlocking. Do you see any other advantages?
Not that I am aware of. OTOH it's also true that transfer more than `REBALANCE_SIZE` objects means that the lock can remain locked for a longer time, which might be a source of jitter. It's really not clear what's the best thing here.