New subject: [PATCH v2 2/2] vkd3d: Allocate one large buffer for a heap and offset into it.

8 Oct 2019

The GPU VA allocator was allocating memory in a way where dereferencing
GPU VA required a lock + bsearch to find the right VA range.
Rather than going this route, we turn the common case into O(1) and
lockless by creating a slab allocator which allows us to lookup a ptr
directly from GPU VA with (VA - Base) / PageSize.
The number of allocations in the fast path must be limited since we
cannot trivially grow the allocator while remaining lock-free for
dereferences.
Signed-off-by: Hans-Kristian Arntzen post@arntzen-software.no
---
 libs/vkd3d/device.c        | 240 +++++++++++++++++++++++++++++++------
 libs/vkd3d/resource.c      |   2 +-
 libs/vkd3d/vkd3d_private.h |  31 +++--
 3 files changed, 227 insertions(+), 46 deletions(-)

diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c
index 3da4273..0ecac9a 100644
--- a/libs/vkd3d/device.c
+++ b/libs/vkd3d/device.c
@@ -1822,42 +1822,106 @@ static void d3d12_device_destroy_pipeline_cache(struct d3d12_device *device)
     pthread_mutex_destroy(&device->mutex);
 }
-D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator,
-        size_t size, void *ptr)
+#define VKD3D_MAX_VA_SLAB_ALLOCATIONS (64 * 1024)
+#define VKD3D_BASE_VA_SLAB (0x1000000000ull)
+#define VKD3D_BASE_VA_FALLBACK (0x8000000000000000ull)
+#define VKD3D_SLAB_ALLOCATION_SIZE (0x100000000ull)
+#define VKD3D_SLAB_ALLOCATION_SIZE_LOG2 32
+
+static D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate_fallback(struct vkd3d_gpu_va_allocator *allocator,
+        size_t size, size_t alignment, void *ptr)
 {
     D3D12_GPU_VIRTUAL_ADDRESS ceiling = ~(D3D12_GPU_VIRTUAL_ADDRESS)0;
     struct vkd3d_gpu_va_allocation *allocation;
-    int rc;
-    if ((rc = pthread_mutex_lock(&allocator->mutex)))
+    if (!vkd3d_array_reserve((void **)&allocator->fallback_mem_allocations, &allocator->fallback_mem_allocations_size,
+            allocator->fallback_mem_allocation_count + 1, sizeof(*allocator->fallback_mem_allocations)))
     {
-        ERR("Failed to lock mutex, error %d.\n", rc);
         return 0;
     }
-    if (!vkd3d_array_reserve((void **)&allocator->allocations, &allocator->allocations_size,
-            allocator->allocation_count + 1, sizeof(*allocator->allocations)))
+    allocator->fallback_mem_floor = (allocator->fallback_mem_floor + alignment - 1) & ~((D3D12_GPU_VIRTUAL_ADDRESS)alignment - 1);
+
+    if (size > ceiling || ceiling - size < allocator->fallback_mem_floor)
     {
-        pthread_mutex_unlock(&allocator->mutex);
         return 0;
     }
-    if (size > ceiling || ceiling - size < allocator->floor)
+    allocation = &allocator->fallback_mem_allocations[allocator->fallback_mem_allocation_count++];
+    allocation->base = allocator->fallback_mem_floor;
+    allocation->size = size;
+    allocation->ptr = ptr;
+
+    /* This pointer is bumped and never lowered on a free.
+     * However, this will only fail once we have exhausted 63 bits of address space. */
+    allocator->fallback_mem_floor += size;
+
+    return allocation->base;
+}
+
+static D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate_slab(struct vkd3d_gpu_va_allocator *allocator,
+        size_t size, size_t alignment, void *ptr)
+{
+    int rc;
+    unsigned vacant_index;
+    D3D12_GPU_VIRTUAL_ADDRESS virtual_address = 0;
+
+    if ((rc = pthread_mutex_lock(&allocator->mutex)))
     {
-        pthread_mutex_unlock(&allocator->mutex);
+        ERR("Failed to lock mutex, error %d.\n", rc);
         return 0;
     }
-    allocation = &allocator->allocations[allocator->allocation_count++];
-    allocation->base = allocator->floor;
-    allocation->size = size;
-    allocation->ptr = ptr;
+    TRACE("Allocating %zu bytes (%zu align) of VA from slab allocator.\n", size, alignment);
+    if (allocator->mem_vacant_count > 0)
+    {
+        vacant_index = allocator->mem_vacant[--allocator->mem_vacant_count];
+
+        /* It is critical that the multiplication happens in 64-bit to not overflow. */
+        virtual_address = VKD3D_BASE_VA_SLAB + vacant_index * VKD3D_SLAB_ALLOCATION_SIZE;
+        TRACE("Allocating VA: 0x%llx: vacant index %u from slab.\n",
+                (unsigned long long)virtual_address, vacant_index);
+        assert(!allocator->slab_mem_allocations[vacant_index].ptr);
+        allocator->slab_mem_allocations[vacant_index].ptr = ptr;
+        allocator->slab_mem_allocations[vacant_index].size = size;
+    }
-    allocator->floor += size;
+    if (virtual_address == 0)
+    {
+        TRACE("Slab allocator is empty, allocating %zu bytes (%zu align) of VA from fallback allocator.\n",
+                size, alignment);
+        /* Fall back to slow allocator. */
+        virtual_address = vkd3d_gpu_va_allocator_allocate_fallback(allocator, size, alignment, ptr);
+    }
pthread_mutex_unlock(&allocator->mutex);
+    return virtual_address;
+}
-    return allocation->base;
+D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator,
+        size_t size, size_t alignment, void *ptr)
+{
+    D3D12_GPU_VIRTUAL_ADDRESS virtual_address;
+    int rc;
+    size_t aligned_size;
+
+    aligned_size = size > alignment ? size : alignment;
+
+    if (aligned_size > VKD3D_SLAB_ALLOCATION_SIZE)
+    {
+        /* For massive VA allocations, go straight to high-mem with a slower allocator. */
+        if ((rc = pthread_mutex_lock(&allocator->mutex)))
+        {
+            ERR("Failed to lock mutex, error %d.\n", rc);
+            return 0;
+        }
+        virtual_address = vkd3d_gpu_va_allocator_allocate_fallback(allocator, size, alignment, ptr);
+        pthread_mutex_unlock(&allocator->mutex);
+    }
+    else
+        virtual_address = vkd3d_gpu_va_allocator_allocate_slab(allocator, size, alignment, ptr);
+
+    return virtual_address;
 }
static int vkd3d_gpu_va_allocation_compare(const void *k, const void *e)
@@ -1872,24 +1936,93 @@ static int vkd3d_gpu_va_allocation_compare(const void *k, const void *e)
     return 0;
 }
+static void *vkd3d_gpu_va_allocator_dereference_slab(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address)
+{
+    D3D12_GPU_VIRTUAL_ADDRESS base_offset;
+    uint64_t base_index;
+    const struct vkd3d_gpu_va_slab_entry *slab;
+
+    base_offset = address - VKD3D_BASE_VA_SLAB;
+    base_index = base_offset >> VKD3D_SLAB_ALLOCATION_SIZE_LOG2;
+    if (base_index >= VKD3D_MAX_VA_SLAB_ALLOCATIONS)
+    {
+        ERR("Accessed slab size class out of range.\n");
+        return NULL;
+    }
+
+    slab = &allocator->slab_mem_allocations[base_index];
+    base_offset -= base_index * VKD3D_SLAB_ALLOCATION_SIZE;
+    if (base_offset >= slab->size)
+    {
+        ERR("Accessed slab out of range.\n");
+        return NULL;
+    }
+    return slab->ptr;
+}
+
+static void vkd3d_gpu_va_allocator_free_slab(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address)
+{
+    D3D12_GPU_VIRTUAL_ADDRESS base_offset;
+    unsigned base_index;
+    struct vkd3d_gpu_va_slab_entry *slab;
+
+    base_offset = address - VKD3D_BASE_VA_SLAB;
+    base_index = base_offset >> VKD3D_SLAB_ALLOCATION_SIZE_LOG2;
+
+    if (base_index >= VKD3D_MAX_VA_SLAB_ALLOCATIONS)
+    {
+        ERR("Accessed slab size class out of range.\n");
+        return;
+    }
+
+    slab = &allocator->slab_mem_allocations[base_index];
+    if (slab->ptr == NULL)
+    {
+        ERR("Attempting to free NULL VA.\n");
+        return;
+    }
+
+    if (allocator->mem_vacant_count >= VKD3D_MAX_VA_SLAB_ALLOCATIONS)
+    {
+        ERR("Invalid free, slab size class is fully freed.\n");
+        return;
+    }
+
+    TRACE("Freeing VA: 0x%llx: index %u from slab.\n",
+            (unsigned long long)address, base_index);
+
+    slab->ptr = NULL;
+    allocator->mem_vacant[allocator->mem_vacant_count++] = base_index;
+}
+
 void *vkd3d_gpu_va_allocator_dereference(struct vkd3d_gpu_va_allocator *allocator,
         D3D12_GPU_VIRTUAL_ADDRESS address)
 {
     struct vkd3d_gpu_va_allocation *allocation;
     int rc;
-    if ((rc = pthread_mutex_lock(&allocator->mutex)))
+    /* If we land in the non-fallback region, dereferencing VA is lockless. The base pointer is immutable,
+     * and only way we can have a data race is if some other thread is poking into the slab_mem_allocation[class][base_index] block.
+     * This can only happen if someone is trying to free the entry while we're dereferencing, which would be a serious app bug. */
+    if (address < VKD3D_BASE_VA_FALLBACK)
     {
-        ERR("Failed to lock mutex, error %d.\n", rc);
-        return NULL;
+        return vkd3d_gpu_va_allocator_dereference_slab(allocator, address);
     }
+    else
+    {
+        /* Slow fallback. */
+        if ((rc = pthread_mutex_lock(&allocator->mutex)))
+        {
+            ERR("Failed to lock mutex, error %d.\n", rc);
+            return NULL;
+        }
-    allocation = bsearch(&address, allocator->allocations, allocator->allocation_count,
-            sizeof(*allocation), vkd3d_gpu_va_allocation_compare);
-
-    pthread_mutex_unlock(&allocator->mutex);
+        allocation = bsearch(&address, allocator->fallback_mem_allocations, allocator->fallback_mem_allocation_count,
+                             sizeof(*allocation), vkd3d_gpu_va_allocation_compare);
-    return allocation ? allocation->ptr : NULL;
+        pthread_mutex_unlock(&allocator->mutex);
+        return allocation ? allocation->ptr : NULL;
+    }
 }
void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address)
@@ -1904,16 +2037,23 @@ void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12
         return;
     }
-    allocation = bsearch(&address, allocator->allocations, allocator->allocation_count,
-            sizeof(*allocation), vkd3d_gpu_va_allocation_compare);
-    if (allocation && allocation->base == address)
+    if (address < VKD3D_BASE_VA_FALLBACK)
     {
-        index = allocation - allocator->allocations;
-        --allocator->allocation_count;
-        if (index != allocator->allocation_count)
+        vkd3d_gpu_va_allocator_free_slab(allocator, address);
+    }
+    else
+    {
+        allocation = bsearch(&address, allocator->fallback_mem_allocations, allocator->fallback_mem_allocation_count,
+                             sizeof(*allocation), vkd3d_gpu_va_allocation_compare);
+        if (allocation && allocation->base == address)
         {
-            memmove(&allocator->allocations[index], &allocator->allocations[index + 1],
-                    (allocator->allocation_count - index) * sizeof(*allocation));
+            index = allocation - allocator->fallback_mem_allocations;
+            --allocator->fallback_mem_allocation_count;
+            if (index != allocator->fallback_mem_allocation_count)
+            {
+                memmove(&allocator->fallback_mem_allocations[index], &allocator->fallback_mem_allocations[index + 1],
+                        (allocator->fallback_mem_allocation_count - index) * sizeof(*allocation));
+            }
         }
     }
@@ -1923,29 +2063,59 @@ void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12
 static bool vkd3d_gpu_va_allocator_init(struct vkd3d_gpu_va_allocator *allocator)
 {
     int rc;
+    int i;
memset(allocator, 0, sizeof(*allocator));
-    allocator->floor = 0x1000;
+    allocator->fallback_mem_floor = VKD3D_BASE_VA_FALLBACK;
+
+    /* To remain lock-less, we cannot grow these lists after the fact. If we commit to a maximum number of allocations
+     * here, we can dereference without taking a lock as the base pointer never changes.
+     * We would be able to grow more seamlessly using an array of pointers,
+     * but would make dereferencing slightly less efficient. */
+    allocator->slab_mem_allocations = vkd3d_calloc(VKD3D_MAX_VA_SLAB_ALLOCATIONS, sizeof(*allocator->slab_mem_allocations));
+    if (!allocator->slab_mem_allocations)
+        goto error;
+
+    /* Otherwise we need 32-bit indices. */
+    assert(VKD3D_MAX_VA_SLAB_ALLOCATIONS <= 64 * 1024);
+
+    allocator->mem_vacant = vkd3d_malloc(VKD3D_MAX_VA_SLAB_ALLOCATIONS * sizeof(uint16_t));
+    if (!allocator->mem_vacant)
+        goto error;
+
+    /* Build a stack of which slab indices are available for allocation.
+     * Place lowest indices last (first to be popped off stack). */
+    for (i = 0; i < VKD3D_MAX_VA_SLAB_ALLOCATIONS; i++)
+        allocator->mem_vacant[i] = (VKD3D_MAX_VA_SLAB_ALLOCATIONS - 1) - i;
+    allocator->mem_vacant_count = VKD3D_MAX_VA_SLAB_ALLOCATIONS;
if ((rc = pthread_mutex_init(&allocator->mutex, NULL)))
     {
         ERR("Failed to initialize mutex, error %d.\n", rc);
-        return false;
+        goto error;
     }
return true;
+
+error:
+    vkd3d_free(allocator->slab_mem_allocations);
+    vkd3d_free(allocator->mem_vacant);
+    return false;
 }
static void vkd3d_gpu_va_allocator_cleanup(struct vkd3d_gpu_va_allocator *allocator)
 {
     int rc;
+    vkd3d_free(allocator->slab_mem_allocations);
+    vkd3d_free(allocator->mem_vacant);
+
     if ((rc = pthread_mutex_lock(&allocator->mutex)))
     {
         ERR("Failed to lock mutex, error %d.\n", rc);
         return;
     }
-    vkd3d_free(allocator->allocations);
+    vkd3d_free(allocator->fallback_mem_allocations);
     pthread_mutex_unlock(&allocator->mutex);
     pthread_mutex_destroy(&allocator->mutex);
 }
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c
index ccd1230..6c9564b 100644
--- a/libs/vkd3d/resource.c
+++ b/libs/vkd3d/resource.c
@@ -1710,7 +1710,7 @@ static HRESULT d3d12_resource_init(struct d3d12_resource *resource, struct d3d12
                     &resource->desc, &resource->u.vk_buffer)))
                 return hr;
             if (!(resource->gpu_address = vkd3d_gpu_va_allocator_allocate(&device->gpu_va_allocator,
-                    desc->Width, resource)))
+                    desc->Width, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT, resource)))
             {
                 ERR("Failed to allocate GPU VA.\n");
                 d3d12_resource_destroy(resource, device);
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h
index 59f0eac..a5f7c81 100644
--- a/libs/vkd3d/vkd3d_private.h
+++ b/libs/vkd3d/vkd3d_private.h
@@ -202,24 +202,35 @@ HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker,
 HRESULT vkd3d_fence_worker_stop(struct vkd3d_fence_worker *worker,
         struct d3d12_device *device) DECLSPEC_HIDDEN;
+struct vkd3d_gpu_va_allocation
+{
+    D3D12_GPU_VIRTUAL_ADDRESS base;
+    SIZE_T size;
+    void *ptr;
+};
+
+struct vkd3d_gpu_va_slab_entry
+{
+    void *ptr;
+    SIZE_T size;
+};
+
 struct vkd3d_gpu_va_allocator
 {
     pthread_mutex_t mutex;
-    D3D12_GPU_VIRTUAL_ADDRESS floor;
+    struct vkd3d_gpu_va_slab_entry *slab_mem_allocations;
+    uint16_t *mem_vacant;
+    size_t mem_vacant_count;
-    struct vkd3d_gpu_va_allocation
-    {
-        D3D12_GPU_VIRTUAL_ADDRESS base;
-        SIZE_T size;
-        void *ptr;
-    } *allocations;
-    size_t allocations_size;
-    size_t allocation_count;
+    struct vkd3d_gpu_va_allocation *fallback_mem_allocations;
+    size_t fallback_mem_allocations_size;
+    size_t fallback_mem_allocation_count;
+    D3D12_GPU_VIRTUAL_ADDRESS fallback_mem_floor;
 };
D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator,
-        size_t size, void *ptr) DECLSPEC_HIDDEN;
+        size_t size, size_t alignment, void *ptr) DECLSPEC_HIDDEN;
 void *vkd3d_gpu_va_allocator_dereference(struct vkd3d_gpu_va_allocator *allocator,
         D3D12_GPU_VIRTUAL_ADDRESS address) DECLSPEC_HIDDEN;
 void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator,
-- 
2.23.0



    

[PATCH v2 1/2] vkd3d: Optimize GPU VA allocator.