The GPU VA allocator was allocating memory in a way where dereferencing GPU VA required a lock + bsearch to find the right VA range.
Rather than going this route, we turn the common case into O(1) and lockless by creating a slab allocator which allows us to lookup a ptr directly from GPU VA with (VA - Base) / PageSize.
The number of allocations in the fast path must be limited since we cannot trivially grow the allocator while remaining lock-free for dereferences.
Signed-off-by: Hans-Kristian Arntzen post@arntzen-software.no --- libs/vkd3d/device.c | 240 +++++++++++++++++++++++++++++++------ libs/vkd3d/resource.c | 2 +- libs/vkd3d/vkd3d_private.h | 31 +++-- 3 files changed, 227 insertions(+), 46 deletions(-)
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 3da4273..0ecac9a 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -1822,42 +1822,106 @@ static void d3d12_device_destroy_pipeline_cache(struct d3d12_device *device) pthread_mutex_destroy(&device->mutex); }
-D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator, - size_t size, void *ptr) +#define VKD3D_MAX_VA_SLAB_ALLOCATIONS (64 * 1024) +#define VKD3D_BASE_VA_SLAB (0x1000000000ull) +#define VKD3D_BASE_VA_FALLBACK (0x8000000000000000ull) +#define VKD3D_SLAB_ALLOCATION_SIZE (0x100000000ull) +#define VKD3D_SLAB_ALLOCATION_SIZE_LOG2 32 + +static D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate_fallback(struct vkd3d_gpu_va_allocator *allocator, + size_t size, size_t alignment, void *ptr) { D3D12_GPU_VIRTUAL_ADDRESS ceiling = ~(D3D12_GPU_VIRTUAL_ADDRESS)0; struct vkd3d_gpu_va_allocation *allocation; - int rc;
- if ((rc = pthread_mutex_lock(&allocator->mutex))) + if (!vkd3d_array_reserve((void **)&allocator->fallback_mem_allocations, &allocator->fallback_mem_allocations_size, + allocator->fallback_mem_allocation_count + 1, sizeof(*allocator->fallback_mem_allocations))) { - ERR("Failed to lock mutex, error %d.\n", rc); return 0; }
- if (!vkd3d_array_reserve((void **)&allocator->allocations, &allocator->allocations_size, - allocator->allocation_count + 1, sizeof(*allocator->allocations))) + allocator->fallback_mem_floor = (allocator->fallback_mem_floor + alignment - 1) & ~((D3D12_GPU_VIRTUAL_ADDRESS)alignment - 1); + + if (size > ceiling || ceiling - size < allocator->fallback_mem_floor) { - pthread_mutex_unlock(&allocator->mutex); return 0; }
- if (size > ceiling || ceiling - size < allocator->floor) + allocation = &allocator->fallback_mem_allocations[allocator->fallback_mem_allocation_count++]; + allocation->base = allocator->fallback_mem_floor; + allocation->size = size; + allocation->ptr = ptr; + + /* This pointer is bumped and never lowered on a free. + * However, this will only fail once we have exhausted 63 bits of address space. */ + allocator->fallback_mem_floor += size; + + return allocation->base; +} + +static D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate_slab(struct vkd3d_gpu_va_allocator *allocator, + size_t size, size_t alignment, void *ptr) +{ + int rc; + unsigned vacant_index; + D3D12_GPU_VIRTUAL_ADDRESS virtual_address = 0; + + if ((rc = pthread_mutex_lock(&allocator->mutex))) { - pthread_mutex_unlock(&allocator->mutex); + ERR("Failed to lock mutex, error %d.\n", rc); return 0; }
- allocation = &allocator->allocations[allocator->allocation_count++]; - allocation->base = allocator->floor; - allocation->size = size; - allocation->ptr = ptr; + TRACE("Allocating %zu bytes (%zu align) of VA from slab allocator.\n", size, alignment); + if (allocator->mem_vacant_count > 0) + { + vacant_index = allocator->mem_vacant[--allocator->mem_vacant_count]; + + /* It is critical that the multiplication happens in 64-bit to not overflow. */ + virtual_address = VKD3D_BASE_VA_SLAB + vacant_index * VKD3D_SLAB_ALLOCATION_SIZE; + TRACE("Allocating VA: 0x%llx: vacant index %u from slab.\n", + (unsigned long long)virtual_address, vacant_index); + assert(!allocator->slab_mem_allocations[vacant_index].ptr); + allocator->slab_mem_allocations[vacant_index].ptr = ptr; + allocator->slab_mem_allocations[vacant_index].size = size; + }
- allocator->floor += size; + if (virtual_address == 0) + { + TRACE("Slab allocator is empty, allocating %zu bytes (%zu align) of VA from fallback allocator.\n", + size, alignment); + /* Fall back to slow allocator. */ + virtual_address = vkd3d_gpu_va_allocator_allocate_fallback(allocator, size, alignment, ptr); + }
pthread_mutex_unlock(&allocator->mutex); + return virtual_address; +}
- return allocation->base; +D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator, + size_t size, size_t alignment, void *ptr) +{ + D3D12_GPU_VIRTUAL_ADDRESS virtual_address; + int rc; + size_t aligned_size; + + aligned_size = size > alignment ? size : alignment; + + if (aligned_size > VKD3D_SLAB_ALLOCATION_SIZE) + { + /* For massive VA allocations, go straight to high-mem with a slower allocator. */ + if ((rc = pthread_mutex_lock(&allocator->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return 0; + } + virtual_address = vkd3d_gpu_va_allocator_allocate_fallback(allocator, size, alignment, ptr); + pthread_mutex_unlock(&allocator->mutex); + } + else + virtual_address = vkd3d_gpu_va_allocator_allocate_slab(allocator, size, alignment, ptr); + + return virtual_address; }
static int vkd3d_gpu_va_allocation_compare(const void *k, const void *e) @@ -1872,24 +1936,93 @@ static int vkd3d_gpu_va_allocation_compare(const void *k, const void *e) return 0; }
+static void *vkd3d_gpu_va_allocator_dereference_slab(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) +{ + D3D12_GPU_VIRTUAL_ADDRESS base_offset; + uint64_t base_index; + const struct vkd3d_gpu_va_slab_entry *slab; + + base_offset = address - VKD3D_BASE_VA_SLAB; + base_index = base_offset >> VKD3D_SLAB_ALLOCATION_SIZE_LOG2; + if (base_index >= VKD3D_MAX_VA_SLAB_ALLOCATIONS) + { + ERR("Accessed slab size class out of range.\n"); + return NULL; + } + + slab = &allocator->slab_mem_allocations[base_index]; + base_offset -= base_index * VKD3D_SLAB_ALLOCATION_SIZE; + if (base_offset >= slab->size) + { + ERR("Accessed slab out of range.\n"); + return NULL; + } + return slab->ptr; +} + +static void vkd3d_gpu_va_allocator_free_slab(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) +{ + D3D12_GPU_VIRTUAL_ADDRESS base_offset; + unsigned base_index; + struct vkd3d_gpu_va_slab_entry *slab; + + base_offset = address - VKD3D_BASE_VA_SLAB; + base_index = base_offset >> VKD3D_SLAB_ALLOCATION_SIZE_LOG2; + + if (base_index >= VKD3D_MAX_VA_SLAB_ALLOCATIONS) + { + ERR("Accessed slab size class out of range.\n"); + return; + } + + slab = &allocator->slab_mem_allocations[base_index]; + if (slab->ptr == NULL) + { + ERR("Attempting to free NULL VA.\n"); + return; + } + + if (allocator->mem_vacant_count >= VKD3D_MAX_VA_SLAB_ALLOCATIONS) + { + ERR("Invalid free, slab size class is fully freed.\n"); + return; + } + + TRACE("Freeing VA: 0x%llx: index %u from slab.\n", + (unsigned long long)address, base_index); + + slab->ptr = NULL; + allocator->mem_vacant[allocator->mem_vacant_count++] = base_index; +} + void *vkd3d_gpu_va_allocator_dereference(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) { struct vkd3d_gpu_va_allocation *allocation; int rc;
- if ((rc = pthread_mutex_lock(&allocator->mutex))) + /* If we land in the non-fallback region, dereferencing VA is lockless. The base pointer is immutable, + * and only way we can have a data race is if some other thread is poking into the slab_mem_allocation[class][base_index] block. + * This can only happen if someone is trying to free the entry while we're dereferencing, which would be a serious app bug. */ + if (address < VKD3D_BASE_VA_FALLBACK) { - ERR("Failed to lock mutex, error %d.\n", rc); - return NULL; + return vkd3d_gpu_va_allocator_dereference_slab(allocator, address); } + else + { + /* Slow fallback. */ + if ((rc = pthread_mutex_lock(&allocator->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return NULL; + }
- allocation = bsearch(&address, allocator->allocations, allocator->allocation_count, - sizeof(*allocation), vkd3d_gpu_va_allocation_compare); - - pthread_mutex_unlock(&allocator->mutex); + allocation = bsearch(&address, allocator->fallback_mem_allocations, allocator->fallback_mem_allocation_count, + sizeof(*allocation), vkd3d_gpu_va_allocation_compare);
- return allocation ? allocation->ptr : NULL; + pthread_mutex_unlock(&allocator->mutex); + return allocation ? allocation->ptr : NULL; + } }
void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) @@ -1904,16 +2037,23 @@ void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12 return; }
- allocation = bsearch(&address, allocator->allocations, allocator->allocation_count, - sizeof(*allocation), vkd3d_gpu_va_allocation_compare); - if (allocation && allocation->base == address) + if (address < VKD3D_BASE_VA_FALLBACK) { - index = allocation - allocator->allocations; - --allocator->allocation_count; - if (index != allocator->allocation_count) + vkd3d_gpu_va_allocator_free_slab(allocator, address); + } + else + { + allocation = bsearch(&address, allocator->fallback_mem_allocations, allocator->fallback_mem_allocation_count, + sizeof(*allocation), vkd3d_gpu_va_allocation_compare); + if (allocation && allocation->base == address) { - memmove(&allocator->allocations[index], &allocator->allocations[index + 1], - (allocator->allocation_count - index) * sizeof(*allocation)); + index = allocation - allocator->fallback_mem_allocations; + --allocator->fallback_mem_allocation_count; + if (index != allocator->fallback_mem_allocation_count) + { + memmove(&allocator->fallback_mem_allocations[index], &allocator->fallback_mem_allocations[index + 1], + (allocator->fallback_mem_allocation_count - index) * sizeof(*allocation)); + } } }
@@ -1923,29 +2063,59 @@ void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12 static bool vkd3d_gpu_va_allocator_init(struct vkd3d_gpu_va_allocator *allocator) { int rc; + int i;
memset(allocator, 0, sizeof(*allocator)); - allocator->floor = 0x1000; + allocator->fallback_mem_floor = VKD3D_BASE_VA_FALLBACK; + + /* To remain lock-less, we cannot grow these lists after the fact. If we commit to a maximum number of allocations + * here, we can dereference without taking a lock as the base pointer never changes. + * We would be able to grow more seamlessly using an array of pointers, + * but would make dereferencing slightly less efficient. */ + allocator->slab_mem_allocations = vkd3d_calloc(VKD3D_MAX_VA_SLAB_ALLOCATIONS, sizeof(*allocator->slab_mem_allocations)); + if (!allocator->slab_mem_allocations) + goto error; + + /* Otherwise we need 32-bit indices. */ + assert(VKD3D_MAX_VA_SLAB_ALLOCATIONS <= 64 * 1024); + + allocator->mem_vacant = vkd3d_malloc(VKD3D_MAX_VA_SLAB_ALLOCATIONS * sizeof(uint16_t)); + if (!allocator->mem_vacant) + goto error; + + /* Build a stack of which slab indices are available for allocation. + * Place lowest indices last (first to be popped off stack). */ + for (i = 0; i < VKD3D_MAX_VA_SLAB_ALLOCATIONS; i++) + allocator->mem_vacant[i] = (VKD3D_MAX_VA_SLAB_ALLOCATIONS - 1) - i; + allocator->mem_vacant_count = VKD3D_MAX_VA_SLAB_ALLOCATIONS;
if ((rc = pthread_mutex_init(&allocator->mutex, NULL))) { ERR("Failed to initialize mutex, error %d.\n", rc); - return false; + goto error; }
return true; + +error: + vkd3d_free(allocator->slab_mem_allocations); + vkd3d_free(allocator->mem_vacant); + return false; }
static void vkd3d_gpu_va_allocator_cleanup(struct vkd3d_gpu_va_allocator *allocator) { int rc;
+ vkd3d_free(allocator->slab_mem_allocations); + vkd3d_free(allocator->mem_vacant); + if ((rc = pthread_mutex_lock(&allocator->mutex))) { ERR("Failed to lock mutex, error %d.\n", rc); return; } - vkd3d_free(allocator->allocations); + vkd3d_free(allocator->fallback_mem_allocations); pthread_mutex_unlock(&allocator->mutex); pthread_mutex_destroy(&allocator->mutex); } diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index ccd1230..6c9564b 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -1710,7 +1710,7 @@ static HRESULT d3d12_resource_init(struct d3d12_resource *resource, struct d3d12 &resource->desc, &resource->u.vk_buffer))) return hr; if (!(resource->gpu_address = vkd3d_gpu_va_allocator_allocate(&device->gpu_va_allocator, - desc->Width, resource))) + desc->Width, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT, resource))) { ERR("Failed to allocate GPU VA.\n"); d3d12_resource_destroy(resource, device); diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 59f0eac..a5f7c81 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -202,24 +202,35 @@ HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, HRESULT vkd3d_fence_worker_stop(struct vkd3d_fence_worker *worker, struct d3d12_device *device) DECLSPEC_HIDDEN;
+struct vkd3d_gpu_va_allocation +{ + D3D12_GPU_VIRTUAL_ADDRESS base; + SIZE_T size; + void *ptr; +}; + +struct vkd3d_gpu_va_slab_entry +{ + void *ptr; + SIZE_T size; +}; + struct vkd3d_gpu_va_allocator { pthread_mutex_t mutex;
- D3D12_GPU_VIRTUAL_ADDRESS floor; + struct vkd3d_gpu_va_slab_entry *slab_mem_allocations; + uint16_t *mem_vacant; + size_t mem_vacant_count;
- struct vkd3d_gpu_va_allocation - { - D3D12_GPU_VIRTUAL_ADDRESS base; - SIZE_T size; - void *ptr; - } *allocations; - size_t allocations_size; - size_t allocation_count; + struct vkd3d_gpu_va_allocation *fallback_mem_allocations; + size_t fallback_mem_allocations_size; + size_t fallback_mem_allocation_count; + D3D12_GPU_VIRTUAL_ADDRESS fallback_mem_floor; };
D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator, - size_t size, void *ptr) DECLSPEC_HIDDEN; + size_t size, size_t alignment, void *ptr) DECLSPEC_HIDDEN; void *vkd3d_gpu_va_allocator_dereference(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) DECLSPEC_HIDDEN; void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator,
Greatly reduce VA allocations we have to make and makes returned VA more sensible, and better matches returned VAs we see on native drivers.
D3D12 usage flags for buffers seem generic enough that there is no obvious benefit to place smaller VkBuffers on top of VkDeviceMemory.
Ideally, physical_buffer_address is used here, but this works as a good fallback if that path is added later.
With this patch and previous VA optimization, I'm observing a 2.0-2.5% FPS uplift on SOTTR when CPU bound.
Signed-off-by: Hans-Kristian Arntzen post@arntzen-software.no --- libs/vkd3d/command.c | 9 +-- libs/vkd3d/device.c | 2 + libs/vkd3d/resource.c | 135 ++++++++++++++++++++++++++++++++----- libs/vkd3d/vkd3d_private.h | 2 + 4 files changed, 128 insertions(+), 20 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index d420863..dabdbb5 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -3031,8 +3031,8 @@ static void STDMETHODCALLTYPE d3d12_command_list_CopyBufferRegion(ID3D12Graphics
d3d12_command_list_end_current_render_pass(list);
- buffer_copy.srcOffset = src_offset; - buffer_copy.dstOffset = dst_offset; + buffer_copy.srcOffset = src_offset + src_resource->heap_offset; + buffer_copy.dstOffset = dst_offset + dst_resource->heap_offset; buffer_copy.size = byte_count;
VK_CALL(vkCmdCopyBuffer(list->vk_command_buffer, @@ -3450,8 +3450,8 @@ static void STDMETHODCALLTYPE d3d12_command_list_CopyResource(ID3D12GraphicsComm assert(d3d12_resource_is_buffer(src_resource)); assert(src_resource->desc.Width == dst_resource->desc.Width);
- vk_buffer_copy.srcOffset = 0; - vk_buffer_copy.dstOffset = 0; + vk_buffer_copy.srcOffset = src_resource->heap_offset; + vk_buffer_copy.dstOffset = dst_resource->heap_offset; vk_buffer_copy.size = dst_resource->desc.Width; VK_CALL(vkCmdCopyBuffer(list->vk_command_buffer, src_resource->u.vk_buffer, dst_resource->u.vk_buffer, 1, &vk_buffer_copy)); @@ -4076,6 +4076,7 @@ static void d3d12_command_list_set_root_cbv(struct d3d12_command_list *list,
resource = vkd3d_gpu_va_allocator_dereference(&list->device->gpu_va_allocator, gpu_address); buffer_info.buffer = resource->u.vk_buffer; + buffer_info.offset = gpu_address - resource->gpu_address; buffer_info.range = resource->desc.Width - buffer_info.offset; buffer_info.range = min(buffer_info.range, vk_info->device_limits.maxUniformBufferRange); diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 0ecac9a..47e90e6 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -1334,6 +1334,8 @@ static HRESULT vkd3d_init_device_caps(struct d3d12_device *device, device->feature_options.CrossAdapterRowMajorTextureSupported = FALSE; /* SPV_EXT_shader_viewport_index_layer */ device->feature_options.VPAndRTArrayIndexFromAnyShaderFeedingRasterizerSupportedWithoutGSEmulation = FALSE; + + /* FIXME: Does this actually work on NV which has 64k bufferImage alignment quirks with VkDeviceMemory? */ device->feature_options.ResourceHeapTier = D3D12_RESOURCE_HEAP_TIER_2;
if ((vr = VK_CALL(vkEnumerateDeviceExtensionProperties(physical_device, NULL, &count, NULL))) < 0) diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 6c9564b..557d5e1 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -292,6 +292,8 @@ static ULONG STDMETHODCALLTYPE d3d12_heap_AddRef(ID3D12Heap *iface) return refcount; }
+static ULONG d3d12_resource_decref(struct d3d12_resource *resource); + static void d3d12_heap_destroy(struct d3d12_heap *heap) { struct d3d12_device *device = heap->device; @@ -299,6 +301,9 @@ static void d3d12_heap_destroy(struct d3d12_heap *heap)
TRACE("Destroying heap %p.\n", heap);
+ if (heap->buffer_resource) + d3d12_resource_decref(heap->buffer_resource); + vkd3d_private_store_destroy(&heap->private_store);
VK_CALL(vkFreeMemory(device->vk_device, heap->vk_memory, NULL)); @@ -539,6 +544,12 @@ static HRESULT validate_heap_desc(const D3D12_HEAP_DESC *desc, const struct d3d1 return S_OK; }
+static HRESULT d3d12_resource_create(struct d3d12_device *device, + const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags, + const D3D12_RESOURCE_DESC *desc, D3D12_RESOURCE_STATES initial_state, + const D3D12_CLEAR_VALUE *optimized_clear_value, bool placed, + struct d3d12_resource **resource); + static HRESULT d3d12_heap_init(struct d3d12_heap *heap, struct d3d12_device *device, const D3D12_HEAP_DESC *desc, const struct d3d12_resource *resource) { @@ -546,6 +557,10 @@ static HRESULT d3d12_heap_init(struct d3d12_heap *heap, VkDeviceSize vk_memory_size; HRESULT hr; int rc; + bool buffers_allowed; + D3D12_RESOURCE_DESC resource_desc; + D3D12_RESOURCE_STATES initial_resource_state; + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
heap->ID3D12Heap_iface.lpVtbl = &d3d12_heap_vtbl; heap->refcount = 1; @@ -556,6 +571,7 @@ static HRESULT d3d12_heap_init(struct d3d12_heap *heap,
heap->map_ptr = NULL; heap->map_count = 0; + heap->buffer_resource = NULL;
if (!heap->desc.Properties.CreationNodeMask) heap->desc.Properties.CreationNodeMask = 1; @@ -583,6 +599,53 @@ static HRESULT d3d12_heap_init(struct d3d12_heap *heap, return hr; }
+ buffers_allowed = !(heap->desc.Flags & D3D12_HEAP_FLAG_DENY_BUFFERS); + if (buffers_allowed && !resource) + { + /* Create a single omnipotent buffer which fills the entire heap. + * Whenever we place buffer resources on this heap, we'll just offset this VkBuffer. + * This allows us to keep VA space somewhat sane, and keeps number of (limited) VA allocations down. + * One possible downside is that the buffer might be slightly slower to access, + * but D3D12 has very lenient usage flags for buffers. */ + + memset(&resource_desc, 0, sizeof(resource_desc)); + resource_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + resource_desc.Width = desc->SizeInBytes; + resource_desc.Height = 1; + resource_desc.DepthOrArraySize = 1; + resource_desc.MipLevels = 1; + resource_desc.SampleDesc.Count = 1; + resource_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + + switch (desc->Properties.Type) + { + case D3D12_HEAP_TYPE_UPLOAD: + initial_resource_state = D3D12_RESOURCE_STATE_GENERIC_READ; + break; + + case D3D12_HEAP_TYPE_READBACK: + initial_resource_state = D3D12_RESOURCE_STATE_COPY_DEST; + break; + + default: + /* Upload and readback heaps do not allow UAV access, only enable this flag for other heaps. */ + resource_desc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; + initial_resource_state = D3D12_RESOURCE_STATE_COMMON; + break; + } + + if (FAILED(hr = d3d12_resource_create(device, &desc->Properties, desc->Flags, + &resource_desc, initial_resource_state, + NULL, false, &heap->buffer_resource))) + { + heap->buffer_resource = NULL; + return hr; + } + /* This internal resource should not own a reference on the device. + * d3d12_resource_create takes a reference on the device. */ + d3d12_device_release(device); + } + if (resource) { if (d3d12_resource_is_buffer(resource)) @@ -600,12 +663,19 @@ static HRESULT d3d12_heap_init(struct d3d12_heap *heap,
heap->desc.SizeInBytes = vk_memory_size; } + else if (heap->buffer_resource) + { + hr = vkd3d_allocate_buffer_memory(device, heap->buffer_resource->u.vk_buffer, + &heap->desc.Properties, heap->desc.Flags, + &heap->vk_memory, &heap->vk_memory_type, &vk_memory_size); + } else { + /* Allocate generic memory which should hopefully match up with whatever resources + * we want to place here. */ memory_requirements.size = heap->desc.SizeInBytes; memory_requirements.alignment = heap->desc.Alignment; memory_requirements.memoryTypeBits = ~(uint32_t)0; - hr = vkd3d_allocate_device_memory(device, &heap->desc.Properties, heap->desc.Flags, &memory_requirements, NULL, &heap->vk_memory, &heap->vk_memory_type); @@ -614,6 +684,11 @@ static HRESULT d3d12_heap_init(struct d3d12_heap *heap, { vkd3d_private_store_destroy(&heap->private_store); pthread_mutex_destroy(&heap->mutex); + if (heap->buffer_resource) + { + d3d12_resource_decref(heap->buffer_resource); + heap->buffer_resource = NULL; + } return hr; }
@@ -1003,13 +1078,16 @@ static void d3d12_resource_destroy(struct d3d12_resource *resource, struct d3d12 if (resource->flags & VKD3D_RESOURCE_EXTERNAL) return;
- if (resource->gpu_address) - vkd3d_gpu_va_allocator_free(&device->gpu_va_allocator, resource->gpu_address); + if (!(resource->flags & VKD3D_RESOURCE_PLACED_BUFFER)) + { + if (resource->gpu_address) + vkd3d_gpu_va_allocator_free(&device->gpu_va_allocator, resource->gpu_address);
- if (d3d12_resource_is_buffer(resource)) - VK_CALL(vkDestroyBuffer(device->vk_device, resource->u.vk_buffer, NULL)); - else - VK_CALL(vkDestroyImage(device->vk_device, resource->u.vk_image, NULL)); + if (d3d12_resource_is_buffer(resource)) + VK_CALL(vkDestroyBuffer(device->vk_device, resource->u.vk_buffer, NULL)); + else + VK_CALL(vkDestroyImage(device->vk_device, resource->u.vk_image, NULL)); + }
if (resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) d3d12_heap_destroy(resource->heap); @@ -1669,7 +1747,7 @@ static bool d3d12_resource_validate_heap_properties(const struct d3d12_resource static HRESULT d3d12_resource_init(struct d3d12_resource *resource, struct d3d12_device *device, const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags, const D3D12_RESOURCE_DESC *desc, D3D12_RESOURCE_STATES initial_state, - const D3D12_CLEAR_VALUE *optimized_clear_value) + const D3D12_CLEAR_VALUE *optimized_clear_value, bool placed) { HRESULT hr;
@@ -1699,6 +1777,8 @@ static HRESULT d3d12_resource_init(struct d3d12_resource *resource, struct d3d12
resource->gpu_address = 0; resource->flags = 0; + if (placed && desc->Dimension == D3D12_RESOURCE_DIMENSION_BUFFER) + resource->flags |= VKD3D_RESOURCE_PLACED_BUFFER;
if (FAILED(hr = d3d12_resource_validate_desc(&resource->desc))) return hr; @@ -1706,6 +1786,13 @@ static HRESULT d3d12_resource_init(struct d3d12_resource *resource, struct d3d12 switch (desc->Dimension) { case D3D12_RESOURCE_DIMENSION_BUFFER: + /* We'll inherit a VkBuffer reference from the heap with an implied offset. */ + if (placed) + { + resource->u.vk_buffer = VK_NULL_HANDLE; + break; + } + if (FAILED(hr = vkd3d_create_buffer(device, heap_properties, heap_flags, &resource->desc, &resource->u.vk_buffer))) return hr; @@ -1755,7 +1842,7 @@ static HRESULT d3d12_resource_init(struct d3d12_resource *resource, struct d3d12 static HRESULT d3d12_resource_create(struct d3d12_device *device, const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags, const D3D12_RESOURCE_DESC *desc, D3D12_RESOURCE_STATES initial_state, - const D3D12_CLEAR_VALUE *optimized_clear_value, struct d3d12_resource **resource) + const D3D12_CLEAR_VALUE *optimized_clear_value, bool placed, struct d3d12_resource **resource) { struct d3d12_resource *object; HRESULT hr; @@ -1764,7 +1851,7 @@ static HRESULT d3d12_resource_create(struct d3d12_device *device, return E_OUTOFMEMORY;
if (FAILED(hr = d3d12_resource_init(object, device, heap_properties, heap_flags, - desc, initial_state, optimized_clear_value))) + desc, initial_state, optimized_clear_value, placed))) { vkd3d_free(object); return hr; @@ -1806,7 +1893,7 @@ HRESULT d3d12_committed_resource_create(struct d3d12_device *device, }
if (FAILED(hr = d3d12_resource_create(device, heap_properties, heap_flags, - desc, initial_state, optimized_clear_value, &object))) + desc, initial_state, optimized_clear_value, false, &object))) return hr;
if (FAILED(hr = vkd3d_allocate_resource_memory(device, object, heap_properties, heap_flags))) @@ -1830,6 +1917,16 @@ static HRESULT vkd3d_bind_heap_memory(struct d3d12_device *device, VkMemoryRequirements requirements; VkResult vr;
+ if (resource->flags & VKD3D_RESOURCE_PLACED_BUFFER) + { + /* Just inherit the buffer from the heap. */ + resource->u.vk_buffer = heap->buffer_resource->u.vk_buffer; + resource->heap = heap; + resource->heap_offset = heap_offset; + resource->gpu_address = heap->buffer_resource->gpu_address + heap_offset; + return S_OK; + } + if (d3d12_resource_is_buffer(resource)) VK_CALL(vkGetBufferMemoryRequirements(vk_device, resource->u.vk_buffer, &requirements)); else @@ -1879,7 +1976,7 @@ HRESULT d3d12_placed_resource_create(struct d3d12_device *device, struct d3d12_h HRESULT hr;
if (FAILED(hr = d3d12_resource_create(device, &heap->desc.Properties, heap->desc.Flags, - desc, initial_state, optimized_clear_value, &object))) + desc, initial_state, optimized_clear_value, true, &object))) return hr;
if (FAILED(hr = vkd3d_bind_heap_memory(device, object, heap, heap_offset))) @@ -1903,7 +2000,7 @@ HRESULT d3d12_reserved_resource_create(struct d3d12_device *device, HRESULT hr;
if (FAILED(hr = d3d12_resource_create(device, NULL, 0, - desc, initial_state, optimized_clear_value, &object))) + desc, initial_state, optimized_clear_value, false, &object))) return hr;
TRACE("Created reserved resource %p.\n", object); @@ -2205,7 +2302,7 @@ static bool vkd3d_create_buffer_view_for_resource(struct d3d12_device *device, assert(d3d12_resource_is_buffer(resource));
return vkd3d_create_buffer_view(device, resource->u.vk_buffer, - format, offset * element_size, size * element_size, view); + format, resource->heap_offset + offset * element_size, size * element_size, view); }
static void vkd3d_set_view_swizzle_for_format(VkComponentMapping *components, @@ -2807,7 +2904,7 @@ static void vkd3d_create_buffer_uav(struct d3d12_desc *descriptor, struct d3d12_
format = vkd3d_get_format(device, DXGI_FORMAT_R32_UINT, false); if (!vkd3d_create_vk_buffer_view(device, counter_resource->u.vk_buffer, format, - desc->u.Buffer.CounterOffsetInBytes, sizeof(uint32_t), &view->vk_counter_view)) + desc->u.Buffer.CounterOffsetInBytes + resource->heap_offset, sizeof(uint32_t), &view->vk_counter_view)) { WARN("Failed to create counter buffer view.\n"); view->vk_counter_view = VK_NULL_HANDLE; @@ -2913,12 +3010,18 @@ bool vkd3d_create_raw_buffer_view(struct d3d12_device *device, { const struct vkd3d_format *format; struct d3d12_resource *resource; + uint64_t range; + uint64_t offset;
format = vkd3d_get_format(device, DXGI_FORMAT_R32_UINT, false); resource = vkd3d_gpu_va_allocator_dereference(&device->gpu_va_allocator, gpu_address); assert(d3d12_resource_is_buffer(resource)); + + offset = gpu_address - resource->gpu_address; + range = min(resource->desc.Width - offset, device->vk_info.device_limits.maxStorageBufferRange); + return vkd3d_create_vk_buffer_view(device, resource->u.vk_buffer, format, - gpu_address - resource->gpu_address, VK_WHOLE_SIZE, vk_buffer_view); + offset, range, vk_buffer_view); }
/* samplers */ diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index a5f7c81..2b1ae30 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -380,6 +380,7 @@ struct d3d12_heap unsigned int map_count; uint32_t vk_memory_type;
+ struct d3d12_resource *buffer_resource; struct d3d12_device *device;
struct vkd3d_private_store private_store; @@ -394,6 +395,7 @@ struct d3d12_heap *unsafe_impl_from_ID3D12Heap(ID3D12Heap *iface) DECLSPEC_HIDDE #define VKD3D_RESOURCE_EXTERNAL 0x00000004 #define VKD3D_RESOURCE_DEDICATED_HEAP 0x00000008 #define VKD3D_RESOURCE_LINEAR_TILING 0x00000010 +#define VKD3D_RESOURCE_PLACED_BUFFER 0x00000020
/* ID3D12Resource */ struct d3d12_resource
On Tue, 8 Oct 2019 at 12:00, Hans-Kristian Arntzen post@arntzen-software.no wrote:
@@ -4076,6 +4076,7 @@ static void d3d12_command_list_set_root_cbv(struct d3d12_command_list *list,
resource = vkd3d_gpu_va_allocator_dereference(&list->device->gpu_va_allocator, gpu_address); buffer_info.buffer = resource->u.vk_buffer;
- buffer_info.offset = gpu_address - resource->gpu_address; buffer_info.range = resource->desc.Width - buffer_info.offset; buffer_info.range = min(buffer_info.range, vk_info->device_limits.maxUniformBufferRange);
Stray whitespace change.
What I'm guessing happened here is that you changed the code to account for resource->heap_offset, but then realised it's always 0 for anything that has its own VA. That's convenient here, but also a little annoying to have to reason about; I'd probably prefer to consistently apply the heap offset anywhere we use a buffer offset. Worse, I think there are a few places that do need the heap offset applied, but don't in this patch. E.g., d3d12_command_list_CopyTextureRegion(), d3d12_command_list_ResourceBarrier(), d3d12_command_list_ClearUnorderedAccessViewUint().
@@ -1334,6 +1334,8 @@ static HRESULT vkd3d_init_device_caps(struct d3d12_device *device, device->feature_options.CrossAdapterRowMajorTextureSupported = FALSE; /* SPV_EXT_shader_viewport_index_layer */ device->feature_options.VPAndRTArrayIndexFromAnyShaderFeedingRasterizerSupportedWithoutGSEmulation = FALSE;
- /* FIXME: Does this actually work on NV which has 64k bufferImage alignment quirks with VkDeviceMemory? */ device->feature_options.ResourceHeapTier = D3D12_RESOURCE_HEAP_TIER_2;
Well, does it?
@@ -546,6 +557,10 @@ static HRESULT d3d12_heap_init(struct d3d12_heap *heap, VkDeviceSize vk_memory_size; HRESULT hr; int rc;
- bool buffers_allowed;
- D3D12_RESOURCE_DESC resource_desc;
- D3D12_RESOURCE_STATES initial_resource_state;
- const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
"vk_procs" is unused.
@@ -1003,13 +1078,16 @@ static void d3d12_resource_destroy(struct d3d12_resource *resource, struct d3d12 if (resource->flags & VKD3D_RESOURCE_EXTERNAL) return;
- if (resource->gpu_address)
vkd3d_gpu_va_allocator_free(&device->gpu_va_allocator, resource->gpu_address);
- if (!(resource->flags & VKD3D_RESOURCE_PLACED_BUFFER))
- {
if (resource->gpu_address)
vkd3d_gpu_va_allocator_free(&device->gpu_va_allocator, resource->gpu_address);
- if (d3d12_resource_is_buffer(resource))
VK_CALL(vkDestroyBuffer(device->vk_device, resource->u.vk_buffer, NULL));
- else
VK_CALL(vkDestroyImage(device->vk_device, resource->u.vk_image, NULL));
if (d3d12_resource_is_buffer(resource))
VK_CALL(vkDestroyBuffer(device->vk_device, resource->u.vk_buffer, NULL));
else
VK_CALL(vkDestroyImage(device->vk_device, resource->u.vk_image, NULL));
}
if (resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) d3d12_heap_destroy(resource->heap);
Do we really need VKD3D_RESOURCE_PLACED_BUFFER? It seems largely equivalent to "!(resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) && d3d12_resource_is_buffer(resource)".
I pushed a new patchset.
On 10/22/19 3:04 PM, Henri Verbeet wrote:
On Tue, 8 Oct 2019 at 12:00, Hans-Kristian Arntzen post@arntzen-software.no wrote:
@@ -4076,6 +4076,7 @@ static void d3d12_command_list_set_root_cbv(struct d3d12_command_list *list,
resource = vkd3d_gpu_va_allocator_dereference(&list->device->gpu_va_allocator, gpu_address); buffer_info.buffer = resource->u.vk_buffer;
buffer_info.offset = gpu_address - resource->gpu_address; buffer_info.range = resource->desc.Width - buffer_info.offset; buffer_info.range = min(buffer_info.range, vk_info->device_limits.maxUniformBufferRange);
Stray whitespace change.
What I'm guessing happened here is that you changed the code to account for resource->heap_offset, but then realised it's always 0 for anything that has its own VA. That's convenient here, but also a little annoying to have to reason about; I'd probably prefer to consistently apply the heap offset anywhere we use a buffer offset. Worse, I think there are a few places that do need the heap offset applied, but don't in this patch. E.g., d3d12_command_list_CopyTextureRegion(), d3d12_command_list_ResourceBarrier(), d3d12_command_list_ClearUnorderedAccessViewUint().
Consistently applying heap_offset would be worse though, because it would be wrong in the case where placed buffers are not used. When looking up VA, you get the buffer for the heap.
I fixed the copy/clear commands, but I'm a bit concerned the test suite did not catch this. I changed the UAV clear test to use placed resources at least in a follow-up patch.
@@ -1334,6 +1334,8 @@ static HRESULT vkd3d_init_device_caps(struct d3d12_device *device, device->feature_options.CrossAdapterRowMajorTextureSupported = FALSE; /* SPV_EXT_shader_viewport_index_layer */ device->feature_options.VPAndRTArrayIndexFromAnyShaderFeedingRasterizerSupportedWithoutGSEmulation = FALSE;
- /* FIXME: Does this actually work on NV which has 64k bufferImage alignment quirks with VkDeviceMemory? */ device->feature_options.ResourceHeapTier = D3D12_RESOURCE_HEAP_TIER_2;
Well, does it?
Removed the comment, but there should be some rationale why vkd3d does this. NV has bufferImageGranularity 64k, and if apps place buffers and images within that alignment, there is implicit memory aliasing. Although, even if TIER_1 is advertised, apps don't really seem to care and create heaps with all uses enabled anyways. I'll just leave it until I see a real app having problems with this.
@@ -546,6 +557,10 @@ static HRESULT d3d12_heap_init(struct d3d12_heap *heap, VkDeviceSize vk_memory_size; HRESULT hr; int rc;
- bool buffers_allowed;
- D3D12_RESOURCE_DESC resource_desc;
- D3D12_RESOURCE_STATES initial_resource_state;
- const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
"vk_procs" is unused.
@@ -1003,13 +1078,16 @@ static void d3d12_resource_destroy(struct d3d12_resource *resource, struct d3d12 if (resource->flags & VKD3D_RESOURCE_EXTERNAL) return;
- if (resource->gpu_address)
vkd3d_gpu_va_allocator_free(&device->gpu_va_allocator, resource->gpu_address);
- if (!(resource->flags & VKD3D_RESOURCE_PLACED_BUFFER))
- {
if (resource->gpu_address)
vkd3d_gpu_va_allocator_free(&device->gpu_va_allocator, resource->gpu_address);
- if (d3d12_resource_is_buffer(resource))
VK_CALL(vkDestroyBuffer(device->vk_device, resource->u.vk_buffer, NULL));
- else
VK_CALL(vkDestroyImage(device->vk_device, resource->u.vk_image, NULL));
if (d3d12_resource_is_buffer(resource))
VK_CALL(vkDestroyBuffer(device->vk_device, resource->u.vk_buffer, NULL));
else
VK_CALL(vkDestroyImage(device->vk_device, resource->u.vk_image, NULL));
}
if (resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) d3d12_heap_destroy(resource->heap);
Do we really need VKD3D_RESOURCE_PLACED_BUFFER? It seems largely equivalent to "!(resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) && d3d12_resource_is_buffer(resource)".
Yes, there is some confusion who owns the heap if DEDICATED_HEAP is used. The heap creates the resource in this case, and not the other way around. FWIW, I tried reverting back, but ended up with issues which would likely need some awkward flag to work around anyways, so PLACED_BUFFER is the best option I think.
Cheers, Hans-Kristian
On Wed, 23 Oct 2019 at 13:47, Hans-Kristian Arntzen post@arntzen-software.no wrote:
On 10/22/19 3:04 PM, Henri Verbeet wrote:
What I'm guessing happened here is that you changed the code to account for resource->heap_offset, but then realised it's always 0 for anything that has its own VA. That's convenient here, but also a little annoying to have to reason about; I'd probably prefer to consistently apply the heap offset anywhere we use a buffer offset. Worse, I think there are a few places that do need the heap offset applied, but don't in this patch. E.g., d3d12_command_list_CopyTextureRegion(), d3d12_command_list_ResourceBarrier(), d3d12_command_list_ClearUnorderedAccessViewUint().
Consistently applying heap_offset would be worse though, because it would be wrong in the case where placed buffers are not used. When looking up VA, you get the buffer for the heap.
Would it? For non-placed resources the heap offset should always be 0. Put a different way, after this patch the invariant is that the VkBuffer always starts at offset 0 of the underlying VkDeviceMemory allocation.
I fixed the copy/clear commands, but I'm a bit concerned the test suite did not catch this. I changed the UAV clear test to use placed resources at least in a follow-up patch.
Support for placed resources is a relatively recent addition to vkd3d, so most of the tests currently use committed resources. It would be good to increase the coverage, but that's something to be aware of, yes.
Do we really need VKD3D_RESOURCE_PLACED_BUFFER? It seems largely equivalent to "!(resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) && d3d12_resource_is_buffer(resource)".
Yes, there is some confusion who owns the heap if DEDICATED_HEAP is used. The heap creates the resource in this case, and not the other way around. FWIW, I tried reverting back, but ended up with issues which would likely need some awkward flag to work around anyways, so PLACED_BUFFER is the best option I think.
What is the exact issue there?
On 10/23/19 12:46 PM, Henri Verbeet wrote:
On Wed, 23 Oct 2019 at 13:47, Hans-Kristian Arntzen post@arntzen-software.no wrote:
On 10/22/19 3:04 PM, Henri Verbeet wrote:
What I'm guessing happened here is that you changed the code to account for resource->heap_offset, but then realised it's always 0 for anything that has its own VA. That's convenient here, but also a little annoying to have to reason about; I'd probably prefer to consistently apply the heap offset anywhere we use a buffer offset. Worse, I think there are a few places that do need the heap offset applied, but don't in this patch. E.g., d3d12_command_list_CopyTextureRegion(), d3d12_command_list_ResourceBarrier(), d3d12_command_list_ClearUnorderedAccessViewUint().
Consistently applying heap_offset would be worse though, because it would be wrong in the case where placed buffers are not used. When looking up VA, you get the buffer for the heap.
Would it? For non-placed resources the heap offset should always be 0. Put a different way, after this patch the invariant is that the VkBuffer always starts at offset 0 of the underlying VkDeviceMemory allocation.
For placed resources before the patch, heap_offset was set to non-zero (see vkd3d_bind_heap_memory). The reason heap_offset was not needed then, was that each placed resource had it's own VK handle, and own VA allocation. Similar story now, except now only heap VkBuffer allocations can be looked up from VA allocator, but it's basically the exact same principle. GPU VA - Base VA for looked up resource gives the correct offset always.
If placed buffers have their own VA space and handle (like before this patch), you'll end up with something like:
- Heap is allocated - Placed buffer is created at offset = 0x1000, heap_offset is also set to 0x1000. VA = 0x10000 is allocated for the buffer. - App tries to bind with VA = 0x10100 - Offset = VA - BaseVA = 0x100, this would be correct. - If heap_offset is added, it's now wrong (0x10000 + 0x100 = 0x10100).
So, well, adding heap_offset in this patch will technically work (because it will always be 0), but it still feels kinda wrong to actually add it. Wouldn't just adding an assert be good enough?
I fixed the copy/clear commands, but I'm a bit concerned the test suite did not catch this. I changed the UAV clear test to use placed resources at least in a follow-up patch.
Support for placed resources is a relatively recent addition to vkd3d, so most of the tests currently use committed resources. It would be good to increase the coverage, but that's something to be aware of, yes.
Do we really need VKD3D_RESOURCE_PLACED_BUFFER? It seems largely equivalent to "!(resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) && d3d12_resource_is_buffer(resource)".
Yes, there is some confusion who owns the heap if DEDICATED_HEAP is used. The heap creates the resource in this case, and not the other way around. FWIW, I tried reverting back, but ended up with issues which would likely need some awkward flag to work around anyways, so PLACED_BUFFER is the best option I think.
What is the exact issue there?
if (resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) d3d12_heap_destroy(resource->heap); is called twice. The heap thinks it owns itself, and the resource thinks it owns the heap, because it's marked as owning the heap as well.
Cheers, Hans-Kristian
Ref. commit: https://source.winehq.org/git/vkd3d.git/commitdiff/c002aee119b638d30eeb7cdc9...
This patch breaks World of Warcraft (crash) with a
fixme:d3d12_device_CheckFeatureSupport: Unhandled feature 0x15.
Reverting this patch fixes it.
WoW is still broken when zoning into Nazjatar tho ref. https://bugs.winehq.org/show_bug.cgi?id=47471 but that is a different bug.
Sveinar Søpler
There appears to be a complete implementation of RS 1.1 already, so enable this feature.
Signed-off-by: Hans-Kristian Arntzen <post at arntzen-software.no>
libs/vkd3d/device.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 691a23a..d248726 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2720,8 +2720,7 @@ static HRESULT STDMETHODCALLTYPE d3d12_device_CheckFeatureSupport(ID3D12Device * return E_INVALIDARG; }
FIXME("Root signature version 1_1 not supported yet.\n");
data->HighestVersion = D3D_ROOT_SIGNATURE_VERSION_1_0;
data->HighestVersion = D3D_ROOT_SIGNATURE_VERSION_1_1; TRACE("Root signature version %#x.\n", data->HighestVersion); return S_OK;
-- 2.23.0
October 23, 2019 3:09 PM, "Sveinar Søpler" cybermax@dexter.no wrote:
fixme:d3d12_device_CheckFeatureSupport: Unhandled feature 0x15.
That's D3D12_FEATURE_D3D12_OPTIONS3.
typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS3 { BOOL CopyQueueTimestampQueriesSupported; BOOL CastingFullyTypedFormatSupported; D3D12_COMMAND_LIST_SUPPORT_FLAGS WriteBufferImmediateSupportFlags; /* VK_AMD_buffer_marker */ D3D12_VIEW_INSTANCING_TIER ViewInstancingTier; /* VK_KHR_multiview */ BOOL BarycentricsSupported; /* VK_NV_fragment_shader_barycentric, VK_AMD_shader_explicit_vertex_parameter */ } D3D12_FEATURE_DATA_D3D12_OPTIONS3;
The right thing to do, then, is to support this option--if only to say we don't really support anything in it. Does this patch fix it for you?
Chip
On Wed, 23 Oct 2019 at 14:51, Hans-Kristian Arntzen post@arntzen-software.no wrote:
On 10/23/19 12:46 PM, Henri Verbeet wrote:
On Wed, 23 Oct 2019 at 13:47, Hans-Kristian Arntzen post@arntzen-software.no wrote:
Consistently applying heap_offset would be worse though, because it would be wrong in the case where placed buffers are not used. When looking up VA, you get the buffer for the heap.
Would it? For non-placed resources the heap offset should always be 0. Put a different way, after this patch the invariant is that the VkBuffer always starts at offset 0 of the underlying VkDeviceMemory allocation.
For placed resources before the patch, heap_offset was set to non-zero (see vkd3d_bind_heap_memory). The reason heap_offset was not needed then, was that each placed resource had it's own VK handle, and own VA allocation. Similar story now, except now only heap VkBuffer allocations can be looked up from VA allocator, but it's basically the exact same principle. GPU VA - Base VA for looked up resource gives the correct offset always.
If placed buffers have their own VA space and handle (like before this patch), you'll end up with something like:
Well yes, but isn't the entire point of this patch that they shouldn't? You'll also have issues with placed resources that alias each other.
- Heap is allocated
- Placed buffer is created at offset = 0x1000, heap_offset is also set
to 0x1000. VA = 0x10000 is allocated for the buffer.
- App tries to bind with VA = 0x10100
- Offset = VA - BaseVA = 0x100, this would be correct.
- If heap_offset is added, it's now wrong (0x10000 + 0x100 = 0x10100).
So, well, adding heap_offset in this patch will technically work (because it will always be 0), but it still feels kinda wrong to actually add it. Wouldn't just adding an assert be good enough?
I'd argue that adding the offset is the *correct* thing to do, and omitting it in cases where we know it will always be zero is an optimisation. That may still be ok though if we add a helper function with an appropriate comment to get the VkBuffer and offset from a D3D12_GPU_VIRTUAL_ADDRESS.
Yes, there is some confusion who owns the heap if DEDICATED_HEAP is used. The heap creates the resource in this case, and not the other way around. FWIW, I tried reverting back, but ended up with issues which would likely need some awkward flag to work around anyways, so PLACED_BUFFER is the best option I think.
What is the exact issue there?
if (resource->flags & VKD3D_RESOURCE_DEDICATED_HEAP) d3d12_heap_destroy(resource->heap); is called twice. The heap thinks it owns itself, and the resource thinks it owns the heap, because it's marked as owning the heap as well.
One way to solve that is to use "heap->is_private" in d3d12_resource_destroy(), instead of the VKD3D_RESOURCE_DEDICATED_HEAP flag. Another, slightly uglier, option is to do something like the following in d3d12_heap_destroy():
if ((resource = heap->buffer_resource)) { heap->buffer_resource = NULL; d3d12_resource_decref(resource); }
I think either would be cleaner than introducing an extra resource flag.