Shader visibility is currently ignored, but we don't want to create Vulkan descriptor sets for CPU heaps.
-- v2: vkd3d: Do not create Vulkan descriptor sets for non-shader-visible heaps. vkd3d: Return a null handle from GetGPUDescriptorHandleForHeapStart() for non-shader-visible heaps. tests: Test GetGPUDescriptorHandleForHeapStart() on a non-shader-visible heap. vkd3d: Enable Vulkan-backed heaps for each heap instead of per device.
From: Conor McCarthy cmccarthy@codeweavers.com
Provides a simple way to disable Vulkan writes for non-shader-visible heaps. Also there is a chance of avoiding access to the d3d12_device object which helps memory cache performance. --- libs/vkd3d/device.c | 4 +++- libs/vkd3d/resource.c | 25 ++++++++++++++++--------- libs/vkd3d/vkd3d_private.h | 6 +++++- 3 files changed, 24 insertions(+), 11 deletions(-)
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index b9a8943c..025a60fc 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -3414,6 +3414,7 @@ static void STDMETHODCALLTYPE d3d12_device_CopyDescriptors(ID3D12Device *iface, struct d3d12_device *device = impl_from_ID3D12Device(iface); unsigned int dst_range_idx, dst_idx, src_range_idx, src_idx; unsigned int dst_range_size, src_range_size; + struct d3d12_descriptor_heap *dst_heap; const struct d3d12_desc *src; struct d3d12_desc *dst;
@@ -3443,13 +3444,14 @@ static void STDMETHODCALLTYPE d3d12_device_CopyDescriptors(ID3D12Device *iface, src_range_size = src_descriptor_range_sizes ? src_descriptor_range_sizes[src_range_idx] : 1;
dst = d3d12_desc_from_cpu_handle(dst_descriptor_range_offsets[dst_range_idx]); + dst_heap = d3d12_desc_get_descriptor_heap(dst); src = d3d12_desc_from_cpu_handle(src_descriptor_range_offsets[src_range_idx]);
for (; dst_idx < dst_range_size && src_idx < src_range_size; ++dst_idx, ++src_idx) { if (dst[dst_idx].s.u.object == src[src_idx].s.u.object) continue; - d3d12_desc_copy(&dst[dst_idx], &src[src_idx], device); + d3d12_desc_copy(&dst[dst_idx], &src[src_idx], dst_heap, device); }
if (dst_idx >= dst_range_size) diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 4c07d326..72663afe 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -2411,13 +2411,11 @@ void d3d12_desc_flush_vk_heap_updates_locked(struct d3d12_descriptor_heap *descr descriptor_writes_free_object_refs(&writes, device); }
-static void d3d12_desc_mark_as_modified(struct d3d12_desc *dst) +static void d3d12_desc_mark_as_modified(struct d3d12_desc *dst, struct d3d12_descriptor_heap *descriptor_heap) { - struct d3d12_descriptor_heap *descriptor_heap; unsigned int i, head;
i = dst->index; - descriptor_heap = d3d12_desc_get_descriptor_heap(dst); head = descriptor_heap->dirty_list_head;
/* Only one thread can swap the value away from zero. */ @@ -2431,14 +2429,20 @@ static void d3d12_desc_mark_as_modified(struct d3d12_desc *dst) } }
-void d3d12_desc_write_atomic(struct d3d12_desc *dst, const struct d3d12_desc *src, - struct d3d12_device *device) +static inline void descriptor_heap_write_atomic(struct d3d12_descriptor_heap *descriptor_heap, struct d3d12_desc *dst, + const struct d3d12_desc *src, struct d3d12_device *device) { void *object = src->s.u.object;
d3d12_desc_replace(dst, object, device); - if (device->use_vk_heaps && object && !dst->next) - d3d12_desc_mark_as_modified(dst); + if (descriptor_heap->use_vk_heaps && object && !dst->next) + d3d12_desc_mark_as_modified(dst, descriptor_heap); +} + +void d3d12_desc_write_atomic(struct d3d12_desc *dst, const struct d3d12_desc *src, + struct d3d12_device *device) +{ + descriptor_heap_write_atomic(d3d12_desc_get_descriptor_heap(dst), dst, src, device); }
static void d3d12_desc_destroy(struct d3d12_desc *descriptor, struct d3d12_device *device) @@ -2446,7 +2450,9 @@ static void d3d12_desc_destroy(struct d3d12_desc *descriptor, struct d3d12_devic d3d12_desc_replace(descriptor, NULL, device); }
-void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, +/* This is a major performance bottleneck for some games, so do not load the device + * pointer from dst_heap. In some cases device will not be used. */ +void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, struct d3d12_descriptor_heap *dst_heap, struct d3d12_device *device) { struct d3d12_desc tmp; @@ -2454,7 +2460,7 @@ void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, assert(dst != src);
tmp.s.u.object = d3d12_desc_get_object_ref(src, device); - d3d12_desc_write_atomic(dst, &tmp, device); + descriptor_heap_write_atomic(dst_heap, dst, &tmp, device); }
static VkDeviceSize vkd3d_get_required_texel_buffer_alignment(const struct d3d12_device *device, @@ -3987,6 +3993,7 @@ static HRESULT d3d12_descriptor_heap_init(struct d3d12_descriptor_heap *descript if (FAILED(hr = vkd3d_private_store_init(&descriptor_heap->private_store))) return hr;
+ descriptor_heap->use_vk_heaps = device->use_vk_heaps; d3d12_descriptor_heap_vk_descriptor_sets_init(descriptor_heap, device, desc); vkd3d_mutex_init(&descriptor_heap->vk_sets_mutex);
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index c5259420..fa5ee808 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -895,7 +895,10 @@ static inline void d3d12_desc_copy_raw(struct d3d12_desc *dst, const struct d3d1 dst->s = src->s; }
-void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, struct d3d12_device *device); +struct d3d12_descriptor_heap; + +void d3d12_desc_copy(struct d3d12_desc *dst, const struct d3d12_desc *src, struct d3d12_descriptor_heap *dst_heap, + struct d3d12_device *device); void d3d12_desc_create_cbv(struct d3d12_desc *descriptor, struct d3d12_device *device, const D3D12_CONSTANT_BUFFER_VIEW_DESC *desc); void d3d12_desc_create_srv(struct d3d12_desc *descriptor, @@ -1004,6 +1007,7 @@ struct d3d12_descriptor_heap VkDescriptorPool vk_descriptor_pool; struct d3d12_descriptor_heap_vk_set vk_descriptor_sets[VKD3D_SET_INDEX_COUNT]; struct vkd3d_mutex vk_sets_mutex; + bool use_vk_heaps;
unsigned int volatile dirty_list_head;
From: Conor McCarthy cmccarthy@codeweavers.com
--- tests/d3d12.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/tests/d3d12.c b/tests/d3d12.c index 2f1c905f..2184e825 100644 --- a/tests/d3d12.c +++ b/tests/d3d12.c @@ -2351,6 +2351,7 @@ done:
static void test_create_descriptor_heap(void) { + D3D12_GPU_DESCRIPTOR_HANDLE gpu_handle; D3D12_DESCRIPTOR_HEAP_DESC heap_desc; ID3D12Device *device, *tmp_device; ID3D12DescriptorHeap *heap; @@ -2370,6 +2371,10 @@ static void test_create_descriptor_heap(void) hr = ID3D12Device_CreateDescriptorHeap(device, &heap_desc, &IID_ID3D12DescriptorHeap, (void **)&heap); ok(hr == S_OK, "Failed to create descriptor heap, hr %#x.\n", hr);
+ gpu_handle = ID3D12DescriptorHeap_GetGPUDescriptorHandleForHeapStart(heap); + todo + ok(!gpu_handle.ptr, "Got unexpected ptr %"PRIx64".\n", gpu_handle.ptr); + refcount = get_refcount(device); ok(refcount == 2, "Got unexpected refcount %u.\n", (unsigned int)refcount); hr = ID3D12DescriptorHeap_GetDevice(heap, &IID_ID3D12Device, (void **)&tmp_device);
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d/resource.c | 10 +++++++++- tests/d3d12.c | 1 - 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 72663afe..00614ddd 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -3859,7 +3859,15 @@ static D3D12_GPU_DESCRIPTOR_HANDLE * STDMETHODCALLTYPE d3d12_descriptor_heap_Get
TRACE("iface %p, descriptor %p.\n", iface, descriptor);
- descriptor->ptr = (uint64_t)(intptr_t)heap->descriptors; + if (heap->desc.Flags & D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE) + { + descriptor->ptr = (uint64_t)(intptr_t)heap->descriptors; + } + else + { + WARN("Heap %p is not shader-visible.\n", iface); + descriptor->ptr = 0; + }
return descriptor; } diff --git a/tests/d3d12.c b/tests/d3d12.c index 2184e825..b2d409e5 100644 --- a/tests/d3d12.c +++ b/tests/d3d12.c @@ -2372,7 +2372,6 @@ static void test_create_descriptor_heap(void) ok(hr == S_OK, "Failed to create descriptor heap, hr %#x.\n", hr);
gpu_handle = ID3D12DescriptorHeap_GetGPUDescriptorHandleForHeapStart(heap); - todo ok(!gpu_handle.ptr, "Got unexpected ptr %"PRIx64".\n", gpu_handle.ptr);
refcount = get_refcount(device);
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d/command.c | 10 ++++++++++ libs/vkd3d/resource.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 53cb5d95..1aeae451 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -4425,6 +4425,7 @@ static void d3d12_command_list_set_descriptor_table(struct d3d12_command_list *l { struct vkd3d_pipeline_bindings *bindings = &list->pipeline_bindings[bind_point]; const struct d3d12_root_signature *root_signature = bindings->root_signature; + struct d3d12_descriptor_heap *descriptor_heap; struct d3d12_desc *desc;
assert(root_signature_get_descriptor_table(root_signature, index)); @@ -4435,6 +4436,15 @@ static void d3d12_command_list_set_descriptor_table(struct d3d12_command_list *l if (bindings->descriptor_tables[index] == desc) return;
+ descriptor_heap = d3d12_desc_get_descriptor_heap(desc); + if (!(descriptor_heap->desc.Flags & D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE)) + { + /* GetGPUDescriptorHandleForHeapStart() returns a null handle in this case, + * but a CPU handle could be passed. */ + WARN("Descriptor heap %p is not shader visible.\n", descriptor_heap); + return; + } + bindings->descriptor_tables[index] = desc; bindings->descriptor_table_dirty_mask |= (uint64_t)1 << index; bindings->descriptor_table_active_mask |= (uint64_t)1 << index; diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index 00614ddd..f2fe1250 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -3970,7 +3970,7 @@ static HRESULT d3d12_descriptor_heap_vk_descriptor_sets_init(struct d3d12_descri descriptor_heap->vk_descriptor_pool = VK_NULL_HANDLE; memset(descriptor_heap->vk_descriptor_sets, 0, sizeof(descriptor_heap->vk_descriptor_sets));
- if (!device->use_vk_heaps || (desc->Type != D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV + if (!descriptor_heap->use_vk_heaps || (desc->Type != D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV && desc->Type != D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER)) return S_OK;
@@ -4001,7 +4001,7 @@ static HRESULT d3d12_descriptor_heap_init(struct d3d12_descriptor_heap *descript if (FAILED(hr = vkd3d_private_store_init(&descriptor_heap->private_store))) return hr;
- descriptor_heap->use_vk_heaps = device->use_vk_heaps; + descriptor_heap->use_vk_heaps = device->use_vk_heaps && (desc->Flags & D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE); d3d12_descriptor_heap_vk_descriptor_sets_init(descriptor_heap, device, desc); vkd3d_mutex_init(&descriptor_heap->vk_sets_mutex);
On Wed Jul 19 02:02:04 2023 +0000, Henri Verbeet wrote:
- d3d12_desc_copy(&dst[dst_idx], &src[src_idx], device); + d3d12_desc_copy(&dst[dst_idx], &src[src_idx], dst_heap, device);
Do we really need to pass "device" around if we're already passing "dst_heap"? As you mention in the commit message, we don't end up accessing "device" in a lot of places any more, and it seems like we could use "dst_heap->device" when we do.
This function is the main reason for much lower performance vs Windows in HZD at least. Not loading device from dst_heap when we can have it in a register is an advantage. I've added a comment to clarify.
This function is the main reason for much lower performance vs Windows in HZD at least. Not loading device from dst_heap when we can have it in a register is an advantage. I've added a comment to clarify.
In that case, should we be calling d3d12_desc_copy() in a loop from device.c at all? Or would it be better to e.g. introduce a "d3d12_descriptor_heap_copy()" function that takes care of that loop and inlines d3d12_desc_copy() (and quite possibly d3d12_desc_write_atomic())? We may also want to consider placing "device" in the same cacheline as "use_vk_heaps" inside struct d3d12_descriptor_heap in that case.
On Thu Jul 20 12:55:28 2023 +0000, Henri Verbeet wrote:
This function is the main reason for much lower performance vs Windows
in HZD at least. Not loading device from dst_heap when we can have it in a register is an advantage. I've added a comment to clarify. In that case, should we be calling d3d12_desc_copy() in a loop from device.c at all? Or would it be better to e.g. introduce a "d3d12_descriptor_heap_copy()" function that takes care of that loop and inlines d3d12_desc_copy() (and quite possibly d3d12_desc_write_atomic())? We may also want to consider placing "device" in the same cacheline as "use_vk_heaps" inside struct d3d12_descriptor_heap in that case.
Yes I think it will benefit from another patch series to optimise copying. It may also be worth storing texture views in their respective resources to eliminate refcounting of texture views.