This simplifies the preservation of fence objects until worker threads are done with them, and will be needed when threaded queue submission is added.
Signed-off-by: Conor McCarthy cmccarthy@codeweavers.com --- libs/vkd3d/command.c | 72 ++++++++++++-------------------------- libs/vkd3d/vkd3d_private.h | 4 +-- 2 files changed, 24 insertions(+), 52 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 09171fe4..7a373b34 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -20,6 +20,8 @@
#include "vkd3d_private.h"
+static void d3d12_fence_incref(struct d3d12_fence *fence); +static void d3d12_fence_decref(struct d3d12_fence *fence); static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence);
HRESULT vkd3d_queue_create(struct d3d12_device *device, @@ -295,7 +297,7 @@ static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, waiting_fence->queue_sequence_number = queue_sequence_number; ++worker->enqueued_fence_count;
- InterlockedIncrement(&fence->pending_worker_operation_count); + d3d12_fence_incref(fence);
vkd3d_cond_signal(&worker->cond); vkd3d_mutex_unlock(&worker->mutex); @@ -303,37 +305,6 @@ static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, return S_OK; }
-static void vkd3d_fence_worker_remove_fence(struct vkd3d_fence_worker *worker, struct d3d12_fence *fence) -{ - LONG count; - int rc; - - if (!(count = InterlockedAdd(&fence->pending_worker_operation_count, 0))) - return; - - WARN("Waiting for %u pending fence operations (fence %p).\n", count, fence); - - if ((rc = vkd3d_mutex_lock(&worker->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return; - } - - while ((count = InterlockedAdd(&fence->pending_worker_operation_count, 0))) - { - TRACE("Still waiting for %u pending fence operations (fence %p).\n", count, fence); - - worker->pending_fence_destruction = true; - vkd3d_cond_signal(&worker->cond); - - vkd3d_cond_wait(&worker->fence_destruction_cond, &worker->mutex); - } - - TRACE("Removed fence %p.\n", fence); - - vkd3d_mutex_unlock(&worker->mutex); -} - static void vkd3d_fence_worker_move_enqueued_fences_locked(struct vkd3d_fence_worker *worker) { unsigned int i; @@ -432,7 +403,7 @@ static void vkd3d_wait_for_gpu_timeline_semaphores(struct vkd3d_fence_worker *wo if (FAILED(hr = d3d12_fence_signal(current->fence, counter_value, VK_NULL_HANDLE))) ERR("Failed to signal D3D12 fence, hr %#x.\n", hr);
- InterlockedDecrement(¤t->fence->pending_worker_operation_count); + d3d12_fence_decref(current->fence); continue; }
@@ -480,7 +451,7 @@ static void vkd3d_wait_for_gpu_fences(struct vkd3d_fence_worker *worker) if (FAILED(hr = d3d12_fence_signal(current->fence, current->value, vk_fence))) ERR("Failed to signal D3D12 fence, hr %#x.\n", hr);
- InterlockedDecrement(¤t->fence->pending_worker_operation_count); + d3d12_fence_decref(current->fence);
vkd3d_queue_update_sequence_number(current->queue, current->queue_sequence_number, device); continue; @@ -518,12 +489,6 @@ static void *vkd3d_fence_worker_main(void *arg) break; }
- if (worker->pending_fence_destruction) - { - vkd3d_cond_broadcast(&worker->fence_destruction_cond); - worker->pending_fence_destruction = false; - } - if (worker->enqueued_fence_count) { vkd3d_fence_worker_move_enqueued_fences_locked(worker); @@ -560,7 +525,6 @@ HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, TRACE("worker %p.\n", worker);
worker->should_exit = false; - worker->pending_fence_destruction = false; worker->device = device;
worker->enqueued_fence_count = 0; @@ -1026,22 +990,35 @@ static ULONG STDMETHODCALLTYPE d3d12_fence_AddRef(ID3D12Fence *iface) return refcount; }
+static void d3d12_fence_incref(struct d3d12_fence *fence) +{ + InterlockedIncrement(&fence->internal_refcount); +} + static ULONG STDMETHODCALLTYPE d3d12_fence_Release(ID3D12Fence *iface) { struct d3d12_fence *fence = impl_from_ID3D12Fence(iface); ULONG refcount = InterlockedDecrement(&fence->refcount); - int rc;
TRACE("%p decreasing refcount to %u.\n", fence, refcount);
if (!refcount) + d3d12_fence_decref(fence); + + return refcount; +} + +static void d3d12_fence_decref(struct d3d12_fence *fence) +{ + ULONG internal_refcount = InterlockedDecrement(&fence->internal_refcount); + int rc; + + if (!internal_refcount) { struct d3d12_device *device = fence->device;
vkd3d_private_store_destroy(&fence->private_store);
- vkd3d_fence_worker_remove_fence(&device->fence_worker, fence); - d3d12_fence_destroy_vk_objects(fence);
vkd3d_free(fence->events); @@ -1052,8 +1029,6 @@ static ULONG STDMETHODCALLTYPE d3d12_fence_Release(ID3D12Fence *iface)
d3d12_device_release(device); } - - return refcount; }
static HRESULT STDMETHODCALLTYPE d3d12_fence_GetPrivateData(ID3D12Fence *iface, @@ -1380,6 +1355,7 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device * int rc;
fence->ID3D12Fence_iface.lpVtbl = &d3d12_fence_vtbl; + fence->internal_refcount = 1; fence->refcount = 1;
fence->value = initial_value; @@ -1419,8 +1395,6 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device *
memset(fence->old_vk_fences, 0, sizeof(fence->old_vk_fences));
- fence->pending_worker_operation_count = 0; - if (FAILED(hr = vkd3d_private_store_init(&fence->private_store))) { vkd3d_mutex_destroy(&fence->mutex); @@ -6496,7 +6470,7 @@ static HRESULT vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worke waiting_fence->queue = queue; ++worker->enqueued_fence_count;
- InterlockedIncrement(&fence->pending_worker_operation_count); + d3d12_fence_incref(fence);
vkd3d_cond_signal(&worker->cond); vkd3d_mutex_unlock(&worker->mutex); diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 56060b6d..a0163c8d 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -346,7 +346,6 @@ struct vkd3d_fence_worker struct vkd3d_cond cond; struct vkd3d_cond fence_destruction_cond; bool should_exit; - bool pending_fence_destruction;
LONG enqueued_fence_count; struct vkd3d_enqueued_fence @@ -532,6 +531,7 @@ struct vkd3d_pending_fence_wait struct d3d12_fence { ID3D12Fence ID3D12Fence_iface; + LONG internal_refcount; LONG refcount;
uint64_t value; @@ -555,8 +555,6 @@ struct d3d12_fence struct list semaphores; unsigned int semaphore_count;
- LONG pending_worker_operation_count; - VkFence old_vk_fences[VKD3D_MAX_VK_SYNC_OBJECTS];
struct d3d12_device *device;
Simplifies the handling of gpu waits, and in vkd3d-proton is reported to increase performance when support for multiple Vulkan queues is added, because it avoids the problem of fences being signalled while they sit in the pending buffer waiting to be moved to the wait buffer.
Based on a vkd3d-proton patch by Philip Rebohle.
Signed-off-by: Conor McCarthy cmccarthy@codeweavers.com --- libs/vkd3d/command.c | 274 ++++++++++++------------------------- libs/vkd3d/device.c | 8 +- libs/vkd3d/vkd3d_private.h | 29 ++-- 3 files changed, 96 insertions(+), 215 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 7a373b34..5f6cd7f7 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -280,22 +280,19 @@ static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, return hresult_from_errno(rc); }
- if (!vkd3d_array_reserve((void **)&worker->enqueued_fences, &worker->enqueued_fences_size, - worker->enqueued_fence_count + 1, sizeof(*worker->enqueued_fences))) + if (!vkd3d_array_reserve((void **)&worker->fences, &worker->fences_size, + worker->fence_count + 1, sizeof(*worker->fences))) { ERR("Failed to add GPU fence.\n"); vkd3d_mutex_unlock(&worker->mutex); return E_OUTOFMEMORY; }
- worker->enqueued_fences[worker->enqueued_fence_count].vk_fence = vk_fence; - worker->enqueued_fences[worker->enqueued_fence_count].vk_semaphore = VK_NULL_HANDLE; - waiting_fence = &worker->enqueued_fences[worker->enqueued_fence_count].waiting_fence; + waiting_fence = &worker->fences[worker->fence_count++]; waiting_fence->fence = fence; waiting_fence->value = value; - waiting_fence->queue = queue; + waiting_fence->u.vk_fence = vk_fence; waiting_fence->queue_sequence_number = queue_sequence_number; - ++worker->enqueued_fence_count;
d3d12_fence_incref(fence);
@@ -305,219 +302,124 @@ static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, return S_OK; }
-static void vkd3d_fence_worker_move_enqueued_fences_locked(struct vkd3d_fence_worker *worker) -{ - unsigned int i; - bool timeline; - size_t count; - bool ret; - - if (!worker->enqueued_fence_count) - return; - - count = worker->fence_count + worker->enqueued_fence_count; - - if ((timeline = worker->device->use_timeline_semaphores)) - { - ret = vkd3d_array_reserve((void **) &worker->vk_semaphores, &worker->vk_semaphores_size, - count, sizeof(*worker->vk_semaphores)); - ret &= vkd3d_array_reserve((void **) &worker->semaphore_wait_values, &worker->semaphore_wait_values_size, - count, sizeof(*worker->semaphore_wait_values)); - } - else - { - ret = vkd3d_array_reserve((void **)&worker->vk_fences, &worker->vk_fences_size, - count, sizeof(*worker->vk_fences)); - } - ret &= vkd3d_array_reserve((void **)&worker->fences, &worker->fences_size, - count, sizeof(*worker->fences)); - if (!ret) - { - ERR("Failed to reserve memory.\n"); - return; - } - - for (i = 0; i < worker->enqueued_fence_count; ++i) - { - struct vkd3d_enqueued_fence *current = &worker->enqueued_fences[i]; - - if (timeline) - { - worker->vk_semaphores[worker->fence_count] = current->vk_semaphore; - worker->semaphore_wait_values[worker->fence_count] = current->waiting_fence.value; - } - else - { - worker->vk_fences[worker->fence_count] = current->vk_fence; - } - - worker->fences[worker->fence_count] = current->waiting_fence; - ++worker->fence_count; - } - assert(worker->fence_count == count); - worker->enqueued_fence_count = 0; -} - -static void vkd3d_wait_for_gpu_timeline_semaphores(struct vkd3d_fence_worker *worker) +static void vkd3d_wait_for_gpu_timeline_semaphore(struct vkd3d_fence_worker *worker, + const struct vkd3d_waiting_fence *waiting_fence) { const struct d3d12_device *device = worker->device; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkSemaphoreWaitInfoKHR wait_info; - VkSemaphore vk_semaphore; uint64_t counter_value; - unsigned int i, j; + VkResult vr; HRESULT hr; - int vr; - - if (!worker->fence_count) - return;
wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR; wait_info.pNext = NULL; - wait_info.flags = VK_SEMAPHORE_WAIT_ANY_BIT_KHR; - wait_info.pSemaphores = worker->vk_semaphores; - wait_info.semaphoreCount = worker->fence_count; - wait_info.pValues = worker->semaphore_wait_values; + wait_info.flags = 0; + wait_info.semaphoreCount = 1; + wait_info.pSemaphores = &waiting_fence->u.vk_semaphore; + wait_info.pValues = &waiting_fence->value;
vr = VK_CALL(vkWaitSemaphoresKHR(device->vk_device, &wait_info, ~(uint64_t)0)); if (vr == VK_TIMEOUT) return; if (vr != VK_SUCCESS) { - ERR("Failed to wait for Vulkan timeline semaphores, vr %d.\n", vr); + ERR("Failed to wait for Vulkan timeline semaphore, vr %d.\n", vr); return; }
- for (i = 0, j = 0; i < worker->fence_count; ++i) + if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, waiting_fence->u.vk_semaphore, + &counter_value))) < 0) { - struct vkd3d_waiting_fence *current = &worker->fences[i]; - - vk_semaphore = worker->vk_semaphores[i]; - if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, vk_semaphore, &counter_value))) < 0) - { - ERR("Failed to get Vulkan semaphore value, vr %d.\n", vr); - } - else if (counter_value >= current->value) - { - TRACE("Signaling fence %p value %#"PRIx64".\n", current->fence, current->value); - if (FAILED(hr = d3d12_fence_signal(current->fence, counter_value, VK_NULL_HANDLE))) - ERR("Failed to signal D3D12 fence, hr %#x.\n", hr); - - d3d12_fence_decref(current->fence); - continue; - } + ERR("Failed to get Vulkan semaphore value, vr %d.\n", vr); + } + else + { + TRACE("Signaling fence %p value %#"PRIx64".\n", waiting_fence->fence, waiting_fence->value); + if (FAILED(hr = d3d12_fence_signal(waiting_fence->fence, counter_value, VK_NULL_HANDLE))) + ERR("Failed to signal D3D12 fence, hr %#x.\n", hr);
- if (i != j) - { - worker->vk_semaphores[j] = worker->vk_semaphores[i]; - worker->semaphore_wait_values[j] = worker->semaphore_wait_values[i]; - worker->fences[j] = worker->fences[i]; - } - ++j; + d3d12_fence_decref(waiting_fence->fence); } - worker->fence_count = j; }
-static void vkd3d_wait_for_gpu_fences(struct vkd3d_fence_worker *worker) +static void vkd3d_wait_for_gpu_fence(struct vkd3d_fence_worker *worker, + const struct vkd3d_waiting_fence *waiting_fence) { struct d3d12_device *device = worker->device; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; - unsigned int i, j; - VkFence vk_fence; HRESULT hr; int vr;
- if (!worker->fence_count) - return; - - vr = VK_CALL(vkWaitForFences(device->vk_device, - worker->fence_count, worker->vk_fences, VK_FALSE, ~(uint64_t)0)); + vr = VK_CALL(vkWaitForFences(device->vk_device, 1, &waiting_fence->u.vk_fence, VK_FALSE, ~(uint64_t)0)); if (vr == VK_TIMEOUT) return; if (vr != VK_SUCCESS) { - ERR("Failed to wait for Vulkan fences, vr %d.\n", vr); + ERR("Failed to wait for Vulkan fence, vr %d.\n", vr); return; }
- for (i = 0, j = 0; i < worker->fence_count; ++i) - { - vk_fence = worker->vk_fences[i]; - if (!(vr = VK_CALL(vkGetFenceStatus(device->vk_device, vk_fence)))) - { - struct vkd3d_waiting_fence *current = &worker->fences[i]; + TRACE("Signaling fence %p value %#"PRIx64".\n", waiting_fence->fence, waiting_fence->value); + if (FAILED(hr = d3d12_fence_signal(waiting_fence->fence, waiting_fence->value, waiting_fence->u.vk_fence))) + ERR("Failed to signal D3D12 fence, hr %#x.\n", hr);
- TRACE("Signaling fence %p value %#"PRIx64".\n", current->fence, current->value); - if (FAILED(hr = d3d12_fence_signal(current->fence, current->value, vk_fence))) - ERR("Failed to signal D3D12 fence, hr %#x.\n", hr); + d3d12_fence_decref(waiting_fence->fence);
- d3d12_fence_decref(current->fence); - - vkd3d_queue_update_sequence_number(current->queue, current->queue_sequence_number, device); - continue; - } - - if (vr != VK_NOT_READY) - ERR("Failed to get Vulkan fence status, vr %d.\n", vr); - - if (i != j) - { - worker->vk_fences[j] = worker->vk_fences[i]; - worker->fences[j] = worker->fences[i]; - } - ++j; - } - worker->fence_count = j; + vkd3d_queue_update_sequence_number(worker->queue, waiting_fence->queue_sequence_number, device); }
static void *vkd3d_fence_worker_main(void *arg) { + size_t old_fences_size, cur_fences_size = 0, cur_fence_count = 0; + struct vkd3d_waiting_fence *old_fences, *cur_fences = NULL; struct vkd3d_fence_worker *worker = arg; + unsigned int i; int rc;
vkd3d_set_thread_name("vkd3d_fence");
for (;;) { - worker->wait_for_gpu_fences(worker); + if ((rc = vkd3d_mutex_lock(&worker->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + break; + }
- if (!worker->fence_count || InterlockedAdd(&worker->enqueued_fence_count, 0)) + if (!worker->fence_count && !worker->should_exit && (rc = vkd3d_cond_wait(&worker->cond, &worker->mutex))) { - if ((rc = vkd3d_mutex_lock(&worker->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - break; - } + ERR("Failed to wait on condition variable, error %d.\n", rc); + vkd3d_mutex_unlock(&worker->mutex); + break; + }
- if (worker->enqueued_fence_count) - { - vkd3d_fence_worker_move_enqueued_fences_locked(worker); - } - else - { - if (worker->should_exit) - { - vkd3d_mutex_unlock(&worker->mutex); - break; - } + if (worker->should_exit) + break;
- if ((rc = vkd3d_cond_wait(&worker->cond, &worker->mutex))) - { - ERR("Failed to wait on condition variable, error %d.\n", rc); - vkd3d_mutex_unlock(&worker->mutex); - break; - } - } + old_fences_size = cur_fences_size; + old_fences = cur_fences;
- vkd3d_mutex_unlock(&worker->mutex); - } + cur_fence_count = worker->fence_count; + cur_fences_size = worker->fences_size; + cur_fences = worker->fences; + + worker->fence_count = 0; + worker->fences_size = old_fences_size; + worker->fences = old_fences; + + vkd3d_mutex_unlock(&worker->mutex); + + for (i = 0; i < cur_fence_count; ++i) + worker->wait_for_gpu_fence(worker, &cur_fences[i]); }
+ vkd3d_free(cur_fences); return NULL; }
-HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, - struct d3d12_device *device) +static HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, + struct vkd3d_queue *queue, struct d3d12_device *device) { HRESULT hr; int rc; @@ -525,25 +427,15 @@ HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, TRACE("worker %p.\n", worker);
worker->should_exit = false; + worker->queue = queue; worker->device = device;
- worker->enqueued_fence_count = 0; - worker->enqueued_fences = NULL; - worker->enqueued_fences_size = 0; - worker->fence_count = 0; - - worker->vk_fences = NULL; - worker->vk_fences_size = 0; worker->fences = NULL; worker->fences_size = 0; - worker->vk_semaphores = NULL; - worker->vk_semaphores_size = 0; - worker->semaphore_wait_values = NULL; - worker->semaphore_wait_values_size = 0;
- worker->wait_for_gpu_fences = device->use_timeline_semaphores - ? vkd3d_wait_for_gpu_timeline_semaphores : vkd3d_wait_for_gpu_fences; + worker->wait_for_gpu_fence = device->vk_info.KHR_timeline_semaphore + ? vkd3d_wait_for_gpu_timeline_semaphore : vkd3d_wait_for_gpu_fence;
if ((rc = vkd3d_mutex_init(&worker->mutex))) { @@ -577,7 +469,7 @@ HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, return hr; }
-HRESULT vkd3d_fence_worker_stop(struct vkd3d_fence_worker *worker, +static HRESULT vkd3d_fence_worker_stop(struct vkd3d_fence_worker *worker, struct d3d12_device *device) { HRESULT hr; @@ -603,11 +495,7 @@ HRESULT vkd3d_fence_worker_stop(struct vkd3d_fence_worker *worker, vkd3d_cond_destroy(&worker->cond); vkd3d_cond_destroy(&worker->fence_destruction_cond);
- vkd3d_free(worker->enqueued_fences); - vkd3d_free(worker->vk_fences); vkd3d_free(worker->fences); - vkd3d_free(worker->vk_semaphores); - vkd3d_free(worker->semaphore_wait_values);
return S_OK; } @@ -6227,6 +6115,8 @@ static ULONG STDMETHODCALLTYPE d3d12_command_queue_Release(ID3D12CommandQueue *i { struct d3d12_device *device = command_queue->device;
+ vkd3d_fence_worker_stop(&command_queue->fence_worker, device); + vkd3d_private_store_destroy(&command_queue->private_store);
vkd3d_free(command_queue); @@ -6455,20 +6345,18 @@ static HRESULT vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worke return hresult_from_errno(rc); }
- if (!vkd3d_array_reserve((void **)&worker->enqueued_fences, &worker->enqueued_fences_size, - worker->enqueued_fence_count + 1, sizeof(*worker->enqueued_fences))) + if (!vkd3d_array_reserve((void **)&worker->fences, &worker->fences_size, + worker->fence_count + 1, sizeof(*worker->fences))) { ERR("Failed to add GPU timeline semaphore.\n"); vkd3d_mutex_unlock(&worker->mutex); return E_OUTOFMEMORY; }
- worker->enqueued_fences[worker->enqueued_fence_count].vk_semaphore = vk_semaphore; - waiting_fence = &worker->enqueued_fences[worker->enqueued_fence_count].waiting_fence; + waiting_fence = &worker->fences[worker->fence_count++]; waiting_fence->fence = fence; waiting_fence->value = value; - waiting_fence->queue = queue; - ++worker->enqueued_fence_count; + waiting_fence->u.vk_semaphore = vk_semaphore;
d3d12_fence_incref(fence);
@@ -6574,7 +6462,10 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * }
if (device->use_timeline_semaphores) - return vkd3d_enqueue_timeline_semaphore(&device->fence_worker, vk_semaphore, fence, value, vkd3d_queue); + { + return vkd3d_enqueue_timeline_semaphore(&command_queue->fence_worker, + vk_semaphore, fence, value, vkd3d_queue); + }
if (vk_semaphore && SUCCEEDED(hr = d3d12_fence_add_vk_semaphore(fence, vk_semaphore, vk_fence, value))) vk_semaphore = VK_NULL_HANDLE; @@ -6582,8 +6473,11 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * vr = VK_CALL(vkGetFenceStatus(device->vk_device, vk_fence)); if (vr == VK_NOT_READY) { - if (SUCCEEDED(hr = vkd3d_enqueue_gpu_fence(&device->fence_worker, vk_fence, fence, value, vkd3d_queue, sequence_number))) + if (SUCCEEDED(hr = vkd3d_enqueue_gpu_fence(&command_queue->fence_worker, + vk_fence, fence, value, vkd3d_queue, sequence_number))) + { vk_fence = VK_NULL_HANDLE; + } } else if (vr == VK_SUCCESS) { @@ -6978,6 +6872,12 @@ static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, if (FAILED(hr = vkd3d_private_store_init(&queue->private_store))) return hr;
+ if (FAILED(hr = vkd3d_fence_worker_start(&queue->fence_worker, queue->vkd3d_queue, device))) + { + vkd3d_private_store_destroy(&queue->private_store); + return hr; + } + d3d12_device_add_ref(queue->device = device);
return S_OK; diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 0b643727..a1041f2a 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -2711,7 +2711,6 @@ static ULONG STDMETHODCALLTYPE d3d12_device_Release(ID3D12Device *iface) vkd3d_gpu_va_allocator_cleanup(&device->gpu_va_allocator); vkd3d_gpu_descriptor_allocator_cleanup(&device->gpu_descriptor_allocator); vkd3d_render_pass_cache_cleanup(&device->render_pass_cache, device); - vkd3d_fence_worker_stop(&device->fence_worker, device); d3d12_device_destroy_pipeline_cache(device); d3d12_device_destroy_vkd3d_queues(device); for (i = 0; i < ARRAY_SIZE(device->desc_mutex); ++i) @@ -4346,11 +4345,8 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, if (FAILED(hr = vkd3d_private_store_init(&device->private_store))) goto out_free_pipeline_cache;
- if (FAILED(hr = vkd3d_fence_worker_start(&device->fence_worker, device))) - goto out_free_private_store; - if (FAILED(hr = vkd3d_init_format_info(device))) - goto out_stop_fence_worker; + goto out_free_private_store;
if (FAILED(hr = vkd3d_init_null_resources(&device->null_resources, device))) goto out_cleanup_format_info; @@ -4382,8 +4378,6 @@ out_destroy_null_resources: vkd3d_destroy_null_resources(&device->null_resources, device); out_cleanup_format_info: vkd3d_cleanup_format_info(device); -out_stop_fence_worker: - vkd3d_fence_worker_stop(&device->fence_worker, device); out_free_private_store: vkd3d_private_store_destroy(&device->private_store); out_free_pipeline_cache: diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index a0163c8d..350382cd 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -335,7 +335,11 @@ struct vkd3d_waiting_fence { struct d3d12_fence *fence; uint64_t value; - struct vkd3d_queue *queue; + union + { + VkFence vk_fence; + VkSemaphore vk_semaphore; + } u; uint64_t queue_sequence_number; };
@@ -347,33 +351,16 @@ struct vkd3d_fence_worker struct vkd3d_cond fence_destruction_cond; bool should_exit;
- LONG enqueued_fence_count; - struct vkd3d_enqueued_fence - { - VkFence vk_fence; - VkSemaphore vk_semaphore; - struct vkd3d_waiting_fence waiting_fence; - } *enqueued_fences; - size_t enqueued_fences_size; - size_t fence_count; - VkFence *vk_fences; - size_t vk_fences_size; struct vkd3d_waiting_fence *fences; size_t fences_size; - VkSemaphore *vk_semaphores; - size_t vk_semaphores_size; - uint64_t *semaphore_wait_values; - size_t semaphore_wait_values_size;
- void (*wait_for_gpu_fences)(struct vkd3d_fence_worker *worker); + void (*wait_for_gpu_fence)(struct vkd3d_fence_worker *worker, const struct vkd3d_waiting_fence *enqueued_fence);
+ struct vkd3d_queue *queue; struct d3d12_device *device; };
-HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, struct d3d12_device *device); -HRESULT vkd3d_fence_worker_stop(struct vkd3d_fence_worker *worker, struct d3d12_device *device); - struct vkd3d_gpu_va_allocation { D3D12_GPU_VIRTUAL_ADDRESS base; @@ -1338,6 +1325,7 @@ struct d3d12_command_queue
struct vkd3d_queue *vkd3d_queue;
+ struct vkd3d_fence_worker fence_worker; const struct d3d12_fence *last_waited_fence; uint64_t last_waited_fence_value;
@@ -1440,7 +1428,6 @@ struct d3d12_device
struct vkd3d_gpu_descriptor_allocator gpu_descriptor_allocator; struct vkd3d_gpu_va_allocator gpu_va_allocator; - struct vkd3d_fence_worker fence_worker;
struct vkd3d_mutex mutex; struct vkd3d_mutex desc_mutex[8];
On Tue, 26 Apr 2022 at 15:48, Conor McCarthy cmccarthy@codeweavers.com wrote:
@@ -525,25 +427,15 @@ HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, TRACE("worker %p.\n", worker);
worker->should_exit = false;
- worker->queue = queue; worker->device = device;
worker->enqueued_fence_count = 0;
worker->enqueued_fences = NULL;
worker->enqueued_fences_size = 0;
worker->fence_count = 0;
worker->vk_fences = NULL;
worker->vk_fences_size = 0; worker->fences = NULL; worker->fences_size = 0;
worker->vk_semaphores = NULL;
worker->vk_semaphores_size = 0;
worker->semaphore_wait_values = NULL;
worker->semaphore_wait_values_size = 0;
worker->wait_for_gpu_fences = device->use_timeline_semaphores
? vkd3d_wait_for_gpu_timeline_semaphores : vkd3d_wait_for_gpu_fences;
- worker->wait_for_gpu_fence = device->vk_info.KHR_timeline_semaphore
? vkd3d_wait_for_gpu_timeline_semaphore : vkd3d_wait_for_gpu_fence;
I think the original "device->use_timeline_semaphores" check was correct. As-is, this will try to use vkd3d_wait_for_gpu_timeline_semaphore() on fences without a timeline semaphore for me, with predictable results.
Order does not need to be preserved here, and another function will need to add to this array when mapped timeline semaphores are implemented.
Signed-off-by: Conor McCarthy cmccarthy@codeweavers.com --- libs/vkd3d/command.c | 42 +++++++++++++++++++++++--------------- libs/vkd3d/vkd3d_private.h | 4 ++-- 2 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 5f6cd7f7..376321f4 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -590,18 +590,19 @@ static void d3d12_fence_garbage_collect_vk_semaphores_locked(struct d3d12_fence { struct d3d12_device *device = fence->device; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; - struct vkd3d_signaled_semaphore *current, *p; - unsigned int semaphore_count; + struct vkd3d_signaled_semaphore *current; + unsigned int i, semaphore_count;
semaphore_count = fence->semaphore_count; if (!destroy_all && semaphore_count < VKD3D_MAX_VK_SYNC_OBJECTS) return;
- LIST_FOR_EACH_ENTRY_SAFE(current, p, &fence->semaphores, struct vkd3d_signaled_semaphore, entry) + for (i = 0; i < fence->semaphore_count; ++i) { if (!destroy_all && fence->semaphore_count < VKD3D_MAX_VK_SYNC_OBJECTS) break;
+ current = &fence->semaphores[i]; /* The semaphore doesn't have a pending signal operation if the fence * was signaled. */ if ((current->vk_fence || current->is_acquired) && !destroy_all) @@ -612,10 +613,7 @@ static void d3d12_fence_garbage_collect_vk_semaphores_locked(struct d3d12_fence assert(!current->is_acquired);
VK_CALL(vkDestroySemaphore(device->vk_device, current->vk_semaphore, NULL)); - list_remove(¤t->entry); - vkd3d_free(current); - - --fence->semaphore_count; + fence->semaphores[i--] = fence->semaphores[--fence->semaphore_count]; }
if (semaphore_count != fence->semaphore_count) @@ -656,6 +654,7 @@ static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct struct vkd3d_signaled_semaphore *semaphore; struct vkd3d_signaled_semaphore *current; uint64_t semaphore_value; + unsigned int i; int rc;
TRACE("fence %p, value %#"PRIx64".\n", fence, value); @@ -669,8 +668,9 @@ static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct semaphore = NULL; semaphore_value = ~(uint64_t)0;
- LIST_FOR_EACH_ENTRY(current, &fence->semaphores, struct vkd3d_signaled_semaphore, entry) + for (i = 0; i < fence->semaphore_count; ++i) { + current = &fence->semaphores[i]; /* Prefer a semaphore with the smallest value. */ if (!current->is_acquired && current->value >= value && semaphore_value >= current->value) { @@ -693,6 +693,7 @@ static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct
static void d3d12_fence_remove_vk_semaphore(struct d3d12_fence *fence, struct vkd3d_signaled_semaphore *semaphore) { + size_t i; int rc;
if ((rc = vkd3d_mutex_lock(&fence->mutex))) @@ -703,10 +704,8 @@ static void d3d12_fence_remove_vk_semaphore(struct d3d12_fence *fence, struct vk
assert(semaphore->is_acquired);
- list_remove(&semaphore->entry); - vkd3d_free(semaphore); - - --fence->semaphore_count; + i = semaphore - fence->semaphores; + fence->semaphores[i] = fence->semaphores[--fence->semaphore_count];
vkd3d_mutex_unlock(&fence->mutex); } @@ -751,14 +750,20 @@ static HRESULT d3d12_fence_add_vk_semaphore(struct d3d12_fence *fence,
d3d12_fence_garbage_collect_vk_semaphores_locked(fence, false);
+ if (!vkd3d_array_reserve((void**)&fence->semaphores, &fence->semaphores_size, + fence->semaphore_count + 1, sizeof(*fence->semaphores))) + { + ERR("Failed to add semaphore.\n"); + vkd3d_mutex_unlock(&fence->mutex); + return false; + } + + semaphore = &fence->semaphores[fence->semaphore_count++]; semaphore->value = value; semaphore->vk_semaphore = vk_semaphore; semaphore->vk_fence = vk_fence; semaphore->is_acquired = false;
- list_add_tail(&fence->semaphores, &semaphore->entry); - ++fence->semaphore_count; - vkd3d_mutex_unlock(&fence->mutex);
return hr; @@ -821,8 +826,9 @@ static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkF { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
- LIST_FOR_EACH_ENTRY(current, &fence->semaphores, struct vkd3d_signaled_semaphore, entry) + for (i = 0; i < fence->semaphore_count; ++i) { + current = &fence->semaphores[i]; if (current->vk_fence == vk_fence) current->vk_fence = VK_NULL_HANDLE; } @@ -910,6 +916,7 @@ static void d3d12_fence_decref(struct d3d12_fence *fence) d3d12_fence_destroy_vk_objects(fence);
vkd3d_free(fence->events); + vkd3d_free(fence->semaphores); if ((rc = vkd3d_mutex_destroy(&fence->mutex))) ERR("Failed to destroy mutex, error %d.\n", rc); vkd3d_cond_destroy(&fence->null_event_cond); @@ -1278,7 +1285,8 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device * fence->pending_timeline_value = initial_value; fence->gpu_wait_count = 0;
- list_init(&fence->semaphores); + fence->semaphores = NULL; + fence->semaphores_size = 0; fence->semaphore_count = 0;
memset(fence->old_vk_fences, 0, sizeof(fence->old_vk_fences)); diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 350382cd..4e03145d 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -501,7 +501,6 @@ HRESULT vkd3d_set_private_data_interface(struct vkd3d_private_store *store, cons
struct vkd3d_signaled_semaphore { - struct list entry; uint64_t value; VkSemaphore vk_semaphore; VkFence vk_fence; @@ -539,7 +538,8 @@ struct d3d12_fence struct vkd3d_pending_fence_wait gpu_waits[VKD3D_MAX_FENCE_WAITING_QUEUES]; unsigned int gpu_wait_count;
- struct list semaphores; + struct vkd3d_signaled_semaphore *semaphores; + size_t semaphores_size; unsigned int semaphore_count;
VkFence old_vk_fences[VKD3D_MAX_VK_SYNC_OBJECTS];
On Tue, 26 Apr 2022 at 15:48, Conor McCarthy cmccarthy@codeweavers.com wrote:
@@ -590,18 +590,19 @@ static void d3d12_fence_garbage_collect_vk_semaphores_locked(struct d3d12_fence { struct d3d12_device *device = fence->device; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
- struct vkd3d_signaled_semaphore *current, *p;
- unsigned int semaphore_count;
struct vkd3d_signaled_semaphore *current;
unsigned int i, semaphore_count;
semaphore_count = fence->semaphore_count; if (!destroy_all && semaphore_count < VKD3D_MAX_VK_SYNC_OBJECTS) return;
- LIST_FOR_EACH_ENTRY_SAFE(current, p, &fence->semaphores, struct vkd3d_signaled_semaphore, entry)
for (i = 0; i < fence->semaphore_count; ++i) { if (!destroy_all && fence->semaphore_count < VKD3D_MAX_VK_SYNC_OBJECTS) break;
current = &fence->semaphores[i]; /* The semaphore doesn't have a pending signal operation if the fence * was signaled. */ if ((current->vk_fence || current->is_acquired) && !destroy_all)
@@ -612,10 +613,7 @@ static void d3d12_fence_garbage_collect_vk_semaphores_locked(struct d3d12_fence assert(!current->is_acquired);
VK_CALL(vkDestroySemaphore(device->vk_device, current->vk_semaphore, NULL));
list_remove(¤t->entry);
vkd3d_free(current);
--fence->semaphore_count;
}fence->semaphores[i--] = fence->semaphores[--fence->semaphore_count];
I suppose that works, although decrementing "i" here only to have the loop control increment it again seems a bit awkward; more so for the case where "i == 0". It seems tempting to suggest using a while-loop here, only incrementing "i" when we take the "continue" path, and then using "*current = fence->semaphores[--fence->semaphore_count];" for the last line of the block.
@@ -703,10 +704,8 @@ static void d3d12_fence_remove_vk_semaphore(struct d3d12_fence *fence, struct vk
assert(semaphore->is_acquired);
- list_remove(&semaphore->entry);
- vkd3d_free(semaphore);
- --fence->semaphore_count;
- i = semaphore - fence->semaphores;
- fence->semaphores[i] = fence->semaphores[--fence->semaphore_count];
I.e., "*semaphore = fence->semaphores[--fence->semaphore_count];", right?
Correct fence behaviour requires:
Map monotonically increasing timeline values to fence virtual values to avoid invalid use of Vulkan timeline semaphores. In particular, non- increasing values and value jumps of >= 4G are required for d3d12.
Create a worker thread for each queue to handle queue commands. This allows blocking of wait submission until an unblocking signal is submitted, so out-of-order waits are handled correctly when two d3d12 queues are mapped to the same Vulkan queue.
Threaded queue submission also fixes the old fence implementation so it is fully functional, though a bit less efficient than timeline semaphores.
Based in part on vkd3d-proton patches by Hans-Kristian Arntzen.
Signed-off-by: Conor McCarthy cmccarthy@codeweavers.com --- v2: Always broadcast on the null event condition after signalling.
Threaded command queues on their own seem to bring out the problems with the existing fence implementation, which results in games crashing. Combining them with the fence changes avoids that issue. --- libs/vkd3d/command.c | 893 +++++++++++++++++++++---------------- libs/vkd3d/device.c | 79 ---- libs/vkd3d/vkd3d_private.h | 81 +++- tests/d3d12.c | 4 +- 4 files changed, 564 insertions(+), 493 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 376321f4..f4430966 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -23,6 +23,7 @@ static void d3d12_fence_incref(struct d3d12_fence *fence); static void d3d12_fence_decref(struct d3d12_fence *fence); static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence); +static void d3d12_fence_signal_timeline_semaphore(struct d3d12_fence *fence, uint64_t timeline_value);
HRESULT vkd3d_queue_create(struct d3d12_device *device, uint32_t family_index, const VkQueueFamilyProperties *properties, struct vkd3d_queue **queue) @@ -48,9 +49,6 @@ HRESULT vkd3d_queue_create(struct d3d12_device *device, object->vk_queue_flags = properties->queueFlags; object->timestamp_bits = properties->timestampValidBits;
- object->wait_completion_semaphore = VK_NULL_HANDLE; - object->pending_wait_completion_value = 0; - object->semaphores = NULL; object->semaphores_size = 0; object->semaphore_count = 0; @@ -66,20 +64,6 @@ HRESULT vkd3d_queue_create(struct d3d12_device *device, return S_OK; }
-bool vkd3d_queue_init_timeline_semaphore(struct vkd3d_queue *queue, struct d3d12_device *device) -{ - VkResult vr; - - if (!queue->wait_completion_semaphore - && (vr = vkd3d_create_timeline_semaphore(device, 0, &queue->wait_completion_semaphore)) < 0) - { - WARN("Failed to create timeline semaphore, vr %d.\n", vr); - return false; - } - - return true; -} - void vkd3d_queue_destroy(struct vkd3d_queue *queue, struct d3d12_device *device) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; @@ -94,8 +78,6 @@ void vkd3d_queue_destroy(struct vkd3d_queue *queue, struct d3d12_device *device)
vkd3d_free(queue->semaphores);
- VK_CALL(vkDestroySemaphore(device->vk_device, queue->wait_completion_semaphore, NULL)); - for (i = 0; i < ARRAY_SIZE(queue->old_vk_semaphores); ++i) { if (queue->old_vk_semaphores[i]) @@ -265,7 +247,7 @@ static VkResult vkd3d_queue_create_vk_semaphore_locked(struct vkd3d_queue *queue }
/* Fence worker thread */ -static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, +static bool vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, VkFence vk_fence, struct d3d12_fence *fence, uint64_t value, struct vkd3d_queue *queue, uint64_t queue_sequence_number) { @@ -277,7 +259,7 @@ static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, if ((rc = vkd3d_mutex_lock(&worker->mutex))) { ERR("Failed to lock mutex, error %d.\n", rc); - return hresult_from_errno(rc); + return false; }
if (!vkd3d_array_reserve((void **)&worker->fences, &worker->fences_size, @@ -285,7 +267,7 @@ static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, { ERR("Failed to add GPU fence.\n"); vkd3d_mutex_unlock(&worker->mutex); - return E_OUTOFMEMORY; + return false; }
waiting_fence = &worker->fences[worker->fence_count++]; @@ -299,7 +281,7 @@ static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker, vkd3d_cond_signal(&worker->cond); vkd3d_mutex_unlock(&worker->mutex);
- return S_OK; + return true; }
static void vkd3d_wait_for_gpu_timeline_semaphore(struct vkd3d_fence_worker *worker, @@ -308,9 +290,7 @@ static void vkd3d_wait_for_gpu_timeline_semaphore(struct vkd3d_fence_worker *wor const struct d3d12_device *device = worker->device; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkSemaphoreWaitInfoKHR wait_info; - uint64_t counter_value; VkResult vr; - HRESULT hr;
wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR; wait_info.pNext = NULL; @@ -328,19 +308,10 @@ static void vkd3d_wait_for_gpu_timeline_semaphore(struct vkd3d_fence_worker *wor return; }
- if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, waiting_fence->u.vk_semaphore, - &counter_value))) < 0) - { - ERR("Failed to get Vulkan semaphore value, vr %d.\n", vr); - } - else - { - TRACE("Signaling fence %p value %#"PRIx64".\n", waiting_fence->fence, waiting_fence->value); - if (FAILED(hr = d3d12_fence_signal(waiting_fence->fence, counter_value, VK_NULL_HANDLE))) - ERR("Failed to signal D3D12 fence, hr %#x.\n", hr); + TRACE("Signaling fence %p value %#"PRIx64".\n", waiting_fence->fence, waiting_fence->value); + d3d12_fence_signal_timeline_semaphore(waiting_fence->fence, waiting_fence->value);
- d3d12_fence_decref(waiting_fence->fence); - } + d3d12_fence_decref(waiting_fence->fence); }
static void vkd3d_wait_for_gpu_fence(struct vkd3d_fence_worker *worker, @@ -605,14 +576,14 @@ static void d3d12_fence_garbage_collect_vk_semaphores_locked(struct d3d12_fence current = &fence->semaphores[i]; /* The semaphore doesn't have a pending signal operation if the fence * was signaled. */ - if ((current->vk_fence || current->is_acquired) && !destroy_all) + if ((current->u.binary.vk_fence || current->u.binary.is_acquired) && !destroy_all) continue;
- if (current->vk_fence) + if (current->u.binary.vk_fence) WARN("Destroying potentially pending semaphore.\n"); - assert(!current->is_acquired); + assert(!current->u.binary.is_acquired);
- VK_CALL(vkDestroySemaphore(device->vk_device, current->vk_semaphore, NULL)); + VK_CALL(vkDestroySemaphore(device->vk_device, current->u.binary.vk_semaphore, NULL)); fence->semaphores[i--] = fence->semaphores[--fence->semaphore_count]; }
@@ -648,23 +619,16 @@ static void d3d12_fence_destroy_vk_objects(struct d3d12_fence *fence) vkd3d_mutex_unlock(&fence->mutex); }
-static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct d3d12_fence *fence, +static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore_locked(struct d3d12_fence *fence, uint64_t value, uint64_t *completed_value) { struct vkd3d_signaled_semaphore *semaphore; struct vkd3d_signaled_semaphore *current; uint64_t semaphore_value; unsigned int i; - int rc;
TRACE("fence %p, value %#"PRIx64".\n", fence, value);
- if ((rc = vkd3d_mutex_lock(&fence->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return VK_NULL_HANDLE; - } - semaphore = NULL; semaphore_value = ~(uint64_t)0;
@@ -672,7 +636,7 @@ static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct { current = &fence->semaphores[i]; /* Prefer a semaphore with the smallest value. */ - if (!current->is_acquired && current->value >= value && semaphore_value >= current->value) + if (!current->u.binary.is_acquired && current->value >= value && semaphore_value >= current->value) { semaphore = current; semaphore_value = current->value; @@ -682,12 +646,10 @@ static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct }
if (semaphore) - semaphore->is_acquired = true; + semaphore->u.binary.is_acquired = true;
*completed_value = fence->value;
- vkd3d_mutex_unlock(&fence->mutex); - return semaphore; }
@@ -702,7 +664,7 @@ static void d3d12_fence_remove_vk_semaphore(struct d3d12_fence *fence, struct vk return; }
- assert(semaphore->is_acquired); + assert(semaphore->u.binary.is_acquired);
i = semaphore - fence->semaphores; fence->semaphores[i] = fence->semaphores[--fence->semaphore_count]; @@ -720,32 +682,24 @@ static void d3d12_fence_release_vk_semaphore(struct d3d12_fence *fence, struct v return; }
- assert(semaphore->is_acquired); - semaphore->is_acquired = false; + assert(semaphore->u.binary.is_acquired); + semaphore->u.binary.is_acquired = false;
vkd3d_mutex_unlock(&fence->mutex); }
-static HRESULT d3d12_fence_add_vk_semaphore(struct d3d12_fence *fence, - VkSemaphore vk_semaphore, VkFence vk_fence, uint64_t value) +static bool d3d12_fence_add_vk_semaphore(struct d3d12_fence *fence, VkSemaphore vk_semaphore, + VkFence vk_fence, uint64_t value, const struct vkd3d_queue *signalling_queue) { struct vkd3d_signaled_semaphore *semaphore; - HRESULT hr = S_OK; int rc;
TRACE("fence %p, value %#"PRIx64".\n", fence, value);
- if (!(semaphore = vkd3d_malloc(sizeof(*semaphore)))) - { - ERR("Failed to add semaphore.\n"); - return E_OUTOFMEMORY; - } - if ((rc = vkd3d_mutex_lock(&fence->mutex))) { ERR("Failed to lock mutex, error %d.\n", rc); - vkd3d_free(semaphore); - return E_FAIL; + return false; }
d3d12_fence_garbage_collect_vk_semaphores_locked(fence, false); @@ -760,16 +714,17 @@ static HRESULT d3d12_fence_add_vk_semaphore(struct d3d12_fence *fence,
semaphore = &fence->semaphores[fence->semaphore_count++]; semaphore->value = value; - semaphore->vk_semaphore = vk_semaphore; - semaphore->vk_fence = vk_fence; - semaphore->is_acquired = false; + semaphore->u.binary.vk_semaphore = vk_semaphore; + semaphore->u.binary.vk_fence = vk_fence; + semaphore->u.binary.is_acquired = false; + semaphore->signalling_queue = signalling_queue;
vkd3d_mutex_unlock(&fence->mutex);
- return hr; + return true; }
-static bool d3d12_fence_signal_external_events_locked(struct d3d12_fence *fence) +static void d3d12_fence_signal_external_events_locked(struct d3d12_fence *fence) { struct d3d12_device *device = fence->device; bool signal_null_event_cond = false; @@ -801,7 +756,21 @@ static bool d3d12_fence_signal_external_events_locked(struct d3d12_fence *fence)
fence->event_count = j;
- return signal_null_event_cond; + if (signal_null_event_cond) + vkd3d_cond_broadcast(&fence->null_event_cond); +} + +static void d3d12_fence_update_pending_value_locked(struct d3d12_fence *fence) +{ + uint64_t new_max_pending_value = 0; + unsigned int i; + + for (i = 0; i < fence->semaphore_count; ++i) + new_max_pending_value = max(fence->semaphores[i].value, new_max_pending_value); + + fence->max_pending_value = max(fence->value, new_max_pending_value); + /* If we're signalling the fence, wake up any submission threads which can now safely kick work. */ + vkd3d_cond_broadcast(&fence->cond); }
static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence) @@ -819,8 +788,7 @@ static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkF
fence->value = value;
- if (d3d12_fence_signal_external_events_locked(fence)) - vkd3d_cond_broadcast(&fence->null_event_cond); + d3d12_fence_signal_external_events_locked(fence);
if (vk_fence) { @@ -829,8 +797,8 @@ static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkF for (i = 0; i < fence->semaphore_count; ++i) { current = &fence->semaphores[i]; - if (current->vk_fence == vk_fence) - current->vk_fence = VK_NULL_HANDLE; + if (current->u.binary.vk_fence == vk_fence) + current->u.binary.vk_fence = VK_NULL_HANDLE; }
for (i = 0; i < ARRAY_SIZE(fence->old_vk_fences); ++i) @@ -847,11 +815,135 @@ static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkF VK_CALL(vkDestroyFence(device->vk_device, vk_fence, NULL)); }
+ d3d12_fence_update_pending_value_locked(fence); + vkd3d_mutex_unlock(&fence->mutex);
return S_OK; }
+static void d3d12_fence_block_until_pending_value_reaches_locked(struct d3d12_fence *fence, uint64_t pending_value) +{ + while (pending_value > fence->max_pending_value) + { + TRACE("Blocking wait on fence %p until it reaches 0x%"PRIx64".\n", fence, pending_value); + vkd3d_cond_wait(&fence->cond, &fence->mutex); + } +} + +static bool d3d12_fence_can_elide_wait_semaphore_locked(const struct d3d12_fence *fence, + uint64_t wait_value, const struct vkd3d_queue *waiting_queue) +{ + unsigned int i; + + /* Relevant if the semaphore has been signalled already on host. + * We should not wait on the timeline semaphore directly, we can simply submit in-place. */ + if (fence->value >= wait_value) + return true; + + /* We can elide a wait if we can use the submission order guarantee. + * If there is a pending signal on this queue which will satisfy the wait, + * submission barrier will implicitly complete the wait, + * and we don't have to eat the overhead of submitting an extra wait on top. + * This will essentially always trigger on single-queue. + */ + for (i = 0; i < fence->semaphore_count; ++i) + { + if (fence->semaphores[i].signalling_queue == waiting_queue && fence->semaphores[i].value >= wait_value) + return true; + } + + return false; +} + +static uint64_t d3d12_fence_add_pending_signal_locked(struct d3d12_fence *fence, uint64_t virtual_value, + const struct vkd3d_queue *signalling_queue) +{ + struct vkd3d_signaled_semaphore *semaphore; + + if (!vkd3d_array_reserve((void**)&fence->semaphores, &fence->semaphores_size, + fence->semaphore_count + 1, sizeof(*fence->semaphores))) + { + return 0; + } + + semaphore = &fence->semaphores[fence->semaphore_count++]; + semaphore->value = virtual_value; + semaphore->u.timeline_value = ++fence->pending_timeline_value; + semaphore->signalling_queue = signalling_queue; + return fence->pending_timeline_value; +} + +static uint64_t d3d12_fence_get_timeline_wait_value_locked(struct d3d12_fence *fence, uint64_t virtual_value) +{ + uint64_t target_timeline_value = UINT64_MAX; + unsigned int i; + + /* This shouldn't happen, we will have elided the wait completely in can_elide_wait_semaphore_locked. */ + assert(virtual_value > fence->value); + + /* Find the smallest physical value which is at least the virtual value. */ + for (i = 0; i < fence->semaphore_count; ++i) + { + if (virtual_value <= fence->semaphores[i].value) + target_timeline_value = min(target_timeline_value, fence->semaphores[i].u.timeline_value); + } + + if (target_timeline_value == UINT64_MAX) + { + FIXME("Cannot find a pending timeline semaphore wait value. Emitting a noop wait.\n"); + return 0; + } + else + { + return target_timeline_value; + } +} + +static void d3d12_fence_signal_timeline_semaphore(struct d3d12_fence *fence, uint64_t timeline_value) +{ + bool did_signal; + unsigned int i; + int rc; + + if ((rc = vkd3d_mutex_lock(&fence->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return; + } + + /* With multiple fence workers, it is possible that signal calls are out of + * order. The physical value itself is monotonic, but we need to make sure + * that all signals happen in correct order if there are fence rewinds. + * We don't expect the loop to run more than once, but there might be + * extreme edge cases where we signal 2 or more. */ + while (fence->timeline_value < timeline_value) + { + ++fence->timeline_value; + did_signal = false; + + for (i = 0; i < fence->semaphore_count; ++i) + { + if (fence->timeline_value == fence->semaphores[i].u.timeline_value) + { + fence->value = fence->semaphores[i].value; + d3d12_fence_signal_external_events_locked(fence); + fence->semaphores[i] = fence->semaphores[--fence->semaphore_count]; + did_signal = true; + break; + } + } + + if (!did_signal) + FIXME("Did not signal a virtual value.\n"); + } + + /* In case we have a rewind signalled from GPU, we need to recompute the max pending timeline value. */ + d3d12_fence_update_pending_value_locked(fence); + + vkd3d_mutex_unlock(&fence->mutex); +} + static HRESULT STDMETHODCALLTYPE d3d12_fence_QueryInterface(ID3D12Fence *iface, REFIID riid, void **object) { @@ -919,6 +1011,8 @@ static void d3d12_fence_decref(struct d3d12_fence *fence) vkd3d_free(fence->semaphores); if ((rc = vkd3d_mutex_destroy(&fence->mutex))) ERR("Failed to destroy mutex, error %d.\n", rc); + if ((rc = vkd3d_cond_destroy(&fence->cond))) + ERR("Failed to destroy cond, error %d.\n", rc); vkd3d_cond_destroy(&fence->null_event_cond); vkd3d_free(fence);
@@ -1058,100 +1152,8 @@ static HRESULT STDMETHODCALLTYPE d3d12_fence_SetEventOnCompletion(ID3D12Fence *i return S_OK; }
-static inline bool d3d12_fence_gpu_wait_is_completed(const struct d3d12_fence *fence, unsigned int i) -{ - const struct d3d12_device *device = fence->device; - const struct vkd3d_vk_device_procs *vk_procs; - uint64_t value; - VkResult vr; - - vk_procs = &device->vk_procs; - - if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, - fence->gpu_waits[i].queue->wait_completion_semaphore, &value))) >= 0) - { - return value >= fence->gpu_waits[i].pending_value; - } - - ERR("Failed to get Vulkan semaphore status, vr %d.\n", vr); - return true; -} - -static inline bool d3d12_fence_has_pending_gpu_ops_locked(struct d3d12_fence *fence) -{ - const struct d3d12_device *device = fence->device; - const struct vkd3d_vk_device_procs *vk_procs; - uint64_t value; - unsigned int i; - VkResult vr; - - for (i = 0; i < fence->gpu_wait_count; ++i) - { - if (d3d12_fence_gpu_wait_is_completed(fence, i) && i < --fence->gpu_wait_count) - fence->gpu_waits[i] = fence->gpu_waits[fence->gpu_wait_count]; - } - if (fence->gpu_wait_count) - return true; - - /* Check for pending signals too. */ - if (fence->value >= fence->pending_timeline_value) - return false; - - vk_procs = &device->vk_procs; - - /* Check the actual semaphore value in case fence->value update is lagging. */ - if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, fence->timeline_semaphore, &value))) < 0) - { - ERR("Failed to get Vulkan semaphore status, vr %d.\n", vr); - return false; - } - - return value < fence->pending_timeline_value; -} - -/* Replace the VkSemaphore with a new one to allow a lower value to be set. Ideally apps will - * only use this to reset the fence when no operations are pending on the queue. */ -static HRESULT d3d12_fence_reinit_timeline_semaphore_locked(struct d3d12_fence *fence, uint64_t value) -{ - const struct d3d12_device *device = fence->device; - const struct vkd3d_vk_device_procs *vk_procs; - VkSemaphore timeline_semaphore; - VkResult vr; - - if (d3d12_fence_has_pending_gpu_ops_locked(fence)) - { - /* This situation is not very likely because it means a fence with pending waits and/or signals was - * signalled on the CPU to a lower value. For now, emit a fixme so it can be patched if necessary. - * A patch already exists for this but it's not pretty. */ - FIXME("Unable to re-initialise timeline semaphore to a lower value due to pending GPU ops.\n"); - return E_FAIL; - } - - if ((vr = vkd3d_create_timeline_semaphore(device, value, &timeline_semaphore)) < 0) - { - WARN("Failed to create timeline semaphore, vr %d.\n", vr); - return hresult_from_vk_result(vr); - } - - fence->value = value; - fence->pending_timeline_value = value; - - WARN("Replacing timeline semaphore with a new object.\n"); - - vk_procs = &device->vk_procs; - - VK_CALL(vkDestroySemaphore(device->vk_device, fence->timeline_semaphore, NULL)); - fence->timeline_semaphore = timeline_semaphore; - - return S_OK; -} - static HRESULT d3d12_fence_signal_cpu_timeline_semaphore(struct d3d12_fence *fence, uint64_t value) { - const struct d3d12_device *device = fence->device; - VkSemaphoreSignalInfoKHR info; - HRESULT hr = S_OK; - VkResult vr; int rc;
if ((rc = vkd3d_mutex_lock(&fence->mutex))) @@ -1160,48 +1162,13 @@ static HRESULT d3d12_fence_signal_cpu_timeline_semaphore(struct d3d12_fence *fen return hresult_from_errno(rc); }
- /* We must only signal a value which is greater than the current value. - * That value can be in the range of current known value (fence->value), or as large as pending_timeline_value. - * Pending timeline value signal might be blocked by another synchronization primitive, and thus statically - * cannot be that value, so the safest thing to do is to check the current value which is updated by the fence - * wait thread continuously. This check is technically racy since the value might be immediately out of date, - * but there is no way to avoid this. */ - if (value > fence->value) - { - const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; - - /* Sanity check against the delta limit. */ - if (value - fence->value > device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference) - { - FIXME("Timeline semaphore delta is %"PRIu64", but implementation only supports a delta of %"PRIu64".\n", - value - fence->value, device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference); - } - - info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR; - info.pNext = NULL; - info.semaphore = fence->timeline_semaphore; - info.value = value; - if ((vr = VK_CALL(vkSignalSemaphoreKHR(device->vk_device, &info))) >= 0) - { - fence->value = value; - if (value > fence->pending_timeline_value) - fence->pending_timeline_value = value; - } - else - { - ERR("Failed to signal timeline semaphore, vr %d.\n", vr); - hr = hresult_from_vk_result(vr); - } - } - else if (value < fence->value) - { - hr = d3d12_fence_reinit_timeline_semaphore_locked(fence, value); - } - + fence->value = value; d3d12_fence_signal_external_events_locked(fence); + d3d12_fence_update_pending_value_locked(fence);
vkd3d_mutex_unlock(&fence->mutex); - return hr; + + return S_OK; }
static HRESULT STDMETHODCALLTYPE d3d12_fence_Signal(ID3D12Fence *iface, UINT64 value) @@ -1254,6 +1221,7 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device * fence->refcount = 1;
fence->value = initial_value; + fence->max_pending_value = initial_value;
if ((rc = vkd3d_mutex_init(&fence->mutex))) { @@ -1261,11 +1229,18 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device * return hresult_from_errno(rc); }
+ if ((rc = vkd3d_cond_init(&fence->cond))) + { + ERR("Failed to initialize cond variable, error %d.\n", rc); + hr = hresult_from_errno(rc); + goto fail_destroy_mutex; + } + if ((rc = vkd3d_cond_init(&fence->null_event_cond))) { ERR("Failed to initialize cond variable, error %d.\n", rc); - vkd3d_mutex_destroy(&fence->mutex); - return hresult_from_errno(rc); + hr = hresult_from_errno(rc); + goto fail_destroy_cond; }
if (flags) @@ -1276,14 +1251,15 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device * fence->event_count = 0;
fence->timeline_semaphore = VK_NULL_HANDLE; - if (device->use_timeline_semaphores && (vr = vkd3d_create_timeline_semaphore(device, initial_value, + fence->timeline_value = 0; + fence->pending_timeline_value = 0; + if (device->vk_info.KHR_timeline_semaphore && (vr = vkd3d_create_timeline_semaphore(device, 0, &fence->timeline_semaphore)) < 0) { WARN("Failed to create timeline semaphore, vr %d.\n", vr); - return hresult_from_vk_result(vr); + hr = hresult_from_vk_result(vr); + goto fail_destroy_null_cond; } - fence->pending_timeline_value = initial_value; - fence->gpu_wait_count = 0;
fence->semaphores = NULL; fence->semaphores_size = 0; @@ -1293,14 +1269,21 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device *
if (FAILED(hr = vkd3d_private_store_init(&fence->private_store))) { - vkd3d_mutex_destroy(&fence->mutex); - vkd3d_cond_destroy(&fence->null_event_cond); - return hr; + goto fail_destroy_null_cond; }
d3d12_device_add_ref(fence->device = device);
return S_OK; + +fail_destroy_null_cond: + vkd3d_cond_destroy(&fence->null_event_cond); +fail_destroy_cond: + vkd3d_cond_destroy(&fence->cond); +fail_destroy_mutex: + vkd3d_mutex_destroy(&fence->mutex); + + return hr; }
HRESULT d3d12_fence_create(struct d3d12_device *device, @@ -6074,6 +6057,41 @@ HRESULT d3d12_command_list_create(struct d3d12_device *device, return S_OK; }
+static HRESULT d3d12_command_queue_add_submission_locked(struct d3d12_command_queue *queue, + const struct d3d12_command_queue_submission *sub) +{ + if (!vkd3d_array_reserve((void**)&queue->submissions, &queue->submissions_size, + queue->submissions_count + 1, sizeof(*queue->submissions))) + { + return E_OUTOFMEMORY; + } + + queue->submissions[queue->submissions_count++] = *sub; + vkd3d_cond_signal(&queue->submission_cond); + return S_OK; +} + +static HRESULT d3d12_command_queue_add_submission(struct d3d12_command_queue *queue, + const struct d3d12_command_queue_submission *sub) +{ + HRESULT hr; + + vkd3d_mutex_lock(&queue->submission_mutex); + hr = d3d12_command_queue_add_submission_locked(queue, sub); + vkd3d_mutex_unlock(&queue->submission_mutex); + return hr; +} + +static void d3d12_command_queue_submit_stop(struct d3d12_command_queue *queue) +{ + struct d3d12_command_queue_submission sub; + HRESULT hr; + + sub.type = VKD3D_SUBMISSION_STOP; + if (FAILED(hr = d3d12_command_queue_add_submission(queue, &sub))) + ERR("Failed to submit command, hr %#x.\n", hr); +} + /* ID3D12CommandQueue */ static inline struct d3d12_command_queue *impl_from_ID3D12CommandQueue(ID3D12CommandQueue *iface) { @@ -6124,7 +6142,12 @@ static ULONG STDMETHODCALLTYPE d3d12_command_queue_Release(ID3D12CommandQueue *i struct d3d12_device *device = command_queue->device;
vkd3d_fence_worker_stop(&command_queue->fence_worker, device); + d3d12_command_queue_submit_stop(command_queue); + vkd3d_join_thread(device->vkd3d_instance, &command_queue->submission_thread); + vkd3d_mutex_destroy(&command_queue->submission_mutex); + vkd3d_cond_destroy(&command_queue->submission_cond);
+ vkd3d_free(command_queue->submissions); vkd3d_private_store_destroy(&command_queue->private_store);
vkd3d_free(command_queue); @@ -6229,18 +6252,17 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm UINT command_list_count, ID3D12CommandList * const *command_lists) { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); - const struct vkd3d_vk_device_procs *vk_procs; + struct d3d12_command_queue_submission sub; struct d3d12_command_list *cmd_list; - struct VkSubmitInfo submit_desc; VkCommandBuffer *buffers; - VkQueue vk_queue; unsigned int i; - VkResult vr; + HRESULT hr;
TRACE("iface %p, command_list_count %u, command_lists %p.\n", iface, command_list_count, command_lists);
- vk_procs = &command_queue->device->vk_procs; + if (!command_list_count) + return;
if (!(buffers = vkd3d_calloc(command_list_count, sizeof(*buffers)))) { @@ -6263,29 +6285,11 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm buffers[i] = cmd_list->vk_command_buffer; }
- submit_desc.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submit_desc.pNext = NULL; - submit_desc.waitSemaphoreCount = 0; - submit_desc.pWaitSemaphores = NULL; - submit_desc.pWaitDstStageMask = NULL; - submit_desc.commandBufferCount = command_list_count; - submit_desc.pCommandBuffers = buffers; - submit_desc.signalSemaphoreCount = 0; - submit_desc.pSignalSemaphores = NULL; - - if (!(vk_queue = vkd3d_queue_acquire(command_queue->vkd3d_queue))) - { - ERR("Failed to acquire queue %p.\n", command_queue->vkd3d_queue); - vkd3d_free(buffers); - return; - } - - if ((vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_desc, VK_NULL_HANDLE))) < 0) - ERR("Failed to submit queue(s), vr %d.\n", vr); - - vkd3d_queue_release(command_queue->vkd3d_queue); - - vkd3d_free(buffers); + sub.type = VKD3D_SUBMISSION_EXECUTE; + sub.u.execute.cmd = buffers; + sub.u.execute.cmd_count = command_list_count; + if (FAILED(hr = d3d12_command_queue_add_submission(command_queue, &sub))) + ERR("Failed to submit command, hr %#x.\n", hr); }
static void STDMETHODCALLTYPE d3d12_command_queue_SetMarker(ID3D12CommandQueue *iface, @@ -6307,39 +6311,7 @@ static void STDMETHODCALLTYPE d3d12_command_queue_EndEvent(ID3D12CommandQueue *i FIXME("iface %p stub!\n", iface); }
-static HRESULT d3d12_fence_update_gpu_signal_timeline_semaphore(struct d3d12_fence *fence, uint64_t value) -{ - const struct d3d12_device *device = fence->device; - int rc; - - if ((rc = vkd3d_mutex_lock(&fence->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return hresult_from_errno(rc); - } - - /* If we're attempting to async signal a fence with a value which is not strictly increasing the payload value, - * warn about this case. Do not treat this as an error since it works at least with RADV and Nvidia drivers and - * there's no workaround on the GPU side. */ - if (value <= fence->pending_timeline_value) - { - WARN("Fence %p values are not strictly increasing. Pending values: old %"PRIu64", new %"PRIu64".\n", - fence, fence->pending_timeline_value, value); - } - /* Sanity check against the delta limit. Use the current fence value. */ - else if (value - fence->value > device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference) - { - FIXME("Timeline semaphore delta is %"PRIu64", but implementation only supports a delta of %"PRIu64".\n", - value - fence->value, device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference); - } - fence->pending_timeline_value = value; - - vkd3d_mutex_unlock(&fence->mutex); - - return S_OK; -} - -static HRESULT vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worker, VkSemaphore vk_semaphore, +static void vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worker, VkSemaphore vk_semaphore, struct d3d12_fence *fence, uint64_t value, struct vkd3d_queue *queue) { struct vkd3d_waiting_fence *waiting_fence; @@ -6350,7 +6322,7 @@ static HRESULT vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worke if ((rc = vkd3d_mutex_lock(&worker->mutex))) { ERR("Failed to lock mutex, error %d.\n", rc); - return hresult_from_errno(rc); + return; }
if (!vkd3d_array_reserve((void **)&worker->fences, &worker->fences_size, @@ -6358,7 +6330,7 @@ static HRESULT vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worke { ERR("Failed to add GPU timeline semaphore.\n"); vkd3d_mutex_unlock(&worker->mutex); - return E_OUTOFMEMORY; + return; }
waiting_fence = &worker->fences[worker->fence_count++]; @@ -6370,39 +6342,58 @@ static HRESULT vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worke
vkd3d_cond_signal(&worker->cond); vkd3d_mutex_unlock(&worker->mutex); - - return S_OK; }
static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue *iface, ID3D12Fence *fence_iface, UINT64 value) { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); + struct d3d12_fence *fence = unsafe_impl_from_ID3D12Fence(fence_iface); + struct d3d12_command_queue_submission sub; + + TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value); + + d3d12_fence_incref(fence); + + sub.type = VKD3D_SUBMISSION_SIGNAL; + sub.u.signal.fence = fence; + sub.u.signal.value = value; + return d3d12_command_queue_add_submission(command_queue, &sub); +} + +static void d3d12_command_queue_signal(struct d3d12_command_queue *command_queue, + struct d3d12_fence *fence, uint64_t value) +{ VkTimelineSemaphoreSubmitInfoKHR timeline_submit_info; const struct vkd3d_vk_device_procs *vk_procs; VkSemaphore vk_semaphore = VK_NULL_HANDLE; VkFence vk_fence = VK_NULL_HANDLE; struct vkd3d_queue *vkd3d_queue; uint64_t sequence_number = 0; + uint64_t timeline_value = 0; struct d3d12_device *device; - struct d3d12_fence *fence; VkSubmitInfo submit_info; VkQueue vk_queue; + int rc = -1; VkResult vr; - HRESULT hr; - - TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value);
device = command_queue->device; vk_procs = &device->vk_procs; vkd3d_queue = command_queue->vkd3d_queue;
- fence = unsafe_impl_from_ID3D12Fence(fence_iface); - - if (device->use_timeline_semaphores) + if (device->vk_info.KHR_timeline_semaphore) { - if (FAILED(hr = d3d12_fence_update_gpu_signal_timeline_semaphore(fence, value))) - return hr; + if ((rc = vkd3d_mutex_lock(&fence->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return; + } + if (!(timeline_value = d3d12_fence_add_pending_signal_locked(fence, value, vkd3d_queue))) + { + WARN("Failed to add pending signal.\n"); + vkd3d_mutex_unlock(&fence->mutex); + return; + }
vk_semaphore = fence->timeline_semaphore; assert(vk_semaphore); @@ -6412,18 +6403,17 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * if ((vr = d3d12_fence_create_vk_fence(fence, &vk_fence)) < 0) { WARN("Failed to create Vulkan fence, vr %d.\n", vr); - goto fail_vkresult; + goto fail; } }
if (!(vk_queue = vkd3d_queue_acquire(vkd3d_queue))) { ERR("Failed to acquire queue %p.\n", vkd3d_queue); - hr = E_FAIL; goto fail; }
- if (!device->use_timeline_semaphores && (vr = vkd3d_queue_create_vk_semaphore_locked(vkd3d_queue, + if (!device->vk_info.KHR_timeline_semaphore && (vr = vkd3d_queue_create_vk_semaphore_locked(vkd3d_queue, device, &vk_semaphore)) < 0) { ERR("Failed to create Vulkan semaphore, vr %d.\n", vr); @@ -6440,11 +6430,11 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * submit_info.signalSemaphoreCount = vk_semaphore ? 1 : 0; submit_info.pSignalSemaphores = &vk_semaphore;
- if (device->use_timeline_semaphores) + if (device->vk_info.KHR_timeline_semaphore) { timeline_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; timeline_submit_info.pNext = NULL; - timeline_submit_info.pSignalSemaphoreValues = &value; + timeline_submit_info.pSignalSemaphoreValues = &timeline_value; timeline_submit_info.signalSemaphoreValueCount = submit_info.signalSemaphoreCount; timeline_submit_info.waitSemaphoreValueCount = 0; timeline_submit_info.pWaitSemaphoreValues = NULL; @@ -6452,7 +6442,7 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * }
vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, vk_fence)); - if (!device->use_timeline_semaphores && vr >= 0) + if (!device->vk_info.KHR_timeline_semaphore && vr >= 0) { sequence_number = ++vkd3d_queue->submitted_sequence_number;
@@ -6463,41 +6453,43 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue *
vkd3d_queue_release(vkd3d_queue);
+ if (!rc) + { + vkd3d_mutex_unlock(&fence->mutex); + rc = -1; + } + if (vr < 0) { WARN("Failed to submit signal operation, vr %d.\n", vr); - goto fail_vkresult; + goto fail; }
- if (device->use_timeline_semaphores) + if (device->vk_info.KHR_timeline_semaphore) { return vkd3d_enqueue_timeline_semaphore(&command_queue->fence_worker, - vk_semaphore, fence, value, vkd3d_queue); + vk_semaphore, fence, timeline_value, vkd3d_queue); }
- if (vk_semaphore && SUCCEEDED(hr = d3d12_fence_add_vk_semaphore(fence, vk_semaphore, vk_fence, value))) + if (vk_semaphore && d3d12_fence_add_vk_semaphore(fence, vk_semaphore, vk_fence, value, vkd3d_queue)) vk_semaphore = VK_NULL_HANDLE;
vr = VK_CALL(vkGetFenceStatus(device->vk_device, vk_fence)); if (vr == VK_NOT_READY) { - if (SUCCEEDED(hr = vkd3d_enqueue_gpu_fence(&command_queue->fence_worker, - vk_fence, fence, value, vkd3d_queue, sequence_number))) - { + if (vkd3d_enqueue_gpu_fence(&command_queue->fence_worker, vk_fence, fence, value, vkd3d_queue, sequence_number)) vk_fence = VK_NULL_HANDLE; - } } else if (vr == VK_SUCCESS) { TRACE("Already signaled %p, value %#"PRIx64".\n", fence, value); - hr = d3d12_fence_signal(fence, value, vk_fence); + d3d12_fence_signal(fence, value, vk_fence); vk_fence = VK_NULL_HANDLE; vkd3d_queue_update_sequence_number(vkd3d_queue, sequence_number, device); } else { FIXME("Failed to get fence status, vr %d.\n", vr); - hr = hresult_from_vk_result(vr); }
if (vk_fence || vk_semaphore) @@ -6507,18 +6499,17 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * goto fail; }
- return hr; + return;
-fail_vkresult: - hr = hresult_from_vk_result(vr); fail: + if (!rc) + vkd3d_mutex_unlock(&fence->mutex); VK_CALL(vkDestroyFence(device->vk_device, vk_fence, NULL)); - if (!device->use_timeline_semaphores) + if (!device->vk_info.KHR_timeline_semaphore) VK_CALL(vkDestroySemaphore(device->vk_device, vk_semaphore, NULL)); - return hr; }
-static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_queue *command_queue, +static void d3d12_command_queue_wait_binary_semaphore_locked(struct d3d12_command_queue *command_queue, struct d3d12_fence *fence, uint64_t value) { static const VkPipelineStageFlagBits wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; @@ -6529,23 +6520,24 @@ static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_qu VkSubmitInfo submit_info; VkQueue vk_queue; VkResult vr; - HRESULT hr;
vk_procs = &command_queue->device->vk_procs; queue = command_queue->vkd3d_queue;
- semaphore = d3d12_fence_acquire_vk_semaphore(fence, value, &completed_value); + semaphore = d3d12_fence_acquire_vk_semaphore_locked(fence, value, &completed_value); + + vkd3d_mutex_unlock(&fence->mutex); + if (!semaphore && completed_value >= value) { /* We don't get a Vulkan semaphore if the fence was signaled on CPU. */ TRACE("Already signaled %p, value %#"PRIx64".\n", fence, completed_value); - return S_OK; + return; }
if (!(vk_queue = vkd3d_queue_acquire(queue))) { ERR("Failed to acquire queue %p.\n", queue); - hr = E_FAIL; goto fail; }
@@ -6562,13 +6554,13 @@ static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_qu }
vkd3d_queue_release(queue); - return S_OK; + return; }
submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submit_info.pNext = NULL; submit_info.waitSemaphoreCount = 1; - submit_info.pWaitSemaphores = &semaphore->vk_semaphore; + submit_info.pWaitSemaphores = &semaphore->u.binary.vk_semaphore; submit_info.pWaitDstStageMask = &wait_stage_mask; submit_info.commandBufferCount = 0; submit_info.pCommandBuffers = NULL; @@ -6580,13 +6572,12 @@ static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_qu { ERR("Failed to allocate memory for semaphore.\n"); vkd3d_queue_release(queue); - hr = E_OUTOFMEMORY; goto fail; }
if ((vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE))) >= 0) { - queue->semaphores[queue->semaphore_count].vk_semaphore = semaphore->vk_semaphore; + queue->semaphores[queue->semaphore_count].vk_semaphore = semaphore->u.binary.vk_semaphore; queue->semaphores[queue->semaphore_count].sequence_number = queue->submitted_sequence_number + 1; ++queue->semaphore_count;
@@ -6599,60 +6590,17 @@ static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_qu if (vr < 0) { WARN("Failed to submit wait operation, vr %d.\n", vr); - hr = hresult_from_vk_result(vr); goto fail; }
d3d12_fence_remove_vk_semaphore(fence, semaphore); - return S_OK; + return;
fail: d3d12_fence_release_vk_semaphore(fence, semaphore); - return hr; }
-static inline void d3d12_fence_update_gpu_wait(struct d3d12_fence *fence, const struct vkd3d_queue *queue) -{ - unsigned int i; - bool found; - int rc; - - if ((rc = vkd3d_mutex_lock(&fence->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return; - } - - for (i = 0, found = false; i < fence->gpu_wait_count; ++i) - { - if (fence->gpu_waits[i].queue == queue) - { - fence->gpu_waits[i].pending_value = queue->pending_wait_completion_value; - found = true; - } - else if (d3d12_fence_gpu_wait_is_completed(fence, i) && i < --fence->gpu_wait_count) - { - fence->gpu_waits[i] = fence->gpu_waits[fence->gpu_wait_count]; - } - } - - if (!found) - { - if (fence->gpu_wait_count < ARRAY_SIZE(fence->gpu_waits)) - { - fence->gpu_waits[fence->gpu_wait_count].queue = queue; - fence->gpu_waits[fence->gpu_wait_count++].pending_value = queue->pending_wait_completion_value; - } - else - { - FIXME("Unable to track GPU fence wait.\n"); - } - } - - vkd3d_mutex_unlock(&fence->mutex); -} - -static HRESULT d3d12_command_queue_wait_timeline_semaphore(struct d3d12_command_queue *command_queue, +static void d3d12_command_queue_wait(struct d3d12_command_queue *command_queue, struct d3d12_fence *fence, uint64_t value) { static const VkPipelineStageFlagBits wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; @@ -6660,25 +6608,47 @@ static HRESULT d3d12_command_queue_wait_timeline_semaphore(struct d3d12_command_ const struct vkd3d_vk_device_procs *vk_procs; struct vkd3d_queue *queue; VkSubmitInfo submit_info; + uint64_t wait_count; VkQueue vk_queue; VkResult vr;
vk_procs = &command_queue->device->vk_procs; queue = command_queue->vkd3d_queue;
+ vkd3d_mutex_lock(&fence->mutex); + + /* This is the critical part required to support out-of-order signal. + * Normally we would be able to submit waits and signals out of order, but + * we don't have virtualized queues in Vulkan, so we need to handle the case + * where multiple queues alias over the same physical queue, so effectively, + * we need to manage out-of-order submits ourselves. */ + d3d12_fence_block_until_pending_value_reaches_locked(fence, value); + + /* If a host signal unblocked us, or we know that the fence has reached a specific value, there is no need + * to queue up a wait. */ + if (d3d12_fence_can_elide_wait_semaphore_locked(fence, value, queue)) + { + TRACE("Eliding wait on fence %p, value %#"PRIx64".\n", fence, value); + vkd3d_mutex_unlock(&fence->mutex); + return; + } + + if (!command_queue->device->vk_info.KHR_timeline_semaphore) + return d3d12_command_queue_wait_binary_semaphore_locked(command_queue, fence, value); + + wait_count = d3d12_fence_get_timeline_wait_value_locked(fence, value); + + /* We can unlock the fence here. The queue semaphore will not be signalled to signal_value + * until we have submitted, so the semaphore cannot be destroyed before the call to vkQueueSubmit. */ + vkd3d_mutex_unlock(&fence->mutex); + assert(fence->timeline_semaphore); timeline_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; timeline_submit_info.pNext = NULL; + timeline_submit_info.waitSemaphoreValueCount = 1; + timeline_submit_info.pWaitSemaphoreValues = &wait_count; timeline_submit_info.signalSemaphoreValueCount = 0; timeline_submit_info.pSignalSemaphoreValues = NULL; - timeline_submit_info.waitSemaphoreValueCount = 1; - timeline_submit_info.pWaitSemaphoreValues = &value; - - if (!(vk_queue = vkd3d_queue_acquire(queue))) - { - ERR("Failed to acquire queue %p.\n", queue); - return E_FAIL; - }
submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submit_info.pNext = &timeline_submit_info; @@ -6690,14 +6660,11 @@ static HRESULT d3d12_command_queue_wait_timeline_semaphore(struct d3d12_command_ submit_info.signalSemaphoreCount = 0; submit_info.pSignalSemaphores = NULL;
- ++queue->pending_wait_completion_value; - - submit_info.signalSemaphoreCount = 1; - submit_info.pSignalSemaphores = &queue->wait_completion_semaphore; - timeline_submit_info.signalSemaphoreValueCount = 1; - timeline_submit_info.pSignalSemaphoreValues = &queue->pending_wait_completion_value; - - d3d12_fence_update_gpu_wait(fence, queue); + if (!(vk_queue = vkd3d_queue_acquire(queue))) + { + ERR("Failed to acquire queue %p.\n", queue); + return; + }
vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE));
@@ -6706,10 +6673,7 @@ static HRESULT d3d12_command_queue_wait_timeline_semaphore(struct d3d12_command_ if (vr < 0) { WARN("Failed to submit wait operation, vr %d.\n", vr); - return hresult_from_vk_result(vr); } - - return S_OK; }
static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Wait(ID3D12CommandQueue *iface, @@ -6717,14 +6681,16 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Wait(ID3D12CommandQueue *if { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); struct d3d12_fence *fence = unsafe_impl_from_ID3D12Fence(fence_iface); + struct d3d12_command_queue_submission sub;
TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value);
- if (command_queue->device->use_timeline_semaphores) - return d3d12_command_queue_wait_timeline_semaphore(command_queue, fence, value); + d3d12_fence_incref(fence);
- FIXME_ONCE("KHR_timeline_semaphore is not available or incompatible. Some wait commands may be unsupported.\n"); - return d3d12_command_queue_wait_binary_semaphore(command_queue, fence, value); + sub.type = VKD3D_SUBMISSION_WAIT; + sub.u.wait.fence = fence; + sub.u.wait.value = value; + return d3d12_command_queue_add_submission(command_queue, &sub); }
static HRESULT STDMETHODCALLTYPE d3d12_command_queue_GetTimestampFrequency(ID3D12CommandQueue *iface, @@ -6848,10 +6814,118 @@ static const struct ID3D12CommandQueueVtbl d3d12_command_queue_vtbl = d3d12_command_queue_GetDesc, };
+static void d3d12_command_queue_acquire_serialised(struct d3d12_command_queue *queue) +{ + /* In order to make sure all pending operations queued so far have been submitted, we build a drain + * task which will increment the queue_drain_count once the thread has finished all its work. */ + struct d3d12_command_queue_submission sub; + uint64_t current_drain; + HRESULT hr; + + sub.type = VKD3D_SUBMISSION_DRAIN; + + vkd3d_mutex_lock(&queue->submission_mutex); + + current_drain = ++queue->target_drain_count; + if (FAILED(hr = d3d12_command_queue_add_submission_locked(queue, &sub))) + ERR("Failed to submit command, hr %#x.\n", hr); + + while (current_drain != queue->queue_drain_count) + vkd3d_cond_wait(&queue->submission_cond, &queue->submission_mutex); +} + +static void d3d12_command_queue_release_serialised(struct d3d12_command_queue *queue) +{ + vkd3d_mutex_unlock(&queue->submission_mutex); +} + +static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queue, + VkCommandBuffer *cmd, unsigned int count) +{ + const struct vkd3d_vk_device_procs *vk_procs = &command_queue->device->vk_procs; + struct vkd3d_queue *vkd3d_queue = command_queue->vkd3d_queue; + VkSubmitInfo submit_desc; + VkQueue vk_queue; + VkResult vr; + + TRACE("queue %p, command_list_count %u, command_lists %p.\n", + command_queue, count, cmd); + + memset(&submit_desc, 0, sizeof(submit_desc)); + + if (!(vk_queue = vkd3d_queue_acquire(vkd3d_queue))) + { + ERR("Failed to acquire queue %p.\n", vkd3d_queue); + return; + } + + submit_desc.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_desc.commandBufferCount = count; + submit_desc.pCommandBuffers = cmd; + + if ((vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_desc, VK_NULL_HANDLE))) < 0) + ERR("Failed to submit queue(s), vr %d.\n", vr); + + vkd3d_queue_release(vkd3d_queue); +} + +static void *d3d12_command_queue_submission_worker_main(void *userdata) +{ + struct d3d12_command_queue_submission submission; + struct d3d12_command_queue *queue = userdata; + + vkd3d_set_thread_name("vkd3d_queue"); + + for (;;) + { + vkd3d_mutex_lock(&queue->submission_mutex); + while (!queue->submissions_count) + vkd3d_cond_wait(&queue->submission_cond, &queue->submission_mutex); + + submission = queue->submissions[0]; + memmove(queue->submissions, queue->submissions + 1, --queue->submissions_count * sizeof(submission)); + vkd3d_mutex_unlock(&queue->submission_mutex); + + switch (submission.type) + { + case VKD3D_SUBMISSION_WAIT: + d3d12_command_queue_wait(queue, submission.u.wait.fence, submission.u.wait.value); + d3d12_fence_decref(submission.u.wait.fence); + break; + + case VKD3D_SUBMISSION_SIGNAL: + d3d12_command_queue_signal(queue, submission.u.signal.fence, submission.u.signal.value); + d3d12_fence_decref(submission.u.signal.fence); + break; + + case VKD3D_SUBMISSION_EXECUTE: + d3d12_command_queue_execute(queue, submission.u.execute.cmd, submission.u.execute.cmd_count); + vkd3d_free(submission.u.execute.cmd); + break; + + case VKD3D_SUBMISSION_DRAIN: + vkd3d_mutex_lock(&queue->submission_mutex); + ++queue->queue_drain_count; + vkd3d_cond_signal(&queue->submission_cond); + vkd3d_mutex_unlock(&queue->submission_mutex); + break; + + case VKD3D_SUBMISSION_STOP: + TRACE("Stopping command queue %p.\n", queue); + return NULL; + + default: + FIXME("Unhandled submission type %u.\n", submission.type); + break; + } + } +} + static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, struct d3d12_device *device, const D3D12_COMMAND_QUEUE_DESC *desc) { HRESULT hr; + int rc;
queue->ID3D12CommandQueue_iface.lpVtbl = &d3d12_command_queue_vtbl; queue->refcount = 1; @@ -6866,6 +6940,12 @@ static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, queue->last_waited_fence = NULL; queue->last_waited_fence_value = 0;
+ queue->submissions = NULL; + queue->submissions_count = 0; + queue->submissions_size = 0; + queue->target_drain_count = 0; + queue->queue_drain_count = 0; + if (desc->Priority == D3D12_COMMAND_QUEUE_PRIORITY_GLOBAL_REALTIME) { FIXME("Global realtime priority is not implemented.\n"); @@ -6880,15 +6960,40 @@ static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, if (FAILED(hr = vkd3d_private_store_init(&queue->private_store))) return hr;
+ if ((rc = vkd3d_mutex_init(&queue->submission_mutex)) < 0) + { + hr = hresult_from_errno(rc); + goto fail_submission_mutex; + } + + if ((rc = vkd3d_cond_init(&queue->submission_cond)) < 0) + { + hr = hresult_from_errno(rc); + goto fail_submission_cond; + } + if (FAILED(hr = vkd3d_fence_worker_start(&queue->fence_worker, queue->vkd3d_queue, device))) + goto fail_fence_worker_start; + + if ((rc = vkd3d_create_thread(device->vkd3d_instance, d3d12_command_queue_submission_worker_main, queue, &queue->submission_thread)) < 0) { - vkd3d_private_store_destroy(&queue->private_store); - return hr; + hr = hresult_from_errno(rc); + goto fail_pthread_create; }
d3d12_device_add_ref(queue->device = device);
return S_OK; + +fail_pthread_create: + vkd3d_fence_worker_stop(&queue->fence_worker, device); +fail_fence_worker_start: + vkd3d_cond_destroy(&queue->submission_cond); +fail_submission_cond: + vkd3d_mutex_destroy(&queue->submission_mutex); +fail_submission_mutex: + vkd3d_private_store_destroy(&queue->private_store); + return hr; }
HRESULT d3d12_command_queue_create(struct d3d12_device *device, @@ -6924,6 +7029,9 @@ VkQueue vkd3d_acquire_vk_queue(ID3D12CommandQueue *queue) { struct d3d12_command_queue *d3d12_queue = impl_from_ID3D12CommandQueue(queue);
+ /* For external users of the Vulkan queue, we must ensure that the queue is drained + * so that submissions happen in the desired order. */ + d3d12_command_queue_acquire_serialised(d3d12_queue); return vkd3d_queue_acquire(d3d12_queue->vkd3d_queue); }
@@ -6931,7 +7039,8 @@ void vkd3d_release_vk_queue(ID3D12CommandQueue *queue) { struct d3d12_command_queue *d3d12_queue = impl_from_ID3D12CommandQueue(queue);
- return vkd3d_queue_release(d3d12_queue->vkd3d_queue); + vkd3d_queue_release(d3d12_queue->vkd3d_queue); + d3d12_command_queue_release_serialised(d3d12_queue); }
/* ID3D12CommandSignature */ diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index a1041f2a..99e65a51 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -747,7 +747,6 @@ struct vkd3d_physical_device_info VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT texel_buffer_alignment_properties; VkPhysicalDeviceTransformFeedbackPropertiesEXT xfb_properties; VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT vertex_divisor_properties; - VkPhysicalDeviceTimelineSemaphorePropertiesKHR timeline_semaphore_properties;
VkPhysicalDeviceProperties2KHR properties2;
@@ -772,7 +771,6 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i VkPhysicalDeviceDescriptorIndexingPropertiesEXT *descriptor_indexing_properties; VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *vertex_divisor_properties; VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *buffer_alignment_properties; - VkPhysicalDeviceTimelineSemaphorePropertiesKHR *timeline_semaphore_properties; VkPhysicalDeviceDescriptorIndexingFeaturesEXT *descriptor_indexing_features; VkPhysicalDeviceRobustness2FeaturesEXT *robustness2_features; VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *vertex_divisor_features; @@ -799,7 +797,6 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i vertex_divisor_features = &info->vertex_divisor_features; vertex_divisor_properties = &info->vertex_divisor_properties; timeline_semaphore_features = &info->timeline_semaphore_features; - timeline_semaphore_properties = &info->timeline_semaphore_properties; xfb_features = &info->xfb_features; xfb_properties = &info->xfb_properties;
@@ -841,8 +838,6 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i vk_prepend_struct(&info->properties2, xfb_properties); vertex_divisor_properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT; vk_prepend_struct(&info->properties2, vertex_divisor_properties); - timeline_semaphore_properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR; - vk_prepend_struct(&info->properties2, timeline_semaphore_properties);
if (vulkan_info->KHR_get_physical_device_properties2) VK_CALL(vkGetPhysicalDeviceProperties2KHR(physical_device, &info->properties2)); @@ -1431,7 +1426,6 @@ static HRESULT vkd3d_init_device_caps(struct d3d12_device *device, vulkan_info->rasterization_stream = physical_device_info->xfb_properties.transformFeedbackRasterizationStreamSelect; vulkan_info->transform_feedback_queries = physical_device_info->xfb_properties.transformFeedbackQueries; vulkan_info->max_vertex_attrib_divisor = max(physical_device_info->vertex_divisor_properties.maxVertexAttribDivisor, 1); - vulkan_info->timeline_semaphore_properties = physical_device_info->timeline_semaphore_properties;
device->feature_options.DoublePrecisionFloatShaderOps = features->shaderFloat64; device->feature_options.OutputMergerLogicOp = features->logicOp; @@ -1908,75 +1902,6 @@ static bool d3d12_is_64k_msaa_supported(struct d3d12_device *device) && info.Alignment <= 0x10000; }
-/* A lower value can be signalled on a D3D12 fence. Vulkan timeline semaphores - * do not support this, but test if it works anyway. */ -static bool d3d12_is_timeline_semaphore_supported(const struct d3d12_device *device) -{ - const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; - VkTimelineSemaphoreSubmitInfoKHR timeline_submit_info; - VkSemaphore timeline_semaphore; - VkSubmitInfo submit_info; - bool result = false; - uint64_t value = 0; - VkQueue vk_queue; - VkResult vr; - - if (!device->vk_info.KHR_timeline_semaphore) - return false; - - if ((vr = vkd3d_create_timeline_semaphore(device, 1, &timeline_semaphore)) < 0) - { - WARN("Failed to create timeline semaphore, vr %d.\n", vr); - return false; - } - - if (!(vk_queue = vkd3d_queue_acquire(device->direct_queue))) - { - ERR("Failed to acquire queue %p.\n", device->direct_queue); - VK_CALL(vkDestroySemaphore(device->vk_device, timeline_semaphore, NULL)); - return false; - } - - submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submit_info.pNext = &timeline_submit_info; - submit_info.waitSemaphoreCount = 0; - submit_info.pWaitSemaphores = NULL; - submit_info.pWaitDstStageMask = NULL; - submit_info.commandBufferCount = 0; - submit_info.pCommandBuffers = NULL; - submit_info.signalSemaphoreCount = 1; - submit_info.pSignalSemaphores = &timeline_semaphore; - - timeline_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; - timeline_submit_info.pNext = NULL; - timeline_submit_info.pSignalSemaphoreValues = &value; - timeline_submit_info.signalSemaphoreValueCount = 1; - timeline_submit_info.waitSemaphoreValueCount = 0; - timeline_submit_info.pWaitSemaphoreValues = NULL; - - vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE)); - - if (vr >= 0) - { - if ((vr = VK_CALL(vkQueueWaitIdle(vk_queue))) < 0) - WARN("Failed to wait for queue, vr %d.\n", vr); - - if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, timeline_semaphore, &value))) < 0) - ERR("Failed to get Vulkan semaphore status, vr %d.\n", vr); - else if (!(result = !value)) - WARN("Disabling timeline semaphore use due to incompatible behaviour.\n"); - } - else - { - WARN("Failed to submit signal operation, vr %d.\n", vr); - } - - vkd3d_queue_release(device->direct_queue); - VK_CALL(vkDestroySemaphore(device->vk_device, timeline_semaphore, NULL)); - - return result; -} - static HRESULT vkd3d_create_vk_device(struct d3d12_device *device, const struct vkd3d_device_create_info *create_info) { @@ -2075,10 +2000,6 @@ static HRESULT vkd3d_create_vk_device(struct d3d12_device *device, }
device->feature_options4.MSAA64KBAlignedTextureSupported = d3d12_is_64k_msaa_supported(device); - device->use_timeline_semaphores = d3d12_is_timeline_semaphore_supported(device) - && vkd3d_queue_init_timeline_semaphore(device->direct_queue, device) - && vkd3d_queue_init_timeline_semaphore(device->compute_queue, device) - && vkd3d_queue_init_timeline_semaphore(device->copy_queue, device);
TRACE("Created Vulkan device %p.\n", vk_device);
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4e03145d..9989f20a 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -59,7 +59,6 @@ #define VKD3D_MAX_SHADER_EXTENSIONS 3u #define VKD3D_MAX_SHADER_STAGES 5u #define VKD3D_MAX_VK_SYNC_OBJECTS 4u -#define VKD3D_MAX_FENCE_WAITING_QUEUES 4u #define VKD3D_MAX_DESCRIPTOR_SETS 64u /* D3D12 binding tier 3 has a limit of 2048 samplers. */ #define VKD3D_MAX_DESCRIPTOR_SET_SAMPLERS 2048u @@ -152,8 +151,6 @@ struct vkd3d_vulkan_info
VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT texel_buffer_alignment_properties;
- VkPhysicalDeviceTimelineSemaphorePropertiesKHR timeline_semaphore_properties; - unsigned int shader_extension_count; enum vkd3d_shader_spirv_extension shader_extensions[VKD3D_MAX_SHADER_EXTENSIONS];
@@ -502,15 +499,17 @@ HRESULT vkd3d_set_private_data_interface(struct vkd3d_private_store *store, cons struct vkd3d_signaled_semaphore { uint64_t value; - VkSemaphore vk_semaphore; - VkFence vk_fence; - bool is_acquired; -}; - -struct vkd3d_pending_fence_wait -{ - const struct vkd3d_queue *queue; - uint64_t pending_value; + union + { + struct + { + VkSemaphore vk_semaphore; + VkFence vk_fence; + bool is_acquired; + } binary; + uint64_t timeline_value; + } u; + const struct vkd3d_queue *signalling_queue; };
/* ID3D12Fence */ @@ -521,7 +520,9 @@ struct d3d12_fence LONG refcount;
uint64_t value; + uint64_t max_pending_value; struct vkd3d_mutex mutex; + struct vkd3d_cond cond; struct vkd3d_cond null_event_cond;
struct vkd3d_waiting_event @@ -534,9 +535,8 @@ struct d3d12_fence size_t event_count;
VkSemaphore timeline_semaphore; + uint64_t timeline_value; uint64_t pending_timeline_value; - struct vkd3d_pending_fence_wait gpu_waits[VKD3D_MAX_FENCE_WAITING_QUEUES]; - unsigned int gpu_wait_count;
struct vkd3d_signaled_semaphore *semaphores; size_t semaphores_size; @@ -1294,9 +1294,6 @@ struct vkd3d_queue VkQueueFlags vk_queue_flags; uint32_t timestamp_bits;
- VkSemaphore wait_completion_semaphore; - uint64_t pending_wait_completion_value; - struct { VkSemaphore vk_semaphore; @@ -1311,10 +1308,47 @@ struct vkd3d_queue VkQueue vkd3d_queue_acquire(struct vkd3d_queue *queue); HRESULT vkd3d_queue_create(struct d3d12_device *device, uint32_t family_index, const VkQueueFamilyProperties *properties, struct vkd3d_queue **queue); -bool vkd3d_queue_init_timeline_semaphore(struct vkd3d_queue *queue, struct d3d12_device *device); void vkd3d_queue_destroy(struct vkd3d_queue *queue, struct d3d12_device *device); void vkd3d_queue_release(struct vkd3d_queue *queue);
+enum vkd3d_submission_type +{ + VKD3D_SUBMISSION_WAIT, + VKD3D_SUBMISSION_SIGNAL, + VKD3D_SUBMISSION_EXECUTE, + VKD3D_SUBMISSION_DRAIN, + VKD3D_SUBMISSION_STOP, +}; + +struct d3d12_command_queue_submission_wait +{ + struct d3d12_fence *fence; + uint64_t value; +}; + +struct d3d12_command_queue_submission_signal +{ + struct d3d12_fence *fence; + uint64_t value; +}; + +struct d3d12_command_queue_submission_execute +{ + VkCommandBuffer *cmd; + unsigned int cmd_count; +}; + +struct d3d12_command_queue_submission +{ + enum vkd3d_submission_type type; + union + { + struct d3d12_command_queue_submission_wait wait; + struct d3d12_command_queue_submission_signal signal; + struct d3d12_command_queue_submission_execute execute; + } u; +}; + /* ID3D12CommandQueue */ struct d3d12_command_queue { @@ -1331,6 +1365,16 @@ struct d3d12_command_queue
struct d3d12_device *device;
+ struct vkd3d_mutex submission_mutex; + struct vkd3d_cond submission_cond; + union vkd3d_thread_handle submission_thread; + + struct d3d12_command_queue_submission *submissions; + size_t submissions_count; + size_t submissions_size; + uint64_t target_drain_count; + uint64_t queue_drain_count; + struct vkd3d_private_store private_store; };
@@ -1470,7 +1514,6 @@ struct d3d12_device VkDescriptorPoolSize vk_pool_sizes[VKD3D_DESCRIPTOR_POOL_COUNT]; struct vkd3d_vk_descriptor_heap_layout vk_descriptor_heap_layouts[VKD3D_SET_INDEX_COUNT]; bool use_vk_heaps; - bool use_timeline_semaphores; };
HRESULT d3d12_device_create(struct vkd3d_instance *instance, diff --git a/tests/d3d12.c b/tests/d3d12.c index c992c6de..0f892e4e 100644 --- a/tests/d3d12.c +++ b/tests/d3d12.c @@ -33089,9 +33089,7 @@ static void test_queue_wait(void) command_list = context.list; queue = context.queue;
- /* 'queue2' must not map to the same command queue as 'queue', or Wait() before GPU signal will fail. - * Using a compute queue fixes this on most hardware, but it may still fail on low spec hardware. */ - queue2 = create_command_queue(device, D3D12_COMMAND_LIST_TYPE_COMPUTE, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL); + queue2 = create_command_queue(device, D3D12_COMMAND_LIST_TYPE_DIRECT, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL);
event = create_event(); ok(event, "Failed to create event.\n");
I can confirm that my machine still reproduces the hang in all d3d12 tests without this patch, and that the tests are fixed by this patch. I don't see any validation errors in the fence tests, either.