Strictly increasing timeline values must be mapped to fence virtual values to avoid invalid use of Vulkan timeline semaphores. In particular, non- increasing values and value jumps of >= 4G are permitted in d3d12.
Different virtual D3D12 command queues may map to the same Vulkan queue. If a wait of value N is submitted on one command queue, and then a signal for >= N is submitted on another, but they are sent to the same Vk queue, the wait will never complete. The solution is to buffer out-of-order waits and any subsequent queue commands until an unblocking signal value is submitted to a different D3D12 queue, or signaled on the CPU.
Buffering out-of-order waits also fixes the old fence implementation so it is fully functional, though a bit less efficient than timeline semaphores.
Based in part on vkd3d-proton patches by Hans-Kristian Arntzen. Unlike the vkd3d-proton implementation, this patch does not use worker threads for submissions to the Vulkan queue.
Signed-off-by: Conor McCarthy cmccarthy@codeweavers.com --- libs/vkd3d/command.c | 894 ++++++++++++++++++++++--------------- libs/vkd3d/device.c | 81 +--- libs/vkd3d/vkd3d_private.h | 78 +++- tests/d3d12.c | 4 +- 4 files changed, 602 insertions(+), 455 deletions(-)
diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index b187c65b..d0782e5a 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -22,7 +22,11 @@
static void d3d12_fence_incref(struct d3d12_fence *fence); static void d3d12_fence_decref(struct d3d12_fence *fence); -static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence); +static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence, bool on_cpu); +static void d3d12_fence_signal_timeline_semaphore(struct d3d12_fence *fence, uint64_t timeline_value); +static HRESULT d3d12_command_queue_signal(struct d3d12_command_queue *command_queue, + struct d3d12_fence *fence, uint64_t value); +static bool d3d12_command_queue_flush_ops(struct d3d12_command_queue *queue, bool *flushed_any);
HRESULT vkd3d_queue_create(struct d3d12_device *device, uint32_t family_index, const VkQueueFamilyProperties *properties, struct vkd3d_queue **queue) @@ -48,9 +52,6 @@ HRESULT vkd3d_queue_create(struct d3d12_device *device, object->vk_queue_flags = properties->queueFlags; object->timestamp_bits = properties->timestampValidBits;
- object->wait_completion_semaphore = VK_NULL_HANDLE; - object->pending_wait_completion_value = 0; - object->semaphores = NULL; object->semaphores_size = 0; object->semaphore_count = 0; @@ -66,20 +67,6 @@ HRESULT vkd3d_queue_create(struct d3d12_device *device, return S_OK; }
-bool vkd3d_queue_init_timeline_semaphore(struct vkd3d_queue *queue, struct d3d12_device *device) -{ - VkResult vr; - - if (!queue->wait_completion_semaphore - && (vr = vkd3d_create_timeline_semaphore(device, 0, &queue->wait_completion_semaphore)) < 0) - { - WARN("Failed to create timeline semaphore, vr %d.\n", vr); - return false; - } - - return true; -} - void vkd3d_queue_destroy(struct vkd3d_queue *queue, struct d3d12_device *device) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; @@ -94,8 +81,6 @@ void vkd3d_queue_destroy(struct vkd3d_queue *queue, struct d3d12_device *device)
vkd3d_free(queue->semaphores);
- VK_CALL(vkDestroySemaphore(device->vk_device, queue->wait_completion_semaphore, NULL)); - for (i = 0; i < ARRAY_SIZE(queue->old_vk_semaphores); ++i) { if (queue->old_vk_semaphores[i]) @@ -308,9 +293,7 @@ static void vkd3d_wait_for_gpu_timeline_semaphore(struct vkd3d_fence_worker *wor const struct d3d12_device *device = worker->device; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkSemaphoreWaitInfoKHR wait_info; - uint64_t counter_value; VkResult vr; - HRESULT hr;
wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR; wait_info.pNext = NULL; @@ -328,19 +311,10 @@ static void vkd3d_wait_for_gpu_timeline_semaphore(struct vkd3d_fence_worker *wor return; }
- if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, waiting_fence->u.vk_semaphore, - &counter_value))) < 0) - { - ERR("Failed to get Vulkan semaphore value, vr %d.\n", vr); - } - else - { - TRACE("Signaling fence %p value %#"PRIx64".\n", waiting_fence->fence, waiting_fence->value); - if (FAILED(hr = d3d12_fence_signal(waiting_fence->fence, counter_value, VK_NULL_HANDLE))) - ERR("Failed to signal D3D12 fence, hr %#x.\n", hr); + TRACE("Signaling fence %p value %#"PRIx64".\n", waiting_fence->fence, waiting_fence->value); + d3d12_fence_signal_timeline_semaphore(waiting_fence->fence, waiting_fence->value);
- d3d12_fence_decref(waiting_fence->fence); - } + d3d12_fence_decref(waiting_fence->fence); }
static void vkd3d_wait_for_gpu_fence(struct vkd3d_fence_worker *worker, @@ -361,7 +335,7 @@ static void vkd3d_wait_for_gpu_fence(struct vkd3d_fence_worker *worker, }
TRACE("Signaling fence %p value %#"PRIx64".\n", waiting_fence->fence, waiting_fence->value); - if (FAILED(hr = d3d12_fence_signal(waiting_fence->fence, waiting_fence->value, waiting_fence->u.vk_fence))) + if (FAILED(hr = d3d12_fence_signal(waiting_fence->fence, waiting_fence->value, waiting_fence->u.vk_fence, false))) ERR("Failed to signal D3D12 fence, hr %#x.\n", hr);
d3d12_fence_decref(waiting_fence->fence); @@ -434,7 +408,7 @@ static HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, worker->fences = NULL; worker->fences_size = 0;
- worker->wait_for_gpu_fence = device->use_timeline_semaphores + worker->wait_for_gpu_fence = device->vk_info.KHR_timeline_semaphore ? vkd3d_wait_for_gpu_timeline_semaphore : vkd3d_wait_for_gpu_fence;
if ((rc = vkd3d_mutex_init(&worker->mutex))) @@ -606,17 +580,17 @@ static void d3d12_fence_garbage_collect_vk_semaphores_locked(struct d3d12_fence current = &fence->semaphores[i]; /* The semaphore doesn't have a pending signal operation if the fence * was signaled. */ - if ((current->vk_fence || current->is_acquired) && !destroy_all) + if ((current->u.binary.vk_fence || current->u.binary.is_acquired) && !destroy_all) { ++i; continue; }
- if (current->vk_fence) + if (current->u.binary.vk_fence) WARN("Destroying potentially pending semaphore.\n"); - assert(!current->is_acquired); + assert(!current->u.binary.is_acquired);
- VK_CALL(vkDestroySemaphore(device->vk_device, current->vk_semaphore, NULL)); + VK_CALL(vkDestroySemaphore(device->vk_device, current->u.binary.vk_semaphore, NULL)); fence->semaphores[i] = fence->semaphores[--fence->semaphore_count]; }
@@ -652,23 +626,16 @@ static void d3d12_fence_destroy_vk_objects(struct d3d12_fence *fence) vkd3d_mutex_unlock(&fence->mutex); }
-static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct d3d12_fence *fence, +static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore_locked(struct d3d12_fence *fence, uint64_t value, uint64_t *completed_value) { struct vkd3d_signaled_semaphore *semaphore; struct vkd3d_signaled_semaphore *current; uint64_t semaphore_value; unsigned int i; - int rc;
TRACE("fence %p, value %#"PRIx64".\n", fence, value);
- if ((rc = vkd3d_mutex_lock(&fence->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return VK_NULL_HANDLE; - } - semaphore = NULL; semaphore_value = ~(uint64_t)0;
@@ -676,7 +643,7 @@ static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct { current = &fence->semaphores[i]; /* Prefer a semaphore with the smallest value. */ - if (!current->is_acquired && current->value >= value && semaphore_value >= current->value) + if (!current->u.binary.is_acquired && current->value >= value && semaphore_value >= current->value) { semaphore = current; semaphore_value = current->value; @@ -686,12 +653,10 @@ static struct vkd3d_signaled_semaphore *d3d12_fence_acquire_vk_semaphore(struct }
if (semaphore) - semaphore->is_acquired = true; + semaphore->u.binary.is_acquired = true;
*completed_value = fence->value;
- vkd3d_mutex_unlock(&fence->mutex); - return semaphore; }
@@ -705,7 +670,7 @@ static void d3d12_fence_remove_vk_semaphore(struct d3d12_fence *fence, struct vk return; }
- assert(semaphore->is_acquired); + assert(semaphore->u.binary.is_acquired);
*semaphore = fence->semaphores[--fence->semaphore_count];
@@ -722,32 +687,133 @@ static void d3d12_fence_release_vk_semaphore(struct d3d12_fence *fence, struct v return; }
- assert(semaphore->is_acquired); - semaphore->is_acquired = false; + assert(semaphore->u.binary.is_acquired); + semaphore->u.binary.is_acquired = false;
vkd3d_mutex_unlock(&fence->mutex); }
-static HRESULT d3d12_fence_add_vk_semaphore(struct d3d12_fence *fence, - VkSemaphore vk_semaphore, VkFence vk_fence, uint64_t value) +static void d3d12_fence_update_pending_value_locked(struct d3d12_fence *fence) +{ + uint64_t new_max_pending_value; + unsigned int i; + + for (i = 0, new_max_pending_value = 0; i < fence->semaphore_count; ++i) + new_max_pending_value = max(fence->semaphores[i].value, new_max_pending_value); + + fence->max_pending_value = max(fence->value, new_max_pending_value); +} + +static HRESULT d3d12_fence_update_pending_value(struct d3d12_fence *fence) +{ + int rc; + + if ((rc = vkd3d_mutex_lock(&fence->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return hresult_from_errno(rc); + } + + d3d12_fence_update_pending_value_locked(fence); + + vkd3d_mutex_unlock(&fence->mutex); + + return S_OK; +} + +static HRESULT d3d12_device_add_blocked_command_queues(struct d3d12_device *device, + struct d3d12_command_queue * const *command_queues, unsigned int count) { - struct vkd3d_signaled_semaphore *semaphore; HRESULT hr = S_OK; + unsigned int i; int rc;
- TRACE("fence %p, value %#"PRIx64".\n", fence, value); + if ((rc = vkd3d_mutex_lock(&device->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return hresult_from_errno(rc); + }
- if (!(semaphore = vkd3d_malloc(sizeof(*semaphore)))) + if ((i = ARRAY_SIZE(device->blocked_queues) - device->blocked_queue_count) < count) { - ERR("Failed to add semaphore.\n"); - return E_OUTOFMEMORY; + FIXME("Failed to add %u blocked command queue(s) to device %p.\n", count - i, device); + count = i; + hr = E_FAIL; + } + + for (i = 0; i < count; ++i) + device->blocked_queues[device->blocked_queue_count++] = command_queues[i]; + + vkd3d_mutex_unlock(&device->mutex); + return hr; +} + +static HRESULT d3d12_device_flush_blocked_queues_once(struct d3d12_device *device, bool *flushed_any) +{ + struct d3d12_command_queue *blocked_queues[VKD3D_MAX_DEVICE_BLOCKED_QUEUES]; + unsigned int i, blocked_queue_count; + int rc; + + *flushed_any = false; + + if ((rc = vkd3d_mutex_lock(&device->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return hresult_from_errno(rc); + } + + /* Flush any ops unblocked by a new pending value. These cannot be flushed + * with the device locked, so move the queue pointers to a local array. */ + blocked_queue_count = device->blocked_queue_count; + memcpy(blocked_queues, device->blocked_queues, blocked_queue_count * sizeof(blocked_queues[0])); + device->blocked_queue_count = 0; + + vkd3d_mutex_unlock(&device->mutex); + + i = 0; + while (i < blocked_queue_count) + { + if (d3d12_command_queue_flush_ops(blocked_queues[i], flushed_any)) + blocked_queues[i] = blocked_queues[--blocked_queue_count]; + else + ++i; + } + + /* None of these queues could have been re-added during the above loop because + * blocked queues always have a nonzero op count. */ + return d3d12_device_add_blocked_command_queues(device, blocked_queues, blocked_queue_count); +} + +static HRESULT d3d12_device_flush_blocked_queues(struct d3d12_device *device) +{ + bool flushed_any; + HRESULT hr; + + /* Executing an op on one queue may unblock another, so repeat until nothing is flushed. */ + do + { + if (!device->blocked_queue_count) + return S_OK; + if (FAILED(hr = d3d12_device_flush_blocked_queues_once(device, &flushed_any))) + return hr; } + while (flushed_any); + + return S_OK; +} + +static HRESULT d3d12_fence_add_vk_semaphore(struct d3d12_fence *fence, VkSemaphore vk_semaphore, + VkFence vk_fence, uint64_t value, const struct vkd3d_queue *signalling_queue) +{ + struct vkd3d_signaled_semaphore *semaphore; + int rc; + + TRACE("fence %p, value %#"PRIx64".\n", fence, value);
if ((rc = vkd3d_mutex_lock(&fence->mutex))) { ERR("Failed to lock mutex, error %d.\n", rc); - vkd3d_free(semaphore); - return E_FAIL; + return hresult_from_errno(rc); }
d3d12_fence_garbage_collect_vk_semaphores_locked(fence, false); @@ -757,21 +823,24 @@ static HRESULT d3d12_fence_add_vk_semaphore(struct d3d12_fence *fence, { ERR("Failed to add semaphore.\n"); vkd3d_mutex_unlock(&fence->mutex); - return false; + return E_OUTOFMEMORY; }
semaphore = &fence->semaphores[fence->semaphore_count++]; semaphore->value = value; - semaphore->vk_semaphore = vk_semaphore; - semaphore->vk_fence = vk_fence; - semaphore->is_acquired = false; + semaphore->u.binary.vk_semaphore = vk_semaphore; + semaphore->u.binary.vk_fence = vk_fence; + semaphore->u.binary.is_acquired = false; + semaphore->signalling_queue = signalling_queue; + + d3d12_fence_update_pending_value_locked(fence);
vkd3d_mutex_unlock(&fence->mutex);
- return hr; + return d3d12_device_flush_blocked_queues(fence->device); }
-static bool d3d12_fence_signal_external_events_locked(struct d3d12_fence *fence) +static void d3d12_fence_signal_external_events_locked(struct d3d12_fence *fence) { struct d3d12_device *device = fence->device; bool signal_null_event_cond = false; @@ -803,10 +872,11 @@ static bool d3d12_fence_signal_external_events_locked(struct d3d12_fence *fence)
fence->event_count = j;
- return signal_null_event_cond; + if (signal_null_event_cond) + vkd3d_cond_broadcast(&fence->null_event_cond); }
-static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence) +static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence, bool on_cpu) { struct d3d12_device *device = fence->device; struct vkd3d_signaled_semaphore *current; @@ -821,8 +891,7 @@ static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkF
fence->value = value;
- if (d3d12_fence_signal_external_events_locked(fence)) - vkd3d_cond_broadcast(&fence->null_event_cond); + d3d12_fence_signal_external_events_locked(fence);
if (vk_fence) { @@ -831,8 +900,8 @@ static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkF for (i = 0; i < fence->semaphore_count; ++i) { current = &fence->semaphores[i]; - if (current->vk_fence == vk_fence) - current->vk_fence = VK_NULL_HANDLE; + if (current->u.binary.vk_fence == vk_fence) + current->u.binary.vk_fence = VK_NULL_HANDLE; }
for (i = 0; i < ARRAY_SIZE(fence->old_vk_fences); ++i) @@ -849,9 +918,101 @@ static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkF VK_CALL(vkDestroyFence(device->vk_device, vk_fence, NULL)); }
+ d3d12_fence_update_pending_value_locked(fence); + vkd3d_mutex_unlock(&fence->mutex);
- return S_OK; + return on_cpu ? d3d12_device_flush_blocked_queues(device) : S_OK; +} + +static uint64_t d3d12_fence_add_pending_timeline_signal(struct d3d12_fence *fence, uint64_t virtual_value, + const struct vkd3d_queue *signalling_queue) +{ + struct vkd3d_signaled_semaphore *semaphore; + int rc; + + if ((rc = vkd3d_mutex_lock(&fence->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return hresult_from_errno(rc); + } + + if (!vkd3d_array_reserve((void **)&fence->semaphores, &fence->semaphores_size, + fence->semaphore_count + 1, sizeof(*fence->semaphores))) + { + return 0; + } + + semaphore = &fence->semaphores[fence->semaphore_count++]; + semaphore->value = virtual_value; + semaphore->u.timeline_value = ++fence->pending_timeline_value; + semaphore->signalling_queue = signalling_queue; + + vkd3d_mutex_unlock(&fence->mutex); + + return fence->pending_timeline_value; +} + +static uint64_t d3d12_fence_get_timeline_wait_value_locked(struct d3d12_fence *fence, uint64_t virtual_value) +{ + uint64_t target_timeline_value = UINT64_MAX; + unsigned int i; + + /* Find the smallest physical value which is at least the virtual value. */ + for (i = 0; i < fence->semaphore_count; ++i) + { + if (virtual_value <= fence->semaphores[i].value) + target_timeline_value = min(target_timeline_value, fence->semaphores[i].u.timeline_value); + } + + /* No timeline value will be found if it was already signaled on the GPU and handled in + * the worker thread. A wait must still be emitted as a barrier against command re-ordering. */ + return (target_timeline_value == UINT64_MAX) ? 0 : target_timeline_value; +} + +static void d3d12_fence_signal_timeline_semaphore(struct d3d12_fence *fence, uint64_t timeline_value) +{ + bool did_signal; + unsigned int i; + int rc; + + if ((rc = vkd3d_mutex_lock(&fence->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return; + } + + /* With multiple fence workers, it is possible that signal calls are out of + * order. The physical value itself is monotonic, but we need to make sure + * that all signals happen in correct order if there are fence rewinds. + * We don't expect the loop to run more than once, but there might be + * extreme edge cases where we signal 2 or more. */ + while (fence->timeline_value < timeline_value) + { + ++fence->timeline_value; + did_signal = false; + + for (i = 0; i < fence->semaphore_count; ++i) + { + if (fence->timeline_value == fence->semaphores[i].u.timeline_value) + { + fence->value = fence->semaphores[i].value; + d3d12_fence_signal_external_events_locked(fence); + fence->semaphores[i] = fence->semaphores[--fence->semaphore_count]; + did_signal = true; + break; + } + } + + if (!did_signal) + FIXME("Did not signal a virtual value.\n"); + } + + /* If a rewind remains queued, the virtual value deleted above may be + * greater than any pending value, so update the max pending value. */ + d3d12_fence_update_pending_value_locked(fence); + + vkd3d_mutex_unlock(&fence->mutex); }
static HRESULT STDMETHODCALLTYPE d3d12_fence_QueryInterface(ID3D12Fence *iface, @@ -1060,100 +1221,8 @@ static HRESULT STDMETHODCALLTYPE d3d12_fence_SetEventOnCompletion(ID3D12Fence *i return S_OK; }
-static inline bool d3d12_fence_gpu_wait_is_completed(const struct d3d12_fence *fence, unsigned int i) -{ - const struct d3d12_device *device = fence->device; - const struct vkd3d_vk_device_procs *vk_procs; - uint64_t value; - VkResult vr; - - vk_procs = &device->vk_procs; - - if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, - fence->gpu_waits[i].queue->wait_completion_semaphore, &value))) >= 0) - { - return value >= fence->gpu_waits[i].pending_value; - } - - ERR("Failed to get Vulkan semaphore status, vr %d.\n", vr); - return true; -} - -static inline bool d3d12_fence_has_pending_gpu_ops_locked(struct d3d12_fence *fence) -{ - const struct d3d12_device *device = fence->device; - const struct vkd3d_vk_device_procs *vk_procs; - uint64_t value; - unsigned int i; - VkResult vr; - - for (i = 0; i < fence->gpu_wait_count; ++i) - { - if (d3d12_fence_gpu_wait_is_completed(fence, i) && i < --fence->gpu_wait_count) - fence->gpu_waits[i] = fence->gpu_waits[fence->gpu_wait_count]; - } - if (fence->gpu_wait_count) - return true; - - /* Check for pending signals too. */ - if (fence->value >= fence->pending_timeline_value) - return false; - - vk_procs = &device->vk_procs; - - /* Check the actual semaphore value in case fence->value update is lagging. */ - if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, fence->timeline_semaphore, &value))) < 0) - { - ERR("Failed to get Vulkan semaphore status, vr %d.\n", vr); - return false; - } - - return value < fence->pending_timeline_value; -} - -/* Replace the VkSemaphore with a new one to allow a lower value to be set. Ideally apps will - * only use this to reset the fence when no operations are pending on the queue. */ -static HRESULT d3d12_fence_reinit_timeline_semaphore_locked(struct d3d12_fence *fence, uint64_t value) -{ - const struct d3d12_device *device = fence->device; - const struct vkd3d_vk_device_procs *vk_procs; - VkSemaphore timeline_semaphore; - VkResult vr; - - if (d3d12_fence_has_pending_gpu_ops_locked(fence)) - { - /* This situation is not very likely because it means a fence with pending waits and/or signals was - * signalled on the CPU to a lower value. For now, emit a fixme so it can be patched if necessary. - * A patch already exists for this but it's not pretty. */ - FIXME("Unable to re-initialise timeline semaphore to a lower value due to pending GPU ops.\n"); - return E_FAIL; - } - - if ((vr = vkd3d_create_timeline_semaphore(device, value, &timeline_semaphore)) < 0) - { - WARN("Failed to create timeline semaphore, vr %d.\n", vr); - return hresult_from_vk_result(vr); - } - - fence->value = value; - fence->pending_timeline_value = value; - - WARN("Replacing timeline semaphore with a new object.\n"); - - vk_procs = &device->vk_procs; - - VK_CALL(vkDestroySemaphore(device->vk_device, fence->timeline_semaphore, NULL)); - fence->timeline_semaphore = timeline_semaphore; - - return S_OK; -} - static HRESULT d3d12_fence_signal_cpu_timeline_semaphore(struct d3d12_fence *fence, uint64_t value) { - const struct d3d12_device *device = fence->device; - VkSemaphoreSignalInfoKHR info; - HRESULT hr = S_OK; - VkResult vr; int rc;
if ((rc = vkd3d_mutex_lock(&fence->mutex))) @@ -1162,48 +1231,13 @@ static HRESULT d3d12_fence_signal_cpu_timeline_semaphore(struct d3d12_fence *fen return hresult_from_errno(rc); }
- /* We must only signal a value which is greater than the current value. - * That value can be in the range of current known value (fence->value), or as large as pending_timeline_value. - * Pending timeline value signal might be blocked by another synchronization primitive, and thus statically - * cannot be that value, so the safest thing to do is to check the current value which is updated by the fence - * wait thread continuously. This check is technically racy since the value might be immediately out of date, - * but there is no way to avoid this. */ - if (value > fence->value) - { - const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; - - /* Sanity check against the delta limit. */ - if (value - fence->value > device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference) - { - FIXME("Timeline semaphore delta is %"PRIu64", but implementation only supports a delta of %"PRIu64".\n", - value - fence->value, device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference); - } - - info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR; - info.pNext = NULL; - info.semaphore = fence->timeline_semaphore; - info.value = value; - if ((vr = VK_CALL(vkSignalSemaphoreKHR(device->vk_device, &info))) >= 0) - { - fence->value = value; - if (value > fence->pending_timeline_value) - fence->pending_timeline_value = value; - } - else - { - ERR("Failed to signal timeline semaphore, vr %d.\n", vr); - hr = hresult_from_vk_result(vr); - } - } - else if (value < fence->value) - { - hr = d3d12_fence_reinit_timeline_semaphore_locked(fence, value); - } - + fence->value = value; d3d12_fence_signal_external_events_locked(fence); + d3d12_fence_update_pending_value_locked(fence);
vkd3d_mutex_unlock(&fence->mutex); - return hr; + + return d3d12_device_flush_blocked_queues(fence->device); }
static HRESULT STDMETHODCALLTYPE d3d12_fence_Signal(ID3D12Fence *iface, UINT64 value) @@ -1214,7 +1248,7 @@ static HRESULT STDMETHODCALLTYPE d3d12_fence_Signal(ID3D12Fence *iface, UINT64 v
if (fence->timeline_semaphore) return d3d12_fence_signal_cpu_timeline_semaphore(fence, value); - return d3d12_fence_signal(fence, value, VK_NULL_HANDLE); + return d3d12_fence_signal(fence, value, VK_NULL_HANDLE, true); }
static const struct ID3D12FenceVtbl d3d12_fence_vtbl = @@ -1257,6 +1291,7 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device * fence->refcount = 1;
fence->value = initial_value; + fence->max_pending_value = initial_value;
if ((rc = vkd3d_mutex_init(&fence->mutex))) { @@ -1279,15 +1314,15 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device * fence->event_count = 0;
fence->timeline_semaphore = VK_NULL_HANDLE; - if (device->use_timeline_semaphores && (vr = vkd3d_create_timeline_semaphore(device, initial_value, + fence->timeline_value = 0; + fence->pending_timeline_value = 0; + if (device->vk_info.KHR_timeline_semaphore && (vr = vkd3d_create_timeline_semaphore(device, 0, &fence->timeline_semaphore)) < 0) { WARN("Failed to create timeline semaphore, vr %d.\n", vr); hr = hresult_from_vk_result(vr); goto fail_destroy_null_cond; } - fence->pending_timeline_value = initial_value; - fence->gpu_wait_count = 0;
fence->semaphores = NULL; fence->semaphores_size = 0; @@ -6136,6 +6171,9 @@ static ULONG STDMETHODCALLTYPE d3d12_command_queue_Release(ID3D12CommandQueue *i
vkd3d_fence_worker_stop(&command_queue->fence_worker, device);
+ vkd3d_mutex_destroy(&command_queue->op_mutex); + vkd3d_free(command_queue->ops); + vkd3d_private_store_destroy(&command_queue->private_store);
vkd3d_free(command_queue); @@ -6205,6 +6243,14 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_GetDevice(ID3D12CommandQueu return d3d12_device_query_interface(command_queue->device, iid, device); }
+static struct vkd3d_cs_op_data *d3d12_command_queue_require_space_locked(struct d3d12_command_queue *queue) +{ + if (!vkd3d_array_reserve((void **)&queue->ops, &queue->ops_size, queue->ops_count + 1, sizeof(*queue->ops))) + return NULL; + + return &queue->ops[queue->ops_count++]; +} + static void STDMETHODCALLTYPE d3d12_command_queue_UpdateTileMappings(ID3D12CommandQueue *iface, ID3D12Resource *resource, UINT region_count, const D3D12_TILED_RESOURCE_COORDINATE *region_start_coordinates, @@ -6236,22 +6282,50 @@ static void STDMETHODCALLTYPE d3d12_command_queue_CopyTileMappings(ID3D12Command src_region_start_coordinate, region_size, flags); }
+static void d3d12_command_queue_execute(struct d3d12_command_queue *command_queue, + VkCommandBuffer *buffers, unsigned int count) +{ + const struct vkd3d_vk_device_procs *vk_procs = &command_queue->device->vk_procs; + struct vkd3d_queue *vkd3d_queue = command_queue->vkd3d_queue; + VkSubmitInfo submit_desc; + VkQueue vk_queue; + VkResult vr; + + memset(&submit_desc, 0, sizeof(submit_desc)); + + if (!(vk_queue = vkd3d_queue_acquire(vkd3d_queue))) + { + ERR("Failed to acquire queue %p.\n", vkd3d_queue); + return; + } + + submit_desc.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_desc.commandBufferCount = count; + submit_desc.pCommandBuffers = buffers; + + if ((vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_desc, VK_NULL_HANDLE))) < 0) + ERR("Failed to submit queue(s), vr %d.\n", vr); + + vkd3d_queue_release(vkd3d_queue); + + vkd3d_free(buffers); +} + static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12CommandQueue *iface, UINT command_list_count, ID3D12CommandList * const *command_lists) { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); - const struct vkd3d_vk_device_procs *vk_procs; struct d3d12_command_list *cmd_list; - struct VkSubmitInfo submit_desc; + struct vkd3d_cs_op_data *op; VkCommandBuffer *buffers; - VkQueue vk_queue; unsigned int i; - VkResult vr; + int rc;
TRACE("iface %p, command_list_count %u, command_lists %p.\n", iface, command_list_count, command_lists);
- vk_procs = &command_queue->device->vk_procs; + if (!command_list_count) + return;
if (!(buffers = vkd3d_calloc(command_list_count, sizeof(*buffers)))) { @@ -6274,29 +6348,30 @@ static void STDMETHODCALLTYPE d3d12_command_queue_ExecuteCommandLists(ID3D12Comm buffers[i] = cmd_list->vk_command_buffer; }
- submit_desc.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submit_desc.pNext = NULL; - submit_desc.waitSemaphoreCount = 0; - submit_desc.pWaitSemaphores = NULL; - submit_desc.pWaitDstStageMask = NULL; - submit_desc.commandBufferCount = command_list_count; - submit_desc.pCommandBuffers = buffers; - submit_desc.signalSemaphoreCount = 0; - submit_desc.pSignalSemaphores = NULL; - - if (!(vk_queue = vkd3d_queue_acquire(command_queue->vkd3d_queue))) + if ((rc = vkd3d_mutex_lock(&command_queue->op_mutex))) { - ERR("Failed to acquire queue %p.\n", command_queue->vkd3d_queue); - vkd3d_free(buffers); + ERR("Failed to lock mutex, error %d.\n", rc); return; }
- if ((vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_desc, VK_NULL_HANDLE))) < 0) - ERR("Failed to submit queue(s), vr %d.\n", vr); + if (!command_queue->ops_count) + { + d3d12_command_queue_execute(command_queue, buffers, command_list_count); + vkd3d_mutex_unlock(&command_queue->op_mutex); + return; + }
- vkd3d_queue_release(command_queue->vkd3d_queue); + if (!(op = d3d12_command_queue_require_space_locked(command_queue))) + { + ERR("Failed to add op.\n"); + return; + } + op->opcode = VKD3D_CS_OP_EXECUTE; + op->u.execute.buffers = buffers; + op->u.execute.buffer_count = command_list_count;
- vkd3d_free(buffers); + vkd3d_mutex_unlock(&command_queue->op_mutex); + return; }
static void STDMETHODCALLTYPE d3d12_command_queue_SetMarker(ID3D12CommandQueue *iface, @@ -6318,38 +6393,6 @@ static void STDMETHODCALLTYPE d3d12_command_queue_EndEvent(ID3D12CommandQueue *i FIXME("iface %p stub!\n", iface); }
-static HRESULT d3d12_fence_update_gpu_signal_timeline_semaphore(struct d3d12_fence *fence, uint64_t value) -{ - const struct d3d12_device *device = fence->device; - int rc; - - if ((rc = vkd3d_mutex_lock(&fence->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return hresult_from_errno(rc); - } - - /* If we're attempting to async signal a fence with a value which is not strictly increasing the payload value, - * warn about this case. Do not treat this as an error since it works at least with RADV and Nvidia drivers and - * there's no workaround on the GPU side. */ - if (value <= fence->pending_timeline_value) - { - WARN("Fence %p values are not strictly increasing. Pending values: old %"PRIu64", new %"PRIu64".\n", - fence, fence->pending_timeline_value, value); - } - /* Sanity check against the delta limit. Use the current fence value. */ - else if (value - fence->value > device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference) - { - FIXME("Timeline semaphore delta is %"PRIu64", but implementation only supports a delta of %"PRIu64".\n", - value - fence->value, device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference); - } - fence->pending_timeline_value = value; - - vkd3d_mutex_unlock(&fence->mutex); - - return S_OK; -} - static HRESULT vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worker, VkSemaphore vk_semaphore, struct d3d12_fence *fence, uint64_t value, struct vkd3d_queue *queue) { @@ -6389,31 +6432,68 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * ID3D12Fence *fence_iface, UINT64 value) { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); + struct d3d12_fence *fence = unsafe_impl_from_ID3D12Fence(fence_iface); + struct vkd3d_cs_op_data *op; + HRESULT hr = S_OK; + int rc; + + TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value); + + if ((rc = vkd3d_mutex_lock(&command_queue->op_mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return hresult_from_errno(rc); + } + + if (!command_queue->ops_count) + { + hr = d3d12_command_queue_signal(command_queue, fence, value); + goto done; + } + + if (!(op = d3d12_command_queue_require_space_locked(command_queue))) + { + hr = E_OUTOFMEMORY; + goto done; + } + op->opcode = VKD3D_CS_OP_SIGNAL; + op->u.signal.fence = fence; + op->u.signal.value = value; + + d3d12_fence_incref(fence); + +done: + vkd3d_mutex_unlock(&command_queue->op_mutex); + return hr; +} + +static HRESULT d3d12_command_queue_signal(struct d3d12_command_queue *command_queue, + struct d3d12_fence *fence, uint64_t value) +{ VkTimelineSemaphoreSubmitInfoKHR timeline_submit_info; const struct vkd3d_vk_device_procs *vk_procs; VkSemaphore vk_semaphore = VK_NULL_HANDLE; VkFence vk_fence = VK_NULL_HANDLE; struct vkd3d_queue *vkd3d_queue; uint64_t sequence_number = 0; + uint64_t timeline_value = 0; struct d3d12_device *device; - struct d3d12_fence *fence; VkSubmitInfo submit_info; VkQueue vk_queue; VkResult vr; HRESULT hr;
- TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value); - device = command_queue->device; vk_procs = &device->vk_procs; vkd3d_queue = command_queue->vkd3d_queue;
- fence = unsafe_impl_from_ID3D12Fence(fence_iface); - - if (device->use_timeline_semaphores) + if (device->vk_info.KHR_timeline_semaphore) { - if (FAILED(hr = d3d12_fence_update_gpu_signal_timeline_semaphore(fence, value))) - return hr; + if (!(timeline_value = d3d12_fence_add_pending_timeline_signal(fence, value, vkd3d_queue))) + { + ERR("Failed to add pending signal.\n"); + return E_OUTOFMEMORY; + }
vk_semaphore = fence->timeline_semaphore; assert(vk_semaphore); @@ -6434,7 +6514,7 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * goto fail; }
- if (!device->use_timeline_semaphores && (vr = vkd3d_queue_create_vk_semaphore_locked(vkd3d_queue, + if (!device->vk_info.KHR_timeline_semaphore && (vr = vkd3d_queue_create_vk_semaphore_locked(vkd3d_queue, device, &vk_semaphore)) < 0) { ERR("Failed to create Vulkan semaphore, vr %d.\n", vr); @@ -6451,11 +6531,11 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * submit_info.signalSemaphoreCount = vk_semaphore ? 1 : 0; submit_info.pSignalSemaphores = &vk_semaphore;
- if (device->use_timeline_semaphores) + if (device->vk_info.KHR_timeline_semaphore) { timeline_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; timeline_submit_info.pNext = NULL; - timeline_submit_info.pSignalSemaphoreValues = &value; + timeline_submit_info.pSignalSemaphoreValues = &timeline_value; timeline_submit_info.signalSemaphoreValueCount = submit_info.signalSemaphoreCount; timeline_submit_info.waitSemaphoreValueCount = 0; timeline_submit_info.pWaitSemaphoreValues = NULL; @@ -6463,7 +6543,7 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * }
vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, vk_fence)); - if (!device->use_timeline_semaphores && vr >= 0) + if (!device->vk_info.KHR_timeline_semaphore && vr >= 0) { sequence_number = ++vkd3d_queue->submitted_sequence_number;
@@ -6480,13 +6560,22 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * goto fail_vkresult; }
- if (device->use_timeline_semaphores) + if (device->vk_info.KHR_timeline_semaphore) { + if (FAILED(hr = d3d12_fence_update_pending_value(fence))) + return hr; + + if (FAILED(hr = d3d12_device_flush_blocked_queues(device))) + return hr; + + vk_semaphore = fence->timeline_semaphore; + assert(vk_semaphore); + return vkd3d_enqueue_timeline_semaphore(&command_queue->fence_worker, - vk_semaphore, fence, value, vkd3d_queue); + vk_semaphore, fence, timeline_value, vkd3d_queue); }
- if (vk_semaphore && SUCCEEDED(hr = d3d12_fence_add_vk_semaphore(fence, vk_semaphore, vk_fence, value))) + if (vk_semaphore && SUCCEEDED(hr = d3d12_fence_add_vk_semaphore(fence, vk_semaphore, vk_fence, value, vkd3d_queue))) vk_semaphore = VK_NULL_HANDLE;
vr = VK_CALL(vkGetFenceStatus(device->vk_device, vk_fence)); @@ -6501,7 +6590,7 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue * else if (vr == VK_SUCCESS) { TRACE("Already signaled %p, value %#"PRIx64".\n", fence, value); - hr = d3d12_fence_signal(fence, value, vk_fence); + hr = d3d12_fence_signal(fence, value, vk_fence, false); vk_fence = VK_NULL_HANDLE; vkd3d_queue_update_sequence_number(vkd3d_queue, sequence_number, device); } @@ -6524,12 +6613,12 @@ fail_vkresult: hr = hresult_from_vk_result(vr); fail: VK_CALL(vkDestroyFence(device->vk_device, vk_fence, NULL)); - if (!device->use_timeline_semaphores) + if (!device->vk_info.KHR_timeline_semaphore) VK_CALL(vkDestroySemaphore(device->vk_device, vk_semaphore, NULL)); return hr; }
-static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_queue *command_queue, +static HRESULT d3d12_command_queue_wait_binary_semaphore_locked(struct d3d12_command_queue *command_queue, struct d3d12_fence *fence, uint64_t value) { static const VkPipelineStageFlagBits wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; @@ -6545,7 +6634,10 @@ static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_qu vk_procs = &command_queue->device->vk_procs; queue = command_queue->vkd3d_queue;
- semaphore = d3d12_fence_acquire_vk_semaphore(fence, value, &completed_value); + semaphore = d3d12_fence_acquire_vk_semaphore_locked(fence, value, &completed_value); + + vkd3d_mutex_unlock(&fence->mutex); + if (!semaphore && completed_value >= value) { /* We don't get a Vulkan semaphore if the fence was signaled on CPU. */ @@ -6568,7 +6660,7 @@ static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_qu } else { - FIXME("Failed to acquire Vulkan semaphore for fence %p, value %#"PRIx64 + WARN("Failed to acquire Vulkan semaphore for fence %p, value %#"PRIx64 ", completed value %#"PRIx64".\n", fence, value, completed_value); }
@@ -6579,7 +6671,7 @@ static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_qu submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submit_info.pNext = NULL; submit_info.waitSemaphoreCount = 1; - submit_info.pWaitSemaphores = &semaphore->vk_semaphore; + submit_info.pWaitSemaphores = &semaphore->u.binary.vk_semaphore; submit_info.pWaitDstStageMask = &wait_stage_mask; submit_info.commandBufferCount = 0; submit_info.pCommandBuffers = NULL; @@ -6597,7 +6689,7 @@ static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_qu
if ((vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE))) >= 0) { - queue->semaphores[queue->semaphore_count].vk_semaphore = semaphore->vk_semaphore; + queue->semaphores[queue->semaphore_count].vk_semaphore = semaphore->u.binary.vk_semaphore; queue->semaphores[queue->semaphore_count].sequence_number = queue->submitted_sequence_number + 1; ++queue->semaphore_count;
@@ -6622,48 +6714,7 @@ fail: return hr; }
-static inline void d3d12_fence_update_gpu_wait(struct d3d12_fence *fence, const struct vkd3d_queue *queue) -{ - unsigned int i; - bool found; - int rc; - - if ((rc = vkd3d_mutex_lock(&fence->mutex))) - { - ERR("Failed to lock mutex, error %d.\n", rc); - return; - } - - for (i = 0, found = false; i < fence->gpu_wait_count; ++i) - { - if (fence->gpu_waits[i].queue == queue) - { - fence->gpu_waits[i].pending_value = queue->pending_wait_completion_value; - found = true; - } - else if (d3d12_fence_gpu_wait_is_completed(fence, i) && i < --fence->gpu_wait_count) - { - fence->gpu_waits[i] = fence->gpu_waits[fence->gpu_wait_count]; - } - } - - if (!found) - { - if (fence->gpu_wait_count < ARRAY_SIZE(fence->gpu_waits)) - { - fence->gpu_waits[fence->gpu_wait_count].queue = queue; - fence->gpu_waits[fence->gpu_wait_count++].pending_value = queue->pending_wait_completion_value; - } - else - { - FIXME("Unable to track GPU fence wait.\n"); - } - } - - vkd3d_mutex_unlock(&fence->mutex); -} - -static HRESULT d3d12_command_queue_wait_timeline_semaphore(struct d3d12_command_queue *command_queue, +static HRESULT d3d12_command_queue_wait_locked(struct d3d12_command_queue *command_queue, struct d3d12_fence *fence, uint64_t value) { static const VkPipelineStageFlagBits wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; @@ -6671,25 +6722,29 @@ static HRESULT d3d12_command_queue_wait_timeline_semaphore(struct d3d12_command_ const struct vkd3d_vk_device_procs *vk_procs; struct vkd3d_queue *queue; VkSubmitInfo submit_info; + uint64_t wait_value; VkQueue vk_queue; VkResult vr;
vk_procs = &command_queue->device->vk_procs; queue = command_queue->vkd3d_queue;
+ if (!command_queue->device->vk_info.KHR_timeline_semaphore) + return d3d12_command_queue_wait_binary_semaphore_locked(command_queue, fence, value); + + wait_value = d3d12_fence_get_timeline_wait_value_locked(fence, value); + + /* We can unlock the fence here. The queue semaphore will not be signalled to signal_value + * until we have submitted, so the semaphore cannot be destroyed before the call to vkQueueSubmit. */ + vkd3d_mutex_unlock(&fence->mutex); + assert(fence->timeline_semaphore); timeline_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; timeline_submit_info.pNext = NULL; + timeline_submit_info.waitSemaphoreValueCount = 1; + timeline_submit_info.pWaitSemaphoreValues = &wait_value; timeline_submit_info.signalSemaphoreValueCount = 0; timeline_submit_info.pSignalSemaphoreValues = NULL; - timeline_submit_info.waitSemaphoreValueCount = 1; - timeline_submit_info.pWaitSemaphoreValues = &value; - - if (!(vk_queue = vkd3d_queue_acquire(queue))) - { - ERR("Failed to acquire queue %p.\n", queue); - return E_FAIL; - }
submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submit_info.pNext = &timeline_submit_info; @@ -6701,14 +6756,11 @@ static HRESULT d3d12_command_queue_wait_timeline_semaphore(struct d3d12_command_ submit_info.signalSemaphoreCount = 0; submit_info.pSignalSemaphores = NULL;
- ++queue->pending_wait_completion_value; - - submit_info.signalSemaphoreCount = 1; - submit_info.pSignalSemaphores = &queue->wait_completion_semaphore; - timeline_submit_info.signalSemaphoreValueCount = 1; - timeline_submit_info.pSignalSemaphoreValues = &queue->pending_wait_completion_value; - - d3d12_fence_update_gpu_wait(fence, queue); + if (!(vk_queue = vkd3d_queue_acquire(queue))) + { + ERR("Failed to acquire queue %p.\n", queue); + return E_FAIL; + }
vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE));
@@ -6728,14 +6780,58 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Wait(ID3D12CommandQueue *if { struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface); struct d3d12_fence *fence = unsafe_impl_from_ID3D12Fence(fence_iface); + struct vkd3d_cs_op_data *op; + HRESULT hr = S_OK; + int rc;
TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value);
- if (command_queue->device->use_timeline_semaphores) - return d3d12_command_queue_wait_timeline_semaphore(command_queue, fence, value); + if ((rc = vkd3d_mutex_lock(&command_queue->op_mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return hresult_from_errno(rc); + } + if ((rc = vkd3d_mutex_lock(&fence->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + hr = hresult_from_errno(rc); + goto done; + }
- FIXME_ONCE("KHR_timeline_semaphore is not available or incompatible. Some wait commands may be unsupported.\n"); - return d3d12_command_queue_wait_binary_semaphore(command_queue, fence, value); + if (!command_queue->ops_count && value <= fence->max_pending_value) + { + hr = d3d12_command_queue_wait_locked(command_queue, fence, value); + goto done; + } + + vkd3d_mutex_unlock(&fence->mutex); + + /* This is the critical part required to support out-of-order signal. + * Normally we would be able to submit waits and signals out of order, but + * we don't have virtualized queues in Vulkan, so we need to handle the case + * where multiple queues alias over the same physical queue, so effectively, + * we need to manage out-of-order submits ourselves. */ + + if (!command_queue->ops_count) + hr = d3d12_device_add_blocked_command_queues(command_queue->device, &command_queue, 1); + + if (FAILED(hr)) + goto done; + + if (!(op = d3d12_command_queue_require_space_locked(command_queue))) + { + hr = E_OUTOFMEMORY; + goto done; + } + op->opcode = VKD3D_CS_OP_WAIT; + op->u.wait.fence = fence; + op->u.wait.value = value; + + d3d12_fence_incref(fence); + +done: + vkd3d_mutex_unlock(&command_queue->op_mutex); + return hr; }
static HRESULT STDMETHODCALLTYPE d3d12_command_queue_GetTimestampFrequency(ID3D12CommandQueue *iface, @@ -6859,10 +6955,82 @@ static const struct ID3D12CommandQueueVtbl d3d12_command_queue_vtbl = d3d12_command_queue_GetDesc, };
+/* flushed_any is initialised by the caller. */ +static bool d3d12_command_queue_flush_ops(struct d3d12_command_queue *queue, bool *flushed_any) +{ + struct vkd3d_cs_op_data *op; + struct d3d12_fence *fence; + bool flushed_all = false; + unsigned int i; + int rc; + + if (!queue->ops_count) + return true; + + /* This function may be re-entered during a call below to d3d12_command_queue_signal(). + * We return true because the first caller is responsible for re-adding this queue to + * the flush list if it ends up returning false. */ + if (queue->is_flushing) + return true; + + if ((rc = vkd3d_mutex_lock(&queue->op_mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return true; + } + + /* Currently only required for d3d12_command_queue_signal(), but set it here anyway. */ + queue->is_flushing = true; + + for (i = 0; i < queue->ops_count; ++i) + { + op = &queue->ops[i]; + switch (op->opcode) + { + case VKD3D_CS_OP_WAIT: + fence = op->u.wait.fence; + if (op->u.wait.value > fence->max_pending_value) + { + queue->ops_count -= i; + memmove(queue->ops, op, queue->ops_count * sizeof(*op)); + goto done; + } + vkd3d_mutex_lock(&fence->mutex); + d3d12_command_queue_wait_locked(queue, fence, op->u.wait.value); + d3d12_fence_decref(fence); + break; + + case VKD3D_CS_OP_SIGNAL: + d3d12_command_queue_signal(queue, op->u.signal.fence, op->u.signal.value); + d3d12_fence_decref(op->u.signal.fence); + break; + + case VKD3D_CS_OP_EXECUTE: + d3d12_command_queue_execute(queue, op->u.execute.buffers, op->u.execute.buffer_count); + break; + + default: + FIXME("Unhandled op type %u.\n", op->opcode); + break; + } + *flushed_any |= true; + } + + queue->ops_count = 0; + flushed_all = true; + +done: + queue->is_flushing = false; + + vkd3d_mutex_unlock(&queue->op_mutex); + return flushed_all; +} + static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, struct d3d12_device *device, const D3D12_COMMAND_QUEUE_DESC *desc) { HRESULT hr; + int rc;
queue->ID3D12CommandQueue_iface.lpVtbl = &d3d12_command_queue_vtbl; queue->refcount = 1; @@ -6877,6 +7045,11 @@ static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, queue->last_waited_fence = NULL; queue->last_waited_fence_value = 0;
+ queue->ops = NULL; + queue->ops_count = 0; + queue->ops_size = 0; + queue->is_flushing = false; + if (desc->Priority == D3D12_COMMAND_QUEUE_PRIORITY_GLOBAL_REALTIME) { FIXME("Global realtime priority is not implemented.\n"); @@ -6891,15 +7064,24 @@ static HRESULT d3d12_command_queue_init(struct d3d12_command_queue *queue, if (FAILED(hr = vkd3d_private_store_init(&queue->private_store))) return hr;
- if (FAILED(hr = vkd3d_fence_worker_start(&queue->fence_worker, queue->vkd3d_queue, device))) + if ((rc = vkd3d_mutex_init(&queue->op_mutex)) < 0) { - vkd3d_private_store_destroy(&queue->private_store); - return hr; + hr = hresult_from_errno(rc); + goto fail_destroy_private_store; }
+ if (FAILED(hr = vkd3d_fence_worker_start(&queue->fence_worker, queue->vkd3d_queue, device))) + goto fail_destroy_op_mutex; + d3d12_device_add_ref(queue->device = device);
return S_OK; + +fail_destroy_op_mutex: + vkd3d_mutex_destroy(&queue->op_mutex); +fail_destroy_private_store: + vkd3d_private_store_destroy(&queue->private_store); + return hr; }
HRESULT d3d12_command_queue_create(struct d3d12_device *device, @@ -6934,8 +7116,12 @@ uint32_t vkd3d_get_vk_queue_family_index(ID3D12CommandQueue *queue) VkQueue vkd3d_acquire_vk_queue(ID3D12CommandQueue *queue) { struct d3d12_command_queue *d3d12_queue = impl_from_ID3D12CommandQueue(queue); + VkQueue vk_queue = vkd3d_queue_acquire(d3d12_queue->vkd3d_queue); + + if (d3d12_queue->ops_count) + WARN("Acquired command queue %p with %zu remaining ops.\n", d3d12_queue, d3d12_queue->ops_count);
- return vkd3d_queue_acquire(d3d12_queue->vkd3d_queue); + return vk_queue; }
void vkd3d_release_vk_queue(ID3D12CommandQueue *queue) diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 5f8108ec..eaedc444 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -747,7 +747,6 @@ struct vkd3d_physical_device_info VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT texel_buffer_alignment_properties; VkPhysicalDeviceTransformFeedbackPropertiesEXT xfb_properties; VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT vertex_divisor_properties; - VkPhysicalDeviceTimelineSemaphorePropertiesKHR timeline_semaphore_properties;
VkPhysicalDeviceProperties2KHR properties2;
@@ -772,7 +771,6 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i VkPhysicalDeviceDescriptorIndexingPropertiesEXT *descriptor_indexing_properties; VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *vertex_divisor_properties; VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *buffer_alignment_properties; - VkPhysicalDeviceTimelineSemaphorePropertiesKHR *timeline_semaphore_properties; VkPhysicalDeviceDescriptorIndexingFeaturesEXT *descriptor_indexing_features; VkPhysicalDeviceRobustness2FeaturesEXT *robustness2_features; VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *vertex_divisor_features; @@ -799,7 +797,6 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i vertex_divisor_features = &info->vertex_divisor_features; vertex_divisor_properties = &info->vertex_divisor_properties; timeline_semaphore_features = &info->timeline_semaphore_features; - timeline_semaphore_properties = &info->timeline_semaphore_properties; xfb_features = &info->xfb_features; xfb_properties = &info->xfb_properties;
@@ -841,8 +838,6 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i vk_prepend_struct(&info->properties2, xfb_properties); vertex_divisor_properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT; vk_prepend_struct(&info->properties2, vertex_divisor_properties); - timeline_semaphore_properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR; - vk_prepend_struct(&info->properties2, timeline_semaphore_properties);
if (vulkan_info->KHR_get_physical_device_properties2) VK_CALL(vkGetPhysicalDeviceProperties2KHR(physical_device, &info->properties2)); @@ -1431,7 +1426,6 @@ static HRESULT vkd3d_init_device_caps(struct d3d12_device *device, vulkan_info->rasterization_stream = physical_device_info->xfb_properties.transformFeedbackRasterizationStreamSelect; vulkan_info->transform_feedback_queries = physical_device_info->xfb_properties.transformFeedbackQueries; vulkan_info->max_vertex_attrib_divisor = max(physical_device_info->vertex_divisor_properties.maxVertexAttribDivisor, 1); - vulkan_info->timeline_semaphore_properties = physical_device_info->timeline_semaphore_properties;
device->feature_options.DoublePrecisionFloatShaderOps = features->shaderFloat64; device->feature_options.OutputMergerLogicOp = features->logicOp; @@ -1908,75 +1902,6 @@ static bool d3d12_is_64k_msaa_supported(struct d3d12_device *device) && info.Alignment <= 0x10000; }
-/* A lower value can be signalled on a D3D12 fence. Vulkan timeline semaphores - * do not support this, but test if it works anyway. */ -static bool d3d12_is_timeline_semaphore_supported(const struct d3d12_device *device) -{ - const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; - VkTimelineSemaphoreSubmitInfoKHR timeline_submit_info; - VkSemaphore timeline_semaphore; - VkSubmitInfo submit_info; - bool result = false; - uint64_t value = 0; - VkQueue vk_queue; - VkResult vr; - - if (!device->vk_info.KHR_timeline_semaphore) - return false; - - if ((vr = vkd3d_create_timeline_semaphore(device, 1, &timeline_semaphore)) < 0) - { - WARN("Failed to create timeline semaphore, vr %d.\n", vr); - return false; - } - - if (!(vk_queue = vkd3d_queue_acquire(device->direct_queue))) - { - ERR("Failed to acquire queue %p.\n", device->direct_queue); - VK_CALL(vkDestroySemaphore(device->vk_device, timeline_semaphore, NULL)); - return false; - } - - submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submit_info.pNext = &timeline_submit_info; - submit_info.waitSemaphoreCount = 0; - submit_info.pWaitSemaphores = NULL; - submit_info.pWaitDstStageMask = NULL; - submit_info.commandBufferCount = 0; - submit_info.pCommandBuffers = NULL; - submit_info.signalSemaphoreCount = 1; - submit_info.pSignalSemaphores = &timeline_semaphore; - - timeline_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; - timeline_submit_info.pNext = NULL; - timeline_submit_info.pSignalSemaphoreValues = &value; - timeline_submit_info.signalSemaphoreValueCount = 1; - timeline_submit_info.waitSemaphoreValueCount = 0; - timeline_submit_info.pWaitSemaphoreValues = NULL; - - vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE)); - - if (vr >= 0) - { - if ((vr = VK_CALL(vkQueueWaitIdle(vk_queue))) < 0) - WARN("Failed to wait for queue, vr %d.\n", vr); - - if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, timeline_semaphore, &value))) < 0) - ERR("Failed to get Vulkan semaphore status, vr %d.\n", vr); - else if (!(result = !value)) - WARN("Disabling timeline semaphore use due to incompatible behaviour.\n"); - } - else - { - WARN("Failed to submit signal operation, vr %d.\n", vr); - } - - vkd3d_queue_release(device->direct_queue); - VK_CALL(vkDestroySemaphore(device->vk_device, timeline_semaphore, NULL)); - - return result; -} - static HRESULT vkd3d_create_vk_device(struct d3d12_device *device, const struct vkd3d_device_create_info *create_info) { @@ -2075,10 +2000,6 @@ static HRESULT vkd3d_create_vk_device(struct d3d12_device *device, }
device->feature_options4.MSAA64KBAlignedTextureSupported = d3d12_is_64k_msaa_supported(device); - device->use_timeline_semaphores = d3d12_is_timeline_semaphore_supported(device) - && vkd3d_queue_init_timeline_semaphore(device->direct_queue, device) - && vkd3d_queue_init_timeline_semaphore(device->compute_queue, device) - && vkd3d_queue_init_timeline_semaphore(device->copy_queue, device);
TRACE("Created Vulkan device %p.\n", vk_device);
@@ -4362,6 +4283,8 @@ static HRESULT d3d12_device_init(struct d3d12_device *device, vkd3d_gpu_va_allocator_init(&device->gpu_va_allocator); vkd3d_time_domains_init(device);
+ device->blocked_queue_count = 0; + for (i = 0; i < ARRAY_SIZE(device->desc_mutex); ++i) vkd3d_mutex_init(&device->desc_mutex[i]);
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 4e03145d..f00181a2 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -59,7 +59,7 @@ #define VKD3D_MAX_SHADER_EXTENSIONS 3u #define VKD3D_MAX_SHADER_STAGES 5u #define VKD3D_MAX_VK_SYNC_OBJECTS 4u -#define VKD3D_MAX_FENCE_WAITING_QUEUES 4u +#define VKD3D_MAX_DEVICE_BLOCKED_QUEUES 16u #define VKD3D_MAX_DESCRIPTOR_SETS 64u /* D3D12 binding tier 3 has a limit of 2048 samplers. */ #define VKD3D_MAX_DESCRIPTOR_SET_SAMPLERS 2048u @@ -152,8 +152,6 @@ struct vkd3d_vulkan_info
VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT texel_buffer_alignment_properties;
- VkPhysicalDeviceTimelineSemaphorePropertiesKHR timeline_semaphore_properties; - unsigned int shader_extension_count; enum vkd3d_shader_spirv_extension shader_extensions[VKD3D_MAX_SHADER_EXTENSIONS];
@@ -502,15 +500,17 @@ HRESULT vkd3d_set_private_data_interface(struct vkd3d_private_store *store, cons struct vkd3d_signaled_semaphore { uint64_t value; - VkSemaphore vk_semaphore; - VkFence vk_fence; - bool is_acquired; -}; - -struct vkd3d_pending_fence_wait -{ - const struct vkd3d_queue *queue; - uint64_t pending_value; + union + { + struct + { + VkSemaphore vk_semaphore; + VkFence vk_fence; + bool is_acquired; + } binary; + uint64_t timeline_value; + } u; + const struct vkd3d_queue *signalling_queue; };
/* ID3D12Fence */ @@ -521,6 +521,7 @@ struct d3d12_fence LONG refcount;
uint64_t value; + uint64_t max_pending_value; struct vkd3d_mutex mutex; struct vkd3d_cond null_event_cond;
@@ -534,9 +535,8 @@ struct d3d12_fence size_t event_count;
VkSemaphore timeline_semaphore; + uint64_t timeline_value; uint64_t pending_timeline_value; - struct vkd3d_pending_fence_wait gpu_waits[VKD3D_MAX_FENCE_WAITING_QUEUES]; - unsigned int gpu_wait_count;
struct vkd3d_signaled_semaphore *semaphores; size_t semaphores_size; @@ -1294,9 +1294,6 @@ struct vkd3d_queue VkQueueFlags vk_queue_flags; uint32_t timestamp_bits;
- VkSemaphore wait_completion_semaphore; - uint64_t pending_wait_completion_value; - struct { VkSemaphore vk_semaphore; @@ -1311,10 +1308,45 @@ struct vkd3d_queue VkQueue vkd3d_queue_acquire(struct vkd3d_queue *queue); HRESULT vkd3d_queue_create(struct d3d12_device *device, uint32_t family_index, const VkQueueFamilyProperties *properties, struct vkd3d_queue **queue); -bool vkd3d_queue_init_timeline_semaphore(struct vkd3d_queue *queue, struct d3d12_device *device); void vkd3d_queue_destroy(struct vkd3d_queue *queue, struct d3d12_device *device); void vkd3d_queue_release(struct vkd3d_queue *queue);
+enum vkd3d_cs_op +{ + VKD3D_CS_OP_WAIT, + VKD3D_CS_OP_SIGNAL, + VKD3D_CS_OP_EXECUTE, +}; + +struct vkd3d_cs_wait +{ + struct d3d12_fence *fence; + uint64_t value; +}; + +struct vkd3d_cs_signal +{ + struct d3d12_fence *fence; + uint64_t value; +}; + +struct vkd3d_cs_execute +{ + VkCommandBuffer *buffers; + unsigned int buffer_count; +}; + +struct vkd3d_cs_op_data +{ + enum vkd3d_cs_op opcode; + union + { + struct vkd3d_cs_wait wait; + struct vkd3d_cs_signal signal; + struct vkd3d_cs_execute execute; + } u; +}; + /* ID3D12CommandQueue */ struct d3d12_command_queue { @@ -1331,6 +1363,12 @@ struct d3d12_command_queue
struct d3d12_device *device;
+ struct vkd3d_mutex op_mutex; + struct vkd3d_cs_op_data *ops; + size_t ops_count; + size_t ops_size; + bool is_flushing; + struct vkd3d_private_store private_store; };
@@ -1452,6 +1490,9 @@ struct d3d12_device unsigned int queue_family_count; VkTimeDomainEXT vk_host_time_domain;
+ struct d3d12_command_queue *blocked_queues[VKD3D_MAX_DEVICE_BLOCKED_QUEUES]; + unsigned int blocked_queue_count; + struct vkd3d_instance *vkd3d_instance;
IUnknown *parent; @@ -1470,7 +1511,6 @@ struct d3d12_device VkDescriptorPoolSize vk_pool_sizes[VKD3D_DESCRIPTOR_POOL_COUNT]; struct vkd3d_vk_descriptor_heap_layout vk_descriptor_heap_layouts[VKD3D_SET_INDEX_COUNT]; bool use_vk_heaps; - bool use_timeline_semaphores; };
HRESULT d3d12_device_create(struct vkd3d_instance *instance, diff --git a/tests/d3d12.c b/tests/d3d12.c index 015c3122..5f83a373 100644 --- a/tests/d3d12.c +++ b/tests/d3d12.c @@ -33224,9 +33224,7 @@ static void test_queue_wait(void) command_list = context.list; queue = context.queue;
- /* 'queue2' must not map to the same command queue as 'queue', or Wait() before GPU signal will fail. - * Using a compute queue fixes this on most hardware, but it may still fail on low spec hardware. */ - queue2 = create_command_queue(device, D3D12_COMMAND_LIST_TYPE_COMPUTE, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL); + queue2 = create_command_queue(device, D3D12_COMMAND_LIST_TYPE_DIRECT, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL);
event = create_event(); ok(event, "Failed to create event.\n");