New subject: [PATCH vkd3d 2/2] tests: Test signalling a fence to a lower value after a GPU wait.

9 Mar 2022

D3D12 supports signalling a fence to a lower value, while Vulkan timeline
semaphores do not. On the GPU side this is handled by simply submitting
the signal anyway, because working around this is impractical and it
currently works. For CPU signals the Vulkan semaphore is replaced with a
new one at the lower value only if no waits and/or signals are pending on
the GPU. Otherwise, a fixme is emitted.
Partly based on a vkd3d-proton patch by Hans-Kristian Arntzen (not
including the handling of lower fence values).
The old implementation is used if KHR_timeline_semaphore is not
available.
Signed-off-by: Conor McCarthy cmccarthy@codeweavers.com
---
 libs/vkd3d/command.c       | 562 ++++++++++++++++++++++++++++++++++---
 libs/vkd3d/device.c        |  14 +
 libs/vkd3d/vkd3d_private.h |  25 ++
 libs/vkd3d/vulkan_procs.h  |   5 +
 tests/d3d12.c              |  11 +-
 5 files changed, 576 insertions(+), 41 deletions(-)

diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c
index e7375fb8..cef6d022 100644
--- a/libs/vkd3d/command.c
+++ b/libs/vkd3d/command.c
@@ -22,11 +22,31 @@
static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence);
+static VkResult vkd3d_create_timeline_semaphore(const struct d3d12_device *device, uint64_t initial_value,
+        VkSemaphore *timeline_semaphore)
+{
+    const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
+    VkSemaphoreTypeCreateInfoKHR type_info;
+    VkSemaphoreCreateInfo info;
+
+    info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+    info.pNext = &type_info;
+    info.flags = 0;
+
+    type_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR;
+    type_info.pNext = NULL;
+    type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR;
+    type_info.initialValue = initial_value;
+
+    return VK_CALL(vkCreateSemaphore(device->vk_device, &info, NULL, timeline_semaphore));
+}
+
 HRESULT vkd3d_queue_create(struct d3d12_device *device,
         uint32_t family_index, const VkQueueFamilyProperties *properties, struct vkd3d_queue **queue)
 {
     const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
     struct vkd3d_queue *object;
+    VkResult vr;
     int rc;
if (!(object = vkd3d_malloc(sizeof(*object))))
@@ -46,6 +66,15 @@ HRESULT vkd3d_queue_create(struct d3d12_device *device,
     object->vk_queue_flags = properties->queueFlags;
     object->timestamp_bits = properties->timestampValidBits;
+    object->wait_completion_semaphore = VK_NULL_HANDLE;
+    object->pending_wait_completion_value = 0;
+    if (device->vk_info.KHR_timeline_semaphore && (vr = vkd3d_create_timeline_semaphore(device, 0,
+            &object->wait_completion_semaphore)) < 0)
+    {
+        WARN("Failed to create timeline semaphore, vr %d.\n", vr);
+        return hresult_from_vk_result(vr);
+    }
+
     object->semaphores = NULL;
     object->semaphores_size = 0;
     object->semaphore_count = 0;
@@ -75,6 +104,8 @@ void vkd3d_queue_destroy(struct vkd3d_queue *queue, struct d3d12_device *device)
vkd3d_free(queue->semaphores);
+    VK_CALL(vkDestroySemaphore(device->vk_device, queue->wait_completion_semaphore, NULL));
+
     for (i = 0; i < ARRAY_SIZE(queue->old_vk_semaphores); ++i)
     {
         if (queue->old_vk_semaphores[i])
@@ -268,6 +299,7 @@ static HRESULT vkd3d_enqueue_gpu_fence(struct vkd3d_fence_worker *worker,
     }
worker->enqueued_fences[worker->enqueued_fence_count].vk_fence = vk_fence;
+    worker->enqueued_fences[worker->enqueued_fence_count].vk_semaphore = VK_NULL_HANDLE;
     waiting_fence = &worker->enqueued_fences[worker->enqueued_fence_count].waiting_fence;
     waiting_fence->fence = fence;
     waiting_fence->value = value;
@@ -317,6 +349,7 @@ static void vkd3d_fence_worker_remove_fence(struct vkd3d_fence_worker *worker, s
 static void vkd3d_fence_worker_move_enqueued_fences_locked(struct vkd3d_fence_worker *worker)
 {
     unsigned int i;
+    bool timeline;
     size_t count;
     bool ret;
@@ -325,8 +358,18 @@ static void vkd3d_fence_worker_move_enqueued_fences_locked(struct vkd3d_fence_wo
count = worker->fence_count + worker->enqueued_fence_count;
-    ret = vkd3d_array_reserve((void **)&worker->vk_fences, &worker->vk_fences_size,
-            count, sizeof(*worker->vk_fences));
+    if ((timeline = worker->device->vk_info.KHR_timeline_semaphore))
+    {
+        ret = vkd3d_array_reserve((void **) &worker->vk_semaphores, &worker->vk_semaphores_size,
+                count, sizeof(*worker->vk_semaphores));
+        ret &= vkd3d_array_reserve((void **) &worker->semaphore_wait_values, &worker->semaphore_wait_values_size,
+                count, sizeof(*worker->semaphore_wait_values));
+    }
+    else
+    {
+        ret = vkd3d_array_reserve((void **)&worker->vk_fences, &worker->vk_fences_size,
+                count, sizeof(*worker->vk_fences));
+    }
     ret &= vkd3d_array_reserve((void **)&worker->fences, &worker->fences_size,
             count, sizeof(*worker->fences));
     if (!ret)
@@ -339,7 +382,16 @@ static void vkd3d_fence_worker_move_enqueued_fences_locked(struct vkd3d_fence_wo
     {
         struct vkd3d_enqueued_fence *current = &worker->enqueued_fences[i];
-        worker->vk_fences[worker->fence_count] = current->vk_fence;
+        if (timeline)
+        {
+            worker->vk_semaphores[worker->fence_count] = current->vk_semaphore;
+            worker->semaphore_wait_values[worker->fence_count] = current->waiting_fence.value;
+        }
+        else
+        {
+            worker->vk_fences[worker->fence_count] = current->vk_fence;
+        }
+
         worker->fences[worker->fence_count] = current->waiting_fence;
         ++worker->fence_count;
     }
@@ -347,6 +399,66 @@ static void vkd3d_fence_worker_move_enqueued_fences_locked(struct vkd3d_fence_wo
     worker->enqueued_fence_count = 0;
 }
+static void vkd3d_wait_for_gpu_timeline_semaphores(struct vkd3d_fence_worker *worker)
+{
+    const struct d3d12_device *device = worker->device;
+    const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
+    VkSemaphoreWaitInfoKHR wait_info;
+    VkSemaphore vk_semaphore;
+    uint64_t counter_value;
+    unsigned int i, j;
+    HRESULT hr;
+    int vr;
+
+    if (!worker->fence_count)
+        return;
+
+    wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR;
+    wait_info.pNext = NULL;
+    wait_info.flags = VK_SEMAPHORE_WAIT_ANY_BIT_KHR;
+    wait_info.pSemaphores = worker->vk_semaphores;
+    wait_info.semaphoreCount = worker->fence_count;
+    wait_info.pValues = worker->semaphore_wait_values;
+
+    vr = VK_CALL(vkWaitSemaphoresKHR(device->vk_device, &wait_info, ~(uint64_t)0));
+    if (vr == VK_TIMEOUT)
+        return;
+    if (vr != VK_SUCCESS)
+    {
+        ERR("Failed to wait for Vulkan timeline semaphores, vr %d.\n", vr);
+        return;
+    }
+
+    for (i = 0, j = 0; i < worker->fence_count; ++i)
+    {
+        struct vkd3d_waiting_fence *current = &worker->fences[i];
+
+        vk_semaphore = worker->vk_semaphores[i];
+        if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, vk_semaphore, &counter_value))) < 0)
+        {
+            ERR("Failed to get Vulkan semaphore value, vr %d.\n", vr);
+        }
+        else if (counter_value >= current->value)
+        {
+            TRACE("Signaling fence %p value %#"PRIx64".\n", current->fence, current->value);
+            if (FAILED(hr = d3d12_fence_signal(current->fence, counter_value, VK_NULL_HANDLE)))
+                ERR("Failed to signal D3D12 fence, hr %#x.\n", hr);
+
+            InterlockedDecrement(&current->fence->pending_worker_operation_count);
+            continue;
+        }
+
+        if (i != j)
+        {
+            worker->vk_semaphores[j] = worker->vk_semaphores[i];
+            worker->semaphore_wait_values[j] = worker->semaphore_wait_values[i];
+            worker->fences[j] = worker->fences[i];
+        }
+        ++j;
+    }
+    worker->fence_count = j;
+}
+
 static void vkd3d_wait_for_gpu_fences(struct vkd3d_fence_worker *worker)
 {
     struct d3d12_device *device = worker->device;
@@ -408,7 +520,7 @@ static void *vkd3d_fence_worker_main(void *arg)
for (;;)
     {
-        vkd3d_wait_for_gpu_fences(worker);
+        worker->wait_for_gpu_fences(worker);
if (!worker->fence_count || InterlockedAdd(&worker->enqueued_fence_count, 0))
         {
@@ -473,6 +585,13 @@ HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker,
     worker->vk_fences_size = 0;
     worker->fences = NULL;
     worker->fences_size = 0;
+    worker->vk_semaphores = NULL;
+    worker->vk_semaphores_size = 0;
+    worker->semaphore_wait_values = NULL;
+    worker->semaphore_wait_values_size = 0;
+
+    worker->wait_for_gpu_fences = device->vk_info.KHR_timeline_semaphore
+            ? vkd3d_wait_for_gpu_timeline_semaphores : vkd3d_wait_for_gpu_fences;
if ((rc = vkd3d_mutex_init(&worker->mutex)))
     {
@@ -535,6 +654,8 @@ HRESULT vkd3d_fence_worker_stop(struct vkd3d_fence_worker *worker,
     vkd3d_free(worker->enqueued_fences);
     vkd3d_free(worker->vk_fences);
     vkd3d_free(worker->fences);
+    vkd3d_free(worker->vk_semaphores);
+    vkd3d_free(worker->semaphore_wait_values);
return S_OK;
 }
@@ -684,6 +805,7 @@ static void d3d12_fence_destroy_vk_objects(struct d3d12_fence *fence)
     }
d3d12_fence_garbage_collect_vk_semaphores_locked(fence, true);
+    VK_CALL(vkDestroySemaphore(device->vk_device, fence->timeline_semaphore, NULL));
vkd3d_mutex_unlock(&fence->mutex);
 }
@@ -802,31 +924,21 @@ static HRESULT d3d12_fence_add_vk_semaphore(struct d3d12_fence *fence,
     return hr;
 }
-static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence)
+static bool d3d12_fence_signal_external_events_locked(struct d3d12_fence *fence)
 {
     struct d3d12_device *device = fence->device;
-    struct vkd3d_signaled_semaphore *current;
     bool signal_null_event_cond = false;
     unsigned int i, j;
-    int rc;
-
-    if ((rc = vkd3d_mutex_lock(&fence->mutex)))
-    {
-        ERR("Failed to lock mutex, error %d.\n", rc);
-        return hresult_from_errno(rc);
-    }
-
-    fence->value = value;
for (i = 0, j = 0; i < fence->event_count; ++i)
     {
         struct vkd3d_waiting_event *current = &fence->events[i];
-        if (current->value <= value)
+        if (current->value <= fence->value)
         {
             if (current->event)
             {
-                fence->device->signal_event(current->event);
+                device->signal_event(current->event);
             }
             else
             {
@@ -841,9 +953,28 @@ static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkF
             ++j;
         }
     }
+
     fence->event_count = j;
-    if (signal_null_event_cond)
+    return signal_null_event_cond;
+}
+
+static HRESULT d3d12_fence_signal(struct d3d12_fence *fence, uint64_t value, VkFence vk_fence)
+{
+    struct d3d12_device *device = fence->device;
+    struct vkd3d_signaled_semaphore *current;
+    unsigned int i;
+    int rc;
+
+    if ((rc = vkd3d_mutex_lock(&fence->mutex)))
+    {
+        ERR("Failed to lock mutex, error %d.\n", rc);
+        return hresult_from_errno(rc);
+    }
+
+    fence->value = value;
+
+    if (d3d12_fence_signal_external_events_locked(fence))
         vkd3d_cond_broadcast(&fence->null_event_cond);
if (vk_fence)
@@ -1069,12 +1200,158 @@ static HRESULT STDMETHODCALLTYPE d3d12_fence_SetEventOnCompletion(ID3D12Fence *i
     return S_OK;
 }
+static inline bool d3d12_fence_gpu_wait_is_completed(const struct d3d12_fence *fence, unsigned int i)
+{
+    const struct d3d12_device *device = fence->device;
+    const struct vkd3d_vk_device_procs *vk_procs;
+    uint64_t value;
+    VkResult vr;
+
+    vk_procs = &device->vk_procs;
+
+    if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device,
+            fence->gpu_waits[i].queue->wait_completion_semaphore, &value))) >= 0)
+    {
+        return value >= fence->gpu_waits[i].pending_value;
+    }
+
+    ERR("Failed to get Vulkan semaphore status, vr %d.\n", vr);
+    return true;
+}
+
+static inline bool d3d12_fence_has_pending_gpu_ops_locked(struct d3d12_fence *fence)
+{
+    const struct d3d12_device *device = fence->device;
+    const struct vkd3d_vk_device_procs *vk_procs;
+    uint64_t value;
+    unsigned int i;
+    VkResult vr;
+
+    for (i = 0; i < fence->gpu_wait_count; ++i)
+    {
+        if (d3d12_fence_gpu_wait_is_completed(fence, i) && i < --fence->gpu_wait_count)
+            fence->gpu_waits[i] = fence->gpu_waits[fence->gpu_wait_count];
+    }
+    if (fence->gpu_wait_count)
+        return true;
+
+    /* Check for pending signals too. */
+    if (fence->value >= fence->pending_timeline_value)
+        return false;
+
+    vk_procs = &device->vk_procs;
+
+    /* Check the actual semaphore value in case fence->value update is lagging. */
+    if ((vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, fence->timeline_semaphore, &value))) < 0)
+    {
+        ERR("Failed to get Vulkan semaphore status, vr %d.\n", vr);
+        return false;
+    }
+
+    return value < fence->pending_timeline_value;
+}
+
+/* Replace the VkSemaphore with a new one to allow a lower value to be set. Ideally apps will
+ * only use this to reset the fence when no operations are pending on the queue. */
+static VkResult d3d12_fence_reinit_timeline_semaphore_locked(struct d3d12_fence *fence, uint64_t value)
+{
+    const struct d3d12_device *device = fence->device;
+    const struct vkd3d_vk_device_procs *vk_procs;
+    VkSemaphore timeline_semaphore;
+    VkResult vr;
+
+    if (d3d12_fence_has_pending_gpu_ops_locked(fence))
+    {
+        /* This situation is not very likely because it means a fence with pending waits and/or signals was
+         * signalled on the CPU to a lower value. For now, emit a fixme so it can be patched if necessary.
+         * A patch already exists for this but it's not pretty. */
+        FIXME("Unable to re-initialise timeline semaphore to a lower value due to pending GPU ops.\n");
+        return VK_ERROR_UNKNOWN;
+    }
+
+    if ((vr = vkd3d_create_timeline_semaphore(device, value, &timeline_semaphore)) < 0)
+    {
+        WARN("Failed to create timeline semaphore, vr %d.\n", vr);
+        return vr;
+    }
+
+    fence->value = value;
+    fence->pending_timeline_value = value;
+
+    WARN("Replacing timeline semaphore with a new object.\n");
+
+    vk_procs = &device->vk_procs;
+
+    VK_CALL(vkDestroySemaphore(device->vk_device, fence->timeline_semaphore, NULL));
+    fence->timeline_semaphore = timeline_semaphore;
+
+    return VK_SUCCESS;
+}
+
+static HRESULT d3d12_fence_signal_cpu_timeline_semaphore(struct d3d12_fence *fence, uint64_t value)
+{
+    const struct d3d12_device *device = fence->device;
+    VkSemaphoreSignalInfoKHR info;
+    VkResult vr = VK_SUCCESS;
+    int rc;
+
+    if ((rc = vkd3d_mutex_lock(&fence->mutex)))
+    {
+        ERR("Failed to lock mutex, error %d.\n", rc);
+        return hresult_from_errno(rc);
+    }
+
+    /* We must only signal a value which is greater than the current value.
+     * That value can be in the range of current known value (fence->value), or as large as pending_timeline_value.
+     * Pending timeline value signal might be blocked by another synchronization primitive, and thus statically
+     * cannot be that value, so the safest thing to do is to check the current value which is updated by the fence
+     * wait thread continuously. This check is technically racy since the value might be immediately out of date,
+     * but there is no way to avoid this. */
+    if (value > fence->value)
+    {
+        const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
+
+        /* Sanity check against the delta limit. */
+        if (value - fence->value > device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference)
+        {
+            FIXME("Timeline semaphore delta is %"PRIu64", but implementation only supports a delta of %"PRIu64".\n",
+                    value - fence->value, device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference);
+        }
+
+        info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR;
+        info.pNext = NULL;
+        info.semaphore = fence->timeline_semaphore;
+        info.value = value;
+        if ((vr = VK_CALL(vkSignalSemaphoreKHR(device->vk_device, &info))) >= 0)
+        {
+            fence->value = value;
+            if (value > fence->pending_timeline_value)
+                fence->pending_timeline_value = value;
+        }
+        else
+        {
+            ERR("Failed to signal timeline semaphore, vr %d.\n", vr);
+        }
+    }
+    else if (value < fence->value)
+    {
+        vr = d3d12_fence_reinit_timeline_semaphore_locked(fence, value);
+    }
+
+    d3d12_fence_signal_external_events_locked(fence);
+
+    vkd3d_mutex_unlock(&fence->mutex);
+    return hresult_from_vk_result(vr);
+}
+
 static HRESULT STDMETHODCALLTYPE d3d12_fence_Signal(ID3D12Fence *iface, UINT64 value)
 {
     struct d3d12_fence *fence = impl_from_ID3D12Fence(iface);
TRACE("iface %p, value %#"PRIx64".\n", iface, value);
+    if (fence->timeline_semaphore)
+        return d3d12_fence_signal_cpu_timeline_semaphore(fence, value);
     return d3d12_fence_signal(fence, value, VK_NULL_HANDLE);
 }
@@ -1108,6 +1385,7 @@ static struct d3d12_fence *unsafe_impl_from_ID3D12Fence(ID3D12Fence *iface)
 static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device *device,
         UINT64 initial_value, D3D12_FENCE_FLAGS flags)
 {
+    VkResult vr;
     HRESULT hr;
     int rc;
@@ -1136,6 +1414,16 @@ static HRESULT d3d12_fence_init(struct d3d12_fence *fence, struct d3d12_device *
     fence->events_size = 0;
     fence->event_count = 0;
+    fence->timeline_semaphore = VK_NULL_HANDLE;
+    if (device->vk_info.KHR_timeline_semaphore && (vr = vkd3d_create_timeline_semaphore(device, initial_value,
+            &fence->timeline_semaphore)) < 0)
+    {
+        WARN("Failed to create timeline semaphore, vr %d.\n", vr);
+        return hresult_from_vk_result(vr);
+    }
+    fence->pending_timeline_value = initial_value;
+    fence->gpu_wait_count = 0;
+
     list_init(&fence->semaphores);
     fence->semaphore_count = 0;
@@ -5991,18 +6279,88 @@ static void STDMETHODCALLTYPE d3d12_command_queue_EndEvent(ID3D12CommandQueue *i
     FIXME("iface %p stub!\n", iface);
 }
+static HRESULT d3d12_fence_update_gpu_signal_timeline_semaphore(struct d3d12_fence *fence, uint64_t value)
+{
+    const struct d3d12_device *device = fence->device;
+    int rc;
+
+    if ((rc = vkd3d_mutex_lock(&fence->mutex)))
+    {
+        ERR("Failed to lock mutex, error %d.\n", rc);
+        return hresult_from_errno(rc);
+    }
+
+    /* If we're attempting to async signal a fence with a value which is not monotonically increasing the payload value,
+     * warn about this case. Do not treat this as an error since it works at least with RADV and Nvidia drivers and
+     * there's no workaround on the GPU side. */
+    if (value <= fence->pending_timeline_value)
+    {
+        WARN("Fence %p is being signalled non-monotonically. Old pending value %"PRIu64", new pending value %"PRIu64".\n",
+                fence, fence->pending_timeline_value, value);
+    }
+    /* Sanity check against the delta limit. Use the current fence value. */
+    else if (value - fence->value > device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference)
+    {
+        FIXME("Timeline semaphore delta is %"PRIu64", but implementation only supports a delta of %"PRIu64".\n",
+                value - fence->value, device->vk_info.timeline_semaphore_properties.maxTimelineSemaphoreValueDifference);
+    }
+    fence->pending_timeline_value = value;
+
+    vkd3d_mutex_unlock(&fence->mutex);
+
+    return S_OK;
+}
+
+static HRESULT vkd3d_enqueue_timeline_semaphore(struct vkd3d_fence_worker *worker, VkSemaphore vk_semaphore,
+        struct d3d12_fence *fence, uint64_t value, struct vkd3d_queue *queue)
+{
+    struct vkd3d_waiting_fence *waiting_fence;
+    int rc;
+
+    TRACE("worker %p, fence %p, value %#"PRIx64".\n", worker, fence, value);
+
+    if ((rc = vkd3d_mutex_lock(&worker->mutex)))
+    {
+        ERR("Failed to lock mutex, error %d.\n", rc);
+        return hresult_from_errno(rc);
+    }
+
+    if (!vkd3d_array_reserve((void **)&worker->enqueued_fences, &worker->enqueued_fences_size,
+            worker->enqueued_fence_count + 1, sizeof(*worker->enqueued_fences)))
+    {
+        ERR("Failed to add GPU timeline semaphore.\n");
+        vkd3d_mutex_unlock(&worker->mutex);
+        return E_OUTOFMEMORY;
+    }
+
+    worker->enqueued_fences[worker->enqueued_fence_count].vk_semaphore = vk_semaphore;
+    waiting_fence = &worker->enqueued_fences[worker->enqueued_fence_count].waiting_fence;
+    waiting_fence->fence = fence;
+    waiting_fence->value = value;
+    waiting_fence->queue = queue;
+    ++worker->enqueued_fence_count;
+
+    InterlockedIncrement(&fence->pending_worker_operation_count);
+
+    vkd3d_cond_signal(&worker->cond);
+    vkd3d_mutex_unlock(&worker->mutex);
+
+    return S_OK;
+}
+
 static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue *iface,
         ID3D12Fence *fence_iface, UINT64 value)
 {
     struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface);
+    VkTimelineSemaphoreSubmitInfoKHR timeline_submit_info;
     const struct vkd3d_vk_device_procs *vk_procs;
     VkSemaphore vk_semaphore = VK_NULL_HANDLE;
     VkFence vk_fence = VK_NULL_HANDLE;
     struct vkd3d_queue *vkd3d_queue;
+    uint64_t sequence_number = 0;
     struct d3d12_device *device;
     struct d3d12_fence *fence;
     VkSubmitInfo submit_info;
-    uint64_t sequence_number;
     VkQueue vk_queue;
     VkResult vr;
     HRESULT hr;
@@ -6015,10 +6373,21 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue *
fence = unsafe_impl_from_ID3D12Fence(fence_iface);
-    if ((vr = d3d12_fence_create_vk_fence(fence, &vk_fence)) < 0)
+    if (device->vk_info.KHR_timeline_semaphore)
     {
-        WARN("Failed to create Vulkan fence, vr %d.\n", vr);
-        goto fail_vkresult;
+        if (FAILED(hr = d3d12_fence_update_gpu_signal_timeline_semaphore(fence, value)))
+            return hr;
+
+        vk_semaphore = fence->timeline_semaphore;
+        assert(vk_semaphore);
+    }
+    else
+    {
+        if ((vr = d3d12_fence_create_vk_fence(fence, &vk_fence)) < 0)
+        {
+            WARN("Failed to create Vulkan fence, vr %d.\n", vr);
+            goto fail_vkresult;
+        }
     }
if (!(vk_queue = vkd3d_queue_acquire(vkd3d_queue)))
@@ -6028,7 +6397,8 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue *
         goto fail;
     }
-    if ((vr = vkd3d_queue_create_vk_semaphore_locked(vkd3d_queue, device, &vk_semaphore)) < 0)
+    if (!device->vk_info.KHR_timeline_semaphore && (vr = vkd3d_queue_create_vk_semaphore_locked(vkd3d_queue,
+            device, &vk_semaphore)) < 0)
     {
         ERR("Failed to create Vulkan semaphore, vr %d.\n", vr);
         vk_semaphore = VK_NULL_HANDLE;
@@ -6044,7 +6414,19 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue *
     submit_info.signalSemaphoreCount = vk_semaphore ? 1 : 0;
     submit_info.pSignalSemaphores = &vk_semaphore;
-    if ((vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, vk_fence))) >= 0)
+    if (device->vk_info.KHR_timeline_semaphore)
+    {
+        timeline_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR;
+        timeline_submit_info.pNext = NULL;
+        timeline_submit_info.pSignalSemaphoreValues = &value;
+        timeline_submit_info.signalSemaphoreValueCount = submit_info.signalSemaphoreCount;
+        timeline_submit_info.waitSemaphoreValueCount = 0;
+        timeline_submit_info.pWaitSemaphoreValues = NULL;
+        submit_info.pNext = &timeline_submit_info;
+    }
+
+    vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, vk_fence));
+    if (!device->vk_info.KHR_timeline_semaphore && vr >= 0)
     {
         sequence_number = ++vkd3d_queue->submitted_sequence_number;
@@ -6061,6 +6443,9 @@ static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Signal(ID3D12CommandQueue *
         goto fail_vkresult;
     }
+    if (device->vk_info.KHR_timeline_semaphore)
+        return vkd3d_enqueue_timeline_semaphore(&device->fence_worker, vk_semaphore, fence, value, vkd3d_queue);
+
     if (vk_semaphore && SUCCEEDED(hr = d3d12_fence_add_vk_semaphore(fence, vk_semaphore, vk_fence, value)))
         vk_semaphore = VK_NULL_HANDLE;
@@ -6096,32 +6481,27 @@ fail_vkresult:
     hr = hresult_from_vk_result(vr);
 fail:
     VK_CALL(vkDestroyFence(device->vk_device, vk_fence, NULL));
-    VK_CALL(vkDestroySemaphore(device->vk_device, vk_semaphore, NULL));
+    if (!device->vk_info.KHR_timeline_semaphore)
+        VK_CALL(vkDestroySemaphore(device->vk_device, vk_semaphore, NULL));
     return hr;
 }
-static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Wait(ID3D12CommandQueue *iface,
-        ID3D12Fence *fence_iface, UINT64 value)
+static HRESULT d3d12_command_queue_wait_binary_semaphore(struct d3d12_command_queue *command_queue,
+        struct d3d12_fence *fence, uint64_t value)
 {
     static const VkPipelineStageFlagBits wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
-    struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface);
     const struct vkd3d_vk_device_procs *vk_procs;
     struct vkd3d_signaled_semaphore *semaphore;
     uint64_t completed_value = 0;
     struct vkd3d_queue *queue;
-    struct d3d12_fence *fence;
     VkSubmitInfo submit_info;
     VkQueue vk_queue;
     VkResult vr;
     HRESULT hr;
-    TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value);
-
     vk_procs = &command_queue->device->vk_procs;
     queue = command_queue->vkd3d_queue;
-    fence = unsafe_impl_from_ID3D12Fence(fence_iface);
-
     semaphore = d3d12_fence_acquire_vk_semaphore(fence, value, &completed_value);
     if (!semaphore && completed_value >= value)
     {
@@ -6199,6 +6579,122 @@ fail:
     return hr;
 }
+static inline void d3d12_fence_update_gpu_wait(struct d3d12_fence *fence, const struct vkd3d_queue *queue)
+{
+    unsigned int i;
+    bool found;
+    int rc;
+
+    if ((rc = vkd3d_mutex_lock(&fence->mutex)))
+    {
+        ERR("Failed to lock mutex, error %d.\n", rc);
+        return;
+    }
+
+    for (i = 0, found = false; i < fence->gpu_wait_count; ++i)
+    {
+        if (fence->gpu_waits[i].queue == queue)
+        {
+            fence->gpu_waits[i].pending_value = queue->pending_wait_completion_value;
+            found = true;
+        }
+        else if (d3d12_fence_gpu_wait_is_completed(fence, i) && i < --fence->gpu_wait_count)
+        {
+            fence->gpu_waits[i] = fence->gpu_waits[fence->gpu_wait_count];
+        }
+    }
+
+    if (!found)
+    {
+        if (fence->gpu_wait_count < ARRAY_SIZE(fence->gpu_waits))
+        {
+            fence->gpu_waits[fence->gpu_wait_count].queue = queue;
+            fence->gpu_waits[fence->gpu_wait_count++].pending_value = queue->pending_wait_completion_value;
+        }
+        else
+        {
+            FIXME("Unable to track GPU fence wait.\n");
+        }
+    }
+
+    vkd3d_mutex_unlock(&fence->mutex);
+}
+
+static HRESULT d3d12_command_queue_wait_timeline_semaphore(struct d3d12_command_queue *command_queue,
+        struct d3d12_fence *fence, uint64_t value)
+{
+    static const VkPipelineStageFlagBits wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
+    VkTimelineSemaphoreSubmitInfoKHR timeline_submit_info;
+    const struct vkd3d_vk_device_procs *vk_procs;
+    struct vkd3d_queue *queue;
+    VkSubmitInfo submit_info;
+    VkQueue vk_queue;
+    VkResult vr;
+
+    vk_procs = &command_queue->device->vk_procs;
+    queue = command_queue->vkd3d_queue;
+
+    assert(fence->timeline_semaphore);
+    timeline_submit_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR;
+    timeline_submit_info.pNext = NULL;
+    timeline_submit_info.signalSemaphoreValueCount = 0;
+    timeline_submit_info.pSignalSemaphoreValues = NULL;
+    timeline_submit_info.waitSemaphoreValueCount = 1;
+    timeline_submit_info.pWaitSemaphoreValues = &value;
+
+    if (!(vk_queue = vkd3d_queue_acquire(queue)))
+    {
+        ERR("Failed to acquire queue %p.\n", queue);
+        return E_FAIL;
+    }
+
+    submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submit_info.pNext = &timeline_submit_info;
+    submit_info.waitSemaphoreCount = 1;
+    submit_info.pWaitSemaphores = &fence->timeline_semaphore;
+    submit_info.pWaitDstStageMask = &wait_stage_mask;
+    submit_info.commandBufferCount = 0;
+    submit_info.pCommandBuffers = NULL;
+    submit_info.signalSemaphoreCount = 0;
+    submit_info.pSignalSemaphores = NULL;
+
+    ++queue->pending_wait_completion_value;
+
+    submit_info.signalSemaphoreCount = 1;
+    submit_info.pSignalSemaphores = &queue->wait_completion_semaphore;
+    timeline_submit_info.signalSemaphoreValueCount = 1;
+    timeline_submit_info.pSignalSemaphoreValues = &queue->pending_wait_completion_value;
+
+    d3d12_fence_update_gpu_wait(fence, queue);
+
+    vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE));
+
+    vkd3d_queue_release(queue);
+
+    if (vr < 0)
+    {
+        WARN("Failed to submit wait operation, vr %d.\n", vr);
+        return hresult_from_vk_result(vr);
+    }
+
+    return S_OK;
+}
+
+static HRESULT STDMETHODCALLTYPE d3d12_command_queue_Wait(ID3D12CommandQueue *iface,
+        ID3D12Fence *fence_iface, UINT64 value)
+{
+    struct d3d12_command_queue *command_queue = impl_from_ID3D12CommandQueue(iface);
+    struct d3d12_fence *fence = unsafe_impl_from_ID3D12Fence(fence_iface);
+
+    TRACE("iface %p, fence %p, value %#"PRIx64".\n", iface, fence_iface, value);
+
+    if (command_queue->device->vk_info.KHR_timeline_semaphore)
+        return d3d12_command_queue_wait_timeline_semaphore(command_queue, fence, value);
+
+    FIXME_ONCE("KHR_timeline_semaphore is not available. Some wait commands may be unsupported.\n");
+    return d3d12_command_queue_wait_binary_semaphore(command_queue, fence, value);
+}
+
 static HRESULT STDMETHODCALLTYPE d3d12_command_queue_GetTimestampFrequency(ID3D12CommandQueue *iface,
         UINT64 *frequency)
 {
diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c
index 59fa9af9..0c5278c0 100644
--- a/libs/vkd3d/device.c
+++ b/libs/vkd3d/device.c
@@ -129,6 +129,7 @@ static const struct vkd3d_optional_extension_info optional_device_extensions[] =
     VK_EXTENSION(KHR_MAINTENANCE3, KHR_maintenance3),
     VK_EXTENSION(KHR_PUSH_DESCRIPTOR, KHR_push_descriptor),
     VK_EXTENSION(KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE, KHR_sampler_mirror_clamp_to_edge),
+    VK_EXTENSION(KHR_TIMELINE_SEMAPHORE, KHR_timeline_semaphore),
     /* EXT extensions */
     VK_EXTENSION(EXT_CALIBRATED_TIMESTAMPS, EXT_calibrated_timestamps),
     VK_EXTENSION(EXT_CONDITIONAL_RENDERING, EXT_conditional_rendering),
@@ -683,6 +684,7 @@ struct vkd3d_physical_device_info
     VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT texel_buffer_alignment_properties;
     VkPhysicalDeviceTransformFeedbackPropertiesEXT xfb_properties;
     VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT vertex_divisor_properties;
+    VkPhysicalDeviceTimelineSemaphorePropertiesKHR timeline_semaphore_properties;
VkPhysicalDeviceProperties2KHR properties2;
@@ -694,6 +696,7 @@ struct vkd3d_physical_device_info
     VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT texel_buffer_alignment_features;
     VkPhysicalDeviceTransformFeedbackFeaturesEXT xfb_features;
     VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT vertex_divisor_features;
+    VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore_features;
VkPhysicalDeviceFeatures2 features2;
 };
@@ -705,10 +708,12 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i
     VkPhysicalDeviceDescriptorIndexingPropertiesEXT *descriptor_indexing_properties;
     VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *vertex_divisor_properties;
     VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *buffer_alignment_properties;
+    VkPhysicalDeviceTimelineSemaphorePropertiesKHR *timeline_semaphore_properties;
     VkPhysicalDeviceDescriptorIndexingFeaturesEXT *descriptor_indexing_features;
     VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *vertex_divisor_features;
     VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *buffer_alignment_features;
     VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *demote_features;
+    VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *timeline_semaphore_features;
     VkPhysicalDeviceDepthClipEnableFeaturesEXT *depth_clip_features;
     VkPhysicalDeviceMaintenance3Properties *maintenance3_properties;
     VkPhysicalDeviceTransformFeedbackPropertiesEXT *xfb_properties;
@@ -727,6 +732,8 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i
     buffer_alignment_properties = &info->texel_buffer_alignment_properties;
     vertex_divisor_features = &info->vertex_divisor_features;
     vertex_divisor_properties = &info->vertex_divisor_properties;
+    timeline_semaphore_features = &info->timeline_semaphore_features;
+    timeline_semaphore_properties = &info->timeline_semaphore_properties;
     xfb_features = &info->xfb_features;
     xfb_properties = &info->xfb_properties;
@@ -746,6 +753,8 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i
     vk_prepend_struct(&info->features2, xfb_features);
     vertex_divisor_features->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT;
     vk_prepend_struct(&info->features2, vertex_divisor_features);
+    timeline_semaphore_features->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR;
+    vk_prepend_struct(&info->features2, timeline_semaphore_features);
if (vulkan_info->KHR_get_physical_device_properties2)
         VK_CALL(vkGetPhysicalDeviceFeatures2KHR(physical_device, &info->features2));
@@ -764,6 +773,8 @@ static void vkd3d_physical_device_info_init(struct vkd3d_physical_device_info *i
     vk_prepend_struct(&info->properties2, xfb_properties);
     vertex_divisor_properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT;
     vk_prepend_struct(&info->properties2, vertex_divisor_properties);
+    timeline_semaphore_properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR;
+    vk_prepend_struct(&info->properties2, timeline_semaphore_properties);
if (vulkan_info->KHR_get_physical_device_properties2)
         VK_CALL(vkGetPhysicalDeviceProperties2KHR(physical_device, &info->properties2));
@@ -1322,6 +1333,7 @@ static HRESULT vkd3d_init_device_caps(struct d3d12_device *device,
     vulkan_info->rasterization_stream = physical_device_info->xfb_properties.transformFeedbackRasterizationStreamSelect;
     vulkan_info->transform_feedback_queries = physical_device_info->xfb_properties.transformFeedbackQueries;
     vulkan_info->max_vertex_attrib_divisor = max(physical_device_info->vertex_divisor_properties.maxVertexAttribDivisor, 1);
+    vulkan_info->timeline_semaphore_properties = physical_device_info->timeline_semaphore_properties;
device->feature_options.DoublePrecisionFloatShaderOps = features->shaderFloat64;
     device->feature_options.OutputMergerLogicOp = features->logicOp;
@@ -1444,6 +1456,8 @@ static HRESULT vkd3d_init_device_caps(struct d3d12_device *device,
         vulkan_info->EXT_shader_demote_to_helper_invocation = false;
     if (!physical_device_info->texel_buffer_alignment_features.texelBufferAlignment)
         vulkan_info->EXT_texel_buffer_alignment = false;
+    if (!physical_device_info->timeline_semaphore_features.timelineSemaphore)
+        vulkan_info->KHR_timeline_semaphore = false;
vulkan_info->texel_buffer_alignment_properties = physical_device_info->texel_buffer_alignment_properties;
diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h
index ac0d5ce1..48777aac 100644
--- a/libs/vkd3d/vkd3d_private.h
+++ b/libs/vkd3d/vkd3d_private.h
@@ -59,6 +59,7 @@
 #define VKD3D_MAX_SHADER_EXTENSIONS       3u
 #define VKD3D_MAX_SHADER_STAGES           5u
 #define VKD3D_MAX_VK_SYNC_OBJECTS         4u
+#define VKD3D_MAX_FENCE_WAITING_QUEUES    4u
 #define VKD3D_MAX_DESCRIPTOR_SETS        64u
 /* D3D12 binding tier 3 has a limit of 2048 samplers. */
 #define VKD3D_MAX_DESCRIPTOR_SET_SAMPLERS 2048u
@@ -123,6 +124,7 @@ struct vkd3d_vulkan_info
     bool KHR_maintenance3;
     bool KHR_push_descriptor;
     bool KHR_sampler_mirror_clamp_to_edge;
+    bool KHR_timeline_semaphore;
     /* EXT device extensions */
     bool EXT_calibrated_timestamps;
     bool EXT_conditional_rendering;
@@ -147,6 +149,8 @@ struct vkd3d_vulkan_info
VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT texel_buffer_alignment_properties;
+    VkPhysicalDeviceTimelineSemaphorePropertiesKHR timeline_semaphore_properties;
+
     unsigned int shader_extension_count;
     enum vkd3d_shader_spirv_extension shader_extensions[VKD3D_MAX_SHADER_EXTENSIONS];
@@ -344,6 +348,7 @@ struct vkd3d_fence_worker
     struct vkd3d_enqueued_fence
     {
         VkFence vk_fence;
+        VkSemaphore vk_semaphore;
         struct vkd3d_waiting_fence waiting_fence;
     } *enqueued_fences;
     size_t enqueued_fences_size;
@@ -353,6 +358,12 @@ struct vkd3d_fence_worker
     size_t vk_fences_size;
     struct vkd3d_waiting_fence *fences;
     size_t fences_size;
+    VkSemaphore *vk_semaphores;
+    size_t vk_semaphores_size;
+    uint64_t *semaphore_wait_values;
+    size_t semaphore_wait_values_size;
+
+    void (*wait_for_gpu_fences)(struct vkd3d_fence_worker *worker);
struct d3d12_device *device;
 };
@@ -507,6 +518,12 @@ struct vkd3d_signaled_semaphore
     bool is_acquired;
 };
+struct vkd3d_pending_fence_wait
+{
+    const struct vkd3d_queue *queue;
+    uint64_t pending_value;
+};
+
 /* ID3D12Fence */
 struct d3d12_fence
 {
@@ -526,6 +543,11 @@ struct d3d12_fence
     size_t events_size;
     size_t event_count;
+    VkSemaphore timeline_semaphore;
+    uint64_t pending_timeline_value;
+    struct vkd3d_pending_fence_wait gpu_waits[VKD3D_MAX_FENCE_WAITING_QUEUES];
+    unsigned int gpu_wait_count;
+
     struct list semaphores;
     unsigned int semaphore_count;
@@ -1200,6 +1222,9 @@ struct vkd3d_queue
     VkQueueFlags vk_queue_flags;
     uint32_t timestamp_bits;
+    VkSemaphore wait_completion_semaphore;
+    uint64_t pending_wait_completion_value;
+
     struct
     {
         VkSemaphore vk_semaphore;
diff --git a/libs/vkd3d/vulkan_procs.h b/libs/vkd3d/vulkan_procs.h
index 60556735..34e0ab4b 100644
--- a/libs/vkd3d/vulkan_procs.h
+++ b/libs/vkd3d/vulkan_procs.h
@@ -195,6 +195,11 @@ VK_DEVICE_EXT_PFN(vkGetDescriptorSetLayoutSupportKHR)
 /* VK_KHR_push_descriptor */
 VK_DEVICE_EXT_PFN(vkCmdPushDescriptorSetKHR)
+/* VK_KHR_timeline_semaphore */
+VK_DEVICE_EXT_PFN(vkGetSemaphoreCounterValueKHR)
+VK_DEVICE_EXT_PFN(vkWaitSemaphoresKHR)
+VK_DEVICE_EXT_PFN(vkSignalSemaphoreKHR)
+
 /* VK_EXT_calibrated_timestamps */
 VK_DEVICE_EXT_PFN(vkGetCalibratedTimestampsEXT)
diff --git a/tests/d3d12.c b/tests/d3d12.c
index 5067dd97..ea64918f 100644
--- a/tests/d3d12.c
+++ b/tests/d3d12.c
@@ -33233,7 +33233,9 @@ static void test_queue_wait(void)
     command_list = context.list;
     queue = context.queue;
-    queue2 = create_command_queue(device, D3D12_COMMAND_LIST_TYPE_DIRECT, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL);
+    /* 'queue2' must not map to the same command queue as 'queue', or Wait() before GPU signal will fail.
+     * Using a compute queue fixes this on most hardware, but it may still fail on low spec hardware. */
+    queue2 = create_command_queue(device, D3D12_COMMAND_LIST_TYPE_COMPUTE, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL);
event = create_event();
     ok(event, "Failed to create event.\n");
@@ -33298,12 +33300,6 @@ static void test_queue_wait(void)
     check_readback_data_uint(&rb, NULL, 0xff00ff00, 0);
     release_resource_readback(&rb);
-    if (!vkd3d_test_platform_is_windows())
-    {
-        skip("Wait() is not implemented yet.\n"); /* FIXME */
-        goto skip_tests;
-    }
-
     /* Wait() before CPU signal */
     update_buffer_data(cb, 0, sizeof(blue), &blue);
     queue_wait(queue, fence, 2);
@@ -33379,7 +33375,6 @@ static void test_queue_wait(void)
     check_readback_data_uint(&rb, NULL, 0xff00ff00, 0);
     release_resource_readback(&rb);
-skip_tests:
     /* Signal() and Wait() in the same command queue */
     update_buffer_data(cb, 0, sizeof(blue), &blue);
     queue_signal(queue, fence, 7);
-- 
2.34.1



    

[PATCH vkd3d v2 1/2] vkd3d: Use Vulkan timeline semaphores for D3D12 fences.

Signed-off-by: Conor McCarthy cmccarthy@codeweavers.com