This removes tracking work entirely from the CS thread and replaces interlocked with regular writes on the main thread. It also avoids accessing access_count in read-write mode from two threads.
Signed-off-by: Stefan Dösinger stefan@codeweavers.com --- dlls/wined3d/cs.c | 4 -- dlls/wined3d/wined3d_private.h | 96 +++++++++++++++++++++++++++++----- 2 files changed, 82 insertions(+), 18 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 3eb4f39429a..10d7b4b8996 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -2751,8 +2751,6 @@ static void wined3d_cs_exec_update_sub_resource(struct wined3d_cs *cs, const voi op->sub_resource_idx, context, &op->bo, box, op->row_pitch, op->slice_pitch);
context_release(context); - - wined3d_resource_release(resource); }
void wined3d_device_context_emit_update_sub_resource(struct wined3d_device_context *context, @@ -2792,8 +2790,6 @@ void wined3d_device_context_emit_update_sub_resource(struct wined3d_device_conte op->row_pitch = row_pitch; op->slice_pitch = slice_pitch;
- wined3d_device_context_acquire_resource(context, resource); - wined3d_device_context_submit(context, WINED3D_CS_QUEUE_MAP); /* The data pointer may go away, so we need to wait until it is read. * Copying the data may be faster if it's small. */ diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 7c03e8a2c20..d797b26fff0 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -4314,7 +4314,7 @@ struct wined3d_resource LONG ref; LONG bind_count; LONG map_count; - LONG access_count; + ULONG access_time; struct wined3d_device *device; enum wined3d_resource_type type; enum wined3d_gl_resource_type gl_type; @@ -4358,17 +4358,6 @@ static inline ULONG wined3d_resource_decref(struct wined3d_resource *resource) return resource->resource_ops->resource_decref(resource); }
-static inline void wined3d_resource_acquire(struct wined3d_resource *resource) -{ - InterlockedIncrement(&resource->access_count); -} - -static inline void wined3d_resource_release(struct wined3d_resource *resource) -{ - LONG refcount = InterlockedDecrement(&resource->access_count); - assert(refcount >= 0); -} - static inline HRESULT wined3d_resource_get_sub_resource_desc(struct wined3d_resource *resource, unsigned int sub_resource_idx, struct wined3d_sub_resource_desc *desc) { @@ -5140,15 +5129,94 @@ void wined3d_device_context_emit_update_sub_resource(struct wined3d_device_conte HRESULT wined3d_device_context_emit_unmap(struct wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx) DECLSPEC_HIDDEN;
-static inline void wined3d_resource_wait_idle(struct wined3d_resource *resource) +static inline void wined3d_resource_acquire(struct wined3d_resource *resource) +{ + const struct wined3d_cs *cs = resource->device->cs; + resource->access_time = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head; +} + +static inline void wined3d_resource_release(struct wined3d_resource *resource) +{ +} + +static inline void wined3d_resource_wait_idle(const struct wined3d_resource *resource) { const struct wined3d_cs *cs = resource->device->cs; + ULONG access_time, tail, head;
if (!cs->thread || cs->thread_id == GetCurrentThreadId()) return;
- while (InterlockedCompareExchange(&resource->access_count, 0, 0)) + /* A resource is considered busy between queueing a command that reads it and the execution of that + * command. We use the head and tail pointer of the default CS queue for tracking the access time. + * We can't track commands on the map queue this way. If a map command is handled asynchronously the + * resource fencing needs to be handled some other way. + * + * The queue head and tail will wrap around when the 32 bit ULONG is exhausted. We therefore need to + * handle a few cases: + * + * A...access_time in the resource + * H...queue write head + * T...queue read tail + * + * Case 1: + * |.....T------A-----H..........| + * The resource is busy because the access time is between head and tail. No wrap-around has happened. + * + * Case 2: + * |..A.....T---------H..........| + * The resource is idle, the last command using it has been executed. + * + * Case 3: + * |........T---------H.....A....| + * The resource is idle, the last command using it has been executed and the head and tail have since + * wrapped around. + * + * Case 4: + * |--A---H.................T----| + * Resource is busy, HEAD has wrapped around, tail not yet. Note that Head < Tail + * + * Case 5: + * |------H....A............T----| + * Resource is idle. Head has wrapped around, tail not yet. + * + * Case 6: + * A + * T + * |........H....................| + * + * Queue is empty, resource therefore idle. + * + * It is possible that a resource has not been used for a long time and is idle, but the head and + * tail wrapped around in such a way that the previously set access time falls between head and tail. + * In this case we will incorrectly wait for the resource. Because we use the entire 32 bits of the + * counters and not just the bits needed to address the actual queue memory, this should happen rarely. + * If it turns out to be a problem we can switch to 64 bit counters or attempt to somehow mark the + * access time of resources invalid. CS packets are at least 4 byte aligned, so we could use the lower + * 2 bits in access_time for such a marker. + * + * Note that the access time is set before the command is submitted, so we have to wait until the + * tail is bigger than access_time, not equal. */ + access_time = resource->access_time; + head = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head; + while (1) + { + tail = *(volatile ULONG *)&cs->queue[WINED3D_CS_QUEUE_DEFAULT].tail; + if (head == tail) /* Case 6, queue empty. */ + break; + + if (head > tail) + { + if (access_time >= head || access_time < tail) /* Case 2, 3. */ + break; + } + else if (access_time > tail || access_time <= head) /* Case 5. */ + { + break; + } + /* Case 1, 4 - busy, wait a little. */ YieldProcessor(); + } }
/* TODO: Add tests and support for FLOAT16_4 POSITIONT, D3DCOLOR position, other