This is the patchset described in https://www.winehq.org/pipermail/wine-devel/2022-January/204020.html . It simplifies and speeds up d3d resource tracking in a few ways:
*) Completely remove any burden on the CS thread. *) Replace interlocked ops on the client thread with a plain assignment. *) Piggy-pack onto the queue's head and tail counters, which we already increment with interlocked ops.
I tested the impact with a microbenchmark: https://github.com/stefand/perftest/blob/main/resource_tracking_d3d11/resour...
Depending on the CPU it doubles or tripples draw speed in that microbenchmark. In real games the effect is much less pronounced, but I do see about a 2% gain in World of Tanks. I also see a gain in Rocket League, but only if I hack away other known issues with Rocket League (UpdateSubResource in particular).
I have further improvements to resource tracking in my mind that can be done on top of these patches: *) Separate read and write access times. *) Remove draw and compute tracking for d3d10+ clients and only track staging resources.
Matteo had some ideas to make the queue multi-writer thread safe to further reduce the use of wined3d_cs. This patchset makes this a bit more complicated because the head value cannot be infered from the return value of require_space() and thus needs to be passed around separately to submit(). This can be done either with thread local storage or via a separate parameter to require_space() and submit().
Stefan Dösinger (4): wined3d: Use extra bits in the queue head and tail counters. wined3d: Use the default queue index for resource fencing. wined3d: Remove the no-op wined3d_resource_release. wined3d: Move resource->type away from the access time field.
dlls/wined3d/cs.c | 170 ++++----------------------------- dlls/wined3d/resource.c | 1 - dlls/wined3d/wined3d_private.h | 97 +++++++++++++++---- 3 files changed, 100 insertions(+), 168 deletions(-)
Signed-off-by: Stefan Dösinger stefan@codeweavers.com
---
The next patches will use them for resource fences. We want as many extra bits as possible to reduce phantom waits due to wrap-arounds, see the next patch for details. --- dlls/wined3d/cs.c | 28 ++++++++++++++-------------- dlls/wined3d/wined3d_private.h | 3 ++- 2 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index f294be29091..3eb4f39429a 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -625,7 +625,7 @@ static const char *debug_cs_op(enum wined3d_cs_op op)
static struct wined3d_cs_packet *wined3d_next_cs_packet(const uint8_t *data, SIZE_T *offset) { - struct wined3d_cs_packet *packet = (struct wined3d_cs_packet *)&data[*offset]; + struct wined3d_cs_packet *packet = (struct wined3d_cs_packet *)&data[WINED3D_CS_QUEUE_MASK(*offset)];
*offset += offsetof(struct wined3d_cs_packet, data[packet->size]);
@@ -3232,7 +3232,7 @@ static const struct wined3d_device_context_ops wined3d_cs_st_ops = static BOOL wined3d_cs_queue_is_empty(const struct wined3d_cs *cs, const struct wined3d_cs_queue *queue) { wined3d_from_cs(cs); - return *(volatile LONG *)&queue->head == queue->tail; + return *(volatile ULONG *)&queue->head == queue->tail; }
static void wined3d_cs_queue_submit(struct wined3d_cs_queue *queue, struct wined3d_cs *cs) @@ -3240,10 +3240,10 @@ static void wined3d_cs_queue_submit(struct wined3d_cs_queue *queue, struct wined struct wined3d_cs_packet *packet; size_t packet_size;
- packet = (struct wined3d_cs_packet *)&queue->data[queue->head]; + packet = (struct wined3d_cs_packet *)&queue->data[WINED3D_CS_QUEUE_MASK(queue->head)]; TRACE("Queuing op %s at %p.\n", debug_cs_op(*(const enum wined3d_cs_op *)packet->data), packet); packet_size = FIELD_OFFSET(struct wined3d_cs_packet, data[packet->size]); - InterlockedExchange(&queue->head, (queue->head + packet_size) & (WINED3D_CS_QUEUE_SIZE - 1)); + InterlockedExchange((LONG *)&queue->head, queue->head + packet_size);
if (InterlockedCompareExchange(&cs->waiting_for_event, FALSE, TRUE)) SetEvent(cs->event); @@ -3264,6 +3264,7 @@ static void *wined3d_cs_queue_require_space(struct wined3d_cs_queue *queue, size size_t queue_size = ARRAY_SIZE(queue->data); size_t header_size, packet_size, remaining; struct wined3d_cs_packet *packet; + ULONG head = WINED3D_CS_QUEUE_MASK(queue->head);
header_size = FIELD_OFFSET(struct wined3d_cs_packet, data[0]); packet_size = FIELD_OFFSET(struct wined3d_cs_packet, data[size]); @@ -3276,7 +3277,7 @@ static void *wined3d_cs_queue_require_space(struct wined3d_cs_queue *queue, size return NULL; }
- remaining = queue_size - queue->head; + remaining = queue_size - head; if (remaining < packet_size) { size_t nop_size = remaining - header_size; @@ -3290,19 +3291,19 @@ static void *wined3d_cs_queue_require_space(struct wined3d_cs_queue *queue, size nop->opcode = WINED3D_CS_OP_NOP;
wined3d_cs_queue_submit(queue, cs); - assert(!queue->head); + head = WINED3D_CS_QUEUE_MASK(queue->head); + assert(!head); }
for (;;) { - LONG tail = *(volatile LONG *)&queue->tail; - LONG head = queue->head; - LONG new_pos; + ULONG tail = WINED3D_CS_QUEUE_MASK(*(volatile ULONG *)&queue->tail); + ULONG new_pos;
/* Empty. */ if (head == tail) break; - new_pos = (head + packet_size) & (WINED3D_CS_QUEUE_SIZE - 1); + new_pos = WINED3D_CS_QUEUE_MASK(head + packet_size); /* Head ahead of tail. We checked the remaining size above, so we only * need to make sure we don't make head equal to tail. */ if (head > tail && (new_pos != tail)) @@ -3316,7 +3317,7 @@ static void *wined3d_cs_queue_require_space(struct wined3d_cs_queue *queue, size head, tail, (unsigned long)packet_size); }
- packet = (struct wined3d_cs_packet *)&queue->data[queue->head]; + packet = (struct wined3d_cs_packet *)&queue->data[head]; packet->size = size; return packet->data; } @@ -3339,7 +3340,7 @@ static void wined3d_cs_mt_finish(struct wined3d_device_context *context, enum wi if (cs->thread_id == GetCurrentThreadId()) return wined3d_cs_st_finish(context, queue_id);
- while (cs->queue[queue_id].head != *(volatile LONG *)&cs->queue[queue_id].tail) + while (cs->queue[queue_id].head != *(volatile ULONG *)&cs->queue[queue_id].tail) YieldProcessor(); }
@@ -3465,8 +3466,7 @@ static DWORD WINAPI wined3d_cs_run(void *ctx) TRACE("%s at %p executed.\n", debug_cs_op(opcode), packet); }
- tail &= (WINED3D_CS_QUEUE_SIZE - 1); - InterlockedExchange(&queue->tail, tail); + InterlockedExchange((LONG *)&queue->tail, tail); }
cs->queue[WINED3D_CS_QUEUE_MAP].tail = cs->queue[WINED3D_CS_QUEUE_MAP].head; diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 277d5a94fb0..7c03e8a2c20 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -4953,10 +4953,11 @@ enum wined3d_push_constants #define WINED3D_CS_QUERY_POLL_INTERVAL 10u #define WINED3D_CS_QUEUE_SIZE 0x100000u #define WINED3D_CS_SPIN_COUNT 10000000u +#define WINED3D_CS_QUEUE_MASK(a) ((a) & (WINED3D_CS_QUEUE_SIZE - 1))
struct wined3d_cs_queue { - LONG head, tail; + ULONG head, tail; BYTE data[WINED3D_CS_QUEUE_SIZE]; };
This removes tracking work entirely from the CS thread and replaces interlocked with regular writes on the main thread. It also avoids accessing access_count in read-write mode from two threads.
Signed-off-by: Stefan Dösinger stefan@codeweavers.com --- dlls/wined3d/cs.c | 4 -- dlls/wined3d/wined3d_private.h | 96 +++++++++++++++++++++++++++++----- 2 files changed, 82 insertions(+), 18 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 3eb4f39429a..10d7b4b8996 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -2751,8 +2751,6 @@ static void wined3d_cs_exec_update_sub_resource(struct wined3d_cs *cs, const voi op->sub_resource_idx, context, &op->bo, box, op->row_pitch, op->slice_pitch);
context_release(context); - - wined3d_resource_release(resource); }
void wined3d_device_context_emit_update_sub_resource(struct wined3d_device_context *context, @@ -2792,8 +2790,6 @@ void wined3d_device_context_emit_update_sub_resource(struct wined3d_device_conte op->row_pitch = row_pitch; op->slice_pitch = slice_pitch;
- wined3d_device_context_acquire_resource(context, resource); - wined3d_device_context_submit(context, WINED3D_CS_QUEUE_MAP); /* The data pointer may go away, so we need to wait until it is read. * Copying the data may be faster if it's small. */ diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 7c03e8a2c20..d797b26fff0 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -4314,7 +4314,7 @@ struct wined3d_resource LONG ref; LONG bind_count; LONG map_count; - LONG access_count; + ULONG access_time; struct wined3d_device *device; enum wined3d_resource_type type; enum wined3d_gl_resource_type gl_type; @@ -4358,17 +4358,6 @@ static inline ULONG wined3d_resource_decref(struct wined3d_resource *resource) return resource->resource_ops->resource_decref(resource); }
-static inline void wined3d_resource_acquire(struct wined3d_resource *resource) -{ - InterlockedIncrement(&resource->access_count); -} - -static inline void wined3d_resource_release(struct wined3d_resource *resource) -{ - LONG refcount = InterlockedDecrement(&resource->access_count); - assert(refcount >= 0); -} - static inline HRESULT wined3d_resource_get_sub_resource_desc(struct wined3d_resource *resource, unsigned int sub_resource_idx, struct wined3d_sub_resource_desc *desc) { @@ -5140,15 +5129,94 @@ void wined3d_device_context_emit_update_sub_resource(struct wined3d_device_conte HRESULT wined3d_device_context_emit_unmap(struct wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx) DECLSPEC_HIDDEN;
-static inline void wined3d_resource_wait_idle(struct wined3d_resource *resource) +static inline void wined3d_resource_acquire(struct wined3d_resource *resource) +{ + const struct wined3d_cs *cs = resource->device->cs; + resource->access_time = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head; +} + +static inline void wined3d_resource_release(struct wined3d_resource *resource) +{ +} + +static inline void wined3d_resource_wait_idle(const struct wined3d_resource *resource) { const struct wined3d_cs *cs = resource->device->cs; + ULONG access_time, tail, head;
if (!cs->thread || cs->thread_id == GetCurrentThreadId()) return;
- while (InterlockedCompareExchange(&resource->access_count, 0, 0)) + /* A resource is considered busy between queueing a command that reads it and the execution of that + * command. We use the head and tail pointer of the default CS queue for tracking the access time. + * We can't track commands on the map queue this way. If a map command is handled asynchronously the + * resource fencing needs to be handled some other way. + * + * The queue head and tail will wrap around when the 32 bit ULONG is exhausted. We therefore need to + * handle a few cases: + * + * A...access_time in the resource + * H...queue write head + * T...queue read tail + * + * Case 1: + * |.....T------A-----H..........| + * The resource is busy because the access time is between head and tail. No wrap-around has happened. + * + * Case 2: + * |..A.....T---------H..........| + * The resource is idle, the last command using it has been executed. + * + * Case 3: + * |........T---------H.....A....| + * The resource is idle, the last command using it has been executed and the head and tail have since + * wrapped around. + * + * Case 4: + * |--A---H.................T----| + * Resource is busy, HEAD has wrapped around, tail not yet. Note that Head < Tail + * + * Case 5: + * |------H....A............T----| + * Resource is idle. Head has wrapped around, tail not yet. + * + * Case 6: + * A + * T + * |........H....................| + * + * Queue is empty, resource therefore idle. + * + * It is possible that a resource has not been used for a long time and is idle, but the head and + * tail wrapped around in such a way that the previously set access time falls between head and tail. + * In this case we will incorrectly wait for the resource. Because we use the entire 32 bits of the + * counters and not just the bits needed to address the actual queue memory, this should happen rarely. + * If it turns out to be a problem we can switch to 64 bit counters or attempt to somehow mark the + * access time of resources invalid. CS packets are at least 4 byte aligned, so we could use the lower + * 2 bits in access_time for such a marker. + * + * Note that the access time is set before the command is submitted, so we have to wait until the + * tail is bigger than access_time, not equal. */ + access_time = resource->access_time; + head = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head; + while (1) + { + tail = *(volatile ULONG *)&cs->queue[WINED3D_CS_QUEUE_DEFAULT].tail; + if (head == tail) /* Case 6, queue empty. */ + break; + + if (head > tail) + { + if (access_time >= head || access_time < tail) /* Case 2, 3. */ + break; + } + else if (access_time > tail || access_time <= head) /* Case 5. */ + { + break; + } + /* Case 1, 4 - busy, wait a little. */ YieldProcessor(); + } }
/* TODO: Add tests and support for FLOAT16_4 POSITIONT, D3DCOLOR position, other
Hi,
On 23 Feb 2022, at 17:01, Stefan Dösinger stefan@codeweavers.com wrote:
- while (InterlockedCompareExchange(&resource->access_count, 0, 0))
- /* A resource is considered busy between queueing a command that reads it and the execution of that
* command. We use the head and tail pointer of the default CS queue for tracking the access time.
* We can't track commands on the map queue this way. If a map command is handled asynchronously the
* resource fencing needs to be handled some other way.
*
* The queue head and tail will wrap around when the 32 bit ULONG is exhausted. We therefore need to
* handle a few cases:
*
* A...access_time in the resource
* H...queue write head
* T...queue read tail
*
* Case 1:
* |.....T------A-----H..........|
* The resource is busy because the access time is between head and tail. No wrap-around has happened.
*
* Case 2:
* |..A.....T---------H..........|
* The resource is idle, the last command using it has been executed.
*
* Case 3:
* |........T---------H.....A....|
* The resource is idle, the last command using it has been executed and the head and tail have since
* wrapped around.
*
* Case 4:
* |--A---H.................T----|
* Resource is busy, HEAD has wrapped around, tail not yet. Note that Head < Tail
*
* Case 5:
* |------H....A............T----|
* Resource is idle. Head has wrapped around, tail not yet.
*
* Case 6:
* A
* T
* |........H....................|
*
There’s a case 7, |--H.................T--A--|, which I don’t think is handled correctly (it goes to the case 5 path). I think it could handled uniformly with a slightly simpler condition like greater_wrap(A,T) && greater_wrap(H,A) => busy, where greater_wrap(x, y) = (x - y) < UINT_MAX/2. That assumes that head and tail are close together relative to the range size, but it seems reasonable to me.
* Queue is empty, resource therefore idle.
*
* It is possible that a resource has not been used for a long time and is idle, but the head and
* tail wrapped around in such a way that the previously set access time falls between head and tail.
* In this case we will incorrectly wait for the resource. Because we use the entire 32 bits of the
* counters and not just the bits needed to address the actual queue memory, this should happen rarely.
* If it turns out to be a problem we can switch to 64 bit counters or attempt to somehow mark the
* access time of resources invalid. CS packets are at least 4 byte aligned, so we could use the lower
* 2 bits in access_time for such a marker.
*
* Note that the access time is set before the command is submitted, so we have to wait until the
* tail is bigger than access_time, not equal. */
- access_time = resource->access_time;
- head = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head;
- while (1)
- {
tail = *(volatile ULONG *)&cs->queue[WINED3D_CS_QUEUE_DEFAULT].tail;
if (head == tail) /* Case 6, queue empty. */
break;
if (head > tail)
{
if (access_time >= head || access_time < tail) /* Case 2, 3. */
break;
}
else if (access_time > tail || access_time <= head) /* Case 5. */
{
break;
}
/* Case 1, 4 - busy, wait a little. */ YieldProcessor();
- }
}
- Jan
Signed-off-by: Stefan Dösinger stefan@codeweavers.com
---
I am open to renaming resource_acquire to something like resource_access, resource_fence or resource_readwrite. The latter is my favorite, but until we introduce read and write timestamps and waits I think resource_acquire is still a decent name. --- dlls/wined3d/cs.c | 138 ++------------------------------- dlls/wined3d/resource.c | 1 - dlls/wined3d/wined3d_private.h | 4 - 3 files changed, 5 insertions(+), 138 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 10d7b4b8996..f63f1197b3c 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -643,7 +643,6 @@ static void wined3d_cs_exec_present(struct wined3d_cs *cs, const void *data) const struct wined3d_cs_present *op = data; const struct wined3d_swapchain_desc *desc; struct wined3d_swapchain *swapchain; - unsigned int i;
swapchain = op->swapchain; desc = &swapchain->state.desc; @@ -714,12 +713,6 @@ static void wined3d_cs_exec_present(struct wined3d_cs *cs, const void *data) } }
- wined3d_resource_release(&swapchain->front_buffer->resource); - for (i = 0; i < desc->backbuffer_count; ++i) - { - wined3d_resource_release(&swapchain->back_buffers[i]->resource); - } - InterlockedDecrement(&cs->pending_presents); }
@@ -763,21 +756,9 @@ static void wined3d_cs_exec_clear(struct wined3d_cs *cs, const void *data) { struct wined3d_device *device = cs->c.device; const struct wined3d_cs_clear *op = data; - unsigned int i;
device->blitter->ops->blitter_clear(device->blitter, device, op->rt_count, &op->fb, op->rect_count, op->rects, &op->draw_rect, op->flags, &op->color, op->depth, op->stencil); - - if (op->flags & WINED3DCLEAR_TARGET) - { - for (i = 0; i < op->rt_count; ++i) - { - if (op->fb.render_targets[i]) - wined3d_resource_release(op->fb.render_targets[i]->resource); - } - } - if (op->flags & (WINED3DCLEAR_ZBUFFER | WINED3DCLEAR_STENCIL)) - wined3d_resource_release(op->fb.depth_stencil->resource); }
void wined3d_cs_emit_clear(struct wined3d_cs *cs, DWORD rect_count, const RECT *rects, @@ -892,39 +873,6 @@ static void acquire_shader_resources(struct wined3d_device_context *context, uns } }
-static void release_shader_resources(const struct wined3d_state *state, unsigned int shader_mask) -{ - struct wined3d_shader_sampler_map_entry *entry; - struct wined3d_shader_resource_view *view; - struct wined3d_shader *shader; - unsigned int i, j; - - for (i = 0; i < WINED3D_SHADER_TYPE_COUNT; ++i) - { - if (!(shader_mask & (1u << i))) - continue; - - if (!(shader = state->shader[i])) - continue; - - for (j = 0; j < WINED3D_MAX_CBS; ++j) - { - if (state->cb[i][j].buffer) - wined3d_resource_release(&state->cb[i][j].buffer->resource); - } - - for (j = 0; j < shader->reg_maps.sampler_map.count; ++j) - { - entry = &shader->reg_maps.sampler_map.entries[j]; - - if (!(view = state->shader_resource_view[i][entry->resource_idx])) - continue; - - wined3d_resource_release(view->resource); - } - } -} - static void acquire_unordered_access_resources(struct wined3d_device_context *context, const struct wined3d_shader *shader, struct wined3d_unordered_access_view * const *views) { @@ -945,26 +893,6 @@ static void acquire_unordered_access_resources(struct wined3d_device_context *co } }
-static void release_unordered_access_resources(const struct wined3d_shader *shader, - struct wined3d_unordered_access_view * const *views) -{ - unsigned int i; - - if (!shader) - return; - - for (i = 0; i < MAX_UNORDERED_ACCESS_VIEWS; ++i) - { - if (!shader->reg_maps.uav_resource_info[i].type) - continue; - - if (!views[i]) - continue; - - wined3d_resource_release(views[i]->resource); - } -} - static void wined3d_cs_exec_dispatch(struct wined3d_cs *cs, const void *data) { const struct wined3d_cs_dispatch *op = data; @@ -974,13 +902,6 @@ static void wined3d_cs_exec_dispatch(struct wined3d_cs *cs, const void *data) WARN("No compute shader bound, skipping dispatch.\n"); else cs->c.device->adapter->adapter_ops->adapter_dispatch_compute(cs->c.device, state, &op->parameters); - - if (op->parameters.indirect) - wined3d_resource_release(&op->parameters.u.indirect.buffer->resource); - - release_shader_resources(state, 1u << WINED3D_SHADER_TYPE_COMPUTE); - release_unordered_access_resources(state->shader[WINED3D_SHADER_TYPE_COMPUTE], - state->unordered_access_view[WINED3D_PIPELINE_COMPUTE]); }
static void acquire_compute_pipeline_resources(struct wined3d_device_context *context) @@ -1081,40 +1002,6 @@ static void wined3d_cs_exec_draw(struct wined3d_cs *cs, const void *data) state->patch_vertex_count = op->patch_vertex_count;
cs->c.device->adapter->adapter_ops->adapter_draw_primitive(cs->c.device, state, &op->parameters); - - if (op->parameters.indirect) - { - struct wined3d_buffer *buffer = op->parameters.u.indirect.buffer; - wined3d_resource_release(&buffer->resource); - } - - if (op->parameters.indexed) - wined3d_resource_release(&state->index_buffer->resource); - for (i = 0; i < ARRAY_SIZE(state->streams); ++i) - { - if (state->streams[i].buffer) - wined3d_resource_release(&state->streams[i].buffer->resource); - } - for (i = 0; i < ARRAY_SIZE(state->stream_output); ++i) - { - if (state->stream_output[i].buffer) - wined3d_resource_release(&state->stream_output[i].buffer->resource); - } - for (i = 0; i < ARRAY_SIZE(state->textures); ++i) - { - if (state->textures[i]) - wined3d_resource_release(&state->textures[i]->resource); - } - for (i = 0; i < d3d_info->limits.max_rt_count; ++i) - { - if (state->fb.render_targets[i]) - wined3d_resource_release(state->fb.render_targets[i]->resource); - } - if (state->fb.depth_stencil) - wined3d_resource_release(state->fb.depth_stencil->resource); - release_shader_resources(state, ~(1u << WINED3D_SHADER_TYPE_COMPUTE)); - release_unordered_access_resources(state->shader[WINED3D_SHADER_TYPE_PIXEL], - state->unordered_access_view[WINED3D_PIPELINE_GRAPHICS]); }
static void acquire_graphics_pipeline_resources(struct wined3d_device_context *context, @@ -2409,7 +2296,6 @@ static void wined3d_cs_exec_preload_resource(struct wined3d_cs *cs, const void * struct wined3d_resource *resource = op->resource;
resource->resource_ops->resource_preload(resource); - wined3d_resource_release(resource); }
void wined3d_cs_emit_preload_resource(struct wined3d_cs *cs, struct wined3d_resource *resource) @@ -2431,7 +2317,6 @@ static void wined3d_cs_exec_unload_resource(struct wined3d_cs *cs, const void *d struct wined3d_resource *resource = op->resource;
resource->resource_ops->resource_unload(resource); - wined3d_resource_release(resource); }
void wined3d_cs_emit_unload_resource(struct wined3d_cs *cs, struct wined3d_resource *resource) @@ -2614,14 +2499,14 @@ static void wined3d_cs_exec_blt_sub_resource(struct wined3d_cs *cs, const void * { FIXME("Flags %#x not implemented for %s resources.\n", op->flags, debug_d3dresourcetype(op->dst_resource->type)); - goto error; + return; }
if (!(op->flags & WINED3D_BLT_RAW) && op->src_resource->format != op->dst_resource->format) { FIXME("Format conversion not implemented for %s resources.\n", debug_d3dresourcetype(op->dst_resource->type)); - goto error; + return; }
update_w = op->dst_box.right - op->dst_box.left; @@ -2633,7 +2518,7 @@ static void wined3d_cs_exec_blt_sub_resource(struct wined3d_cs *cs, const void * { FIXME("Stretching not implemented for %s resources.\n", debug_d3dresourcetype(op->dst_resource->type)); - goto error; + return; }
dst_texture = texture_from_resource(op->dst_resource); @@ -2652,7 +2537,7 @@ static void wined3d_cs_exec_blt_sub_resource(struct wined3d_cs *cs, const void * ERR("Failed to load source sub-resource into %s.\n", wined3d_debug_location(location)); context_release(context); - goto error; + return; }
level = op->dst_sub_resource_idx % dst_texture->level_count; @@ -2668,7 +2553,7 @@ static void wined3d_cs_exec_blt_sub_resource(struct wined3d_cs *cs, const void * { ERR("Failed to load destination sub-resource.\n"); context_release(context); - goto error; + return; }
wined3d_texture_get_memory(src_texture, op->src_sub_resource_idx, &addr, location); @@ -2691,11 +2576,6 @@ static void wined3d_cs_exec_blt_sub_resource(struct wined3d_cs *cs, const void * &op->src_box, op->flags, &op->fx, op->filter))) FIXME("Blit failed.\n"); } - -error: - if (op->src_resource) - wined3d_resource_release(op->src_resource); - wined3d_resource_release(op->dst_resource); }
void wined3d_device_context_emit_blt_sub_resource(struct wined3d_device_context *context, @@ -2813,8 +2693,6 @@ static void wined3d_cs_exec_add_dirty_texture_region(struct wined3d_cs *cs, cons ERR("Failed to load location %s.\n", wined3d_debug_location(texture->resource.map_binding)); } context_release(context); - - wined3d_resource_release(&texture->resource); }
void wined3d_cs_emit_add_dirty_texture_region(struct wined3d_cs *cs, @@ -2841,8 +2719,6 @@ static void wined3d_cs_exec_clear_unordered_access_view(struct wined3d_cs *cs, c context = context_acquire(cs->c.device, NULL, 0); cs->c.device->adapter->adapter_ops->adapter_clear_uav(context, view, &op->clear_value, op->fp); context_release(context); - - wined3d_resource_release(view->resource); }
void wined3d_device_context_emit_clear_uav(struct wined3d_device_context *context, @@ -2870,8 +2746,6 @@ static void wined3d_cs_exec_copy_uav_counter(struct wined3d_cs *cs, const void * context = context_acquire(cs->c.device, NULL, 0); wined3d_unordered_access_view_copy_counter(view, op->buffer, op->offset, context); context_release(context); - - wined3d_resource_release(&op->buffer->resource); }
void wined3d_device_context_emit_copy_uav_counter(struct wined3d_device_context *context, @@ -2899,8 +2773,6 @@ static void wined3d_cs_exec_generate_mipmaps(struct wined3d_cs *cs, const void * context = context_acquire(cs->c.device, NULL, 0); cs->c.device->adapter->adapter_ops->adapter_generate_mipmap(context, view); context_release(context); - - wined3d_resource_release(view->resource); }
void wined3d_device_context_emit_generate_mipmaps(struct wined3d_device_context *context, diff --git a/dlls/wined3d/resource.c b/dlls/wined3d/resource.c index 94d024ce87d..ac18afc1589 100644 --- a/dlls/wined3d/resource.c +++ b/dlls/wined3d/resource.c @@ -236,7 +236,6 @@ static void wined3d_resource_destroy_object(void *object)
wined3d_resource_free_sysmem(resource); context_resource_released(resource->device, resource); - wined3d_resource_release(resource); }
void resource_cleanup(struct wined3d_resource *resource) diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index d797b26fff0..4cf454549f6 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -5135,10 +5135,6 @@ static inline void wined3d_resource_acquire(struct wined3d_resource *resource) resource->access_time = cs->queue[WINED3D_CS_QUEUE_DEFAULT].head; }
-static inline void wined3d_resource_release(struct wined3d_resource *resource) -{ -} - static inline void wined3d_resource_wait_idle(const struct wined3d_resource *resource) { const struct wined3d_cs *cs = resource->device->cs;
I am not signing off on this patch yet as I think a more thorough look at all our structures would be benefitial. E.g. we might want to move a few things (bind_count, map_count, etc into wined3d_client_resource and look at the alignment of our allocations.
This apparently avoids cache line stealing when the main thread sets the access time and the CS thread reads the resource type for preloading purposes.
The reason to move type and not access_time is that ref and bind_count are accessed from the main thread too. --- dlls/wined3d/wined3d_private.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 4cf454549f6..fbc39b53cbe 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -4316,7 +4316,6 @@ struct wined3d_resource LONG map_count; ULONG access_time; struct wined3d_device *device; - enum wined3d_resource_type type; enum wined3d_gl_resource_type gl_type; const struct wined3d_format *format; unsigned int format_flags; @@ -4346,6 +4345,7 @@ struct wined3d_resource
int32_t srv_bind_count_device; int32_t rtv_bind_count_device; + enum wined3d_resource_type type; };
static inline ULONG wined3d_resource_incref(struct wined3d_resource *resource)