Signed-off-by: Zebediah Figura z.figura12@gmail.com --- dlls/wined3d/cs.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 50a08334dab..e6f84134795 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -490,6 +490,7 @@ struct wined3d_cs_update_sub_resource unsigned int sub_resource_idx; struct wined3d_box box; struct wined3d_sub_resource_data data; + /* If data.data is NULL, the structure is followed by the data in memory. */ };
struct wined3d_cs_add_dirty_texture_region @@ -2597,6 +2598,7 @@ void wined3d_device_context_emit_blt_sub_resource(struct wined3d_device_context static void wined3d_cs_exec_update_sub_resource(struct wined3d_cs *cs, const void *data) { const struct wined3d_cs_update_sub_resource *op = data; + const void *src_data = op->data.data ? op->data.data : (const BYTE *)(op + 1); struct wined3d_resource *resource = op->resource; const struct wined3d_box *box = &op->box; unsigned int width, height, depth, level; @@ -2617,7 +2619,7 @@ static void wined3d_cs_exec_update_sub_resource(struct wined3d_cs *cs, const voi goto done; }
- wined3d_buffer_upload_data(buffer, context, box, op->data.data); + wined3d_buffer_upload_data(buffer, context, box, src_data); wined3d_buffer_invalidate_location(buffer, ~WINED3D_LOCATION_BUFFER); goto done; } @@ -2630,7 +2632,7 @@ static void wined3d_cs_exec_update_sub_resource(struct wined3d_cs *cs, const voi depth = wined3d_texture_get_level_depth(texture, level);
addr.buffer_object = 0; - addr.addr = op->data.data; + addr.addr = src_data;
/* Only load the sub-resource for partial updates. */ if (!box->left && !box->top && !box->front @@ -3375,8 +3377,35 @@ static void wined3d_deferred_context_update_sub_resource(struct wined3d_device_c struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, const void *data, unsigned int row_pitch, unsigned int slice_pitch) { - FIXME("context %p, resource %p, sub_resource_idx %u, box %s, data %p, row_pitch %u, slice_pitch %u, stub!\n", - context, resource, sub_resource_idx, debug_box(box), data, row_pitch, slice_pitch); + struct wined3d_cs_update_sub_resource *op; + size_t data_size; + + if (resource->type == WINED3D_RTYPE_BUFFER) + { + data_size = box->right - box->left; + } + else + { + const struct wined3d_format *format = resource->format; + + data_size = (box->back - box->front - 1) * slice_pitch + + ((box->bottom - box->top - 1) / format->block_height) * row_pitch + + ((box->right - box->left + format->block_width - 1) / format->block_width) * format->block_byte_count; + } + + op = wined3d_device_context_require_space(context, sizeof(*op) + data_size, WINED3D_CS_QUEUE_DEFAULT); + op->opcode = WINED3D_CS_OP_UPDATE_SUB_RESOURCE; + op->resource = resource; + op->sub_resource_idx = sub_resource_idx; + op->box = *box; + op->data.row_pitch = row_pitch; + op->data.slice_pitch = slice_pitch; + op->data.data = NULL; + memcpy(op + 1, data, data_size); + + wined3d_device_context_acquire_resource(context, resource); + + wined3d_device_context_submit(context, WINED3D_CS_QUEUE_DEFAULT); }
static void wined3d_deferred_context_issue_query(struct wined3d_device_context *context,
Signed-off-by: Zebediah Figura z.figura12@gmail.com --- dlls/d3d11/tests/d3d11.c | 15 +--- dlls/wined3d/buffer.c | 14 ++++ dlls/wined3d/cs.c | 136 ++++++++++++++++++++++++++++++--- dlls/wined3d/resource.c | 29 +++---- dlls/wined3d/texture.c | 25 ++++++ dlls/wined3d/wined3d_private.h | 5 ++ 6 files changed, 189 insertions(+), 35 deletions(-)
diff --git a/dlls/d3d11/tests/d3d11.c b/dlls/d3d11/tests/d3d11.c index 408c0492cd2..a2fdc116e61 100644 --- a/dlls/d3d11/tests/d3d11.c +++ b/dlls/d3d11/tests/d3d11.c @@ -32735,21 +32735,14 @@ static void test_deferred_context_map(void) ok(hr == E_INVALIDARG, "Got unexpected hr %#x.\n", hr);
hr = ID3D11DeviceContext_Map(deferred, (ID3D11Resource *)buffer, 0, D3D11_MAP_WRITE, 0, &map_desc); - todo_wine ok(hr == E_INVALIDARG, "Got unexpected hr %#x.\n", hr); + ok(hr == E_INVALIDARG, "Got unexpected hr %#x.\n", hr);
hr = ID3D11DeviceContext_Map(deferred, (ID3D11Resource *)buffer, 0, D3D11_MAP_WRITE_NO_OVERWRITE, 0, &map_desc); - todo_wine ok(hr == D3D11_ERROR_DEFERRED_CONTEXT_MAP_WITHOUT_INITIAL_DISCARD, "Got unexpected hr %#x.\n", hr); + ok(hr == D3D11_ERROR_DEFERRED_CONTEXT_MAP_WITHOUT_INITIAL_DISCARD, "Got unexpected hr %#x.\n", hr);
hr = ID3D11DeviceContext_Map(deferred, (ID3D11Resource *)buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &map_desc); - todo_wine ok(hr == S_OK, "Got unexpected hr %#x.\n", hr); - if (hr != S_OK) - { - ID3D11Buffer_Release(buffer2); - ID3D11Buffer_Release(buffer); - ID3D11DeviceContext_Release(deferred); - release_test_context(&test_context); - return; - } + ok(hr == S_OK, "Got unexpected hr %#x.\n", hr); + map_data = map_desc.pData; /* The previous contents of map_data are undefined and may in practice be * uninitialized garbage. */ diff --git a/dlls/wined3d/buffer.c b/dlls/wined3d/buffer.c index 12d038c0120..12b90cb54c2 100644 --- a/dlls/wined3d/buffer.c +++ b/dlls/wined3d/buffer.c @@ -826,6 +826,19 @@ struct wined3d_resource * CDECL wined3d_buffer_get_resource(struct wined3d_buffe return &buffer->resource; }
+static HRESULT buffer_resource_sub_resource_get_size(struct wined3d_resource *resource, + unsigned int sub_resource_idx, unsigned int *size, unsigned int *row_pitch, unsigned int *slice_pitch) +{ + if (sub_resource_idx) + { + WARN("Invalid sub_resource_idx %u.\n", sub_resource_idx); + return E_INVALIDARG; + } + + *size = *row_pitch = *slice_pitch = resource->size; + return S_OK; +} + static HRESULT buffer_resource_sub_resource_map(struct wined3d_resource *resource, unsigned int sub_resource_idx, struct wined3d_map_desc *map_desc, const struct wined3d_box *box, uint32_t flags) { @@ -1084,6 +1097,7 @@ static const struct wined3d_resource_ops buffer_resource_ops = buffer_resource_decref, buffer_resource_preload, buffer_resource_unload, + buffer_resource_sub_resource_get_size, buffer_resource_sub_resource_map, buffer_resource_sub_resource_unmap, }; diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index e6f84134795..1f7213232fa 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -26,6 +26,13 @@ WINE_DECLARE_DEBUG_CHANNEL(fps);
#define WINED3D_INITIAL_CS_SIZE 4096
+struct wined3d_acquired_resource +{ + struct wined3d_resource *resource; + unsigned int sub_resource_idx; + void *sysmem; +}; + struct wined3d_command_list { LONG refcount; @@ -36,7 +43,7 @@ struct wined3d_command_list void *data;
SIZE_T resource_count; - struct wined3d_resource **resources; + struct wined3d_acquired_resource *resources;
/* List of command lists queued for execution on this command list. We might * be the only thing holding a pointer to another command list, so we need @@ -48,9 +55,13 @@ struct wined3d_command_list static void wined3d_command_list_destroy_object(void *object) { struct wined3d_command_list *list = object; + SIZE_T i;
TRACE("list %p.\n", list);
+ for (i = 0; i < list->resource_count; ++i) + wined3d_free_sysmem(list->resources[i].sysmem); + heap_free(list->resources); heap_free(list->data); heap_free(list); @@ -79,7 +90,7 @@ ULONG CDECL wined3d_command_list_decref(struct wined3d_command_list *list) for (i = 0; i < list->command_list_count; ++i) wined3d_command_list_decref(list->command_lists[i]); for (i = 0; i < list->resource_count; ++i) - wined3d_resource_decref(list->resources[i]); + wined3d_resource_decref(list->resources[i].resource);
wined3d_cs_destroy_object(device->cs, wined3d_command_list_destroy_object, list); } @@ -139,6 +150,7 @@ enum wined3d_cs_op WINED3D_CS_OP_COPY_UAV_COUNTER, WINED3D_CS_OP_GENERATE_MIPMAPS, WINED3D_CS_OP_EXECUTE_COMMAND_LIST, + WINED3D_CS_OP_UPLOAD_SUB_RESOURCE, WINED3D_CS_OP_STOP, };
@@ -469,6 +481,15 @@ struct wined3d_cs_unmap HRESULT *hr; };
+struct wined3d_cs_upload_sub_resource +{ + enum wined3d_cs_op opcode; + struct wined3d_resource *resource; + unsigned int sub_resource_idx; + unsigned int size; + const void *data; +}; + struct wined3d_cs_blt_sub_resource { enum wined3d_cs_op opcode; @@ -616,6 +637,7 @@ static const char *debug_cs_op(enum wined3d_cs_op op) WINED3D_TO_STR(WINED3D_CS_OP_COPY_UAV_COUNTER); WINED3D_TO_STR(WINED3D_CS_OP_GENERATE_MIPMAPS); WINED3D_TO_STR(WINED3D_CS_OP_EXECUTE_COMMAND_LIST); + WINED3D_TO_STR(WINED3D_CS_OP_UPLOAD_SUB_RESOURCE); WINED3D_TO_STR(WINED3D_CS_OP_STOP); #undef WINED3D_TO_STR } @@ -2342,7 +2364,7 @@ static void wined3d_cs_execute_command_list(struct wined3d_device_context *conte op->list = list;
for (i = 0; i < list->resource_count; ++i) - wined3d_resource_acquire(list->resources[i]); + wined3d_resource_acquire(list->resources[i].resource);
wined3d_device_context_submit(context, WINED3D_CS_QUEUE_DEFAULT);
@@ -2463,6 +2485,28 @@ static HRESULT wined3d_cs_unmap(struct wined3d_device_context *context, struct w return hr; }
+static void wined3d_cs_exec_upload_sub_resource(struct wined3d_cs *cs, const void *data) +{ + const struct wined3d_cs_upload_sub_resource *op = data; + struct wined3d_resource *resource = op->resource; + unsigned int sub_resource_idx = op->sub_resource_idx; + struct wined3d_map_desc map_desc; + HRESULT hr; + + if (FAILED(hr = resource->resource_ops->resource_sub_resource_map(resource, + sub_resource_idx, &map_desc, NULL, WINED3D_MAP_WRITE | WINED3D_MAP_DISCARD))) + { + ERR("Failed to map resource, hr %#x.\n", hr); + return; + } + + memcpy(map_desc.data, op->data, op->size); + + resource->resource_ops->resource_sub_resource_unmap(resource, sub_resource_idx); + + wined3d_resource_release(resource); +} + static void wined3d_cs_exec_blt_sub_resource(struct wined3d_cs *cs, const void *data) { const struct wined3d_cs_blt_sub_resource *op = data; @@ -2870,6 +2914,7 @@ static void (* const wined3d_cs_op_handlers[])(struct wined3d_cs *cs, const void /* WINED3D_CS_OP_COPY_UAV_COUNTER */ wined3d_cs_exec_copy_uav_counter, /* WINED3D_CS_OP_GENERATE_MIPMAPS */ wined3d_cs_exec_generate_mipmaps, /* WINED3D_CS_OP_EXECUTE_COMMAND_LIST */ wined3d_cs_exec_execute_command_list, + /* WINED3D_CS_OP_UPLOAD_SUB_RESOURCE */ wined3d_cs_exec_upload_sub_resource, };
static void wined3d_cs_exec_execute_command_list(struct wined3d_cs *cs, const void *data) @@ -3301,7 +3346,7 @@ struct wined3d_deferred_context void *data;
SIZE_T resource_count, resources_capacity; - struct wined3d_resource **resources; + struct wined3d_acquired_resource *resources;
/* List of command lists queued for execution on this context. A command * list can be the only thing holding a pointer to another command list, so @@ -3361,16 +3406,78 @@ static HRESULT wined3d_deferred_context_map(struct wined3d_device_context *conte struct wined3d_resource *resource, unsigned int sub_resource_idx, struct wined3d_map_desc *map_desc, const struct wined3d_box *box, unsigned int flags) { - FIXME("context %p, resource %p, sub_resource_idx %u, map_desc %p, box %p, flags %#x, stub!\n", - context, resource, sub_resource_idx, map_desc, box, flags); - return E_NOTIMPL; + struct wined3d_deferred_context *deferred = wined3d_deferred_context_from_context(context); + struct wined3d_acquired_resource *acquired_resource; + unsigned int size; + HRESULT hr; + + if (box) + { + ERR("Unexpected box.\n"); + return E_INVALIDARG; + } + + if (FAILED(hr = resource->resource_ops->resource_sub_resource_get_size(resource, + sub_resource_idx, &size, &map_desc->row_pitch, &map_desc->slice_pitch))) + return E_INVALIDARG; + + if (flags & WINED3D_MAP_DISCARD) + { + struct wined3d_cs_upload_sub_resource *op; + void *sysmem; + + if (!(sysmem = wined3d_allocate_sysmem(size))) + return E_OUTOFMEMORY; + + if (!wined3d_array_reserve((void **)&deferred->resources, &deferred->resources_capacity, + deferred->resource_count + 1, sizeof(*deferred->resources))) + return E_OUTOFMEMORY; + + acquired_resource = &deferred->resources[deferred->resource_count++]; + acquired_resource->resource = resource; + wined3d_resource_incref(resource); + acquired_resource->sub_resource_idx = sub_resource_idx; + acquired_resource->sysmem = sysmem; + + op = wined3d_device_context_require_space(context, sizeof(*op), WINED3D_CS_QUEUE_DEFAULT); + op->opcode = WINED3D_CS_OP_UPLOAD_SUB_RESOURCE; + op->resource = resource; + op->sub_resource_idx = sub_resource_idx; + op->size = size; + op->data = sysmem; + + map_desc->data = sysmem; + return S_OK; + } + else if (flags & WINED3D_MAP_NOOVERWRITE) + { + int i = deferred->resource_count; + + while (i--) + { + acquired_resource = &deferred->resources[i]; + + if (acquired_resource->resource == resource + && acquired_resource->sub_resource_idx == sub_resource_idx && acquired_resource->sysmem) + { + map_desc->data = acquired_resource->sysmem; + return S_OK; + } + } + + return D3D11_ERROR_DEFERRED_CONTEXT_MAP_WITHOUT_INITIAL_DISCARD; + } + else + { + WARN("Invalid flags %#x, returning E_INVALIDARG.\n", flags); + return E_INVALIDARG; + } }
static HRESULT wined3d_deferred_context_unmap(struct wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx) { - FIXME("context %p, resource %p, sub_resource_idx %u, stub!\n", context, resource, sub_resource_idx); - return E_NOTIMPL; + return S_OK; }
static void wined3d_deferred_context_update_sub_resource(struct wined3d_device_context *context, @@ -3423,13 +3530,17 @@ static void wined3d_deferred_context_acquire_resource(struct wined3d_device_cont struct wined3d_resource *resource) { struct wined3d_deferred_context *deferred = wined3d_deferred_context_from_context(context); + struct wined3d_acquired_resource *acquired_resource;
if (!wined3d_array_reserve((void **)&deferred->resources, &deferred->resources_capacity, deferred->resource_count + 1, sizeof(*deferred->resources))) return;
- deferred->resources[deferred->resource_count++] = resource; + acquired_resource = &deferred->resources[deferred->resource_count++]; + acquired_resource->resource = resource; wined3d_resource_incref(resource); + acquired_resource->sub_resource_idx = 0; + acquired_resource->sysmem = NULL; }
static void wined3d_deferred_context_execute_command_list(struct wined3d_device_context *context, @@ -3504,7 +3615,10 @@ void CDECL wined3d_deferred_context_destroy(struct wined3d_device_context *conte TRACE("context %p.\n", context);
for (i = 0; i < deferred->resource_count; ++i) - wined3d_resource_decref(deferred->resources[i]); + { + wined3d_resource_decref(deferred->resources[i].resource); + wined3d_free_sysmem(deferred->resources[i].sysmem); + } heap_free(deferred->resources);
wined3d_state_destroy(deferred->c.state); diff --git a/dlls/wined3d/resource.c b/dlls/wined3d/resource.c index 58e3e5c77fd..42e45a2745c 100644 --- a/dlls/wined3d/resource.c +++ b/dlls/wined3d/resource.c @@ -334,24 +334,32 @@ void CDECL wined3d_resource_preload(struct wined3d_resource *resource) wined3d_cs_emit_preload_resource(resource->device->cs, resource); }
-static BOOL wined3d_resource_allocate_sysmem(struct wined3d_resource *resource) +void *wined3d_allocate_sysmem(SIZE_T size) { void **p; - SIZE_T align = RESOURCE_ALIGNMENT - 1 + sizeof(*p); + static const SIZE_T align = RESOURCE_ALIGNMENT - 1 + sizeof(*p); void *mem;
- if (!(mem = heap_alloc_zero(resource->size + align))) + if (!(mem = heap_alloc_zero(size + align))) { ERR("Failed to allocate system memory.\n"); - return FALSE; + return NULL; }
p = (void **)(((ULONG_PTR)mem + align) & ~(RESOURCE_ALIGNMENT - 1)) - 1; *p = mem;
- resource->heap_memory = ++p; + return ++p; +}
- return TRUE; +void wined3d_free_sysmem(void *mem) +{ + void **p = mem; + + if (!p) + return; + + heap_free(*(--p)); }
BOOL wined3d_resource_prepare_sysmem(struct wined3d_resource *resource) @@ -359,17 +367,12 @@ BOOL wined3d_resource_prepare_sysmem(struct wined3d_resource *resource) if (resource->heap_memory) return TRUE;
- return wined3d_resource_allocate_sysmem(resource); + return !!(resource->heap_memory = wined3d_allocate_sysmem(resource->size)); }
void wined3d_resource_free_sysmem(struct wined3d_resource *resource) { - void **p = resource->heap_memory; - - if (!p) - return; - - heap_free(*(--p)); + wined3d_free_sysmem(resource->heap_memory); resource->heap_memory = NULL; }
diff --git a/dlls/wined3d/texture.c b/dlls/wined3d/texture.c index c7a9b4da3e1..663503bdf39 100644 --- a/dlls/wined3d/texture.c +++ b/dlls/wined3d/texture.c @@ -3506,6 +3506,30 @@ static void texture_resource_unload(struct wined3d_resource *resource) resource_unload(&texture->resource); }
+static HRESULT texture_resource_sub_resource_get_size(struct wined3d_resource *resource, + unsigned int sub_resource_idx, unsigned int *size, unsigned int *row_pitch, unsigned int *slice_pitch) +{ + struct wined3d_texture *texture = texture_from_resource(resource); + unsigned int texture_level = sub_resource_idx % texture->level_count; + struct wined3d_texture_sub_resource *sub_resource; + + if (!(sub_resource = wined3d_texture_get_sub_resource(texture, sub_resource_idx))) + return E_INVALIDARG; + + if (resource->format_flags & WINED3DFMT_FLAG_BROKEN_PITCH) + { + *row_pitch = wined3d_texture_get_level_width(texture, texture_level) * resource->format->byte_count; + *slice_pitch = wined3d_texture_get_level_height(texture, texture_level) * (*row_pitch); + } + else + { + wined3d_texture_get_pitch(texture, texture_level, row_pitch, slice_pitch); + } + + *size = sub_resource->size; + return S_OK; +} + static HRESULT texture_resource_sub_resource_map(struct wined3d_resource *resource, unsigned int sub_resource_idx, struct wined3d_map_desc *map_desc, const struct wined3d_box *box, DWORD flags) { @@ -3692,6 +3716,7 @@ static const struct wined3d_resource_ops texture_resource_ops = texture_resource_decref, texture_resource_preload, texture_resource_unload, + texture_resource_sub_resource_get_size, texture_resource_sub_resource_map, texture_resource_sub_resource_unmap, }; diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 8e3efccffc2..db6e1619e8f 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -4048,6 +4048,8 @@ struct wined3d_resource_ops ULONG (*resource_decref)(struct wined3d_resource *resource); void (*resource_preload)(struct wined3d_resource *resource); void (*resource_unload)(struct wined3d_resource *resource); + HRESULT (*resource_sub_resource_get_size)(struct wined3d_resource *resource, unsigned int sub_resource_idx, + unsigned int *size, unsigned int *row_pitch, unsigned int *slice_pitch); HRESULT (*resource_sub_resource_map)(struct wined3d_resource *resource, unsigned int sub_resource_idx, struct wined3d_map_desc *map_desc, const struct wined3d_box *box, DWORD flags); HRESULT (*resource_sub_resource_unmap)(struct wined3d_resource *resource, unsigned int sub_resource_idx); @@ -4131,6 +4133,9 @@ void wined3d_resource_update_draw_binding(struct wined3d_resource *resource) DEC #define RESOURCE_ALIGNMENT 16 #define WINED3D_CONSTANT_BUFFER_ALIGNMENT 16
+void *wined3d_allocate_sysmem(SIZE_T size) DECLSPEC_HIDDEN; +void wined3d_free_sysmem(void *mem) DECLSPEC_HIDDEN; + #define WINED3D_LOCATION_DISCARDED 0x00000001 #define WINED3D_LOCATION_SYSMEM 0x00000002 #define WINED3D_LOCATION_BUFFER 0x00000008
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=91375
Your paranoid android.
=== w1064v1809 (32 bit report) ===
d3d11: d3d11.c:5810: Test failed: Got unexpected IAVertices count: 0. d3d11.c:5811: Test failed: Got unexpected IAPrimitives count: 0. d3d11.c:5812: Test failed: Got unexpected VSInvocations count: 0. d3d11.c:5815: Test failed: Got unexpected CInvocations count: 0. d3d11.c:5816: Test failed: Got unexpected CPrimitives count: 0.
=== debiant2 (32 bit Arabic:Morocco report) ===
d3d11: d3d11.c:5833: Test failed: d3d11.c:5894: Test marked todo: Got unexpected hr 0x1. d3d11.c:9660: Test failed: d3d11.c:15021: Test marked todo: Got hr 0 for WRITE_DISCARD.
=== debiant2 (32 bit Chinese:China report) ===
d3d11: d3d11.c:9660: Test failed: d3d11.c:15043: Test 60: Got unexpected color 0xffffff00 at (3, 0). d3d11.c:9660: Test failed: Test marked todo: Got hr 0 for WRITE_DISCARD.
On Fri, May 28, 2021 at 7:22 AM Zebediah Figura z.figura12@gmail.com wrote:
Signed-off-by: Zebediah Figura z.figura12@gmail.com
dlls/wined3d/cs.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 50a08334dab..e6f84134795 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -490,6 +490,7 @@ struct wined3d_cs_update_sub_resource unsigned int sub_resource_idx; struct wined3d_box box; struct wined3d_sub_resource_data data;
- /* If data.data is NULL, the structure is followed by the data in memory. */
};
struct wined3d_cs_add_dirty_texture_region @@ -2597,6 +2598,7 @@ void wined3d_device_context_emit_blt_sub_resource(struct wined3d_device_context static void wined3d_cs_exec_update_sub_resource(struct wined3d_cs *cs, const void *data) { const struct wined3d_cs_update_sub_resource *op = data;
- const void *src_data = op->data.data ? op->data.data : (const BYTE *)(op + 1); struct wined3d_resource *resource = op->resource; const struct wined3d_box *box = &op->box; unsigned int width, height, depth, level;
@@ -2617,7 +2619,7 @@ static void wined3d_cs_exec_update_sub_resource(struct wined3d_cs *cs, const voi goto done; }
wined3d_buffer_upload_data(buffer, context, box, op->data.data);
}wined3d_buffer_upload_data(buffer, context, box, src_data); wined3d_buffer_invalidate_location(buffer, ~WINED3D_LOCATION_BUFFER); goto done;
@@ -2630,7 +2632,7 @@ static void wined3d_cs_exec_update_sub_resource(struct wined3d_cs *cs, const voi depth = wined3d_texture_get_level_depth(texture, level);
addr.buffer_object = 0;
- addr.addr = op->data.data;
addr.addr = src_data;
/* Only load the sub-resource for partial updates. */ if (!box->left && !box->top && !box->front
@@ -3375,8 +3377,35 @@ static void wined3d_deferred_context_update_sub_resource(struct wined3d_device_c struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, const void *data, unsigned int row_pitch, unsigned int slice_pitch) {
- FIXME("context %p, resource %p, sub_resource_idx %u, box %s, data %p, row_pitch %u, slice_pitch %u, stub!\n",
context, resource, sub_resource_idx, debug_box(box), data, row_pitch, slice_pitch);
- struct wined3d_cs_update_sub_resource *op;
- size_t data_size;
- if (resource->type == WINED3D_RTYPE_BUFFER)
- {
data_size = box->right - box->left;
- }
- else
- {
const struct wined3d_format *format = resource->format;
data_size = (box->back - box->front - 1) * slice_pitch
+ ((box->bottom - box->top - 1) / format->block_height) * row_pitch
+ ((box->right - box->left + format->block_width - 1) / format->block_width) * format->block_byte_count;
- }
- op = wined3d_device_context_require_space(context, sizeof(*op) + data_size, WINED3D_CS_QUEUE_DEFAULT);
- op->opcode = WINED3D_CS_OP_UPDATE_SUB_RESOURCE;
- op->resource = resource;
- op->sub_resource_idx = sub_resource_idx;
- op->box = *box;
- op->data.row_pitch = row_pitch;
- op->data.slice_pitch = slice_pitch;
- op->data.data = NULL;
- memcpy(op + 1, data, data_size);
- wined3d_device_context_acquire_resource(context, resource);
- wined3d_device_context_submit(context, WINED3D_CS_QUEUE_DEFAULT);
}
So this is, AFAIR, more or less what the staging patchset did. I don't think that we want to put the data in the command stream though.
Always copying the data from the user-provided source to some other storage is definitely the right choice: we don't want to block the application (more or less directly) to make sure that the data stays around until we need it. That's bad for performance for immediate contexts and just impossible for deferred, so not much to argue there. At the same time we also want to minimize the amount of copies. Ideally, for a GPU resource, here we'd upload the data right to the GPU (e.g. inside a persistently mapped buffer), ready to be used whenever it's time. For a CPU resource instead we'd copy the data into its final place in sysmem (what will eventually become its "heap_memory"). We don't quite have the infrastructure for it at the moment but that would be the goal IMO. What you do in patch 2/2 is more along the lines of this idea. The sysmem allocation mechanism works okay for the time being and could be extended to handle other types of memory, e.g. it could eventually take charge of the vulkan allocator, making it generic and usable from GL too. I don't think it has to be managed by each deferred context (i.e. it should probably be per-device), although the current scheme has the advantage of not having to care about concurrency.
So, at least in my view, I'd bring the allocator bits from patch 2/2 into patch 1/2 and use that for update_sub_resource, both the deferred and the immediate variants (the latter could be its own patch). You'd only need the DISCARD portion for update_sub_resource, so the rest can (and probably should) stay in the map() patch.
Yes, mostly what Matteo said.
On Fri, 28 May 2021 at 15:39, Matteo Bruni matteo.mystral@gmail.com wrote:
So this is, AFAIR, more or less what the staging patchset did. I don't think that we want to put the data in the command stream though.
It may not be too bad for deferred contexts, but for immediate contexts probably not, no.
Always copying the data from the user-provided source to some other storage is definitely the right choice: we don't want to block the application (more or less directly) to make sure that the data stays around until we need it. That's bad for performance for immediate contexts and just impossible for deferred, so not much to argue there. At the same time we also want to minimize the amount of copies. Ideally, for a GPU resource, here we'd upload the data right to the GPU (e.g. inside a persistently mapped buffer), ready to be used whenever it's time. For a CPU resource instead we'd copy the data into its final place in sysmem (what will eventually become its "heap_memory"). We don't quite have the infrastructure for it at the moment but that would be the goal IMO. What you do in patch 2/2 is more along the lines of this idea. The sysmem allocation mechanism works okay for the time being and could be extended to handle other types of memory, e.g. it could eventually take charge of the vulkan allocator, making it generic and usable from GL too. I don't think it has to be managed by each deferred context (i.e. it should probably be per-device), although the current scheme has the advantage of not having to care about concurrency.
So, at least in my view, I'd bring the allocator bits from patch 2/2 into patch 1/2 and use that for update_sub_resource, both the deferred and the immediate variants (the latter could be its own patch). You'd only need the DISCARD portion for update_sub_resource, so the rest can (and probably should) stay in the map() patch.
My (very rough) proposal for this would be to replace the "data" field in the wined3d_cs_update_sub_resource structure with a wined3d_bo_address, and then introduce something like the following:
static bool wined3d_device_context_get_upload_bo(struct wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, uint32_t flags, struct wined3d_bo_address *address, void **map_ptr) { ... }
For deferred contexts you should then be able to do something (conceptually) very similar to patch 2/2, and in the end we may not even need map/unmap/update_sub_resource in the wined3d_device_context_ops structure.
Replying to both mails at once, sort of...
On 5/31/21 4:53 AM, Henri Verbeet wrote:
Yes, mostly what Matteo said.
On Fri, 28 May 2021 at 15:39, Matteo Bruni matteo.mystral@gmail.com wrote:
So this is, AFAIR, more or less what the staging patchset did. I don't think that we want to put the data in the command stream though.
It may not be too bad for deferred contexts, but for immediate contexts probably not, no.
Always copying the data from the user-provided source to some other storage is definitely the right choice: we don't want to block the application (more or less directly) to make sure that the data stays around until we need it. That's bad for performance for immediate contexts and just impossible for deferred, so not much to argue there. At the same time we also want to minimize the amount of copies. Ideally, for a GPU resource, here we'd upload the data right to the GPU (e.g. inside a persistently mapped buffer), ready to be used whenever it's time.
To clarify, when talking about uploading directly to the GPU, are you just referring to map() here, or update_sub_resource() as well?
Does it even make sense to use a persistently mapped GPU buffer for a resource mapped/updated by a deferred context? Obligatory disclaimer that I don't know anything about GPU-side performance, but obviously you can't reuse an earlier mapping for UpdateSubResource() or for DISCARD maps, and I'm failing to understand why an application would ever use NOOVERWRITE on a deferred context.
For a CPU resource instead we'd copy the data into its final place in sysmem (what will eventually become its "heap_memory"). We don't quite have the infrastructure for it at the moment but that would be the goal IMO. What you do in patch 2/2 is more along the lines of this idea. The sysmem allocation mechanism works okay for the time being and could be extended to handle other types of memory, e.g. it could eventually take charge of the vulkan allocator, making it generic and usable from GL too. I don't think it has to be managed by each deferred context (i.e. it should probably be per-device), although the current scheme has the advantage of not having to care about concurrency.
In a sense there is no "current scheme", i.e. deferred contexts are tracking which memory belongs to which resource because that *has* to be done per-context, but nobody's managing custom heaps. Unless I misunderstand what's meant here?
So, at least in my view, I'd bring the allocator bits from patch 2/2 into patch 1/2 and use that for update_sub_resource, both the deferred and the immediate variants (the latter could be its own patch). You'd only need the DISCARD portion for update_sub_resource, so the rest can (and probably should) stay in the map() patch.
My (very rough) proposal for this would be to replace the "data" field in the wined3d_cs_update_sub_resource structure with a wined3d_bo_address, and then introduce something like the following:
static bool wined3d_device_context_get_upload_bo(struct
wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, uint32_t flags, struct wined3d_bo_address *address, void **map_ptr) { ... }
This function strikes me as more than a little confusing, party because it seems to be returning two partially overlapping things, with mostly different usages...
i.e. I'd presume the idea is for wined3d_device_context_update_sub_resource() to retrieve this and then call something like wined3d_context_copy_bo_address() to copy from sysmem, which makes sense, but then what are we doing with map_ptr?
Or, if we use this in wined3d_device_context_map(), couldn't we just call wined3d_context_map_bo_address() on the returned wined3d_bo_address?
For deferred contexts you should then be able to do something (conceptually) very similar to patch 2/2, and in the end we may not even need map/unmap/update_sub_resource in the wined3d_device_context_ops structure.
We'd still presumably need an unmap, though, unless the idea above is to *always* allocate new (GPU or CPU) memory, which seems like a much bigger step.
Unless I'm misreading things, this also seems to imply that *_resource_sub_resource_map() shouldn't exist anymore, right?
On Mon, 31 May 2021 at 23:45, Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 4:53 AM, Henri Verbeet wrote:
My (very rough) proposal for this would be to replace the "data" field in the wined3d_cs_update_sub_resource structure with a wined3d_bo_address, and then introduce something like the following:
static bool wined3d_device_context_get_upload_bo(struct
wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, uint32_t flags, struct wined3d_bo_address *address, void **map_ptr) { ... }
This function strikes me as more than a little confusing, party because it seems to be returning two partially overlapping things, with mostly different usages...
i.e. I'd presume the idea is for wined3d_device_context_update_sub_resource() to retrieve this and then call something like wined3d_context_copy_bo_address() to copy from sysmem, which makes sense, but then what are we doing with map_ptr?
Well, wined3d_context_copy_bo_address() needs a wined3d_context, which you typically don't have on application threads.
The basic idea is that you'd pass "address" to WINED3D_CS_OP_UPDATE_SUB_RESOURCE, and "map_ptr " to the application. (In case of map/unmap; for update_sub_resource() you'd just copy the data into "map_ptr".)
Or, if we use this in wined3d_device_context_map(), couldn't we just call wined3d_context_map_bo_address() on the returned wined3d_bo_address?
Conceptually, sure, you could split this into allocate/map/unmap. In practice though, you'll always want the upload buffer to be mapped, in order to copy data into it from system memory. And in fact, if we're going to return a GPU buffer here, it should generally already be mapped. (For various reasons; for 64-bit applications, we use persistent maps if we can; for OpenGL, we want to avoid making GL calls from application threads; if we have a dedicated upload buffer belonging to the device context, it would have been mapped during creation.) When needed, unmap can happen when the upload buffer is consumed.
For deferred contexts you should then be able to do something (conceptually) very similar to patch 2/2, and in the end we may not even need map/unmap/update_sub_resource in the wined3d_device_context_ops structure.
We'd still presumably need an unmap, though, unless the idea above is to *always* allocate new (GPU or CPU) memory, which seems like a much bigger step.
Not necessarily, but wined3d_cs_exec_update_sub_resource() can unmap if needed.
Unless I'm misreading things, this also seems to imply that *_resource_sub_resource_map() shouldn't exist anymore, right?
Non-discard maps still exist, and wined3d_device_context_get_upload_bo() wouldn't be used for those.
On 5/31/21 6:43 PM, Henri Verbeet wrote:
On Mon, 31 May 2021 at 23:45, Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 4:53 AM, Henri Verbeet wrote:
My (very rough) proposal for this would be to replace the "data" field in the wined3d_cs_update_sub_resource structure with a wined3d_bo_address, and then introduce something like the following:
static bool wined3d_device_context_get_upload_bo(struct
wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, uint32_t flags, struct wined3d_bo_address *address, void **map_ptr) { ... }
This function strikes me as more than a little confusing, party because it seems to be returning two partially overlapping things, with mostly different usages...
i.e. I'd presume the idea is for wined3d_device_context_update_sub_resource() to retrieve this and then call something like wined3d_context_copy_bo_address() to copy from sysmem, which makes sense, but then what are we doing with map_ptr?
Well, wined3d_context_copy_bo_address() needs a wined3d_context, which you typically don't have on application threads.
The basic idea is that you'd pass "address" to WINED3D_CS_OP_UPDATE_SUB_RESOURCE, and "map_ptr " to the application. (In case of map/unmap; for update_sub_resource() you'd just copy the data into "map_ptr".)
Or, if we use this in wined3d_device_context_map(), couldn't we just call wined3d_context_map_bo_address() on the returned wined3d_bo_address?
Conceptually, sure, you could split this into allocate/map/unmap. In practice though, you'll always want the upload buffer to be mapped, in order to copy data into it from system memory. And in fact, if we're going to return a GPU buffer here, it should generally already be mapped. (For various reasons; for 64-bit applications, we use persistent maps if we can; for OpenGL, we want to avoid making GL calls from application threads; if we have a dedicated upload buffer belonging to the device context, it would have been mapped during creation.) When needed, unmap can happen when the upload buffer is consumed.
Thanks for explaining the rationale there, that does help a lot.
It still feels awkward to me to pass the same information maybe twice, but I guess the wined3d_bo_address part should basically be considered opaque above the CS thread.
For deferred contexts you should then be able to do something (conceptually) very similar to patch 2/2, and in the end we may not even need map/unmap/update_sub_resource in the wined3d_device_context_ops structure.
We'd still presumably need an unmap, though, unless the idea above is to *always* allocate new (GPU or CPU) memory, which seems like a much bigger step.
Not necessarily, but wined3d_cs_exec_update_sub_resource() can unmap if needed.
Unless I'm misreading things, this also seems to imply that *_resource_sub_resource_map() shouldn't exist anymore, right?
Non-discard maps still exist, and wined3d_device_context_get_upload_bo() wouldn't be used for those.
So the idea goes like this:
(1) Pass a whole wined3d_bo_address as part of struct wined3d_cs_update_sub_resource.
(2) Introduce wined3d_device_context_get_upload_bo() to allocate memory. At first, only use it for deferred contexts.
(3) Use wined3d_device_context_get_upload_bo() for immediate contexts as well, for both DISCARD and NOOVERWRITE maps and for update_sub_resource. This implies always allocating new CPU/GPU memory, going through the DEFAULT queue, and not waiting for the upload to complete.
The idea that map/unmap/update_sub_resource can go away seems to imply (3) to me, but I'm more than a little curious as to whether it's actually desirable in all cases. Even with persistent mapping you still need to allocate new memory, which isn't exactly cheap, especially with large resources.
And then there's the aforementioned non-discard maps—don't we still need an explicit WINED3D_CS_OP_UNMAP for those? Or any case (are there any?) where we don't want to persistently map a resource?
On Fri, Jun 11, 2021 at 5:40 AM Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 6:43 PM, Henri Verbeet wrote:
On Mon, 31 May 2021 at 23:45, Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 4:53 AM, Henri Verbeet wrote:
My (very rough) proposal for this would be to replace the "data" field in the wined3d_cs_update_sub_resource structure with a wined3d_bo_address, and then introduce something like the following:
static bool wined3d_device_context_get_upload_bo(struct
wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, uint32_t flags, struct wined3d_bo_address *address, void **map_ptr) { ... }
This function strikes me as more than a little confusing, party because it seems to be returning two partially overlapping things, with mostly different usages...
i.e. I'd presume the idea is for wined3d_device_context_update_sub_resource() to retrieve this and then call something like wined3d_context_copy_bo_address() to copy from sysmem, which makes sense, but then what are we doing with map_ptr?
Well, wined3d_context_copy_bo_address() needs a wined3d_context, which you typically don't have on application threads.
The basic idea is that you'd pass "address" to WINED3D_CS_OP_UPDATE_SUB_RESOURCE, and "map_ptr " to the application. (In case of map/unmap; for update_sub_resource() you'd just copy the data into "map_ptr".)
Or, if we use this in wined3d_device_context_map(), couldn't we just call wined3d_context_map_bo_address() on the returned wined3d_bo_address?
Conceptually, sure, you could split this into allocate/map/unmap. In practice though, you'll always want the upload buffer to be mapped, in order to copy data into it from system memory. And in fact, if we're going to return a GPU buffer here, it should generally already be mapped. (For various reasons; for 64-bit applications, we use persistent maps if we can; for OpenGL, we want to avoid making GL calls from application threads; if we have a dedicated upload buffer belonging to the device context, it would have been mapped during creation.) When needed, unmap can happen when the upload buffer is consumed.
Thanks for explaining the rationale there, that does help a lot.
It still feels awkward to me to pass the same information maybe twice, but I guess the wined3d_bo_address part should basically be considered opaque above the CS thread.
Yes, as I understand it that's to be used by wined3d to keep track of the lifetime of the memory block.
For deferred contexts you should then be able to do something (conceptually) very similar to patch 2/2, and in the end we may not even need map/unmap/update_sub_resource in the wined3d_device_context_ops structure.
We'd still presumably need an unmap, though, unless the idea above is to *always* allocate new (GPU or CPU) memory, which seems like a much bigger step.
Not necessarily, but wined3d_cs_exec_update_sub_resource() can unmap if needed.
Unless I'm misreading things, this also seems to imply that *_resource_sub_resource_map() shouldn't exist anymore, right?
Non-discard maps still exist, and wined3d_device_context_get_upload_bo() wouldn't be used for those.
So the idea goes like this:
(1) Pass a whole wined3d_bo_address as part of struct wined3d_cs_update_sub_resource.
(2) Introduce wined3d_device_context_get_upload_bo() to allocate memory. At first, only use it for deferred contexts.
(3) Use wined3d_device_context_get_upload_bo() for immediate contexts as well, for both DISCARD and NOOVERWRITE maps and for update_sub_resource. This implies always allocating new CPU/GPU memory, going through the DEFAULT queue, and not waiting for the upload to complete.
wined3d_device_context_get_upload_bo() should probably avoid the queues entirely, at least in the normal case. More below.
The idea that map/unmap/update_sub_resource can go away seems to imply (3) to me, but I'm more than a little curious as to whether it's actually desirable in all cases. Even with persistent mapping you still need to allocate new memory, which isn't exactly cheap, especially with large resources.
You don't need to actually allocate new memory every time, just return memory that you know is available at the moment. The specific mechanism used by wined3d_device_get_upload_bo() isn't important to the callers but yes, the API we want would in fact return a new, independent memory area for each DISCARD map. It's the only way to make things fast with constant buffer updates, to name the most critical example. For some of those you'll easily have thousands of DISCARD maps per frame and if you don't want to block on buffer maps and e.g. you have one frame latency to play with, that means having thousands of "instances" of each buffer moving around at any given time. WRT the mechanism, it might be allocating a large chunk of system memory and suballocating from it, keeping track of the lifetime of each chunk as it is returned to the application and then back to wined3d. Or suballocating from GPU memory. It could even be creating a ton of GL buffers and flipping them around as needed, it doesn't matter; the idea is that each memory chunk can be reused, once we know the memory is available again because it's not used by wined3d (e.g. if it's a block of system memory that's been uploaded to the GPU) or the GPU (e.g. if we directly returned the memory where the GPU resource is stored) anymore. Ideally, when the application is in a steady state, there should be next to no actual memory allocation or GL / Vulkan resource creation. But, in principle, everything would work just as well (if not as fast) if we allocated system memory every time wined3d_device_context_get_upload_bo() is called and freed it once wined3d is done with uploading the data that was stored into it by the application.
FWIW, the DYNAMIC d3d usage is supposed to flag the resources that are going to be updated frequently and are performance sensitive. In principle we could reserve wined3d_device_context_get_upload_bo() to DYNAMIC resources only.
And then there's the aforementioned non-discard maps—don't we still need an explicit WINED3D_CS_OP_UNMAP for those? Or any case (are there any?) where we don't want to persistently map a resource?
We might want to keep the existing CS operations for these cases, sure. I suspect it will become clearer further down the road.
On 6/11/21 2:44 AM, Matteo Bruni wrote:
On Fri, Jun 11, 2021 at 5:40 AM Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 6:43 PM, Henri Verbeet wrote:
On Mon, 31 May 2021 at 23:45, Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 4:53 AM, Henri Verbeet wrote:
My (very rough) proposal for this would be to replace the "data" field in the wined3d_cs_update_sub_resource structure with a wined3d_bo_address, and then introduce something like the following:
static bool wined3d_device_context_get_upload_bo(struct
wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, uint32_t flags, struct wined3d_bo_address *address, void **map_ptr) { ... }
This function strikes me as more than a little confusing, party because it seems to be returning two partially overlapping things, with mostly different usages...
i.e. I'd presume the idea is for wined3d_device_context_update_sub_resource() to retrieve this and then call something like wined3d_context_copy_bo_address() to copy from sysmem, which makes sense, but then what are we doing with map_ptr?
Well, wined3d_context_copy_bo_address() needs a wined3d_context, which you typically don't have on application threads.
The basic idea is that you'd pass "address" to WINED3D_CS_OP_UPDATE_SUB_RESOURCE, and "map_ptr " to the application. (In case of map/unmap; for update_sub_resource() you'd just copy the data into "map_ptr".)
Or, if we use this in wined3d_device_context_map(), couldn't we just call wined3d_context_map_bo_address() on the returned wined3d_bo_address?
Conceptually, sure, you could split this into allocate/map/unmap. In practice though, you'll always want the upload buffer to be mapped, in order to copy data into it from system memory. And in fact, if we're going to return a GPU buffer here, it should generally already be mapped. (For various reasons; for 64-bit applications, we use persistent maps if we can; for OpenGL, we want to avoid making GL calls from application threads; if we have a dedicated upload buffer belonging to the device context, it would have been mapped during creation.) When needed, unmap can happen when the upload buffer is consumed.
Thanks for explaining the rationale there, that does help a lot.
It still feels awkward to me to pass the same information maybe twice, but I guess the wined3d_bo_address part should basically be considered opaque above the CS thread.
Yes, as I understand it that's to be used by wined3d to keep track of the lifetime of the memory block.
For deferred contexts you should then be able to do something (conceptually) very similar to patch 2/2, and in the end we may not even need map/unmap/update_sub_resource in the wined3d_device_context_ops structure.
We'd still presumably need an unmap, though, unless the idea above is to *always* allocate new (GPU or CPU) memory, which seems like a much bigger step.
Not necessarily, but wined3d_cs_exec_update_sub_resource() can unmap if needed.
Unless I'm misreading things, this also seems to imply that *_resource_sub_resource_map() shouldn't exist anymore, right?
Non-discard maps still exist, and wined3d_device_context_get_upload_bo() wouldn't be used for those.
So the idea goes like this:
(1) Pass a whole wined3d_bo_address as part of struct wined3d_cs_update_sub_resource.
(2) Introduce wined3d_device_context_get_upload_bo() to allocate memory. At first, only use it for deferred contexts.
(3) Use wined3d_device_context_get_upload_bo() for immediate contexts as well, for both DISCARD and NOOVERWRITE maps and for update_sub_resource. This implies always allocating new CPU/GPU memory, going through the DEFAULT queue, and not waiting for the upload to complete.
wined3d_device_context_get_upload_bo() should probably avoid the queues entirely, at least in the normal case. More below.
Right, sorry, I don't mean that wined3d_device_context_get_upload_bo() goes through the queues, but that the WINED3D_CS_UPDATE_SUB_RESOURCE goes through the DEFAULT queue instead of the MAP queue once it's submitted.
The idea that map/unmap/update_sub_resource can go away seems to imply (3) to me, but I'm more than a little curious as to whether it's actually desirable in all cases. Even with persistent mapping you still need to allocate new memory, which isn't exactly cheap, especially with large resources.
You don't need to actually allocate new memory every time, just return memory that you know is available at the moment. The specific mechanism used by wined3d_device_get_upload_bo() isn't important to the callers but yes, the API we want would in fact return a new, independent memory area for each DISCARD map. It's the only way to make things fast with constant buffer updates, to name the most critical example. For some of those you'll easily have thousands of DISCARD maps per frame and if you don't want to block on buffer maps and e.g. you have one frame latency to play with, that means having thousands of "instances" of each buffer moving around at any given time. WRT the mechanism, it might be allocating a large chunk of system memory and suballocating from it, keeping track of the lifetime of each chunk as it is returned to the application and then back to wined3d. Or suballocating from GPU memory. It could even be creating a ton of GL buffers and flipping them around as needed, it doesn't matter; the idea is that each memory chunk can be reused, once we know the memory is available again because it's not used by wined3d (e.g. if it's a block of system memory that's been uploaded to the GPU) or the GPU (e.g. if we directly returned the memory where the GPU resource is stored) anymore. Ideally, when the application is in a steady state, there should be next to no actual memory allocation or GL / Vulkan resource creation. But, in principle, everything would work just as well (if not as fast) if we allocated system memory every time wined3d_device_context_get_upload_bo() is called and freed it once wined3d is done with uploading the data that was stored into it by the application.
FWIW, the DYNAMIC d3d usage is supposed to flag the resources that are going to be updated frequently and are performance sensitive. In principle we could reserve wined3d_device_context_get_upload_bo() to DYNAMIC resources only.
And then there's the aforementioned non-discard maps—don't we still need an explicit WINED3D_CS_OP_UNMAP for those? Or any case (are there any?) where we don't want to persistently map a resource?
We might want to keep the existing CS operations for these cases, sure. I suspect it will become clearer further down the road.
Okay, many thanks to you and Henri for those answers; I think I have a clear enough picture now.
On Fri, 11 Jun 2021 at 05:39, Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 6:43 PM, Henri Verbeet wrote:
Conceptually, sure, you could split this into allocate/map/unmap. In practice though, you'll always want the upload buffer to be mapped, in order to copy data into it from system memory. And in fact, if we're going to return a GPU buffer here, it should generally already be mapped. (For various reasons; for 64-bit applications, we use persistent maps if we can; for OpenGL, we want to avoid making GL calls from application threads; if we have a dedicated upload buffer belonging to the device context, it would have been mapped during creation.) When needed, unmap can happen when the upload buffer is consumed.
Thanks for explaining the rationale there, that does help a lot.
It still feels awkward to me to pass the same information maybe twice, but I guess the wined3d_bo_address part should basically be considered opaque above the CS thread.
In a sense, it's opaque outside the adapter specific code. I.e., resolving the "buffer_object" part of the address is specific to a particular adapter implementation. You could think of those as address spaces, if you like. Of course "buffer_object" 0 is special because it's CPU memory, but there isn't necessarily any relation between the "addr" part and what would get returned in "map_ptr".
Non-discard maps still exist, and wined3d_device_context_get_upload_bo() wouldn't be used for those.
So the idea goes like this:
(1) Pass a whole wined3d_bo_address as part of struct wined3d_cs_update_sub_resource.
(2) Introduce wined3d_device_context_get_upload_bo() to allocate memory. At first, only use it for deferred contexts.
(3) Use wined3d_device_context_get_upload_bo() for immediate contexts as well, for both DISCARD and NOOVERWRITE maps and for update_sub_resource. This implies always allocating new CPU/GPU memory, going through the DEFAULT queue, and not waiting for the upload to complete.
As Matteo said, more "available" than "new", but essentially yes.
The idea that map/unmap/update_sub_resource can go away seems to imply (3) to me, but I'm more than a little curious as to whether it's actually desirable in all cases. Even with persistent mapping you still need to allocate new memory, which isn't exactly cheap, especially with large resources.
Well, we can make wined3d_device_context_get_upload_bo() fail when desired, and the caller would then need to handle that, probably by using the existing path.
And then there's the aforementioned non-discard maps—don't we still need an explicit WINED3D_CS_OP_UNMAP for those? Or any case (are there any?) where we don't want to persistently map a resource?
Yes, we're much more conservative about creating persistent mappings on 32-bit, for example.
As usual my reply ended up being unwieldy and confusing. Hopefully it's still helpful...
On Mon, May 31, 2021 at 11:45 PM Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 4:53 AM, Henri Verbeet wrote:
Yes, mostly what Matteo said.
On Fri, 28 May 2021 at 15:39, Matteo Bruni matteo.mystral@gmail.com wrote:
So this is, AFAIR, more or less what the staging patchset did. I don't think that we want to put the data in the command stream though.
It may not be too bad for deferred contexts, but for immediate contexts probably not, no.
Always copying the data from the user-provided source to some other storage is definitely the right choice: we don't want to block the application (more or less directly) to make sure that the data stays around until we need it. That's bad for performance for immediate contexts and just impossible for deferred, so not much to argue there. At the same time we also want to minimize the amount of copies. Ideally, for a GPU resource, here we'd upload the data right to the GPU (e.g. inside a persistently mapped buffer), ready to be used whenever it's time.
To clarify, when talking about uploading directly to the GPU, are you just referring to map() here, or update_sub_resource() as well?
Potentially both cases. A DISCARD map is virtually indistinguishable from update_sub_resource().
Does it even make sense to use a persistently mapped GPU buffer for a resource mapped/updated by a deferred context? Obligatory disclaimer that I don't know anything about GPU-side performance, but obviously you can't reuse an earlier mapping for UpdateSubResource() or for DISCARD maps, and I'm failing to understand why an application would ever use NOOVERWRITE on a deferred context.
I don't have all the details of deferred contexts down, but I'm pretty sure that you can. All that you need to know is that the memory block you return to the application is not in use (be it by wined3d or by the underlying API / GPU) by the time the application will access it. Whether the memory is backed by a buffer object, suballocated from a larger GPU-side resource or a random chunk of system memory is an implementation detail.
NOOVERWRITE also makes perfect sense with deferred contexts. The general idea behind DISCARD and NOOVERWRITE is to gradually "fill" a larger resource over a number of draws, without requiring any synchronization between the draws. This is probably largely or entirely obvious but I'll go through the whole story to make sure we're on the same page.
The natural use case goes like this: the application wants to draw some dynamic geometry (i.e. generated at runtime, with variable primitive count). At initialization time it creates a couple large buffers (e.g. 1 MB each), one for vertices and one for indices. Then at runtime it maps both with DISCARD, fills them with whatever data is necessary for the current draw (e.g. vertex and index data for 100 triangles), keeping track of how much memory is still available in either buffer, unmaps and draws. After a while it decides that it needs to draw some more dynamic stuff. Now it does something very similar to the previous time, except if there is still some free space available it will map the buffer with NOOVERWRITE and "append" to the buffer, writing the new data after the area used by the first draw. NOOVERWRITE is a promise to d3d to not touch the data in use by any previous draw, which in practice means that the implementation can return the same memory as the previous map, rather than blocking to make sure that any pending draw using the same buffer is completed. Eventually the application will run out of space from one buffer or the other. No problem, at that point the application will map the relevant buffer with DISCARD and restart filling the buffer from the top. Here DISCARD is a different promise to d3d: the application tells d3d that it won't try to make use of the old contents of the buffer. What it implies for the implementation is that an entirely new "physical" buffer can be made available to the application, which will take the place of the previous "physical" buffer once pending draws are done with. Afterwards the application will resume mapping the buffer with NOOVERWRITE for the following dynamic draws, until the buffer fills up again and so on and so forth. Worth mentioning is that the "new" buffer that the implementation can return on a DISCARD map might in fact be an old buffer, for example coming from a buffer discarded some time earlier, currently unused and sitting idle. Or it might be a portion of a larger physical buffer that the implementation knows it's not in use.
Of course applications can and often do find ways to use those primitives more creatively, but this is the basic idea. Also, this works just as well for immediate or for deferred contexts.
For a CPU resource instead we'd copy the data into its final place in sysmem (what will eventually become its "heap_memory"). We don't quite have the infrastructure for it at the moment but that would be the goal IMO. What you do in patch 2/2 is more along the lines of this idea. The sysmem allocation mechanism works okay for the time being and could be extended to handle other types of memory, e.g. it could eventually take charge of the vulkan allocator, making it generic and usable from GL too. I don't think it has to be managed by each deferred context (i.e. it should probably be per-device), although the current scheme has the advantage of not having to care about concurrency.
In a sense there is no "current scheme", i.e. deferred contexts are tracking which memory belongs to which resource because that *has* to be done per-context, but nobody's managing custom heaps. Unless I misunderstand what's meant here?
Why? Memory is memory and is a per-device resource. All you need to do is return some available memory to the application whenever it asks for a resource map, and to do that you need to keep track of how memory is used and for how long.
So, at least in my view, I'd bring the allocator bits from patch 2/2 into patch 1/2 and use that for update_sub_resource, both the deferred and the immediate variants (the latter could be its own patch). You'd only need the DISCARD portion for update_sub_resource, so the rest can (and probably should) stay in the map() patch.
My (very rough) proposal for this would be to replace the "data" field in the wined3d_cs_update_sub_resource structure with a wined3d_bo_address, and then introduce something like the following:
static bool wined3d_device_context_get_upload_bo(struct
wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, uint32_t flags, struct wined3d_bo_address *address, void **map_ptr) { ... }
This function strikes me as more than a little confusing, party because it seems to be returning two partially overlapping things, with mostly different usages...
i.e. I'd presume the idea is for wined3d_device_context_update_sub_resource() to retrieve this and then call something like wined3d_context_copy_bo_address() to copy from sysmem, which makes sense, but then what are we doing with map_ptr?
Or, if we use this in wined3d_device_context_map(), couldn't we just call wined3d_context_map_bo_address() on the returned wined3d_bo_address?
As I understand it, the idea is to return map_ptr to the application and keep track of address internally (i.e. it's the reference to the "physical resource" associated with the memory returned via map_ptr). You'll want to keep track of those struct wined3d_bo_address and in particular of when they'll become available again at some point in the future.
For deferred contexts you should then be able to do something (conceptually) very similar to patch 2/2, and in the end we may not even need map/unmap/update_sub_resource in the wined3d_device_context_ops structure.
We'd still presumably need an unmap, though, unless the idea above is to *always* allocate new (GPU or CPU) memory, which seems like a much bigger step.
I'm not entirely on top of the latest changes in wined3d and I'm not sure what we want the various _ops structs to look like eventually, but AFAIU we do want to always allocate new memory for DISCARD maps (and we basically do that already, both for Vulkan and GL, although in a somewhat roundabout fashion). As I mentioned earlier, it doesn't have to be actually new memory, if we happen to have some available memory around we can use it instead. I think the point is that maps really need to go through wined3d_device_context_ops / the CS only because, sometimes, you need new memory. Once you have a call that caters to that situation, you should be set. All the other situations can ideally be handled on the "client" side, without crossing the CS boundary. There are a few complications (what about READ maps from a STAGING resource? updates to a non-DYNAMIC buffer?) but those are the special cases, the WRITE-only DISCARD / NOOVERWRITE dynamic buffer map is the interesting use case that deserves to be streamlined. For deferred contexts, it's the only allowed case and that's no coincidence.
Unless I'm misreading things, this also seems to imply that *_resource_sub_resource_map() shouldn't exist anymore, right?
That's a separate question I think, as that's the API the upper wined3d "layers" use to access physical resources. It might need to be modified along the way to better handle these new concepts but it seems likely that we'll still need something in the same fashion
On Tue, 1 Jun 2021 at 02:12, Matteo Bruni matteo.mystral@gmail.com wrote:
Does it even make sense to use a persistently mapped GPU buffer for a resource mapped/updated by a deferred context? Obligatory disclaimer that I don't know anything about GPU-side performance, but obviously you can't reuse an earlier mapping for UpdateSubResource() or for DISCARD maps, and I'm failing to understand why an application would ever use NOOVERWRITE on a deferred context.
I don't have all the details of deferred contexts down, but I'm pretty sure that you can. All that you need to know is that the memory block you return to the application is not in use (be it by wined3d or by the underlying API / GPU) by the time the application will access it. Whether the memory is backed by a buffer object, suballocated from a larger GPU-side resource or a random chunk of system memory is an implementation detail.
NOOVERWRITE also makes perfect sense with deferred contexts. The general idea behind DISCARD and NOOVERWRITE is to gradually "fill" a larger resource over a number of draws, without requiring any synchronization between the draws. This is probably largely or entirely obvious but I'll go through the whole story to make sure we're on the same page.
The natural use case goes like this: the application wants to draw some dynamic geometry (i.e. generated at runtime, with variable primitive count). At initialization time it creates a couple large buffers (e.g. 1 MB each), one for vertices and one for indices. Then at runtime it maps both with DISCARD, fills them with whatever data is necessary for the current draw (e.g. vertex and index data for 100 triangles), keeping track of how much memory is still available in either buffer, unmaps and draws. After a while it decides that it needs to draw some more dynamic stuff. Now it does something very similar to the previous time, except if there is still some free space available it will map the buffer with NOOVERWRITE and "append" to the buffer, writing the new data after the area used by the first draw. NOOVERWRITE is a promise to d3d to not touch the data in use by any previous draw, which in practice means that the implementation can return the same memory as the previous map, rather than blocking to make sure that any pending draw using the same buffer is completed. Eventually the application will run out of space from one buffer or the other. No problem, at that point the application will map the relevant buffer with DISCARD and restart filling the buffer from the top. Here DISCARD is a different promise to d3d: the application tells d3d that it won't try to make use of the old contents of the buffer. What it implies for the implementation is that an entirely new "physical" buffer can be made available to the application, which will take the place of the previous "physical" buffer once pending draws are done with. Afterwards the application will resume mapping the buffer with NOOVERWRITE for the following dynamic draws, until the buffer fills up again and so on and so forth. Worth mentioning is that the "new" buffer that the implementation can return on a DISCARD map might in fact be an old buffer, for example coming from a buffer discarded some time earlier, currently unused and sitting idle. Or it might be a portion of a larger physical buffer that the implementation knows it's not in use.
Of course applications can and often do find ways to use those primitives more creatively, but this is the basic idea. Also, this works just as well for immediate or for deferred contexts.
Although you could certainly argue that since deferred contexts/command lists are created in advance, there should be no need for NOOVERWRITE maps, and the application could simply fill the entire resource (or at least, the part it's going to use) with the initial DISCARD map. (The case for DISCARD maps vs UpdateSubResource() is perhaps a little more subtle; when using GPU buffers, DISCARD maps would allow you to avoid a copy.) I suspect it's mostly for convenience; NOOVERWRITE maps on deferred contexts allow applications to render in the same way to deferred contexts as they would to immediate contexts.
On 6/1/21 2:48 AM, Henri Verbeet wrote:
On Tue, 1 Jun 2021 at 02:12, Matteo Bruni matteo.mystral@gmail.com wrote:
Does it even make sense to use a persistently mapped GPU buffer for a resource mapped/updated by a deferred context? Obligatory disclaimer that I don't know anything about GPU-side performance, but obviously you can't reuse an earlier mapping for UpdateSubResource() or for DISCARD maps, and I'm failing to understand why an application would ever use NOOVERWRITE on a deferred context.
I don't have all the details of deferred contexts down, but I'm pretty sure that you can. All that you need to know is that the memory block you return to the application is not in use (be it by wined3d or by the underlying API / GPU) by the time the application will access it. Whether the memory is backed by a buffer object, suballocated from a larger GPU-side resource or a random chunk of system memory is an implementation detail.
NOOVERWRITE also makes perfect sense with deferred contexts. The general idea behind DISCARD and NOOVERWRITE is to gradually "fill" a larger resource over a number of draws, without requiring any synchronization between the draws. This is probably largely or entirely obvious but I'll go through the whole story to make sure we're on the same page.
The natural use case goes like this: the application wants to draw some dynamic geometry (i.e. generated at runtime, with variable primitive count). At initialization time it creates a couple large buffers (e.g. 1 MB each), one for vertices and one for indices. Then at runtime it maps both with DISCARD, fills them with whatever data is necessary for the current draw (e.g. vertex and index data for 100 triangles), keeping track of how much memory is still available in either buffer, unmaps and draws. After a while it decides that it needs to draw some more dynamic stuff. Now it does something very similar to the previous time, except if there is still some free space available it will map the buffer with NOOVERWRITE and "append" to the buffer, writing the new data after the area used by the first draw. NOOVERWRITE is a promise to d3d to not touch the data in use by any previous draw, which in practice means that the implementation can return the same memory as the previous map, rather than blocking to make sure that any pending draw using the same buffer is completed. Eventually the application will run out of space from one buffer or the other. No problem, at that point the application will map the relevant buffer with DISCARD and restart filling the buffer from the top. Here DISCARD is a different promise to d3d: the application tells d3d that it won't try to make use of the old contents of the buffer. What it implies for the implementation is that an entirely new "physical" buffer can be made available to the application, which will take the place of the previous "physical" buffer once pending draws are done with. Afterwards the application will resume mapping the buffer with NOOVERWRITE for the following dynamic draws, until the buffer fills up again and so on and so forth. Worth mentioning is that the "new" buffer that the implementation can return on a DISCARD map might in fact be an old buffer, for example coming from a buffer discarded some time earlier, currently unused and sitting idle. Or it might be a portion of a larger physical buffer that the implementation knows it's not in use.
Of course applications can and often do find ways to use those primitives more creatively, but this is the basic idea. Also, this works just as well for immediate or for deferred contexts.
Although you could certainly argue that since deferred contexts/command lists are created in advance, there should be no need for NOOVERWRITE maps, and the application could simply fill the entire resource (or at least, the part it's going to use) with the initial DISCARD map. (The case for DISCARD maps vs UpdateSubResource() is perhaps a little more subtle; when using GPU buffers, DISCARD maps would allow you to avoid a copy.) I suspect it's mostly for convenience; NOOVERWRITE maps on deferred contexts allow applications to render in the same way to deferred contexts as they would to immediate contexts.
I think that's mostly what was confusing me about it, yeah, but on reflection it would make sense that the application doesn't actually know all of the data it's going to upload in advance :-/
On 5/31/21 7:12 PM, Matteo Bruni wrote:
As usual my reply ended up being unwieldy and confusing. Hopefully it's still helpful...
On Mon, May 31, 2021 at 11:45 PM Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 5/31/21 4:53 AM, Henri Verbeet wrote:
Yes, mostly what Matteo said.
On Fri, 28 May 2021 at 15:39, Matteo Bruni matteo.mystral@gmail.com wrote:
So this is, AFAIR, more or less what the staging patchset did. I don't think that we want to put the data in the command stream though.
It may not be too bad for deferred contexts, but for immediate contexts probably not, no.
Always copying the data from the user-provided source to some other storage is definitely the right choice: we don't want to block the application (more or less directly) to make sure that the data stays around until we need it. That's bad for performance for immediate contexts and just impossible for deferred, so not much to argue there. At the same time we also want to minimize the amount of copies. Ideally, for a GPU resource, here we'd upload the data right to the GPU (e.g. inside a persistently mapped buffer), ready to be used whenever it's time.
To clarify, when talking about uploading directly to the GPU, are you just referring to map() here, or update_sub_resource() as well?
Potentially both cases. A DISCARD map is virtually indistinguishable from update_sub_resource().
Does it even make sense to use a persistently mapped GPU buffer for a resource mapped/updated by a deferred context? Obligatory disclaimer that I don't know anything about GPU-side performance, but obviously you can't reuse an earlier mapping for UpdateSubResource() or for DISCARD maps, and I'm failing to understand why an application would ever use NOOVERWRITE on a deferred context.
I don't have all the details of deferred contexts down, but I'm pretty sure that you can. All that you need to know is that the memory block you return to the application is not in use (be it by wined3d or by the underlying API / GPU) by the time the application will access it. Whether the memory is backed by a buffer object, suballocated from a larger GPU-side resource or a random chunk of system memory is an implementation detail.
I think there may be some miscommunication here, but I'm talking about the application doing something like:
1. map resource with DISCARD 2. draw from it 3. map resource with DISCARD, filling it with different data 4. draw from it
(3) can't return the same memory as (1) on a deferred context, nor can we throw away the contents of (1) when we get (3).
If the application never uses NOOVERWRITE or partial maps (and I don't know how common that actually is) it comes down to a single blit to a GPU resource. I'm guessing that's not really an ideal place to use a persistent map.
NOOVERWRITE also makes perfect sense with deferred contexts. The general idea behind DISCARD and NOOVERWRITE is to gradually "fill" a larger resource over a number of draws, without requiring any synchronization between the draws. This is probably largely or entirely obvious but I'll go through the whole story to make sure we're on the same page.
The natural use case goes like this: the application wants to draw some dynamic geometry (i.e. generated at runtime, with variable primitive count). At initialization time it creates a couple large buffers (e.g. 1 MB each), one for vertices and one for indices. Then at runtime it maps both with DISCARD, fills them with whatever data is necessary for the current draw (e.g. vertex and index data for 100 triangles), keeping track of how much memory is still available in either buffer, unmaps and draws. After a while it decides that it needs to draw some more dynamic stuff. Now it does something very similar to the previous time, except if there is still some free space available it will map the buffer with NOOVERWRITE and "append" to the buffer, writing the new data after the area used by the first draw. NOOVERWRITE is a promise to d3d to not touch the data in use by any previous draw, which in practice means that the implementation can return the same memory as the previous map, rather than blocking to make sure that any pending draw using the same buffer is completed. Eventually the application will run out of space from one buffer or the other. No problem, at that point the application will map the relevant buffer with DISCARD and restart filling the buffer from the top. Here DISCARD is a different promise to d3d: the application tells d3d that it won't try to make use of the old contents of the buffer. What it implies for the implementation is that an entirely new "physical" buffer can be made available to the application, which will take the place of the previous "physical" buffer once pending draws are done with. Afterwards the application will resume mapping the buffer with NOOVERWRITE for the following dynamic draws, until the buffer fills up again and so on and so forth. Worth mentioning is that the "new" buffer that the implementation can return on a DISCARD map might in fact be an old buffer, for example coming from a buffer discarded some time earlier, currently unused and sitting idle. Or it might be a portion of a larger physical buffer that the implementation knows it's not in use.
Of course applications can and often do find ways to use those primitives more creatively, but this is the basic idea. Also, this works just as well for immediate or for deferred contexts.
For a CPU resource instead we'd copy the data into its final place in sysmem (what will eventually become its "heap_memory"). We don't quite have the infrastructure for it at the moment but that would be the goal IMO. What you do in patch 2/2 is more along the lines of this idea. The sysmem allocation mechanism works okay for the time being and could be extended to handle other types of memory, e.g. it could eventually take charge of the vulkan allocator, making it generic and usable from GL too. I don't think it has to be managed by each deferred context (i.e. it should probably be per-device), although the current scheme has the advantage of not having to care about concurrency.
In a sense there is no "current scheme", i.e. deferred contexts are tracking which memory belongs to which resource because that *has* to be done per-context, but nobody's managing custom heaps. Unless I misunderstand what's meant here?
Why? Memory is memory and is a per-device resource. All you need to do is return some available memory to the application whenever it asks for a resource map, and to do that you need to keep track of how memory is used and for how long.
The other thing a deferred context needs to do is keep track of the last bit of memory it returned for each resource, and return that same bit of memory in case of a NOOVERWRITE map.
Actually I guess we don't *need* to do that; we could return a new bit of memory; but Windows does (which means that applications might depend on it), and it strikes me as better for performance...
So, at least in my view, I'd bring the allocator bits from patch 2/2 into patch 1/2 and use that for update_sub_resource, both the deferred and the immediate variants (the latter could be its own patch). You'd only need the DISCARD portion for update_sub_resource, so the rest can (and probably should) stay in the map() patch.
My (very rough) proposal for this would be to replace the "data" field in the wined3d_cs_update_sub_resource structure with a wined3d_bo_address, and then introduce something like the following:
static bool wined3d_device_context_get_upload_bo(struct
wined3d_device_context *context, struct wined3d_resource *resource, unsigned int sub_resource_idx, const struct wined3d_box *box, uint32_t flags, struct wined3d_bo_address *address, void **map_ptr) { ... }
This function strikes me as more than a little confusing, party because it seems to be returning two partially overlapping things, with mostly different usages...
i.e. I'd presume the idea is for wined3d_device_context_update_sub_resource() to retrieve this and then call something like wined3d_context_copy_bo_address() to copy from sysmem, which makes sense, but then what are we doing with map_ptr?
Or, if we use this in wined3d_device_context_map(), couldn't we just call wined3d_context_map_bo_address() on the returned wined3d_bo_address?
As I understand it, the idea is to return map_ptr to the application and keep track of address internally (i.e. it's the reference to the "physical resource" associated with the memory returned via map_ptr). You'll want to keep track of those struct wined3d_bo_address and in particular of when they'll become available again at some point in the future.
For deferred contexts you should then be able to do something (conceptually) very similar to patch 2/2, and in the end we may not even need map/unmap/update_sub_resource in the wined3d_device_context_ops structure.
We'd still presumably need an unmap, though, unless the idea above is to *always* allocate new (GPU or CPU) memory, which seems like a much bigger step.
I'm not entirely on top of the latest changes in wined3d and I'm not sure what we want the various _ops structs to look like eventually, but AFAIU we do want to always allocate new memory for DISCARD maps (and we basically do that already, both for Vulkan and GL, although in a somewhat roundabout fashion). As I mentioned earlier, it doesn't have to be actually new memory, if we happen to have some available memory around we can use it instead. I think the point is that maps really need to go through wined3d_device_context_ops / the CS only because, sometimes, you need new memory. Once you have a call that caters to that situation, you should be set. All the other situations can ideally be handled on the "client" side, without crossing the CS boundary. There are a few complications (what about READ maps from a STAGING resource? updates to a non-DYNAMIC buffer?) but those are the special cases, the WRITE-only DISCARD / NOOVERWRITE dynamic buffer map is the interesting use case that deserves to be streamlined. For deferred contexts, it's the only allowed case and that's no coincidence.
Unless I'm misreading things, this also seems to imply that *_resource_sub_resource_map() shouldn't exist anymore, right?
That's a separate question I think, as that's the API the upper wined3d "layers" use to access physical resources. It might need to be modified along the way to better handle these new concepts but it seems likely that we'll still need something in the same fashion