From: Steve Schnepp steve.schnepp@pwkf.org
Convert all consecutive calls to d7_DrawPrimitive() into a single call to wined3d.
The buffer is flushed upon every state change. It is also flushed when incompatible arguments are passed to d7_DrawPrimitive(), such as a changing fvf or incompatible primitive_type.
Note, the call does *increase* the number of data to transmit. This is not an issue because bandwith overhead is much less costly than multiple calls overhead. It will also leverage directly wined3d primitives and allocates space directly in the wined3d device buffer.
Finally, only a subset of the calls get buffered in order to ensure that the disruption is minimal.
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=33814
---
Several remarks :
1. I finally squashed everything in 1 commit as it was too cumbersome to maintain the separation. I can split it at the end once everything else is fine.
2. I didn't change (yet) the ddraw_perf channel to d3d_perf as d3d_perf isn't used at all in ddraw.dll.
3. I used HRESULT as a return value for buffer_add instead of bool. I do not like returning bool when it is not obvious what true/false mean. Yet, I don't mind changing it to an enum if one prefers. And of course, I can change it to bool if one insists ;-) --- dlls/ddraw/ddraw_private.h | 37 +++++++ dlls/ddraw/device.c | 216 ++++++++++++++++++++++++++++++++++++- dlls/ddraw/surface.c | 18 ++++ 3 files changed, 268 insertions(+), 3 deletions(-)
diff --git a/dlls/ddraw/ddraw_private.h b/dlls/ddraw/ddraw_private.h index 18ec1d84a5b..3bb5b3790ca 100644 --- a/dlls/ddraw/ddraw_private.h +++ b/dlls/ddraw/ddraw_private.h @@ -314,6 +314,21 @@ DWORD ddraw_allocate_handle(struct ddraw_handle_table *t, void *object, enum ddr void *ddraw_free_handle(struct ddraw_handle_table *t, DWORD handle, enum ddraw_handle_type type) DECLSPEC_HIDDEN; void *ddraw_get_object(struct ddraw_handle_table *t, DWORD handle, enum ddraw_handle_type type) DECLSPEC_HIDDEN;
+#define VERTEX_BUFFER_SIZE 4096 + +struct vertex_batch { + D3DPRIMITIVETYPE primitive_type; + DWORD fvf; + UINT stride; + char *vertices; + + /* vertex_count arg has a DWORD type in D3D calls, + * but we won't go beyond an unsigned int capacity, as documentation says + * it is limited to D3DMAXNUMVERTICES (65,535) */ + unsigned int vertex_count; + unsigned int vertex_pos; +}; + struct d3d_device { /* IUnknown */ @@ -366,6 +381,9 @@ struct d3d_device
struct wined3d_stateblock *recording, *state, *update_state; const struct wined3d_stateblock_state *stateblock_state; + + /* Vertex buffer for squashing DrawPrimitive() calls before sending it to wined3d */ + struct vertex_batch vertex_batch; };
HRESULT d3d_device_create(struct ddraw *ddraw, const GUID *guid, struct ddraw_surface *target, IUnknown *rt_iface, @@ -747,4 +765,23 @@ HRESULT hr_ddraw_from_wined3d(HRESULT hr) DECLSPEC_HIDDEN; void viewport_alloc_active_light_index(struct d3d_light *light) DECLSPEC_HIDDEN; void viewport_free_active_light_index(struct d3d_light *light) DECLSPEC_HIDDEN;
+HRESULT ddraw_buffer_flush(struct d3d_device *device); + +/* Helper to call ddraw_buffer_flush() only if needed. + * + * It enables the compiler to inline the test so it does not have a function + * call penalty. + * + * Otherwise it even shows itself in perf as it is called *quite* often. + * With this helper, even the test disappears from perf. */ +static inline HRESULT ddraw_buffer_flush_if_needed(struct d3d_device *device) { + /* Nothing to do if it is empty */ + if (! device) + return D3D_OK; + if (! device->vertex_batch.vertex_count) + return D3D_OK; + + return ddraw_buffer_flush(device); +} + #endif diff --git a/dlls/ddraw/device.c b/dlls/ddraw/device.c index 1cfef5007d5..b34ad56b4dc 100644 --- a/dlls/ddraw/device.c +++ b/dlls/ddraw/device.c @@ -31,6 +31,7 @@
WINE_DEFAULT_DEBUG_CHANNEL(ddraw); WINE_DECLARE_DEBUG_CHANNEL(winediag); +WINE_DECLARE_DEBUG_CHANNEL(d3d_perf);
/* The device ID */ const GUID IID_D3DDEVICE_WineD3D = { @@ -1571,6 +1572,178 @@ static HRESULT WINAPI d3d_device1_BeginScene(IDirect3DDevice *iface) return IDirect3DDevice7_BeginScene(&device->IDirect3DDevice7_iface); }
+/* Flushing the buffer if it isn't empty. + * + * It will delegate to a single call to wined3d with the correct parameters, + * and a (hopefully) huge list of vertices/indices. */ +HRESULT ddraw_buffer_flush(struct d3d_device *device) { + HRESULT hr; + + TRACE("device %p primitive_type %#x vertex_count %05u\n", device, device->vertex_batch.primitive_type, device->vertex_batch.vertex_count); + + /* Calling wined3d directly */ + wined3d_streaming_buffer_unmap(&device->vertex_buffer); + wined3d_mutex_lock(); + + + hr = wined3d_stateblock_set_stream_source(device->state, 0, device->vertex_buffer.buffer, 0, device->vertex_batch.stride); + if (FAILED(hr)) + goto done; + + wined3d_stateblock_set_vertex_declaration(device->state, ddraw_find_decl(device->ddraw, device->vertex_batch.fvf)); + + switch (device->vertex_batch.primitive_type) { + case D3DPT_POINTLIST: + wined3d_device_context_set_primitive_type(device->immediate_context, wined3d_primitive_type_from_ddraw(D3DPT_POINTLIST), 0); + break; + case D3DPT_TRIANGLEFAN: + wined3d_device_context_set_primitive_type(device->immediate_context, wined3d_primitive_type_from_ddraw(D3DPT_TRIANGLELIST), 0); + break; + case D3DPT_LINELIST: + wined3d_device_context_set_primitive_type(device->immediate_context, wined3d_primitive_type_from_ddraw(D3DPT_LINELIST), 0); + break; + default: + FIXME("primitive_type %#x not supported yet\n", device->vertex_batch.primitive_type); + wined3d_device_context_set_primitive_type(device->immediate_context, wined3d_primitive_type_from_ddraw(D3DPT_TRIANGLELIST), 0); + } + + wined3d_device_apply_stateblock(device->wined3d_device, device->state); + d3d_device_sync_surfaces(device); + + wined3d_device_context_draw(device->immediate_context, + device->vertex_batch.vertex_pos / device->vertex_batch.stride, + device->vertex_batch.vertex_count, 0, 0); + +done: + wined3d_mutex_unlock(); + + /* Reset the buffer */ + device->vertex_batch.vertices = NULL; + device->vertex_batch.vertex_count = 0; + + return hr; +} + +static void ddraw_buffer_add_indices_list(struct d3d_device *device, void* vertices, DWORD vertex_count) { + UINT stride = device->vertex_batch.stride; + + /* All the points are already in the same order, copy them over */ + memcpy(device->vertex_batch.vertices + device->vertex_batch.vertex_count * stride, vertices, vertex_count * stride); + device->vertex_batch.vertex_count += vertex_count; +} + +static void ddraw_buffer_add_indices_trianglefan(struct d3d_device *device, void* vertices, DWORD vertex_count) { + UINT stride = device->vertex_batch.stride; + unsigned int idx; + + /* The first triangle is the same, therefore the vertexesh are simply copied over */ + memcpy(device->vertex_batch.vertices + device->vertex_batch.vertex_count * stride, vertices, 3 * stride); + device->vertex_batch.vertex_count += 3; + + /* Next triangles are recreated with : 2 next vertices then the 1rst one. + * So, it will *increase* the number of total vertices from 4 to 6, 5 to 9, 6 to 12, ... */ + for (idx = 2; idx < vertex_count-1; idx ++) { + char* next_vertice = (char*) vertices + idx * stride; + + /* Copy the 2 next ones */ + memcpy(device->vertex_batch.vertices + device->vertex_batch.vertex_count * stride, next_vertice, 2 * stride); + device->vertex_batch.vertex_count += 2; + + /* Copy the first one again */ + memcpy(device->vertex_batch.vertices + device->vertex_batch.vertex_count * stride, vertices, stride); + device->vertex_batch.vertex_count += 1; + } +} + +/* + * Note : Adding to the buffer transforms the D3DPT_TRIANGLEFAN primitive into a D3DPT_TRIANGLELIST. + * Otherwise, we cannot concatenate them as TRIANGLE FAN has the first vertex in common to the whole list. + */ +static HRESULT ddraw_buffer_add(struct d3d_device *device, D3DPRIMITIVETYPE primitive_type, DWORD fvf, void *vertices, DWORD vertex_count, DWORD flags, UINT stride) { + HRESULT hr; + + TRACE("device %p, primitive_type %#x, fvf %#lx, vertices %p, vertex_count %lu, flags %#lx, stride %u.\n", + device, primitive_type, fvf, vertices, vertex_count, flags, stride); + + if (device->vertex_batch.vertex_count) { + /* if already-buffered vertexes do not match the one that we want to add, flush. */ + if (primitive_type != device->vertex_batch.primitive_type) { + TRACE_(d3d_perf)("Buffering failed due to mismatched primitive_type %d != buffer.primitive_type %d \n", primitive_type, device->vertex_batch.primitive_type); + ddraw_buffer_flush(device); + } else if (fvf != device->vertex_batch.fvf) { + TRACE_(d3d_perf)("Buffering failed due to mismatched fvf %ld != buffer.fvf %ld \n", fvf, device->vertex_batch.fvf); + ddraw_buffer_flush(device); + } else if (device->vertex_batch.vertex_count + vertex_count * 2 > VERTEX_BUFFER_SIZE) { + /* We double the number of vertices to add since + * - it is a very fast mul + * - the number will never more than double + * - the precision it offers is good enough */ + FIXME_(d3d_perf)("Buffering failed due to almost full buffer vertex_count %u, adding %lu, max %u \n", device->vertex_batch.vertex_count, vertex_count, VERTEX_BUFFER_SIZE); + ddraw_buffer_flush(device); + } + } + + switch(primitive_type) { + case D3DPT_TRIANGLEFAN: + case D3DPT_LINELIST: + case D3DPT_POINTLIST: + /* Supported primitives */ + break; + default: + FIXME("primitive_type %#x is not supported, not buffering\n", primitive_type); + goto fail; + } + + /* Need to test again vertex_count as a flush resets it to 0 */ + if (! device->vertex_batch.vertex_count) { + /* New buffer, setting everything up */ + device->vertex_batch.primitive_type = primitive_type; + device->vertex_batch.fvf = fvf; + device->vertex_batch.stride = stride; + + /* We map & unmap directly. + * That way, we only reserve the space and other calls will have a new one. + * It should not happen, but let's be safe. + * + * We will fill it with following calls */ + hr = wined3d_streaming_buffer_map(device->wined3d_device, &device->vertex_buffer, + VERTEX_BUFFER_SIZE * stride, stride, + &device->vertex_batch.vertex_pos, (void**) &device->vertex_batch.vertices); + if (FAILED(hr)) + goto fail; + } + + /* Create the index */ + switch(primitive_type) { + case D3DPT_TRIANGLEFAN: + if (vertex_count < 3) { + WARN("vertex_count %lu lower than 3. not buffering.\n", vertex_count); + goto fail; + } + ddraw_buffer_add_indices_trianglefan(device, vertices, vertex_count); + break; + case D3DPT_LINELIST: + case D3DPT_POINTLIST: + ddraw_buffer_add_indices_list(device, vertices, vertex_count); + break; + default: + ERR("primitive_type %#x not supported\n", primitive_type); + } + + assert(device->vertex_batch.vertex_count < VERTEX_BUFFER_SIZE); + + /* Buffered ! */ + return D3D_OK; + +fail: + if (device->vertex_batch.vertices) { + wined3d_streaming_buffer_unmap(&device->vertex_buffer); + device->vertex_batch.vertices = NULL; + } + return WINED3DERR_NOTAVAILABLE; +} + + /***************************************************************************** * IDirect3DDevice7::EndScene * @@ -1592,6 +1765,8 @@ static HRESULT d3d_device7_EndScene(IDirect3DDevice7 *iface)
TRACE("iface %p.\n", iface);
+ ddraw_buffer_flush_if_needed(device); + wined3d_mutex_lock(); hr = wined3d_device_end_scene(device->wined3d_device); wined3d_mutex_unlock(); @@ -1861,6 +2036,8 @@ static HRESULT d3d_device_set_render_target(struct d3d_device *device, return DDERR_INVALIDPARAMS; }
+ ddraw_buffer_flush_if_needed(device); + rtv = ddraw_surface_get_rendertarget_view(target); if (FAILED(hr = wined3d_device_context_set_rendertarget_views(device->immediate_context, 0, 1, &rtv, FALSE))) return hr; @@ -2548,6 +2725,8 @@ static HRESULT d3d_device7_SetRenderState(IDirect3DDevice7 *iface,
TRACE("iface %p, state %#x, value %#lx.\n", iface, state, value);
+ ddraw_buffer_flush_if_needed(device); + wined3d_mutex_lock(); /* Some render states need special care */ switch (state) @@ -3132,6 +3311,8 @@ static HRESULT d3d_device7_SetTransform(IDirect3DDevice7 *iface,
TRACE("iface %p, state %#x, matrix %p.\n", iface, state, matrix);
+ ddraw_buffer_flush_if_needed(device); + if (!matrix) return DDERR_INVALIDPARAMS;
@@ -3307,6 +3488,8 @@ static HRESULT d3d_device7_MultiplyTransform(IDirect3DDevice7 *iface,
TRACE("iface %p, state %#x, matrix %p.\n", iface, state, matrix);
+ ddraw_buffer_flush_if_needed(device); + /* Note: D3DMATRIX is compatible with struct wined3d_matrix. */ wined3d_mutex_lock(); wined3d_stateblock_multiply_transform(device->state, @@ -3451,10 +3634,19 @@ static HRESULT d3d_device7_DrawPrimitive(IDirect3DDevice7 *iface, stride = get_flexible_vertex_size(fvf); size = vertex_count * stride;
+ hr = ddraw_buffer_add(device, primitive_type, fvf, vertices, vertex_count, flags, stride); + if (hr == D3D_OK) { + /* Buffered successfuly -> returning immediatly :-) */ + return D3D_OK; + } + + WARN("Buffering failed, using the old (slow) path\n"); + wined3d_mutex_lock();
- if (FAILED(hr = wined3d_streaming_buffer_upload(device->wined3d_device, - &device->vertex_buffer, vertices, size, stride, &vb_pos))) + hr = wined3d_streaming_buffer_upload(device->wined3d_device, + &device->vertex_buffer, vertices, size, stride, &vb_pos); + if (FAILED(hr)) goto done;
hr = wined3d_stateblock_set_stream_source(device->state, 0, device->vertex_buffer.buffer, 0, stride); @@ -3494,10 +3686,12 @@ static HRESULT WINAPI d3d_device7_DrawPrimitive_FPUPreserve(IDirect3DDevice7 *if return hr; }
-static void setup_lighting(const struct d3d_device *device, DWORD fvf, DWORD flags) +static void setup_lighting(struct d3d_device *device, DWORD fvf, DWORD flags) { BOOL enable = TRUE;
+ ddraw_buffer_flush_if_needed(device); + /* Ignore the D3DFVF_XYZRHW case here, wined3d takes care of that */ if (!device->material || !(fvf & D3DFVF_NORMAL) || (flags & D3DDP_DONOTLIGHT)) enable = FALSE; @@ -4641,6 +4835,8 @@ static HRESULT d3d_device7_SetTexture(IDirect3DDevice7 *iface,
TRACE("iface %p, stage %lu, texture %p.\n", iface, stage, texture);
+ ddraw_buffer_flush_if_needed(device); + if (surf && (surf->surface_desc.ddsCaps.dwCaps & DDSCAPS_TEXTURE)) { if (surf->draw_texture) @@ -4925,6 +5121,8 @@ static HRESULT d3d_device7_SetTextureStageState(IDirect3DDevice7 *iface, TRACE("iface %p, stage %lu, state %#x, value %#lx.\n", iface, stage, state, value);
+ ddraw_buffer_flush_if_needed(device); + if (state > D3DTSS_TEXTURETRANSFORMFLAGS) { WARN("Invalid state %#x passed.\n", state); @@ -5195,6 +5393,8 @@ static HRESULT d3d_device7_SetViewport(IDirect3DDevice7 *iface, D3DVIEWPORT7 *vi if (!viewport) return DDERR_INVALIDPARAMS;
+ ddraw_buffer_flush_if_needed(device); + wined3d_mutex_lock(); if (!(rtv = wined3d_device_context_get_rendertarget_view(device->immediate_context, 0))) { @@ -5307,6 +5507,8 @@ static HRESULT d3d_device7_SetMaterial(IDirect3DDevice7 *iface, D3DMATERIAL7 *ma if (!material) return DDERR_INVALIDPARAMS;
+ ddraw_buffer_flush_if_needed(device); + wined3d_mutex_lock(); /* Note: D3DMATERIAL7 is compatible with struct wined3d_material. */ wined3d_stateblock_set_material(device->update_state, (const struct wined3d_material *)material); @@ -5400,6 +5602,8 @@ static HRESULT d3d_device7_SetLight(IDirect3DDevice7 *iface, DWORD light_idx, D3
TRACE("iface %p, light_idx %lu, light %p.\n", iface, light_idx, light);
+ ddraw_buffer_flush_if_needed(device); + wined3d_mutex_lock(); /* Note: D3DLIGHT7 is compatible with struct wined3d_light. */ hr = wined3d_stateblock_set_light(device->update_state, light_idx, (const struct wined3d_light *)light); @@ -6279,6 +6483,8 @@ static HRESULT d3d_device7_LightEnable(IDirect3DDevice7 *iface, DWORD light_idx,
TRACE("iface %p, light_idx %lu, enabled %#x.\n", iface, light_idx, enabled);
+ ddraw_buffer_flush_if_needed(device); + wined3d_mutex_lock(); hr = wined3d_stateblock_set_light_enable(device->update_state, light_idx, enabled); wined3d_mutex_unlock(); @@ -6381,6 +6587,8 @@ static HRESULT d3d_device7_SetClipPlane(IDirect3DDevice7 *iface, DWORD idx, D3DV if (!plane) return DDERR_INVALIDPARAMS;
+ ddraw_buffer_flush_if_needed(device); + wined3d_plane = (struct wined3d_vec4 *)plane;
wined3d_mutex_lock(); @@ -6796,6 +7004,8 @@ enum wined3d_depth_buffer_type d3d_device_update_depth_stencil(struct d3d_device return WINED3D_ZB_FALSE; }
+ ddraw_buffer_flush_if_needed(device); + dsi = impl_from_IDirectDrawSurface7(depthStencil); wined3d_device_context_set_depth_stencil_view(device->immediate_context, ddraw_surface_get_rendertarget_view(dsi)); diff --git a/dlls/ddraw/surface.c b/dlls/ddraw/surface.c index b75775cf252..b681c8e255b 100644 --- a/dlls/ddraw/surface.c +++ b/dlls/ddraw/surface.c @@ -1165,6 +1165,8 @@ static HRESULT WINAPI ddraw_surface7_Lock(IDirectDrawSurface7 *iface, TRACE("iface %p, rect %s, surface_desc %p, flags %#lx, h %p.\n", iface, wine_dbgstr_rect(rect), surface_desc, flags, h);
+ ddraw_buffer_flush_if_needed(surface->device1); + if (!surface_validate_lock_desc(surface, (DDSURFACEDESC *)surface_desc, &surface_desc_size)) return DDERR_INVALIDPARAMS;
@@ -1718,6 +1720,7 @@ static HRESULT WINAPI DECLSPEC_HOTPATCH ddraw_surface1_Blt(IDirectDrawSurface *i TRACE("iface %p, dst_rect %s, src_surface %p, src_rect %s, flags %#lx, fx %p.\n", iface, wine_dbgstr_rect(dst_rect), src_surface, wine_dbgstr_rect(src_rect), flags, fx);
+ unsupported_flags = DDBLT_ALPHADEST | DDBLT_ALPHADESTCONSTOVERRIDE | DDBLT_ALPHADESTNEG @@ -1758,6 +1761,11 @@ static HRESULT WINAPI DECLSPEC_HOTPATCH ddraw_surface1_Blt(IDirectDrawSurface *i return DDERR_NORASTEROPHW; }
+ if (src_impl) + ddraw_buffer_flush_if_needed(src_impl->device1); + if (dst_impl) + ddraw_buffer_flush_if_needed(dst_impl->device1); + wined3d_mutex_lock();
if (flags & (DDBLT_COLORFILL | DDBLT_DEPTHFILL)) @@ -2351,6 +2359,8 @@ static HRESULT WINAPI ddraw_surface7_GetDC(IDirectDrawSurface7 *iface, HDC *dc) if (!dc) return DDERR_INVALIDPARAMS;
+ ddraw_buffer_flush_if_needed(surface->device1); + wined3d_mutex_lock(); if (surface->dc) hr = DDERR_DCALREADYCREATED; @@ -4402,6 +4412,9 @@ static HRESULT WINAPI DECLSPEC_HOTPATCH ddraw_surface7_BltFast(IDirectDrawSurfac TRACE("iface %p, dst_x %lu, dst_y %lu, src_surface %p, src_rect %s, flags %#lx.\n", iface, dst_x, dst_y, src_surface, wine_dbgstr_rect(src_rect), trans);
+ ddraw_buffer_flush_if_needed(src_impl->device1); + ddraw_buffer_flush_if_needed(dst_impl->device1); + dst_w = dst_impl->surface_desc.dwWidth; dst_h = dst_impl->surface_desc.dwHeight;
@@ -5360,6 +5373,9 @@ static HRESULT WINAPI d3d_texture2_Load(IDirect3DTexture2 *iface, IDirect3DTextu return D3D_OK; }
+ ddraw_buffer_flush_if_needed(src_surface->device1); + ddraw_buffer_flush_if_needed(dst_surface->device1); + wined3d_mutex_lock();
dst_resource = wined3d_texture_get_resource(ddraw_surface_get_default_texture(dst_surface, DDRAW_SURFACE_WRITE)); @@ -6615,6 +6631,8 @@ HRESULT ddraw_surface_create(struct ddraw *ddraw, const DDSURFACEDESC2 *surface_ ddraw->d3ddevice->recording = NULL; ddraw->d3ddevice->update_state = ddraw->d3ddevice->state; } + + ddraw_buffer_flush_if_needed(ddraw->d3ddevice); wined3d_stateblock_reset(ddraw->state);
if (FAILED(hr = wined3d_device_reset(ddraw->wined3d_device,