We can generally skip them, but make sure that subsequent barriers synchronize with all accumulated usages.
This is motivated by an issue where a program using a single buffer to store both indices and vertex attributes causes superfluous barriers on each draw call.
Signed-off-by: Jan Sikorski jsikorski@codeweavers.com --- dlls/wined3d/buffer.c | 44 +++++++++++++++++++++------------- dlls/wined3d/texture.c | 31 +++++++++++++++--------- dlls/wined3d/wined3d_private.h | 15 ++++++++++++ 3 files changed, 62 insertions(+), 28 deletions(-)
diff --git a/dlls/wined3d/buffer.c b/dlls/wined3d/buffer.c index 4420885b15f..ef5727030c8 100644 --- a/dlls/wined3d/buffer.c +++ b/dlls/wined3d/buffer.c @@ -1588,28 +1588,38 @@ void wined3d_buffer_vk_barrier(struct wined3d_buffer_vk *buffer_vk, if (buffer_vk->bind_mask && buffer_vk->bind_mask != bind_mask) { const struct wined3d_vk_info *vk_info = context_vk->vk_info; + VkAccessFlags src_access_mask, dst_access_mask; VkBufferMemoryBarrier vk_barrier;
TRACE(" %s -> %s.\n", wined3d_debug_bind_flags(buffer_vk->bind_mask), wined3d_debug_bind_flags(bind_mask));
- wined3d_context_vk_end_current_render_pass(context_vk); - - vk_barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - vk_barrier.pNext = NULL; - vk_barrier.srcAccessMask = vk_access_mask_from_bind_flags(buffer_vk->bind_mask); - vk_barrier.dstAccessMask = vk_access_mask_from_bind_flags(bind_mask); - vk_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - vk_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - vk_barrier.buffer = buffer_vk->bo.vk_buffer; - vk_barrier.offset = buffer_vk->bo.buffer_offset; - vk_barrier.size = buffer_vk->b.resource.size; - VK_CALL(vkCmdPipelineBarrier(wined3d_context_vk_get_command_buffer(context_vk), - vk_pipeline_stage_mask_from_bind_flags(buffer_vk->bind_mask), - vk_pipeline_stage_mask_from_bind_flags(bind_mask), - 0, 0, NULL, 1, &vk_barrier, 0, NULL)); - } - buffer_vk->bind_mask = bind_mask; + src_access_mask = vk_access_mask_from_bind_flags(buffer_vk->bind_mask); + dst_access_mask = vk_access_mask_from_bind_flags(bind_mask); + + if (!vk_access_mask_is_read_only(src_access_mask | dst_access_mask)) + { + wined3d_context_vk_end_current_render_pass(context_vk); + + vk_barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + vk_barrier.pNext = NULL; + vk_barrier.srcAccessMask = src_access_mask; + vk_barrier.dstAccessMask = dst_access_mask; + vk_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + vk_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + vk_barrier.buffer = buffer_vk->bo.vk_buffer; + vk_barrier.offset = buffer_vk->bo.buffer_offset; + vk_barrier.size = buffer_vk->b.resource.size; + VK_CALL(vkCmdPipelineBarrier(wined3d_context_vk_get_command_buffer(context_vk), + vk_pipeline_stage_mask_from_bind_flags(buffer_vk->bind_mask), + vk_pipeline_stage_mask_from_bind_flags(bind_mask), + 0, 0, NULL, 1, &vk_barrier, 0, NULL)); + buffer_vk->bind_mask = bind_mask; + return; + } + } + + buffer_vk->bind_mask |= bind_mask; }
HRESULT CDECL wined3d_buffer_create(struct wined3d_device *device, const struct wined3d_buffer_desc *desc, diff --git a/dlls/wined3d/texture.c b/dlls/wined3d/texture.c index c1247fbc56b..687626a0d56 100644 --- a/dlls/wined3d/texture.c +++ b/dlls/wined3d/texture.c @@ -5251,22 +5251,31 @@ void wined3d_texture_vk_barrier(struct wined3d_texture_vk *texture_vk,
if (texture_vk->bind_mask && texture_vk->bind_mask != bind_mask) { + VkAccessFlags src_access_mask, dst_access_mask; + TRACE(" %s -> %s.\n", wined3d_debug_bind_flags(texture_vk->bind_mask), wined3d_debug_bind_flags(bind_mask));
- vk_range.aspectMask = vk_aspect_mask_from_format(texture_vk->t.resource.format); - vk_range.baseMipLevel = 0; - vk_range.levelCount = VK_REMAINING_MIP_LEVELS; - vk_range.baseArrayLayer = 0; - vk_range.layerCount = VK_REMAINING_ARRAY_LAYERS; + src_access_mask = vk_access_mask_from_bind_flags(texture_vk->bind_mask); + dst_access_mask = vk_access_mask_from_bind_flags(bind_mask);
- wined3d_context_vk_image_barrier(context_vk, wined3d_context_vk_get_command_buffer(context_vk), - vk_pipeline_stage_mask_from_bind_flags(texture_vk->bind_mask), - vk_pipeline_stage_mask_from_bind_flags(bind_mask), - vk_access_mask_from_bind_flags(texture_vk->bind_mask), vk_access_mask_from_bind_flags(bind_mask), - texture_vk->layout, texture_vk->layout, texture_vk->image.vk_image, &vk_range); + if (!vk_access_mask_is_read_only(src_access_mask | dst_access_mask)) + { + vk_range.aspectMask = vk_aspect_mask_from_format(texture_vk->t.resource.format); + vk_range.baseMipLevel = 0; + vk_range.levelCount = VK_REMAINING_MIP_LEVELS; + vk_range.baseArrayLayer = 0; + vk_range.layerCount = VK_REMAINING_ARRAY_LAYERS; + + wined3d_context_vk_image_barrier(context_vk, wined3d_context_vk_get_command_buffer(context_vk), + vk_pipeline_stage_mask_from_bind_flags(texture_vk->bind_mask), + vk_pipeline_stage_mask_from_bind_flags(bind_mask), src_access_mask, dst_access_mask, + texture_vk->layout, texture_vk->layout, texture_vk->image.vk_image, &vk_range); + texture_vk->bind_mask = bind_mask; + return; + } } - texture_vk->bind_mask = bind_mask; + texture_vk->bind_mask |= bind_mask; }
static void ffp_blitter_destroy(struct wined3d_blitter *blitter, struct wined3d_context *context) diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 86eae149306..ae4ae1bf160 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -317,6 +317,21 @@ VkImageViewType vk_image_view_type_from_wined3d(enum wined3d_resource_type type, VkPipelineStageFlags vk_pipeline_stage_mask_from_bind_flags(uint32_t bind_flags) DECLSPEC_HIDDEN; VkShaderStageFlagBits vk_shader_stage_from_wined3d(enum wined3d_shader_type shader_type) DECLSPEC_HIDDEN;
+static inline bool vk_access_mask_is_read_only(VkAccessFlags flags) +{ + static const VkAccessFlags read_flags = + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | VK_ACCESS_SHADER_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_HOST_READ_BIT | VK_ACCESS_MEMORY_READ_BIT | + VK_ACCESS_COMMAND_PREPROCESS_READ_BIT_NV | VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR | + VK_ACCESS_FRAGMENT_SHADING_RATE_ATTACHMENT_READ_BIT_KHR | VK_ACCESS_FRAGMENT_DENSITY_MAP_READ_BIT_EXT | + VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT; + + return (flags & read_flags) == flags; +} + static inline enum wined3d_cmp_func wined3d_sanitize_cmp_func(enum wined3d_cmp_func func) { if (func < WINED3D_CMP_NEVER || func > WINED3D_CMP_ALWAYS)
On Thu, 19 Aug 2021 at 10:56, Jan Sikorski jsikorski@codeweavers.com wrote:
We can generally skip them, but make sure that subsequent barriers synchronize with all accumulated usages.
Is that safe? I don't much doubt that this should work in practice on common desktop GPUs, but what does the spec say? In particular, I'd be a little concerned that e.g. VK_PIPELINE_STAGE_VERTEX_INPUT_BIT barriers may not guarantee visibility to e.g. fragment stages.
+static inline bool vk_access_mask_is_read_only(VkAccessFlags flags) +{
- static const VkAccessFlags read_flags =
VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_HOST_READ_BIT | VK_ACCESS_MEMORY_READ_BIT |
VK_ACCESS_COMMAND_PREPROCESS_READ_BIT_NV | VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR |
VK_ACCESS_FRAGMENT_SHADING_RATE_ATTACHMENT_READ_BIT_KHR | VK_ACCESS_FRAGMENT_DENSITY_MAP_READ_BIT_EXT |
VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT;
- return (flags & read_flags) == flags;
+}
We typically break lines before operators, not after.
On 20 Aug 2021, at 09:51, Henri Verbeet hverbeet@gmail.com wrote:
On Thu, 19 Aug 2021 at 10:56, Jan Sikorski jsikorski@codeweavers.com wrote:
We can generally skip them, but make sure that subsequent barriers synchronize with all accumulated usages.
Is that safe? I don't much doubt that this should work in practice on common desktop GPUs, but what does the spec say? In particular, I'd be a little concerned that e.g. VK_PIPELINE_STAGE_VERTEX_INPUT_BIT barriers may not guarantee visibility to e.g. fragment stages.
What scenario do you have in mind? You mean that we’d skip a barrier that would otherwise chain to some previous write? I think you're right, if I’m getting this right, a more clear example would maybe be a write to a read late in the pipeline (1st draw call), then to a read early in the pipeline (2nd draw call, this barrier would be skipped). The write would race with the second draw. Maybe the same would be true if the order of these draw calls was reversed. It would probably be preferable to sync with the write instead of the read to maintain parallelism.
- Jan