This would eliminate the todo for the precise mad() test in !718. Maybe we need test results on nvidia and intel to decide if we actually want this.
-- v7: vkd3d-shader/ir: Implement MAD in two operations if flagged as precise.
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d-shader/ir.c | 56 +++++++++++++++++++ libs/vkd3d-shader/vkd3d_shader_private.h | 25 +++++++++ .../hlsl/arithmetic-float-uniform.shader_test | 2 +- 3 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/ir.c b/libs/vkd3d-shader/ir.c index 8af537390..704cd8387 100644 --- a/libs/vkd3d-shader/ir.c +++ b/libs/vkd3d-shader/ir.c @@ -1825,6 +1825,8 @@ struct cf_flattener unsigned int control_flow_depth; struct cf_flattener_info *control_flow_info; size_t control_flow_info_size; + + unsigned int precise_mad_temp_idx; };
static void cf_flattener_set_error(struct cf_flattener *flattener, enum vkd3d_result error) @@ -2036,6 +2038,54 @@ static void VKD3D_PRINTF_FUNC(3, 4) cf_flattener_create_block_name(struct cf_fla flattener->block_names[block_id] = buffer.buffer; }
+/* The Shader Model 5 Assembly documentation states: "If components of a mad + * instruction are tagged as precise, the hardware must execute a mad instruction + * or the exact equivalent, and it cannot split it into a multiply followed by an add." + * But DXIL.rst states the opposite: "Floating point multiply & add. This operation is + * not fused for "precise" operations." + * Windows drivers seem to conform with the latter, for SM 4-5 and SM 6. */ +static bool cf_flattener_handle_precise_mad(struct cf_flattener *flattener, + const struct vkd3d_shader_instruction *instruction) +{ + struct vkd3d_shader_instruction *mul_ins, *add_ins; + struct vsir_program *program = flattener->program; + struct vkd3d_shader_dst_param *mul_dst; + + if (!(instruction->flags & VKD3DSI_PRECISE_XYZW)) + return cf_flattener_copy_instruction(flattener, instruction); + + if (!(mul_ins = cf_flattener_require_space(flattener, 2))) + return false; + add_ins = mul_ins + 1; + + if (!flattener->precise_mad_temp_idx) + flattener->precise_mad_temp_idx = 1 + program->temp_count++; + + *mul_ins = *instruction; + mul_ins->handler_idx = VKD3DSIH_MUL; + mul_ins->src_count = 2; + + if (!(vsir_instruction_init_with_params(program, add_ins, &instruction->location, VKD3DSIH_ADD, 1, 2))) + return false; + add_ins->flags = instruction->flags & VKD3DSI_PRECISE_XYZW; + + mul_dst = mul_ins->dst; + *add_ins->dst = *mul_dst; + + mul_dst->modifiers = 0; + vsir_register_init(&mul_dst->reg, VKD3DSPR_TEMP, instruction->src[0].reg.data_type, 1); + mul_dst->reg.dimension = add_ins->dst->reg.dimension; + mul_dst->reg.idx[0].offset = flattener->precise_mad_temp_idx - 1; + + add_ins->src[0].reg = mul_dst->reg; + add_ins->src[0].swizzle = vsir_swizzle_from_writemask(mul_dst->write_mask); + add_ins->src[0].modifiers = 0; + add_ins->src[1] = mul_ins->src[2]; + + flattener->instruction_count += 2; + return true; +} + static enum vkd3d_result cf_flattener_iterate_instruction_array(struct cf_flattener *flattener, struct vkd3d_shader_message_context *message_context) { @@ -2375,6 +2425,12 @@ static enum vkd3d_result cf_flattener_iterate_instruction_array(struct cf_flatte main_block_open = false; break;
+ /* Handle precise MAD here because it requires insertion of an extra instruction. */ + case VKD3DSIH_MAD: + if (!cf_flattener_handle_precise_mad(flattener, instruction)) + return VKD3D_ERROR_OUT_OF_MEMORY; + break; + default: if (!cf_flattener_copy_instruction(flattener, instruction)) return VKD3D_ERROR_OUT_OF_MEMORY; diff --git a/libs/vkd3d-shader/vkd3d_shader_private.h b/libs/vkd3d-shader/vkd3d_shader_private.h index b07a7bff7..f401fd7a3 100644 --- a/libs/vkd3d-shader/vkd3d_shader_private.h +++ b/libs/vkd3d-shader/vkd3d_shader_private.h @@ -1761,6 +1761,31 @@ static inline unsigned int vkd3d_compact_swizzle(uint32_t swizzle, uint32_t writ return compacted_swizzle; }
+static inline uint32_t vsir_swizzle_from_writemask(unsigned int writemask) +{ + static const unsigned int swizzles[16] = + { + 0, + VKD3D_SHADER_SWIZZLE(X, X, X, X), + VKD3D_SHADER_SWIZZLE(Y, Y, Y, Y), + VKD3D_SHADER_SWIZZLE(X, Y, X, X), + VKD3D_SHADER_SWIZZLE(Z, Z, Z, Z), + VKD3D_SHADER_SWIZZLE(X, Z, X, X), + VKD3D_SHADER_SWIZZLE(Y, Z, X, X), + VKD3D_SHADER_SWIZZLE(X, Y, Z, X), + VKD3D_SHADER_SWIZZLE(W, W, W, W), + VKD3D_SHADER_SWIZZLE(X, W, X, X), + VKD3D_SHADER_SWIZZLE(Y, W, X, X), + VKD3D_SHADER_SWIZZLE(X, Y, W, X), + VKD3D_SHADER_SWIZZLE(Z, W, X, X), + VKD3D_SHADER_SWIZZLE(X, Z, W, X), + VKD3D_SHADER_SWIZZLE(Y, Z, W, X), + VKD3D_SHADER_SWIZZLE(X, Y, Z, W), + }; + + return swizzles[writemask & 0xf]; +} + struct vkd3d_struct { enum vkd3d_shader_structure_type type; diff --git a/tests/hlsl/arithmetic-float-uniform.shader_test b/tests/hlsl/arithmetic-float-uniform.shader_test index 8bc3992e7..c6b7caaae 100644 --- a/tests/hlsl/arithmetic-float-uniform.shader_test +++ b/tests/hlsl/arithmetic-float-uniform.shader_test @@ -121,7 +121,7 @@ uniform 0 float4 1.00000007 -42.1 4.0 45.0 uniform 4 float4 1.625 -5.0 4.125 5.0 uniform 8 float4 1.00000007 -1.0 0.5 -0.5 todo(sm<6) draw quad -todo probe all rgba (2.62500048, 209.5, 17.0, 224.5) +probe all rgba (2.62500048, 209.5, 17.0, 224.5)
[require] shader model >= 5.0
I don't think the flattener should be "hijacked" for this pass, it is already complicated enough. It's probably appropriate to have a single pass to collect all the little local operations like this one, instead of iterating over the whole instruction array every time, but it should be another one. As Henri notices, the same pass might do this and lowering texkills (and probably removing DCL_TEMPS too).
On Wed Apr 10 14:22:13 2024 +0000, Giovanni Mascellani wrote:
I don't think the flattener should be "hijacked" for this pass, it is already complicated enough. It's probably appropriate to have a single pass to collect all the little local operations like this one, instead of iterating over the whole instruction array every time, but it should be another one. As Henri notices, the same pass might do this and lowering texkills (and probably removing DCL_TEMPS too).
We have no other passes which may need to insert instructions. It's a compromise between adding yet another instruction copying pass, versus favouring performance even if it's a little messy. If the former is the priority I'll add a pass for all the small operations.