-- v2: tests: Add a test for fmod() with vector arguments. vkd3d-shader: Use ternary operator in fmod() implementation. vkd3d-shader/tpf: Use 'movc' to implement ternary operator.
From: Nikolay Sivov nsivov@codeweavers.com
Signed-off-by: Nikolay Sivov nsivov@codeweavers.com --- libs/vkd3d-shader/hlsl.c | 2 ++ libs/vkd3d-shader/hlsl.h | 2 ++ libs/vkd3d-shader/hlsl.y | 6 +++- libs/vkd3d-shader/hlsl_codegen.c | 50 ++++++++++++++++++++++++++++++++ libs/vkd3d-shader/tpf.c | 25 ++++++++++++++++ 5 files changed, 84 insertions(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/hlsl.c b/libs/vkd3d-shader/hlsl.c index 290af22f..8bf3db8c 100644 --- a/libs/vkd3d-shader/hlsl.c +++ b/libs/vkd3d-shader/hlsl.c @@ -2467,6 +2467,8 @@ const char *debug_hlsl_expr_op(enum hlsl_ir_expr_op op)
[HLSL_OP3_DP2ADD] = "dp2add", [HLSL_OP3_LERP] = "lerp", + [HLSL_OP3_MOVC] = "movc", + [HLSL_OP3_TERNARY] = "ternary", };
return op_names[op]; diff --git a/libs/vkd3d-shader/hlsl.h b/libs/vkd3d-shader/hlsl.h index 42ad23a4..65fd10a1 100644 --- a/libs/vkd3d-shader/hlsl.h +++ b/libs/vkd3d-shader/hlsl.h @@ -548,6 +548,8 @@ enum hlsl_ir_expr_op
HLSL_OP3_DP2ADD, HLSL_OP3_LERP, + HLSL_OP3_MOVC, + HLSL_OP3_TERNARY, };
#define HLSL_MAX_OPERANDS 3 diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index 84c9cd49..1f4f229b 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -6425,6 +6425,7 @@ conditional_expr: struct hlsl_ir_node *cond = node_from_block($1); struct hlsl_ir_node *first = node_from_block($3); struct hlsl_ir_node *second = node_from_block($5); + struct hlsl_ir_node *args[HLSL_MAX_OPERANDS] = { 0 }; struct hlsl_type *common_type;
hlsl_block_add_block($1, $3); @@ -6441,7 +6442,10 @@ conditional_expr: if (!(second = add_implicit_conversion(ctx, block_to_list($1), second, common_type, &@5))) YYABORT;
- if (!hlsl_add_conditional(ctx, $1, cond, first, second)) + args[0] = cond; + args[1] = first; + args[2] = second; + if (!add_expr(ctx, block_to_list($1), HLSL_OP3_TERNARY, args, common_type, &@1)) YYABORT; $$ = $1; } diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 25b5ca2f..5100065b 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2355,6 +2355,54 @@ static bool lower_round(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void * return true; }
+/* Use 'movc' for the ternary operator. */ +static bool lower_ternary(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context) +{ + struct hlsl_ir_node *operands[HLSL_MAX_OPERANDS], *replacement; + struct hlsl_ir_node *zero, *cond, *first, *second; + struct hlsl_constant_value zero_value = { 0 }; + struct hlsl_ir_expr *expr; + struct hlsl_type *type; + + if (instr->type != HLSL_IR_EXPR) + return false; + + expr = hlsl_ir_expr(instr); + if (expr->op != HLSL_OP3_TERNARY) + return false; + + cond = expr->operands[0].node; + first = expr->operands[1].node; + second = expr->operands[2].node; + + if (cond->data_type->base_type == HLSL_TYPE_FLOAT) + { + if (!(zero = hlsl_new_constant(ctx, cond->data_type, &zero_value, &instr->loc))) + return false; + list_add_tail(&instr->entry, &zero->entry); + + memset(operands, 0, sizeof(operands)); + operands[0] = zero; + operands[1] = cond; + type = cond->data_type; + type = hlsl_get_numeric_type(ctx, type->class, HLSL_TYPE_BOOL, type->dimx, type->dimy); + if (!(cond = hlsl_new_expr(ctx, HLSL_OP2_NEQUAL, operands, type, &instr->loc))) + return false; + list_add_before(&instr->entry, &cond->entry); + } + + memset(operands, 0, sizeof(operands)); + operands[0] = cond; + operands[1] = first; + operands[2] = second; + if (!(replacement = hlsl_new_expr(ctx, HLSL_OP3_MOVC, operands, first->data_type, &instr->loc))) + return false; + list_add_before(&instr->entry, &replacement->entry); + + hlsl_replace_node(instr, replacement); + return true; +} + static bool lower_casts_to_bool(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context) { struct hlsl_type *type = instr->data_type, *arg_type; @@ -4314,6 +4362,8 @@ int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry hlsl_transform_ir(ctx, lower_combined_samples, body, NULL); hlsl_transform_ir(ctx, track_object_components_usage, body, NULL);
+ if (profile->major_version >= 4) + hlsl_transform_ir(ctx, lower_ternary, body, NULL); if (profile->major_version < 4) { hlsl_transform_ir(ctx, lower_division, body, NULL); diff --git a/libs/vkd3d-shader/tpf.c b/libs/vkd3d-shader/tpf.c index 4be2f8b8..577bcad8 100644 --- a/libs/vkd3d-shader/tpf.c +++ b/libs/vkd3d-shader/tpf.c @@ -4454,11 +4454,32 @@ static void write_sm4_store_uav_typed(struct hlsl_ctx *ctx, struct vkd3d_bytecod write_sm4_instruction(buffer, &instr); }
+static void write_sm4_movc(struct vkd3d_bytecode_buffer *buffer, enum vkd3d_sm4_opcode opcode, + const struct hlsl_ir_node *dst, const struct hlsl_ir_node *src1, const struct hlsl_ir_node *src2, + const struct hlsl_ir_node *src3) +{ + struct sm4_instruction instr; + + memset(&instr, 0, sizeof(instr)); + instr.opcode = opcode; + + sm4_dst_from_node(&instr.dsts[0], dst); + instr.dst_count = 1; + + sm4_src_from_node(&instr.srcs[0], src1, instr.dsts[0].writemask); + sm4_src_from_node(&instr.srcs[1], src2, instr.dsts[0].writemask); + sm4_src_from_node(&instr.srcs[2], src3, instr.dsts[0].writemask); + instr.src_count = 3; + + write_sm4_instruction(buffer, &instr); +} + static void write_sm4_expr(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, const struct hlsl_ir_expr *expr) { const struct hlsl_ir_node *arg1 = expr->operands[0].node; const struct hlsl_ir_node *arg2 = expr->operands[1].node; + const struct hlsl_ir_node *arg3 = expr->operands[2].node; const struct hlsl_type *dst_type = expr->node.data_type; struct vkd3d_string_buffer *dst_type_string;
@@ -4884,6 +4905,10 @@ static void write_sm4_expr(struct hlsl_ctx *ctx, &expr->node, arg1, arg2); break;
+ case HLSL_OP3_MOVC: + write_sm4_movc(buffer, VKD3D_SM4_OP_MOVC, &expr->node, arg1, arg2, arg3); + break; + default: hlsl_fixme(ctx, &expr->node.loc, "SM4 %s expression.", debug_hlsl_expr_op(expr->op)); }
From: Nikolay Sivov nsivov@codeweavers.com
Signed-off-by: Nikolay Sivov nsivov@codeweavers.com --- libs/vkd3d-shader/hlsl.y | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index 1f4f229b..bd7c4908 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -2838,6 +2838,7 @@ static bool intrinsic_fmod(struct hlsl_ctx *ctx, const struct parse_initializer const struct vkd3d_shader_location *loc) { struct hlsl_ir_node *x, *y, *div, *abs, *frac, *neg_frac, *ge, *select, *zero; + struct hlsl_ir_node *operands[HLSL_MAX_OPERANDS] = { 0 }; static const struct hlsl_constant_value zero_value;
if (!(x = intrinsic_float_convert_arg(ctx, params, params->args[0], loc))) @@ -2865,7 +2866,10 @@ static bool intrinsic_fmod(struct hlsl_ctx *ctx, const struct parse_initializer if (!(ge = add_binary_comparison_expr(ctx, params->instrs, HLSL_OP2_GEQUAL, div, zero, loc))) return false;
- if (!(select = hlsl_add_conditional(ctx, params->instrs, ge, frac, neg_frac))) + operands[0] = ge; + operands[1] = frac; + operands[2] = neg_frac; + if (!(select = add_expr(ctx, block_to_list(params->instrs), HLSL_OP3_TERNARY, operands, x->data_type, loc))) return false;
return !!add_binary_arithmetic_expr(ctx, params->instrs, HLSL_OP2_MUL, select, y, loc);
From: Nikolay Sivov nsivov@codeweavers.com
Signed-off-by: Nikolay Sivov nsivov@codeweavers.com --- tests/hlsl/fmod.shader_test | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/tests/hlsl/fmod.shader_test b/tests/hlsl/fmod.shader_test index 4c9ff988..9eb69c3b 100644 --- a/tests/hlsl/fmod.shader_test +++ b/tests/hlsl/fmod.shader_test @@ -11,3 +11,17 @@ probe all rgba (-0.5, 0.0, 0.0, 0.0) 4 uniform 0 float4 1.1 0.3 0.0 0.0 draw quad probe all rgba (0.2, 0.0, 0.0, 0.0) 4 + +[pixel shader] +float4 main(uniform float4 u) : sv_target +{ + return float4(fmod(u.xy, u.z), 0, 0); +} + +[test] +uniform 0 float4 -0.5 6.5 2.0 0.0 +draw quad +probe all rgba (-0.5, 0.5, 0.0, 0.0) 4 +uniform 0 float4 1.1 0.3 3.0 0.0 +draw quad +probe all rgba (1.1, 0.3, 0.0, 0.0) 4
Is there any user of hlsl_add_conditional() that *doesn't* want it to be per-component? As far as I can tell the answer is no, so we may as well just change it there.
Is there an advantage to keeping separate TERNARY and MOVC ops? Can we just emit TERNARY directly for sm4?
Is there any user of hlsl_add_conditional() that *doesn't* want it to be per-component? As far as I can tell the answer is no, so we may as well just change it there.
Maybe we can. For lit() it's always a scalar I think, but should not hurt.
Is there an advantage to keeping separate TERNARY and MOVC ops? Can we just emit TERNARY directly for sm4?
I was thinking turning it to movc/cmp/slt/sge conditionally, and those have different meaning for condition argument.
Is there an advantage to keeping separate TERNARY and MOVC ops? Can we just emit TERNARY directly for sm4?
I was thinking turning it to movc/cmp/slt/sge conditionally, and those have different meaning for condition argument.
I don't think I understand. Currently we have OP3_TERNARY which evaluates whether its argument is nonzero, and OP3_MOVC which requires a bool. We could certainly introduce OP3_CMP which is the result of "raising" a sequence like OP2_GEQUAL + OP3_TERNARY, but I don't see why we need OP3_MOVC?
On Wed Jul 26 21:30:25 2023 +0000, Zebediah Figura wrote:
Is there an advantage to keeping separate TERNARY and MOVC ops? Can
we just emit TERNARY directly for sm4?
I was thinking turning it to movc/cmp/slt/sge conditionally, and those
have different meaning for condition argument. I don't think I understand. Currently we have OP3_TERNARY which evaluates whether its argument is nonzero, and OP3_MOVC which requires a bool. We could certainly introduce OP3_CMP which is the result of "raising" a sequence like OP2_GEQUAL + OP3_TERNARY, but I don't see why we need OP3_MOVC?
OP3_TERNARY does not evaluate anything. MOVC does not require a bool, a comparison is added for floats, but everything else like bool/uint/int is passed directly as 'movc' argument. CMP is using >= 0.0 condition.
OP3_TERNARY does not evaluate anything. MOVC does not require a bool, a comparison is added for floats, but everything else like bool/uint/int is passed directly as 'movc' argument. CMP is using >= 0.0 condition.
Sorry, let me try rephrasing. In the parser this patch series translates ?: to OP3_TERNARY, and it takes its initial argument verbatim, without trying to cast it to bool / compare it to zero first. Then, later, you lower that to OP3_MOVC, apparently to flush the condition argument to bool (although it doesn't handle int/uint for some reason). My question is, why do we need two different instructions? Why can't the sm4 backend just translate OP3_TERNARY directly?
On Tue Aug 8 20:57:14 2023 +0000, Zebediah Figura wrote:
OP3_TERNARY does not evaluate anything. MOVC does not require a bool,
a comparison is added for floats, but everything else like bool/uint/int is passed directly as 'movc' argument. CMP is using >= 0.0 condition. Sorry, let me try rephrasing. In the parser this patch series translates ?: to OP3_TERNARY, and it takes its initial argument verbatim, without trying to cast it to bool / compare it to zero first. Then, later, you lower that to OP3_MOVC, apparently to flush the condition argument to bool (although it doesn't handle int/uint for some reason). My question is, why do we need two different instructions? Why can't the sm4 backend just translate OP3_TERNARY directly?
There was a suggestion made before to have each operation use predictable semantics that do not depend on profile, for example. MOVC will be used for TERNARY when it's available (sm4+), another instruction will be used for older profiles. It does not handle int/uint because it's not necessary, these types are passed directly to movc. It can translate TERNARY directly, but it has to handle float case somewhere.
There was a suggestion made before to have each operation use predictable semantics that do not depend on profile, for example. MOVC will be used for TERNARY when it's available (sm4+), another instruction will be used for older profiles. It does not handle int/uint because it's not necessary, these types are passed directly to movc. It can translate TERNARY directly, but it has to handle float case somewhere.
I'm not understanding why the semantics of TERNARY would depend on profile. If it means "select first argument if the condition is a nonzero expression of any type", why would the semantics be any different for sm1?
I'm not saying we should get rid of CMP, but I don't see why we need *three* different instructions.
On Tue Aug 8 21:12:37 2023 +0000, Zebediah Figura wrote:
There was a suggestion made before to have each operation use
predictable semantics that do not depend on profile, for example. MOVC will be used for TERNARY when it's available (sm4+), another instruction will be used for older profiles. It does not handle int/uint because it's not necessary, these types are passed directly to movc. It can translate TERNARY directly, but it has to handle float case somewhere. I'm not understanding why the semantics of TERNARY would depend on profile. If it means "select first argument if the condition is a nonzero expression of any type", why would the semantics be any different for sm1? I'm not saying we should get rid of CMP, but I don't see why we need *three* different instructions.
I was under impression that CMP uses nonnegative, while MOVC is nonzero.
On Tue Aug 8 21:17:07 2023 +0000, Nikolay Sivov wrote:
I was under impression that CMP uses nonnegative, while MOVC is nonzero.
Okay, after working out the source of the confusion offline, I see the problem I was missing.
The issue is that sm4 MOVC does what's effectively an integer comparison to zero, but the ternary operator does a per-type comparison to zero. I was figuring that we could reuse one or the other since they're the same thing, but they're not: ternary treats floating point negative zero as false, but MOVC treats it as true (since it has a nonzero bit representation). So with that in mind, our semantics are essentially non-overlapping: TERNARY means per-type compare to zero, MOVC means bitwise compare to zero, and CMP is of course compare >= 0.
--
One thing that did come out of that discussion is that lowering passes over the HLSL IR may not be the best way to do these transformations. In general, for a translation like these, or (say) lower_division(), there are three options:
(1) Generate multiple low-level instructions. This makes less sense if the lowering needs to be done per-backend. But we do this in other cases, such as subtraction.
(2) Generate a high-level instruction, and use a pass to lower it to another HLSL instruction.
(3) Generate a high-level instruction, and translate it to multiple low-level instructions when translating to another form of IR (or directly to the bytecode).
Historically, I've done (2) because it seemed like the most obvious thing to do. It does have the disadvantage that there are more ops in the HLSL, although honestly having a proliferation of expr ops isn't that bad. It is pretty much universally more code, though, so in general we may want to start leaning toward (1) or (3).
Anyway, I don't see this as a reason to object to this patch series; I think it's ultimately not going in a wrong direction. But it's something we may want to think about in the future and try to inform the way we design IR.
On Sun Aug 13 22:07:38 2023 +0000, Zebediah Figura wrote:
Okay, after working out the source of the confusion offline, I see the problem I was missing. The issue is that sm4 MOVC does what's effectively an integer comparison to zero, but the ternary operator does a per-type comparison to zero. I was figuring that we could reuse one or the other since they're the same thing, but they're not: ternary treats floating point negative zero as false, but MOVC treats it as true (since it has a nonzero bit representation). So with that in mind, our semantics are essentially non-overlapping: TERNARY means per-type compare to zero, MOVC means bitwise compare to zero, and CMP is of course compare >= 0. -- One thing that did come out of that discussion is that lowering passes over the HLSL IR may not be the best way to do these transformations. In general, for a translation like these, or (say) lower_division(), there are three options: (1) Generate multiple low-level instructions. This makes less sense if the lowering needs to be done per-backend. But we do this in other cases, such as subtraction. (2) Generate a high-level instruction, and use a pass to lower it to another HLSL instruction. (3) Generate a high-level instruction, and translate it to multiple low-level instructions when translating to another form of IR (or directly to the bytecode). Historically, I've done (2) because it seemed like the most obvious thing to do. It does have the disadvantage that there are more ops in the HLSL, although honestly having a proliferation of expr ops isn't that bad. It is pretty much universally more code, though, so in general we may want to start leaning toward (1) or (3). Anyway, I don't see this as a reason to object to this patch series; I think it's ultimately not going in a wrong direction. But it's something we may want to think about in the future and try to inform the way we design IR.
This does, however, need a rebase.