From: Nikolay Sivov nsivov@codeweavers.com
--- libs/vkd3d-shader/hlsl.y | 11 +++++++ libs/vkd3d-shader/hlsl_sm1.c | 63 +++++++++++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index a4de0edd..bedddf00 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -1650,6 +1650,17 @@ static struct hlsl_ir_node *add_binary_dot_expr(struct hlsl_ctx *ctx, struct lis if (!(args[1] = add_implicit_conversion(ctx, instrs, arg2, common_type, loc))) return NULL;
+ if (ctx->profile->major_version < 4 && dim == 2) + { + struct hlsl_ir_constant *zero; + + if (!(zero = hlsl_new_float_constant(ctx, 0.0f, loc))) + return NULL; + + if (!(args[2] = add_implicit_conversion(ctx, instrs, &zero->node, common_type, loc))) + return NULL; + } + return add_expr(ctx, instrs, op, args, ret_type, loc); }
diff --git a/libs/vkd3d-shader/hlsl_sm1.c b/libs/vkd3d-shader/hlsl_sm1.c index a66d4028..3af2b09c 100644 --- a/libs/vkd3d-shader/hlsl_sm1.c +++ b/libs/vkd3d-shader/hlsl_sm1.c @@ -422,7 +422,7 @@ struct sm1_instruction D3DSHADER_PARAM_SRCMOD_TYPE mod; unsigned int swizzle; uint32_t reg; - } srcs[2]; + } srcs[HLSL_MAX_OPERANDS]; unsigned int src_count;
unsigned int has_dst; @@ -459,6 +459,33 @@ static void write_sm1_instruction(struct hlsl_ctx *ctx, struct vkd3d_bytecode_bu write_sm1_src_register(buffer, &instr->srcs[i], instr->dst.writemask); };
+static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, + D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, + const struct hlsl_reg *src1, const struct hlsl_reg *src2, const struct hlsl_reg *src3) +{ + const struct sm1_instruction instr = + { + .opcode = opcode, + + .dst.type = D3DSPR_TEMP, + .dst.writemask = dst->writemask, + .dst.reg = dst->id, + .has_dst = 1, + + .srcs[0].type = D3DSPR_TEMP, + .srcs[0].swizzle = hlsl_swizzle_from_writemask(src1->writemask), + .srcs[0].reg = src1->id, + .srcs[1].type = D3DSPR_TEMP, + .srcs[1].swizzle = hlsl_swizzle_from_writemask(src2->writemask), + .srcs[1].reg = src2->id, + .srcs[2].type = D3DSPR_TEMP, + .srcs[2].swizzle = hlsl_swizzle_from_writemask(src3->writemask), + .srcs[2].reg = src3->id, + .src_count = 3, + }; + write_sm1_instruction(ctx, buffer, &instr); +} + static void write_sm1_binary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, const struct hlsl_reg *src1, const struct hlsl_reg *src2) @@ -631,9 +658,15 @@ static void write_sm1_expr(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *b struct hlsl_ir_expr *expr = hlsl_ir_expr(instr); struct hlsl_ir_node *arg1 = expr->operands[0].node; struct hlsl_ir_node *arg2 = expr->operands[1].node; + struct hlsl_ir_node *arg3 = expr->operands[2].node; + const struct hlsl_type *dst_type = expr->node.data_type; + struct vkd3d_string_buffer *dst_type_string;
assert(instr->reg.allocated);
+ if (!(dst_type_string = hlsl_type_to_string(ctx, dst_type))) + return; + if (instr->data_type->base_type != HLSL_TYPE_FLOAT) { /* These need to be lowered. */ @@ -679,6 +712,34 @@ static void write_sm1_expr(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *b write_sm1_unary_op(ctx, buffer, D3DSIO_FRC, &instr->reg, &arg1->reg, D3DSPSM_NONE); break;
+ case HLSL_OP2_DOT: + switch (dst_type->base_type) + { + case HLSL_TYPE_FLOAT: + switch (arg1->data_type->dimx) + { + case 4: + write_sm1_binary_op(ctx, buffer, D3DSIO_DP4, &instr->reg, &arg1->reg, &arg2->reg); + break; + + case 3: + write_sm1_binary_op(ctx, buffer, D3DSIO_DP3, &instr->reg, &arg1->reg, &arg2->reg); + break; + + case 2: + write_sm1_ternary_op(ctx, buffer, D3DSIO_DP2ADD, &instr->reg, &arg1->reg, &arg2->reg, &arg3->reg); + break; + + default: + vkd3d_unreachable(); + } + break; + + default: + hlsl_fixme(ctx, &expr->node.loc, "SM1 %s dot expression.", dst_type_string->buffer); + } + break; + default: hlsl_fixme(ctx, &instr->loc, "SM1 "%s" expression.", debug_hlsl_expr_op(expr->op)); break;
Francisco Casas (@fcasas) commented about libs/vkd3d-shader/hlsl.y:
if (!(args[1] = add_implicit_conversion(ctx, instrs, arg2, common_type, loc))) return NULL;
- if (ctx->profile->major_version < 4 && dim == 2)
- {
struct hlsl_ir_constant *zero;
if (!(zero = hlsl_new_float_constant(ctx, 0.0f, loc)))
return NULL;
if (!(args[2] = add_implicit_conversion(ctx, instrs, &zero->node, common_type, loc)))
return NULL;
- }
I don't think it is a good idea to keep the zero constant as a hidden third parameter of `HLSL_OP2_DOT` during the whole compilation.
Others may disagree, but I think it may be better to introduce it in a new compilation pass in `hlsl_emit_bytecode()` inside this block:
```c if (profile->major_version < 4) { transform_ir(ctx, lower_division, body, NULL); transform_ir(ctx, lower_sqrt, body, NULL); } ```
On Fri Jan 27 16:18:29 2023 +0000, Francisco Casas wrote:
I don't think it is a good idea to keep the zero constant as a hidden third parameter of `HLSL_OP2_DOT` during the whole compilation. Others may disagree, but I think it may be better to introduce it in a new compilation pass in `hlsl_emit_bytecode()` inside this block:
if (profile->major_version < 4) { transform_ir(ctx, lower_division, body, NULL); transform_ir(ctx, lower_sqrt, body, NULL); }
This is of course possible, but that would mean we'll have to introduce specific HLSL_OP2_DP2..4, or maybe DP2 and DOT that looks less consistent. I used SM4 variant as an example, but if suggested way is desirable, I can do that.
If we're going to do it this way, it should be a separate DP2ADD operation.
That does not exist for SM4+, I was under impression HLSL_OP* were supposed to depend on profile.
I don't think we want the same op to have different semantics depending on backend.
The policy isn't really set in stone yet, but for the moment I think adding backend-specific HLSL_OP_* is fine.
That does not exist for SM4+, I was under impression HLSL_OP* were supposed to depend on profile.
I meant to say "were NOT supposed to depend" of course.
In principle we could avoid touching the IR and do everything while lowering to SM1. The only difficulty is AFAIK you cannot write literal constants in SM1, so you'd need to ensure that you have an available zero constant to stick in when calling `write_sm1_ternary_op()`, and that's probably uselessly complicated. So I agree that the best solution seems adding `HLSL_OP_DP2ADD` and a lowering pass.
There is of course another option: translate this to mul+mad. And that's something we'd have to do anyway; dp2add is only supported in shader model 2 and up for pixel shaders, and not at all for vertex shaders.
On Mon Jan 30 14:57:33 2023 +0000, Henri Verbeet wrote:
There is of course another option: translate this to mul+mad. And that's something we'd have to do anyway; dp2add is only supported in shader model 2 and up for pixel shaders, and not at all for vertex shaders.
This would need to be a lowering pass, because otherwise you would have to add a new temp in the bytecode. It can work but I am not sure if there will be discrepancies in precision, or if they would matter.
I would go for creating the new `DP2ADD` op (maybe it should be called `HLSL_OP3_DP2ADD`?) and adding the lowering pass for SM1.
On Mon Jan 30 14:57:33 2023 +0000, Francisco Casas wrote:
This would need to be a lowering pass, because otherwise you would have to add a new temp in the bytecode. It can work but I am not sure if there will be discrepancies in precision, or if they would matter. I would go for creating the new `DP2ADD` op (maybe it should be called `HLSL_OP3_DP2ADD`?) and adding the lowering pass for SM1.
huh, I just realized that you could use the same temp for storing the result of both instructions, so it wouldn't require a new temp. I don't think we do that somewhere else though.
On Mon Jan 30 15:02:57 2023 +0000, Francisco Casas wrote:
huh, I just realized that you could use the same temp for storing the result of both instructions, so it wouldn't require a new temp. I don't think we do that somewhere else though.
small correction: it seems that in vertex shaders the dot product is translated to mul+add (not mul+mad):
https://shader-playground.timjones.io/5a12e96e87f2170080123a81305b8444
I would suggest lowering dp2 to mul+add or dp2add, according to the shader model, in the same lowering pass.
I would suggest lowering dot with dimx=2 to mul+add or dp2add, according to the shader model, in the same lowering pass.
Possibly, although I don't know that we want to spend too much effort on matching native instructions exactly.
dp2add is only supported in shader model 2 and up for pixel shaders, and not at all for vertex shaders.
Hmm, that's a good point, I hadn't thought of that before.
I did a brief look to see if we were using any other instructions that weren't supported on all models. It seems we are using several instructions that are missing on ps_1_* but present everywhere else: exp, rcp, rsq, max, min, frc.
We're also missing checks for a lot of sm5 (and sm4.1?) features. We should probably start implementing those at some point; it's not great that we're emitting invalid bytecode.
On Mon Jan 30 21:00:41 2023 +0000, Zebediah Figura wrote:
I would suggest lowering dot with dimx=2 to mul+add or dp2add,
according to the shader model, in the same lowering pass. Possibly, although I don't know that we want to spend too much effort on matching native instructions exactly.
Yep, now I think it would be fine to always lower to mul+add for SM1 (if dimx=2), as long as we keep the `FIXME` specifically for `ps_3_0`, in case it ever matters.