Indexing with non-constants offsets requires relative addressing in SM4. In assembly, this is written like in the following example:
``` x1[r1.x + 3] ```
The first part of this patch series only includes support for indexing vectors with non-constant indexes.
Following patches in https://gitlab.winehq.org/fcasas/vkd3d/-/commits/nonconst-offsets-3.
---
Non-constant indexing of vectors cannot be implemented with relative addressing in SM4 because this indexation cannot be performed at the level of register-components, only whole registers.
Mathematical operations must be used instead.
For floats, the native compiler seems to index an identity matrix, to get the i-th column, and then proceedes to compute the dot product between that column and the vector. For ints, bit operations seem to be performed.
While probably less efficient, this implementation complies with the type-checking at the IR level and when writing bytecode.
From: Francisco Casas fcasas@codeweavers.com
--- Makefile.am | 1 + tests/array-index-expr.shader_test | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 tests/array-index-expr.shader_test
diff --git a/Makefile.am b/Makefile.am index e06d0eeb..b5f7159e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -50,6 +50,7 @@ vkd3d_shader_tests = \ tests/arithmetic-int.shader_test \ tests/arithmetic-int-uniform.shader_test \ tests/arithmetic-uint.shader_test \ + tests/array-index-expr.shader_test \ tests/array-parameters.shader_test \ tests/asfloat.shader_test \ tests/asuint.shader_test \ diff --git a/tests/array-index-expr.shader_test b/tests/array-index-expr.shader_test new file mode 100644 index 00000000..92ab1ab6 --- /dev/null +++ b/tests/array-index-expr.shader_test @@ -0,0 +1,22 @@ +[pixel shader todo] +uniform float i; + +float4 main() : SV_TARGET +{ + float4 arr = float4(11.0, 12.0, 13.0, 14.0); + return arr[i]; +} + +[test] +uniform 0 float 0 +todo draw quad +todo probe all rgba (11.0, 11.0, 11.0, 11.0) +uniform 0 float 1 +todo draw quad +todo probe all rgba (12.0, 12.0, 12.0, 12.0) +uniform 0 float 2 +todo draw quad +todo probe all rgba (13.0, 13.0, 13.0, 13.0) +uniform 0 float 3 +todo draw quad +todo probe all rgba (14.0, 14.0, 14.0, 14.0)
From: Francisco Casas fcasas@codeweavers.com
--- tests/array-index-expr.shader_test | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+)
diff --git a/tests/array-index-expr.shader_test b/tests/array-index-expr.shader_test index 92ab1ab6..675e5e46 100644 --- a/tests/array-index-expr.shader_test +++ b/tests/array-index-expr.shader_test @@ -20,3 +20,28 @@ todo probe all rgba (13.0, 13.0, 13.0, 13.0) uniform 0 float 3 todo draw quad todo probe all rgba (14.0, 14.0, 14.0, 14.0) + + +[pixel shader todo] +uniform float2 i; + +float4 main() : sv_target +{ + float4 f[3] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + + return f[i.x + i.y]; +} + +[test] +uniform 0 float4 0 0 0 0 +todo draw quad +todo probe all rgba (1.0, 2.0, 3.0, 4.0) +uniform 0 float4 1 0 0 0 +todo draw quad +todo probe all rgba (5.0, 6.0, 7.0, 8.0) +uniform 0 float4 0 1 0 0 +todo draw quad +todo probe all rgba (5.0, 6.0, 7.0, 8.0) +uniform 0 float4 1 1 0 0 +todo draw quad +todo probe all rgba (9.0, 10.0, 11.0, 12.0)
From: Francisco Casas fcasas@codeweavers.com
--- libs/vkd3d-shader/hlsl_codegen.c | 77 ++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 33 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 638dab6e..6a99c33e 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -123,15 +123,14 @@ static struct hlsl_ir_node *new_offset_instr_from_deref(struct hlsl_ctx *ctx, st }
/* TODO: remove when no longer needed, only used for transform_deref_paths_into_offsets() */ -static void replace_deref_path_with_offset(struct hlsl_ctx *ctx, struct hlsl_deref *deref, +static bool replace_deref_path_with_offset(struct hlsl_ctx *ctx, struct hlsl_deref *deref, struct hlsl_ir_node *instr) { const struct hlsl_type *type; struct hlsl_ir_node *offset; struct hlsl_block block;
- if (!deref->var) - return; + assert(deref->var);
/* register offsets shouldn't be used before this point is reached. */ assert(!deref->offset.node); @@ -143,45 +142,19 @@ static void replace_deref_path_with_offset(struct hlsl_ctx *ctx, struct hlsl_der if (type->class == HLSL_CLASS_STRUCT || type->class == HLSL_CLASS_ARRAY) { hlsl_cleanup_deref(deref); - return; + return true; }
deref->offset_regset = hlsl_type_get_regset(type);
if (!(offset = new_offset_instr_from_deref(ctx, &block, deref, &instr->loc))) - return; + return false; list_move_before(&instr->entry, &block.instrs);
hlsl_cleanup_deref(deref); hlsl_src_from_node(&deref->offset, offset); -} - -/* TODO: remove when no longer needed. */ -static bool transform_deref_paths_into_offsets(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context) -{ - switch(instr->type) - { - case HLSL_IR_LOAD: - replace_deref_path_with_offset(ctx, &hlsl_ir_load(instr)->src, instr); - return true;
- case HLSL_IR_STORE: - replace_deref_path_with_offset(ctx, &hlsl_ir_store(instr)->lhs, instr); - return true; - - case HLSL_IR_RESOURCE_LOAD: - replace_deref_path_with_offset(ctx, &hlsl_ir_resource_load(instr)->resource, instr); - replace_deref_path_with_offset(ctx, &hlsl_ir_resource_load(instr)->sampler, instr); - return true; - - case HLSL_IR_RESOURCE_STORE: - replace_deref_path_with_offset(ctx, &hlsl_ir_resource_store(instr)->resource, instr); - return true; - - default: - return false; - } - return false; + return true; }
/* Split uniforms into two variables representing the constant and temp @@ -600,6 +573,44 @@ bool hlsl_transform_ir(struct hlsl_ctx *ctx, bool (*func)(struct hlsl_ctx *ctx, return progress; }
+static bool transform_instr_derefs(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context) +{ + bool res; + bool (*func)(struct hlsl_ctx *ctx, struct hlsl_deref *, struct hlsl_ir_node *) = context; + + switch(instr->type) + { + case HLSL_IR_LOAD: + res = func(ctx, &hlsl_ir_load(instr)->src, instr); + return res; + + case HLSL_IR_STORE: + res = func(ctx, &hlsl_ir_store(instr)->lhs, instr); + return res; + + case HLSL_IR_RESOURCE_LOAD: + res = func(ctx, &hlsl_ir_resource_load(instr)->resource, instr); + if (hlsl_ir_resource_load(instr)->sampler.var) + res |= func(ctx, &hlsl_ir_resource_load(instr)->sampler, instr); + return res; + + case HLSL_IR_RESOURCE_STORE: + res = func(ctx, &hlsl_ir_resource_store(instr)->resource, instr); + return res; + + default: + return false; + } + return false; +} + +static bool transform_derefs(struct hlsl_ctx *ctx, + bool (*func)(struct hlsl_ctx *ctx, struct hlsl_deref *, struct hlsl_ir_node *), + struct hlsl_block *block) +{ + return hlsl_transform_ir(ctx, transform_instr_derefs, block, func); +} + struct recursive_call_ctx { const struct hlsl_ir_function_decl **backtrace; @@ -3944,7 +3955,7 @@ int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry hlsl_transform_ir(ctx, track_object_components_usage, body, NULL);
/* TODO: move forward, remove when no longer needed */ - hlsl_transform_ir(ctx, transform_deref_paths_into_offsets, body, NULL); + transform_derefs(ctx, replace_deref_path_with_offset, body); while (hlsl_transform_ir(ctx, hlsl_fold_constant_exprs, body, NULL));
do
From: Francisco Casas fcasas@codeweavers.com
Non-constant vector indexing is not solved with relative addressing in the register indexes because this indexation cannot be at the level of register-components.
Mathematical operations must be used instead.
For floats, the native compiler seems to index an identity matrix, to get the i-th column, and then proceedes to compute the dot product between that column and the vector. For ints, bit operations seem to be performed.
While probably less efficient, this implementation complies with the type-checking at the IR level and when writing bytecode. --- libs/vkd3d-shader/hlsl.c | 13 +++ libs/vkd3d-shader/hlsl.h | 2 + libs/vkd3d-shader/hlsl_codegen.c | 81 +++++++++++++++++++ tests/array-index-expr.shader_test | 18 ++--- tests/expr-indexing.shader_test | 22 ++--- tests/hlsl-matrix-indexing.shader_test | 6 +- .../hlsl-vector-indexing-uniform.shader_test | 4 +- 7 files changed, 121 insertions(+), 25 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl.c b/libs/vkd3d-shader/hlsl.c index 617aef30..1cdc358d 100644 --- a/libs/vkd3d-shader/hlsl.c +++ b/libs/vkd3d-shader/hlsl.c @@ -1287,6 +1287,19 @@ struct hlsl_ir_load *hlsl_new_load_index(struct hlsl_ctx *ctx, const struct hlsl return load; }
+struct hlsl_ir_load *hlsl_new_load_partial_path(struct hlsl_ctx *ctx, const struct hlsl_deref *deref, + unsigned int length, const struct vkd3d_shader_location *loc) +{ + /* This deref can only exists temporarily because it is not the real owner of its members. */ + struct hlsl_deref tmp_deref; + + assert(length <= deref->path_len); + + tmp_deref = *deref; + tmp_deref.path_len = length; + return hlsl_new_load_index(ctx, &tmp_deref, NULL, loc); +} + struct hlsl_ir_load *hlsl_new_var_load(struct hlsl_ctx *ctx, struct hlsl_ir_var *var, const struct vkd3d_shader_location *loc) { diff --git a/libs/vkd3d-shader/hlsl.h b/libs/vkd3d-shader/hlsl.h index 6a4e314d..a59fa36d 100644 --- a/libs/vkd3d-shader/hlsl.h +++ b/libs/vkd3d-shader/hlsl.h @@ -1119,6 +1119,8 @@ struct hlsl_ir_load *hlsl_new_var_load(struct hlsl_ctx *ctx, struct hlsl_ir_var const struct vkd3d_shader_location *loc); struct hlsl_ir_load *hlsl_new_load_index(struct hlsl_ctx *ctx, const struct hlsl_deref *deref, struct hlsl_ir_node *idx, const struct vkd3d_shader_location *loc); +struct hlsl_ir_load *hlsl_new_load_partial_path(struct hlsl_ctx *ctx, const struct hlsl_deref *deref, + unsigned int length, const struct vkd3d_shader_location *loc); struct hlsl_ir_node *hlsl_new_load_component(struct hlsl_ctx *ctx, struct hlsl_block *block, const struct hlsl_deref *deref, unsigned int comp, const struct vkd3d_shader_location *loc);
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 6a99c33e..ed301952 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -1992,6 +1992,86 @@ static bool remove_trivial_swizzles(struct hlsl_ctx *ctx, struct hlsl_ir_node *i return true; }
+static bool lower_nonconstant_vector_derefs(struct hlsl_ctx *ctx, struct hlsl_deref *deref, + struct hlsl_ir_node *instr) +{ + struct hlsl_ir_node *idx; + struct hlsl_type *type; + unsigned int i; + + assert(deref->var); + + if (deref->path_len == 0) + return false; + + type = deref->var->data_type; + + for (i = 0; i < deref->path_len - 1; ++i) + type = hlsl_get_element_type_from_path_index(ctx, type, deref->path[i].node); + + idx = deref->path[deref->path_len - 1].node; + + if (type->class == HLSL_CLASS_VECTOR && idx->type != HLSL_IR_CONSTANT) + { + struct hlsl_ir_node *eq, *res, *mult, *swizzle, *comps[4] = {0}, *operands[HLSL_MAX_OPERANDS] = {0}; + struct hlsl_ir_load *vector_load; + struct hlsl_ir_constant *c; + + if (!(vector_load = hlsl_new_load_partial_path(ctx, deref, deref->path_len - 1, &instr->loc))) + return false; + list_add_before(&instr->entry, &vector_load->node.entry); + + if (!(swizzle = hlsl_new_swizzle(ctx, HLSL_SWIZZLE(X, X, X, X), type->dimx, idx, &instr->loc))) + return false; + list_add_before(&instr->entry, &swizzle->entry); + + if (!(c = hlsl_new_constant(ctx, hlsl_get_vector_type(ctx, HLSL_TYPE_UINT, type->dimx), &instr->loc))) + return false; + c->value.u[0].u = 0; + c->value.u[1].u = 1; + c->value.u[2].u = 2; + c->value.u[3].u = 3; + list_add_before(&instr->entry, &c->node.entry); + + operands[0] = swizzle; + operands[1] = &c->node; + if (!(eq = hlsl_new_expr(ctx, HLSL_OP2_EQUAL, operands, + hlsl_get_vector_type(ctx, HLSL_TYPE_BOOL, type->dimx), &instr->loc))) + return false; + list_add_before(&instr->entry, &eq->entry); + + if (!(eq = hlsl_new_cast(ctx, eq, type, &instr->loc))) + return false; + list_add_before(&instr->entry, &eq->entry); + + if (!(mult = hlsl_new_binary_expr(ctx, HLSL_OP2_MUL, &vector_load->node, eq))) + return false; + list_add_before(&instr->entry, &mult->entry); + + for (i = 0; i < type->dimx; ++i) + { + unsigned int s = hlsl_swizzle_from_writemask(1 << i); + + if (!(comps[i] = hlsl_new_swizzle(ctx, s, 1, mult, &instr->loc))) + return false; + list_add_before(&instr->entry, &comps[i]->entry); + } + + res = comps[0]; + for (i = 1; i < type->dimx; ++i) + { + if (!(res = hlsl_new_binary_expr(ctx, HLSL_OP2_ADD, res, comps[i]))) + return false; + list_add_before(&instr->entry, &res->entry); + } + + hlsl_replace_node(instr, res); + return true; + } + + return false; +} + /* Lower DIV to RCP + MUL. */ static bool lower_division(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context) { @@ -3937,6 +4017,7 @@ int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry progress |= hlsl_transform_ir(ctx, remove_trivial_swizzles, body, NULL); } while (progress); + transform_derefs(ctx, lower_nonconstant_vector_derefs, body);
if (profile->major_version < 4) { diff --git a/tests/array-index-expr.shader_test b/tests/array-index-expr.shader_test index 675e5e46..1c469e8c 100644 --- a/tests/array-index-expr.shader_test +++ b/tests/array-index-expr.shader_test @@ -1,4 +1,4 @@ -[pixel shader todo] +[pixel shader] uniform float i;
float4 main() : SV_TARGET @@ -9,17 +9,17 @@ float4 main() : SV_TARGET
[test] uniform 0 float 0 -todo draw quad -todo probe all rgba (11.0, 11.0, 11.0, 11.0) +draw quad +probe all rgba (11.0, 11.0, 11.0, 11.0) uniform 0 float 1 -todo draw quad -todo probe all rgba (12.0, 12.0, 12.0, 12.0) +draw quad +probe all rgba (12.0, 12.0, 12.0, 12.0) uniform 0 float 2 -todo draw quad -todo probe all rgba (13.0, 13.0, 13.0, 13.0) +draw quad +probe all rgba (13.0, 13.0, 13.0, 13.0) uniform 0 float 3 -todo draw quad -todo probe all rgba (14.0, 14.0, 14.0, 14.0) +draw quad +probe all rgba (14.0, 14.0, 14.0, 14.0)
[pixel shader todo] diff --git a/tests/expr-indexing.shader_test b/tests/expr-indexing.shader_test index 83a63d67..3dcc5727 100644 --- a/tests/expr-indexing.shader_test +++ b/tests/expr-indexing.shader_test @@ -13,7 +13,7 @@ draw quad probe all rgba (8.0, 8.0, 8.0, 8.0)
-[pixel shader todo] +[pixel shader] float4 a, b; float i;
@@ -26,8 +26,8 @@ float4 main() : sv_target uniform 0 float4 1.0 2.0 3.0 4.0 uniform 4 float4 5.0 6.0 7.0 8.0 uniform 8 float 2 -todo draw quad -todo probe all rgba (10.0, 10.0, 10.0, 10.0) +draw quad +probe all rgba (10.0, 10.0, 10.0, 10.0)
[pixel shader] @@ -44,7 +44,7 @@ draw quad probe all rgba (3.0, 3.0, 3.0, 3.0)
-[pixel shader todo] +[pixel shader] float4 a; float i;
@@ -56,11 +56,11 @@ float4 main() : sv_target [test] uniform 0 float4 1.0 2.0 3.0 4.0 uniform 4 float 0 -todo draw quad -todo probe all rgba (4.0, 4.0, 4.0, 4.0) +draw quad +probe all rgba (4.0, 4.0, 4.0, 4.0) uniform 4 float 2 -todo draw quad -todo probe all rgba (1.0, 1.0, 1.0, 1.0) +draw quad +probe all rgba (1.0, 1.0, 1.0, 1.0)
[pixel shader] @@ -82,7 +82,7 @@ draw quad probe all rgba (4.0, 4.0, 4.0, 4.0)
-[pixel shader todo] +[pixel shader] float4 a; float i;
@@ -99,5 +99,5 @@ float4 main() : sv_target [test] uniform 0 float4 1.0 2.0 3.0 4.0 uniform 4 float 1 -todo draw quad -todo probe all rgba (2.0, 2.0, 2.0, 2.0) +draw quad +probe all rgba (2.0, 2.0, 2.0, 2.0) diff --git a/tests/hlsl-matrix-indexing.shader_test b/tests/hlsl-matrix-indexing.shader_test index c3d08296..a57d8fb8 100644 --- a/tests/hlsl-matrix-indexing.shader_test +++ b/tests/hlsl-matrix-indexing.shader_test @@ -108,7 +108,7 @@ draw quad probe all rgba (3.0, 4.0, 50.0, 60.0)
-[pixel shader todo] +[pixel shader] uniform float i;
float4 main() : sv_target @@ -120,8 +120,8 @@ float4 main() : sv_target
[test] uniform 0 float 2 -todo draw quad -todo probe all rgba (8, 9, 10, 11) +draw quad +probe all rgba (8, 9, 10, 11)
[pixel shader todo] diff --git a/tests/hlsl-vector-indexing-uniform.shader_test b/tests/hlsl-vector-indexing-uniform.shader_test index 968f570b..e5ffbdd0 100644 --- a/tests/hlsl-vector-indexing-uniform.shader_test +++ b/tests/hlsl-vector-indexing-uniform.shader_test @@ -1,6 +1,6 @@ % Use a uniform to prevent the compiler from optimizing.
-[pixel shader todo] +[pixel shader] uniform float i; float4 main() : SV_TARGET { @@ -12,5 +12,5 @@ float4 main() : SV_TARGET
[test] uniform 0 float 2 -todo draw quad +draw quad probe all rgba (0.5, 0.3, 0.8, 0.2)
Ethan Lee (@flibitijibibo) commented about tests/array-index-expr.shader_test:
+draw quad +probe all rgba (12.0, 12.0, 12.0, 12.0) +uniform 0 float 2 +draw quad +probe all rgba (13.0, 13.0, 13.0, 13.0) +uniform 0 float 3 +draw quad +probe all rgba (14.0, 14.0, 14.0, 14.0)
+[pixel shader todo] +uniform float2 i;
+float4 main() : sv_target +{
- float4 f[3] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
Does it matter if this is a const array or a uniform? I suppose either way we can obsolete !189 with this MR, whether we need a separate test or not.
On Fri May 12 17:34:57 2023 +0000, Ethan Lee wrote:
Does it matter if this is a const array or a uniform? I suppose either way we can obsolete !189 with this MR, whether we need a separate test or not.
Currently it doesn't matter only because our strategy is copying everything into temps at the beginning of the shader and (if they are output semantics or uniforms) copying from the temps back to the variables at the end of the shader.
However, I think it is good to retain both tests. Merging depending on which MR goes first.
Giovanni Mascellani (@giomasce) commented about libs/vkd3d-shader/hlsl_codegen.c:
for (i = 0; i < type->dimx; ++i)
{
unsigned int s = hlsl_swizzle_from_writemask(1 << i);
if (!(comps[i] = hlsl_new_swizzle(ctx, s, 1, mult, &instr->loc)))
return false;
list_add_before(&instr->entry, &comps[i]->entry);
}
res = comps[0];
for (i = 1; i < type->dimx; ++i)
{
if (!(res = hlsl_new_binary_expr(ctx, HLSL_OP2_ADD, res, comps[i])))
return false;
list_add_before(&instr->entry, &res->entry);
}
I suppose that you're spelling out the dot product so that it works with any numeric type? I'd rather emit an actual DOT here and then implement DOT lowering for integral types: you make the code cleaner and implement another feature for free.
Also, notice that the current code triggers an assertion when indexing `bool4`. I think that's due to the introduction of a `bool` cast once they're assumed to have already been lowered. I guess this pass should happen a bit earlier (though, to be fair, it's always difficult to track the pass dependencies and assumptions).
Giovanni Mascellani (@giomasce) commented about tests/array-index-expr.shader_test:
+[pixel shader todo] +uniform float i;
+float4 main() : SV_TARGET
Can you also test indexing an `int4`?
I suppose that you're spelling out the dot product so that it works with any numeric type? I'd rather emit an actual DOT here and then implement DOT lowering for integral types: you make the code cleaner and implement another feature for free.
That is a good idea! I did just that. Note that HLSL_OP2_DOT is not defined for bools, so bool vectors are a special case I had to handle with HLSL_OP2_LOGIC_AND and HLSL_OP2_LOGIC_OR.
Also, notice that the current code triggers an assertion when indexing `bool4`. I think that's due to the introduction of a `bool` cast once they're assumed to have already been lowered. I guess this pass should happen a bit earlier (though, to be fair, it's always difficult to track the pass dependencies and assumptions).
I moved lower_nonconstant_vector_derefs() up. Also, I realized that this shouldn't be a transform_derefs(·) pass, but just a hlsl_transform_ir(·) pass on hlsl_ir_loads. transform_derefs(·) will still be used in following patches though, so I will keep its introduction patch here.