Signed-off-by: Giovanni Mascellani gmascellani@codeweavers.com --- libs/vkd3d-shader/hlsl_codegen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 24b8205c..0ee8ab55 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -204,7 +204,7 @@ static bool transform_ir(struct hlsl_ctx *ctx, bool (*func)(struct hlsl_ctx *ctx struct hlsl_block *block, void *context) { struct hlsl_ir_node *instr, *next; - bool progress = 0; + bool progress = false;
LIST_FOR_EACH_ENTRY_SAFE(instr, next, &block->instrs, struct hlsl_ir_node, entry) {
Signed-off-by: Giovanni Mascellani gmascellani@codeweavers.com --- Pretty sure this (and the following ones) will require some more tweaking before being accepted! :-)
libs/vkd3d-shader/hlsl_codegen.c | 254 ++++++++++++++++++++++++++++++- 1 file changed, 253 insertions(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 0ee8ab55..8db8bfc9 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -237,6 +237,253 @@ static void replace_node(struct hlsl_ir_node *old, struct hlsl_ir_node *new) hlsl_free_instr(old); }
+/* struct copy_propagation_state represents the accumulated knowledge + * of the copy propagation pass while it scans through the code. Field + * "variables" is a tree whose elements have type struct + * copy_propagation_varible, and represent each of the variables the + * pass has already encountered (except those with special semantics, + * which are ignored). For each variable, the array "values" (whose + * length is the register size of the variable) represent which node + * and which index inside that node (i.e., which of the at most four + * entries of a vector) provided that value last time. Field "node" + * can be NULL, meaning that the pass was not able to statically + * determine the node. + */ + +struct copy_propagation_value +{ + struct hlsl_ir_node *node; + unsigned int index; +}; + +struct copy_propagation_variable +{ + struct rb_entry entry; + struct hlsl_ir_var *var; + struct copy_propagation_value *values; +}; + +struct copy_propagation_state +{ + struct rb_tree variables; +}; + +static int copy_propagation_variable_compare(const void *key, const struct rb_entry *entry) +{ + struct copy_propagation_variable *variable = RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry); + uintptr_t key_int = (uintptr_t)key, entry_int = (uintptr_t)variable->var; + + if (key_int < entry_int) + return -1; + else if (key_int > entry_int) + return 1; + else + return 0; +} + +static void copy_propagation_variable_destroy(struct rb_entry *entry, void *context) +{ + struct copy_propagation_variable *variable = RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry); + + vkd3d_free(variable); +} + +static struct copy_propagation_variable *copy_propagation_get_variable(struct hlsl_ctx *ctx, + struct copy_propagation_state *state, struct hlsl_ir_var *var) +{ + struct rb_entry *entry = rb_get(&state->variables, var); + struct copy_propagation_variable *variable; + int res; + + if (entry) + return RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry); + + variable = hlsl_alloc(ctx, sizeof(*variable)); + if (!variable) + return NULL; + + variable->var = var; + variable->values = hlsl_alloc(ctx, sizeof(*variable->values) * var->data_type->reg_size); + if (!variable->values) + { + vkd3d_free(variable); + return NULL; + } + + res = rb_put(&state->variables, var, &variable->entry); + assert(!res); + + return variable; +} + +static void copy_propagation_set_value(struct copy_propagation_variable *variable, unsigned int offset, + unsigned char writemask, struct hlsl_ir_node *node) +{ + unsigned int index; + + for (index = 0; index < 4; ++index) + { + if (writemask & (1u << index)) + { + if (TRACE_ON()) + { + char buf[32]; + if (!node) + sprintf(buf, "(nil)"); + else if (node->index) + sprintf(buf, "@%u", node->index); + else + sprintf(buf, "%p", node); + TRACE("variable %s[%d] is written by %p[%d]\n", variable->var->name, offset + index, buf, index); + } + variable->values[offset + index].node = node; + variable->values[offset + index].index = index; + } + } +} + +/* Check if locations [offset, offset+count) in variable were all + * written from the same node. If so return the node the corresponding + * indices, otherwise return NULL (and undefined indices). */ +static struct hlsl_ir_node *copy_propagation_reconstruct_node(struct copy_propagation_variable *variable, + unsigned int offset, unsigned int count, unsigned int indices[4]) +{ + struct hlsl_ir_node *node = NULL; + unsigned int i; + + assert(offset + count <= variable->var->data_type->reg_size); + + for (i = 0; i < count; ++i) + { + if (!node) + node = variable->values[offset + i].node; + else if (node != variable->values[offset + i].node) + return NULL; + indices[i] = variable->values[offset + i].index; + } + + return node; +} + +static bool copy_propagation_load(struct hlsl_ctx *ctx, struct hlsl_ir_load *load, + struct copy_propagation_state *state) +{ + struct hlsl_ir_node *node = &load->node, *new_node; + struct copy_propagation_variable *variable; + struct hlsl_type *type = node->data_type; + unsigned int offset, indices[4] = {}; + struct hlsl_deref *src = &load->src; + struct hlsl_ir_var *var = src->var; + struct hlsl_ir_swizzle *swizzle; + DWORD s; + + if (var->is_input_semantic || var->is_output_semantic || var->is_uniform) + return false; + + if (type->type != HLSL_CLASS_SCALAR && type->type != HLSL_CLASS_VECTOR) + return false; + + offset = hlsl_offset_from_deref(src); + + variable = copy_propagation_get_variable(ctx, state, var); + if (!variable) + return false; + + new_node = copy_propagation_reconstruct_node(variable, offset, type->dimx, indices); + + if (TRACE_ON()) + { + char buf[32]; + if (!new_node) + sprintf(buf, "(nil)"); + else if (new_node->index) + sprintf(buf, "@%u", new_node->index); + else + sprintf(buf, "%p", new_node); + TRACE("load from %s[%d-%d] reconstructed to %s[%d %d %d %d]\n", var->name, offset, + offset + type->dimx, buf, indices[0], indices[1], indices[2], indices[3]); + } + + if (!new_node) + return false; + + s = indices[0] | indices[1] << 2 | indices[2] << 4 | indices[3] << 6; + if (!(swizzle = hlsl_new_swizzle(ctx, s, type->dimx, new_node, &node->loc))) + return false; + list_add_before(&node->entry, &swizzle->node.entry); + + replace_node(node, &swizzle->node); + + return true; +} + +static bool copy_propagation_store(struct hlsl_ctx *ctx, struct hlsl_ir_store *store, + struct copy_propagation_state *state) +{ + struct copy_propagation_variable *variable; + struct hlsl_deref *lhs = &store->lhs; + struct hlsl_ir_var *var = lhs->var; + + if (var->is_input_semantic || var->is_output_semantic || var->is_uniform) + return false; + + variable = copy_propagation_get_variable(ctx, state, var); + if (!variable) + return false; + + copy_propagation_set_value(variable, hlsl_offset_from_deref(lhs), store->writemask, store->rhs.node); + + return false; +} + +static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block *block, + struct copy_propagation_state *state) +{ + struct hlsl_ir_node *instr, *next; + bool progress = false; + + LIST_FOR_EACH_ENTRY_SAFE(instr, next, &block->instrs, struct hlsl_ir_node, entry) + { + switch (instr->type) + { + case HLSL_IR_LOAD: + progress |= copy_propagation_load(ctx, hlsl_ir_load(instr), state); + break; + + case HLSL_IR_STORE: + progress |= copy_propagation_store(ctx, hlsl_ir_store(instr), state); + break; + + case HLSL_IR_IF: + FIXME("Copy propagation doesn't support conditionals yet, leaving.\n"); + return progress; + + case HLSL_IR_LOOP: + FIXME("Copy propagation doesn't support loops yet, leaving.\n"); + return progress; + + default: + break; + } + } + + return progress; +} + +static bool copy_propagation_pass(struct hlsl_ctx *ctx, struct hlsl_block *block) +{ + struct copy_propagation_state state; + bool progress; + + rb_init(&state.variables, copy_propagation_variable_compare); + + progress = copy_propagation_recursive(ctx, block, &state); + + rb_destroy(&state.variables, copy_propagation_variable_destroy, NULL); + + return progress; +} + static bool is_vec1(const struct hlsl_type *type) { return (type->type == HLSL_CLASS_SCALAR) || (type->type == HLSL_CLASS_VECTOR && type->dimx == 1); @@ -1354,7 +1601,12 @@ int hlsl_emit_dxbc(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_fun progress |= transform_ir(ctx, split_struct_copies, body, NULL); } while (progress); - while (transform_ir(ctx, fold_constants, body, NULL)); + do + { + progress = transform_ir(ctx, fold_constants, body, NULL); + progress |= copy_propagation_pass(ctx, body); + } + while (progress);
if (ctx->profile->major_version < 4) transform_ir(ctx, lower_division, body, NULL);
On 11/9/21 3:44 AM, Giovanni Mascellani wrote:
Signed-off-by: Giovanni Mascellani gmascellani@codeweavers.com
Pretty sure this (and the following ones) will require some more tweaking before being accepted! :-)
libs/vkd3d-shader/hlsl_codegen.c | 254 ++++++++++++++++++++++++++++++- 1 file changed, 253 insertions(+), 1 deletion(-)
Overall this patch looks a lot better than I was afraid of. It's a lot of code that's intimidating to review, but once you ignore the rbtree boilerplate it's simple enough and seems about in line with what I expect. There's quite a few things that I think can be simplified, but the basic structure seems sound, so nice work.
The CF part is probably going to be a lot trickier, but fortunately I think that this patch alone is the one that really matters. Frankly, if we were to take this patch, and then a second patch that does copy-prop on interior CF blocks with a fresh copy_propagation_state(), we might even cover enough that it's not even worrying about CF...
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 0ee8ab55..8db8bfc9 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -237,6 +237,253 @@ static void replace_node(struct hlsl_ir_node *old, struct hlsl_ir_node *new) hlsl_free_instr(old); }
+/* struct copy_propagation_state represents the accumulated knowledge
- of the copy propagation pass while it scans through the code. Field
- "variables" is a tree whose elements have type struct
- copy_propagation_varible, and represent each of the variables the
- pass has already encountered (except those with special semantics,
- which are ignored). For each variable, the array "values" (whose
- length is the register size of the variable) represent which node
- and which index inside that node (i.e., which of the at most four
- entries of a vector) provided that value last time. Field "node"
- can be NULL, meaning that the pass was not able to statically
- determine the node.
- */
This comment feels a bit too low-level? I dunno, comments like this are hard to review, but my inclination is to describe what you're doing at a high level, along the lines of "we track the last known value of each component of each variable", and let the code itself explain the details.
+struct copy_propagation_value +{
- struct hlsl_ir_node *node;
- unsigned int index;
I think the term we want is "component", not "index".
+};
+struct copy_propagation_variable +{
- struct rb_entry entry;
- struct hlsl_ir_var *var;
- struct copy_propagation_value *values;
+};
+struct copy_propagation_state +{
- struct rb_tree variables;
+};
+static int copy_propagation_variable_compare(const void *key, const struct rb_entry *entry) +{
- struct copy_propagation_variable *variable = RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry);
- uintptr_t key_int = (uintptr_t)key, entry_int = (uintptr_t)variable->var;
- if (key_int < entry_int)
return -1;
- else if (key_int > entry_int)
return 1;
- else
return 0;
Could we just modify the rbtree implementation to use uintptr_t instead, and then do a direct subtraction?
+}
+static void copy_propagation_variable_destroy(struct rb_entry *entry, void *context) +{
- struct copy_propagation_variable *variable = RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry);
- vkd3d_free(variable);
+}
+static struct copy_propagation_variable *copy_propagation_get_variable(struct hlsl_ctx *ctx,
struct copy_propagation_state *state, struct hlsl_ir_var *var)
+{
- struct rb_entry *entry = rb_get(&state->variables, var);
- struct copy_propagation_variable *variable;
- int res;
- if (entry)
return RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry);
- variable = hlsl_alloc(ctx, sizeof(*variable));
- if (!variable)
return NULL;
- variable->var = var;
- variable->values = hlsl_alloc(ctx, sizeof(*variable->values) * var->data_type->reg_size);
- if (!variable->values)
- {
vkd3d_free(variable);
return NULL;
- }
- res = rb_put(&state->variables, var, &variable->entry);
- assert(!res);
Although this is a bit awkward because most of the function is only relevant for stores, not loads, and I think it would actually be better to reflect that in the code, one way or another (so we can do less effort).
- return variable;
+}
+static void copy_propagation_set_value(struct copy_propagation_variable *variable, unsigned int offset,
unsigned char writemask, struct hlsl_ir_node *node)
+{
- unsigned int index;
- for (index = 0; index < 4; ++index)
- {
if (writemask & (1u << index))
{
if (TRACE_ON())
{
char buf[32];
if (!node)
sprintf(buf, "(nil)");
Can that happen?
else if (node->index)
sprintf(buf, "@%u", node->index);
else
sprintf(buf, "%p", node);
Can that happen?
TRACE("variable %s[%d] is written by %p[%d]\n", variable->var->name, offset + index, buf, index);
This trace doesn't really match the usual format, but more saliently, the hardcoded buffer is ugly. Assuming the other two cases really can happen, I'd rather see individual traces spelled out.
Same thing below.
}
variable->values[offset + index].node = node;
variable->values[offset + index].index = index;
}
- }
+}
+/* Check if locations [offset, offset+count) in variable were all
- written from the same node. If so return the node the corresponding
- indices, otherwise return NULL (and undefined indices). */
+static struct hlsl_ir_node *copy_propagation_reconstruct_node(struct copy_propagation_variable *variable,
unsigned int offset, unsigned int count, unsigned int indices[4])
"indices" is really just a swizzle; can we return it as such?
+{
- struct hlsl_ir_node *node = NULL;
- unsigned int i;
- assert(offset + count <= variable->var->data_type->reg_size);
- for (i = 0; i < count; ++i)
- {
if (!node)
node = variable->values[offset + i].node;
else if (node != variable->values[offset + i].node)
return NULL;
indices[i] = variable->values[offset + i].index;
- }
- return node;
+}
This is really obviously the right thing to do, yet it still confused me a *lot* when trying to review it.
That might just be me, but if not, I guess the function name and comment could use work. Explaining *why* the function is necessary would probably help.
+static bool copy_propagation_load(struct hlsl_ctx *ctx, struct hlsl_ir_load *load,
Can you please try to name functions using a verb? There's three more examples below.
struct copy_propagation_state *state)
+{
- struct hlsl_ir_node *node = &load->node, *new_node;
- struct copy_propagation_variable *variable;
- struct hlsl_type *type = node->data_type;
- unsigned int offset, indices[4] = {};
{} isn't portable, unfortunately.
- struct hlsl_deref *src = &load->src;
- struct hlsl_ir_var *var = src->var;
- struct hlsl_ir_swizzle *swizzle;
- DWORD s;
- if (var->is_input_semantic || var->is_output_semantic || var->is_uniform)
return false;
For input semantics and uniforms: yeah, but do we need to? The important question is "can we reconstruct a store for this variable", and this should already be false.
For output semantics, we should never get here in the first place.
- if (type->type != HLSL_CLASS_SCALAR && type->type != HLSL_CLASS_VECTOR)
return false;
- offset = hlsl_offset_from_deref(src);
The problem with hlsl_offset_from_deref(), and the reason I probably should have fought against 62b25bc52b in its current form, is that it not only doesn't deal with non-constant offsets, but doesn't really give the caller a way to bail either. We should probably be returning bool from it, and then aborting here.
- variable = copy_propagation_get_variable(ctx, state, var);
- if (!variable)
return false;
- new_node = copy_propagation_reconstruct_node(variable, offset, type->dimx, indices);
- if (TRACE_ON())
- {
char buf[32];
if (!new_node)
sprintf(buf, "(nil)");
Is this useful to trace?
else if (new_node->index)
sprintf(buf, "@%u", new_node->index);
else
sprintf(buf, "%p", new_node);
Can this happen?
TRACE("load from %s[%d-%d] reconstructed to %s[%d %d %d %d]\n", var->name, offset,
offset + type->dimx, buf, indices[0], indices[1], indices[2], indices[3]);
- }
- if (!new_node)
return false;
- s = indices[0] | indices[1] << 2 | indices[2] << 4 | indices[3] << 6;
- if (!(swizzle = hlsl_new_swizzle(ctx, s, type->dimx, new_node, &node->loc)))
return false;
- list_add_before(&node->entry, &swizzle->node.entry);
- replace_node(node, &swizzle->node);
- return true;
+}
+static bool copy_propagation_store(struct hlsl_ctx *ctx, struct hlsl_ir_store *store,
struct copy_propagation_state *state)
+{
- struct copy_propagation_variable *variable;
- struct hlsl_deref *lhs = &store->lhs;
- struct hlsl_ir_var *var = lhs->var;
- if (var->is_input_semantic || var->is_output_semantic || var->is_uniform)
return false;
Same deal as in copy_propagation_load(), but inverted.
- variable = copy_propagation_get_variable(ctx, state, var);
- if (!variable)
return false;
- copy_propagation_set_value(variable, hlsl_offset_from_deref(lhs), store->writemask, store->rhs.node);
- return false;
+}
Shouldn't this function just return void?
+static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block *block,
struct copy_propagation_state *state)
+{
- struct hlsl_ir_node *instr, *next;
- bool progress = false;
- LIST_FOR_EACH_ENTRY_SAFE(instr, next, &block->instrs, struct hlsl_ir_node, entry)
- {
switch (instr->type)
{
case HLSL_IR_LOAD:
progress |= copy_propagation_load(ctx, hlsl_ir_load(instr), state);
break;
case HLSL_IR_STORE:
progress |= copy_propagation_store(ctx, hlsl_ir_store(instr), state);
break;
case HLSL_IR_IF:
FIXME("Copy propagation doesn't support conditionals yet, leaving.\n");
return progress;
case HLSL_IR_LOOP:
FIXME("Copy propagation doesn't support loops yet, leaving.\n");
return progress;
default:
break;
}
- }
- return progress;
+}
+static bool copy_propagation_pass(struct hlsl_ctx *ctx, struct hlsl_block *block) +{
- struct copy_propagation_state state;
- bool progress;
- rb_init(&state.variables, copy_propagation_variable_compare);
- progress = copy_propagation_recursive(ctx, block, &state);
- rb_destroy(&state.variables, copy_propagation_variable_destroy, NULL);
- return progress;
+}
- static bool is_vec1(const struct hlsl_type *type) { return (type->type == HLSL_CLASS_SCALAR) || (type->type == HLSL_CLASS_VECTOR && type->dimx == 1);
@@ -1354,7 +1601,12 @@ int hlsl_emit_dxbc(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_fun progress |= transform_ir(ctx, split_struct_copies, body, NULL); } while (progress);
- while (transform_ir(ctx, fold_constants, body, NULL));
- do
- {
progress = transform_ir(ctx, fold_constants, body, NULL);
progress |= copy_propagation_pass(ctx, body);
This probably isn't worth examining, but I'm curious why constant folding is useful after copy-prop.
}
while (progress);
if (ctx->profile->major_version < 4) transform_ir(ctx, lower_division, body, NULL);
Hi,
On 09/11/21 23:21, Zebediah Figura wrote:
Overall this patch looks a lot better than I was afraid of. It's a lot of code that's intimidating to review, but once you ignore the rbtree boilerplate it's simple enough and seems about in line with what I expect. There's quite a few things that I think can be simplified, but the basic structure seems sound, so nice work.
Great, that's good! Thanks for your review, I think most if not all of the points you wrote should be easy to fix.
The CF part is probably going to be a lot trickier, but fortunately I think that this patch alone is the one that really matters. Frankly, if we were to take this patch, and then a second patch that does copy-prop on interior CF blocks with a fresh copy_propagation_state(), we might even cover enough that it's not even worrying about CF...
Mmh, note that you have to handle control flow in some way or another, you cannot just ignore it. This patch doesn't know about control flow, but the only way it can do it is by bailing out at the first control flow instruction in the program. It's not enough to ignore the control flow block and resume processing after it, because the inner block might invalidate some of your knowledge about the program state (in 3/5 patch this is done by copy_propagation_invalidate_from_block).
+/* struct copy_propagation_state represents the accumulated knowledge
- of the copy propagation pass while it scans through the code. Field
- "variables" is a tree whose elements have type struct
- copy_propagation_varible, and represent each of the variables the
- pass has already encountered (except those with special semantics,
- which are ignored). For each variable, the array "values" (whose
- length is the register size of the variable) represent which node
- and which index inside that node (i.e., which of the at most four
- entries of a vector) provided that value last time. Field "node"
- can be NULL, meaning that the pass was not able to statically
- determine the node.
- */
This comment feels a bit too low-level? I dunno, comments like this are hard to review, but my inclination is to describe what you're doing at a high level, along the lines of "we track the last known value of each component of each variable", and let the code itself explain the details.
Personally I would have loved to meet comments like mine in other areas of Wine: knowing what data read or written by an algorithm are meant to represent makes it much easier to understand why the algorithm does what it does. I agree that this is not the most complicated case around, but I thought this would be helpful for someone not already knowing my code. OTOH I understand that also a high-level description is useful: would you consider it satisfying to have both?
+static int copy_propagation_variable_compare(const void *key, const struct rb_entry *entry) +{ +Â Â Â struct copy_propagation_variable *variable = RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry); +Â Â Â uintptr_t key_int = (uintptr_t)key, entry_int = (uintptr_t)variable->var;
+Â Â Â if (key_int < entry_int) +Â Â Â Â Â Â Â return -1; +Â Â Â else if (key_int > entry_int) +Â Â Â Â Â Â Â return 1; +Â Â Â else +Â Â Â Â Â Â Â return 0;
Could we just modify the rbtree implementation to use uintptr_t instead, and then do a direct subtraction?
I am not sure of what you mean: the pointer in struct copy_propagation_variable is used as a pointer, it would not be very logical to store it as an integer.
Also, do you get the right think if you subtract two unsigned integers? For one thing the result is unsigned, but even if you cast it to signed you don't get an order, do you?
Even if you could, you would get a shorter code, but I am not sure it would be a more legible one.
+static struct copy_propagation_variable *copy_propagation_get_variable(struct hlsl_ctx *ctx, +Â Â Â Â Â Â Â struct copy_propagation_state *state, struct hlsl_ir_var *var) +{ +Â Â Â struct rb_entry *entry = rb_get(&state->variables, var); +Â Â Â struct copy_propagation_variable *variable; +Â Â Â int res;
+Â Â Â if (entry) +Â Â Â Â Â Â Â return RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry);
+Â Â Â variable = hlsl_alloc(ctx, sizeof(*variable)); +Â Â Â if (!variable) +Â Â Â Â Â Â Â return NULL;
+Â Â Â variable->var = var; +Â Â Â variable->values = hlsl_alloc(ctx, sizeof(*variable->values) * var->data_type->reg_size); +Â Â Â if (!variable->values) +Â Â Â { +Â Â Â Â Â Â Â vkd3d_free(variable); +Â Â Â Â Â Â Â return NULL; +Â Â Â }
+Â Â Â res = rb_put(&state->variables, var, &variable->entry); +Â Â Â assert(!res);
Although this is a bit awkward because most of the function is only relevant for stores, not loads, and I think it would actually be better to reflect that in the code, one way or another (so we can do less effort).
Do less effort at what? It's true that when loading I could return NULL when the variable has never been seen before (meaning that we don't know anything about that variable, so the load cannot be removed) instead of creating an "empty" variable placeholder and return that one, but I don't really see an advantage doing that. It would seem more effort rather than less to me, because there are more possible branches to check.
(BTW, that would also mean that the variable is read before being written, which is something that shouldn't normally happen, but I don't think it's sensible to count on that)
+Â Â Â Â Â Â Â Â Â Â Â if (TRACE_ON()) +Â Â Â Â Â Â Â Â Â Â Â { +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â char buf[32]; +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â if (!node) +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "(nil)");
Can that happen?
Not yet, but it will as soon as loops and conditionals enter the scene, because they can cause the last store to a variable to become unknown (at compilation time).
Now, I know that usually we don't like to have dead code around. I think this case could be accepted, since this code is just a line long and becomes alive at 3/5. Also, I find it easier to review this piece of code at once, instead of first reviewing a part, checking that the missing case will never happen and then review the missing case and checking that it fits well in the already-reviewed part.
But if you really don't like it, I can defer the NULL case to 3/5.
+Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â else if (node->index) +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "@%u", node->index); +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â else +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "%p", node);
Can that happen?
Definitely. Arguably, it might be the common case, given that the earlier optimization passes (including this pass itself, given that it might be executed more than once) might synthesize a lot of nodes, which won't have an index until compute_liveness is executed.
+Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â TRACE("variable %s[%d] is written by %p[%d]\n", variable->var->name, offset + index, buf, index);
This trace doesn't really match the usual format, but more saliently, the hardcoded buffer is ugly. Assuming the other two cases really can happen, I'd rather see individual traces spelled out.
Same thing below.
Ok. As usual, I found find that uglier, but I can live with that.
BTW, having the traces is not a condicio sine qua non for me. They were useful for me to develop the patch, and I left them because I figured that they might also be useful in the future, but we can happily get rid of them.
+Â Â Â struct hlsl_deref *src = &load->src; +Â Â Â struct hlsl_ir_var *var = src->var; +Â Â Â struct hlsl_ir_swizzle *swizzle; +Â Â Â DWORD s;
+Â Â Â if (var->is_input_semantic || var->is_output_semantic || var->is_uniform) +Â Â Â Â Â Â Â return false;
For input semantics and uniforms: yeah, but do we need to? The important question is "can we reconstruct a store for this variable", and this should already be false.
For output semantics, we should never get here in the first place.
I think you're right. I had put this check initially because I was concentrated on other aspects of the algorithm, but it's true that it is overly cautious.
+Â Â Â if (type->type != HLSL_CLASS_SCALAR && type->type != HLSL_CLASS_VECTOR) +Â Â Â Â Â Â Â return false;
+Â Â Â offset = hlsl_offset_from_deref(src);
The problem with hlsl_offset_from_deref(), and the reason I probably should have fought against 62b25bc52b in its current form, is that it not only doesn't deal with non-constant offsets, but doesn't really give the caller a way to bail either. We should probably be returning bool from it, and then aborting here.
I agree. Actually, the first version of my patch set had a variant of hlsl_offset_from_deref that acknowledged the possibility that it could not be possible to statically determine the offset of a deref. This condition is difficult to handle at code generation time, but it is not particularly problematic here: at load time just ignore the load, and at store time invalidate the whole variable.
I can totally fix hlsl_offset_from_deref so that this happens.
(speaking of constant expressions in the code, I am a bit bothered that evaluate_array_dimension is not able to figure out the value of constant expressions that are not immediate constants, and unfortunately fixing that is not as easy as running a fold_constants pass; I don't in mind a clean solution for that that doesn't require re-implementing constant folding there; that's another matter, of course)
+Â Â Â variable = copy_propagation_get_variable(ctx, state, var); +Â Â Â if (!variable) +Â Â Â Â Â Â Â return false;
+Â Â Â new_node = copy_propagation_reconstruct_node(variable, offset, type->dimx, indices);
+Â Â Â if (TRACE_ON()) +Â Â Â { +Â Â Â Â Â Â Â char buf[32]; +Â Â Â Â Â Â Â if (!new_node) +Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "(nil)");
Is this useful to trace?
As before, it is a matter of tastes, I believe. I can drop it, just as I can drop the traces above.
+Â Â Â Â Â Â Â else if (new_node->index) +Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "@%u", new_node->index); +Â Â Â Â Â Â Â else +Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "%p", new_node);
Can this happen?
Yes, just as above.
+Â Â Â variable = copy_propagation_get_variable(ctx, state, var); +Â Â Â if (!variable) +Â Â Â Â Â Â Â return false;
+Â Â Â copy_propagation_set_value(variable, hlsl_offset_from_deref(lhs), store->writemask, store->rhs.node);
+Â Â Â return false; +}
Shouldn't this function just return void?
It's a matter of philosophy. If you ask me, the switch in copy_propagation_recursive should be rather oblivious of what the various helpers actually do, it shouldn't have an insight that copy_propagation_store never returns true. For its point of view, any helper can potentially modify the code. Though I can change to void if you prefer.
+static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block *block, +Â Â Â Â Â Â Â struct copy_propagation_state *state)
So, no comment for a function called "recursive" which is not (yet) actually recursive? :-P
@@ -1354,7 +1601,12 @@ int hlsl_emit_dxbc(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_fun          progress |= transform_ir(ctx, split_struct_copies, body, NULL);      }      while (progress); -   while (transform_ir(ctx, fold_constants, body, NULL)); +   do +   { +       progress = transform_ir(ctx, fold_constants, body, NULL); +       progress |= copy_propagation_pass(ctx, body);
This probably isn't worth examining, but I'm curious why constant folding is useful after copy-prop.
Copy propagation might enable some additional steps of constant folding, if a constant is first stored, than loaded and then an operation is carried on it. OTOH, constant folding can enable some additional step of copy propagation if it makes an offset statically visible.
Thanks, Giovanni.
Hi,
On 10/11/21 10:57, Giovanni Mascellani wrote:
The CF part is probably going to be a lot trickier, but fortunately I think that this patch alone is the one that really matters. Frankly, if we were to take this patch, and then a second patch that does copy-prop on interior CF blocks with a fresh copy_propagation_state(), we might even cover enough that it's not even worrying about CF...
Mmh, note that you have to handle control flow in some way or another, you cannot just ignore it. This patch doesn't know about control flow, but the only way it can do it is by bailing out at the first control flow instruction in the program. It's not enough to ignore the control flow block and resume processing after it, because the inner block might invalidate some of your knowledge about the program state (in 3/5 patch this is done by copy_propagation_invalidate_from_block).
I think I misread your comment, even though my point is still valid: in your model, you have to refresh copy_propagation_state not only when you enter a block, but also when you leave it (actually, you might avoid refreshing when entering a conditional, though not when entering a loop). I think this means dropping quite a lot of useful information, and given that I don't find it particularly hard to avoid dropping it, it seems quite a waste.
Notice, in particular, that if you count on this copy propagation pass to fix texture loading, then you need to ensure, at a minimum, that all loads from a variable that is stored once at the beginning of the program (before any control flow) are correctly recognized and simplified. This does not happen with your proposal, I believe, while it happens with mine 3/5 (which currently has a bug which makes it unsound, but I will fix it in my next submission).
Well, technically at this point mine proposal wouldn't work as well, because it ignores object variables, but that can be addressed.
Giovanni.
On 11/10/21 03:57, Giovanni Mascellani wrote:
Hi,
On 09/11/21 23:21, Zebediah Figura wrote:
Overall this patch looks a lot better than I was afraid of. It's a lot of code that's intimidating to review, but once you ignore the rbtree boilerplate it's simple enough and seems about in line with what I expect. There's quite a few things that I think can be simplified, but the basic structure seems sound, so nice work.
Great, that's good! Thanks for your review, I think most if not all of the points you wrote should be easy to fix.
The CF part is probably going to be a lot trickier, but fortunately I think that this patch alone is the one that really matters. Frankly, if we were to take this patch, and then a second patch that does copy-prop on interior CF blocks with a fresh copy_propagation_state(), we might even cover enough that it's not even worrying about CF...
Mmh, note that you have to handle control flow in some way or another, you cannot just ignore it. This patch doesn't know about control flow, but the only way it can do it is by bailing out at the first control flow instruction in the program. It's not enough to ignore the control flow block and resume processing after it, because the inner block might invalidate some of your knowledge about the program state (in 3/5 patch this is done by copy_propagation_invalidate_from_block).
Right. I should have said to invalidate the whole copyprop state both when entering a CF block and when leaving one. The point is that we can do copyprop within every BB, not just the first one, without having to worry about things like partial invalidation.
+/* struct copy_propagation_state represents the accumulated knowledge
- of the copy propagation pass while it scans through the code. Field
- "variables" is a tree whose elements have type struct
- copy_propagation_varible, and represent each of the variables the
- pass has already encountered (except those with special semantics,
- which are ignored). For each variable, the array "values" (whose
- length is the register size of the variable) represent which node
- and which index inside that node (i.e., which of the at most four
- entries of a vector) provided that value last time. Field "node"
- can be NULL, meaning that the pass was not able to statically
- determine the node.
- */
This comment feels a bit too low-level? I dunno, comments like this are hard to review, but my inclination is to describe what you're doing at a high level, along the lines of "we track the last known value of each component of each variable", and let the code itself explain the details.
Personally I would have loved to meet comments like mine in other areas of Wine: knowing what data read or written by an algorithm are meant to represent makes it much easier to understand why the algorithm does what it does. I agree that this is not the most complicated case around, but I thought this would be helpful for someone not already knowing my code. OTOH I understand that also a high-level description is useful: would you consider it satisfying to have both?
Eh, maybe. I would at least start with the high-level explanation. When I read the above comment my eyes kind of glaze over, and it doesn't tell me anything that the code already doesn't (even the struct definitions alone tell me as much without checking how they're used.)
I dunno, I'll see what you end up coming up with.
BTW, I notice a spelling error ("copy_propagation_varible").
+static int copy_propagation_variable_compare(const void *key, const struct rb_entry *entry) +{ +Â Â Â struct copy_propagation_variable *variable = RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry); +Â Â Â uintptr_t key_int = (uintptr_t)key, entry_int = (uintptr_t)variable->var;
+Â Â Â if (key_int < entry_int) +Â Â Â Â Â Â Â return -1; +Â Â Â else if (key_int > entry_int) +Â Â Â Â Â Â Â return 1; +Â Â Â else +Â Â Â Â Â Â Â return 0;
Could we just modify the rbtree implementation to use uintptr_t instead, and then do a direct subtraction?
I am not sure of what you mean: the pointer in struct copy_propagation_variable is used as a pointer, it would not be very logical to store it as an integer.
Also, do you get the right think if you subtract two unsigned integers? For one thing the result is unsigned, but even if you cast it to signed you don't get an order, do you?
Even if you could, you would get a shorter code, but I am not sure it would be a more legible one.
Er, sorry, I should have said intptr_t. I mean to return intptr_t from the rbtree compare callback, and then all you need is
return (intptr_t)key - (intptr_t) variable->var;
+static struct copy_propagation_variable *copy_propagation_get_variable(struct hlsl_ctx *ctx, +Â Â Â Â Â Â Â struct copy_propagation_state *state, struct hlsl_ir_var *var) +{ +Â Â Â struct rb_entry *entry = rb_get(&state->variables, var); +Â Â Â struct copy_propagation_variable *variable; +Â Â Â int res;
+Â Â Â if (entry) +Â Â Â Â Â Â Â return RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry);
+Â Â Â variable = hlsl_alloc(ctx, sizeof(*variable)); +Â Â Â if (!variable) +Â Â Â Â Â Â Â return NULL;
+Â Â Â variable->var = var; +Â Â Â variable->values = hlsl_alloc(ctx, sizeof(*variable->values) * var->data_type->reg_size); +Â Â Â if (!variable->values) +Â Â Â { +Â Â Â Â Â Â Â vkd3d_free(variable); +Â Â Â Â Â Â Â return NULL; +Â Â Â }
+Â Â Â res = rb_put(&state->variables, var, &variable->entry); +Â Â Â assert(!res);
Although this is a bit awkward because most of the function is only relevant for stores, not loads, and I think it would actually be better to reflect that in the code, one way or another (so we can do less effort).
Do less effort at what? It's true that when loading I could return NULL when the variable has never been seen before (meaning that we don't know anything about that variable, so the load cannot be removed) instead of creating an "empty" variable placeholder and return that one, but I don't really see an advantage doing that. It would seem more effort rather than less to me, because there are more possible branches to check.
Less effort done by the code, I mean, i.e. we use less memory that way and spend less time searching it. It requires a bit more code, but I think that if you slightly reorganize the callbacks it's still perfectly idiomatic.
(BTW, that would also mean that the variable is read before being written, which is something that shouldn't normally happen, but I don't think it's sensible to count on that)
I don't understand what you mean by this; can you please clarify?
+Â Â Â Â Â Â Â Â Â Â Â if (TRACE_ON()) +Â Â Â Â Â Â Â Â Â Â Â { +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â char buf[32]; +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â if (!node) +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "(nil)");
Can that happen?
Not yet, but it will as soon as loops and conditionals enter the scene, because they can cause the last store to a variable to become unknown (at compilation time).
Ah, of course, that makes sense.
Now, I know that usually we don't like to have dead code around. I think this case could be accepted, since this code is just a line long and becomes alive at 3/5. Also, I find it easier to review this piece of code at once, instead of first reviewing a part, checking that the missing case will never happen and then review the missing case and checking that it fits well in the already-reviewed part.
But if you really don't like it, I can defer the NULL case to 3/5.
Yeah, even here I find it easier to review if this code is moved to 3/5.
+Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â else if (node->index) +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "@%u", node->index); +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â else +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "%p", node);
Can that happen?
Definitely. Arguably, it might be the common case, given that the earlier optimization passes (including this pass itself, given that it might be executed more than once) might synthesize a lot of nodes, which won't have an index until compute_liveness is executed.
Ah. I assumed that you would actually be indexing right before copyprop, and didn't check whether that was actually the case.
Although on reflection now I'm not sure if we *should* index. We only dump the IR once, at the end, and I feel like we risk confusion by printing instruction indices that won't match. Hmm, maybe the code makes sense as it is...
+Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â TRACE("variable %s[%d] is written by %p[%d]\n", variable->var->name, offset + index, buf, index);
This trace doesn't really match the usual format, but more saliently, the hardcoded buffer is ugly. Assuming the other two cases really can happen, I'd rather see individual traces spelled out.
Same thing below.
Ok. As usual, I found find that uglier, but I can live with that.
BTW, having the traces is not a condicio sine qua non for me. They were useful for me to develop the patch, and I left them because I figured that they might also be useful in the future, but we can happily get rid of them.
I'm typically in favor of leaving in traces if they were helpful during debugging.
For what it's worth, I'd also be in favor of a "debugstr_node" helper or something. Maybe even if it uses a fixed-size buffer, although I'm not sure that Matteo or Henri will agree with that ;-)
+Â Â Â struct hlsl_deref *src = &load->src; +Â Â Â struct hlsl_ir_var *var = src->var; +Â Â Â struct hlsl_ir_swizzle *swizzle; +Â Â Â DWORD s;
+Â Â Â if (var->is_input_semantic || var->is_output_semantic || var->is_uniform) +Â Â Â Â Â Â Â return false;
For input semantics and uniforms: yeah, but do we need to? The important question is "can we reconstruct a store for this variable", and this should already be false.
For output semantics, we should never get here in the first place.
I think you're right. I had put this check initially because I was concentrated on other aspects of the algorithm, but it's true that it is overly cautious.
+Â Â Â if (type->type != HLSL_CLASS_SCALAR && type->type != HLSL_CLASS_VECTOR) +Â Â Â Â Â Â Â return false;
+Â Â Â offset = hlsl_offset_from_deref(src);
The problem with hlsl_offset_from_deref(), and the reason I probably should have fought against 62b25bc52b in its current form, is that it not only doesn't deal with non-constant offsets, but doesn't really give the caller a way to bail either. We should probably be returning bool from it, and then aborting here.
I agree. Actually, the first version of my patch set had a variant of hlsl_offset_from_deref that acknowledged the possibility that it could not be possible to statically determine the offset of a deref. This condition is difficult to handle at code generation time, but it is not particularly problematic here: at load time just ignore the load, and at store time invalidate the whole variable.
I can totally fix hlsl_offset_from_deref so that this happens.
(speaking of constant expressions in the code, I am a bit bothered that evaluate_array_dimension is not able to figure out the value of constant expressions that are not immediate constants, and unfortunately fixing that is not as easy as running a fold_constants pass; I don't in mind a clean solution for that that doesn't require re-implementing constant folding there; that's another matter, of course)
Yeah, that one is worrying...
I think we might have to reimplement constant folding. Probably we can at least use a common helper, though.
+Â Â Â variable = copy_propagation_get_variable(ctx, state, var); +Â Â Â if (!variable) +Â Â Â Â Â Â Â return false;
+Â Â Â new_node = copy_propagation_reconstruct_node(variable, offset, type->dimx, indices);
+Â Â Â if (TRACE_ON()) +Â Â Â { +Â Â Â Â Â Â Â char buf[32]; +Â Â Â Â Â Â Â if (!new_node) +Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "(nil)");
Is this useful to trace?
As before, it is a matter of tastes, I believe. I can drop it, just as I can drop the traces above.
If it helps, feel free to leave it in. FWIW, I'm mostly talking about the "null" case, not the others.
+Â Â Â Â Â Â Â else if (new_node->index) +Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "@%u", new_node->index); +Â Â Â Â Â Â Â else +Â Â Â Â Â Â Â Â Â Â Â sprintf(buf, "%p", new_node);
Can this happen?
Yes, just as above.
+Â Â Â variable = copy_propagation_get_variable(ctx, state, var); +Â Â Â if (!variable) +Â Â Â Â Â Â Â return false;
+Â Â Â copy_propagation_set_value(variable, hlsl_offset_from_deref(lhs), store->writemask, store->rhs.node);
+Â Â Â return false; +}
Shouldn't this function just return void?
It's a matter of philosophy. If you ask me, the switch in copy_propagation_recursive should be rather oblivious of what the various helpers actually do, it shouldn't have an insight that copy_propagation_store never returns true. For its point of view, any helper can potentially modify the code. Though I can change to void if you prefer.
It's a fair point, but I guess I look at copy_propagation_recursive() and think "recording stores should never make any progress; all the magic happens in rewriting loads". This becomes especially true if this function gets renamed to copy_prop_record_store() or something, which I think it should.
+static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block *block, +Â Â Â Â Â Â Â struct copy_propagation_state *state)
So, no comment for a function called "recursive" which is not (yet) actually recursive? :-P
I'll just go with "I requested the functions' names be changed anyway" ;-)
I don't particularly mind this being called copy_prop_recurse(), even if it doesn't recurse yet, since I think it probably should (even if it's just as simple as invalidating the whole state on entry and exit of a CF block). On the other hand renaming to copy_prop_transform_block() or something would also be reasonable.
@@ -1354,7 +1601,12 @@ int hlsl_emit_dxbc(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_fun          progress |= transform_ir(ctx, split_struct_copies, body, NULL);      }      while (progress); -   while (transform_ir(ctx, fold_constants, body, NULL)); +   do +   { +       progress = transform_ir(ctx, fold_constants, body, NULL); +       progress |= copy_propagation_pass(ctx, body);
This probably isn't worth examining, but I'm curious why constant folding is useful after copy-prop.
Copy propagation might enable some additional steps of constant folding, if a constant is first stored, than loaded and then an operation is carried on it. OTOH, constant folding can enable some additional step of copy propagation if it makes an offset statically visible.
Ah, of course, that should have been obvious...
Hi,
On 10/11/21 17:33, Zebediah Figura (she/her) wrote:
Mmh, note that you have to handle control flow in some way or another, you cannot just ignore it. This patch doesn't know about control flow, but the only way it can do it is by bailing out at the first control flow instruction in the program. It's not enough to ignore the control flow block and resume processing after it, because the inner block might invalidate some of your knowledge about the program state (in 3/5 patch this is done by copy_propagation_invalidate_from_block).
Right. I should have said to invalidate the whole copyprop state both when entering a CF block and when leaving one. The point is that we can do copyprop within every BB, not just the first one, without having to worry about things like partial invalidation.
I guess that by now I've already read the other mail I sent on that, but the point is that wouldn't be enough to get texture working, I think.
Personally I would have loved to meet comments like mine in other areas of Wine: knowing what data read or written by an algorithm are meant to represent makes it much easier to understand why the algorithm does what it does. I agree that this is not the most complicated case around, but I thought this would be helpful for someone not already knowing my code. OTOH I understand that also a high-level description is useful: would you consider it satisfying to have both?
Eh, maybe. I would at least start with the high-level explanation. When I read the above comment my eyes kind of glaze over, and it doesn't tell me anything that the code already doesn't (even the struct definitions alone tell me as much without checking how they're used.)
I dunno, I'll see what you end up coming up with.
Ok, I'll do my best.
Er, sorry, I should have said intptr_t. I mean to return intptr_t from the rbtree compare callback, and then all you need is
return (intptr_t)key - (intptr_t) variable->var;
Would that really work? For one thing, signed overflow is undefined behavior in C (except possibly very recent versions which we're definitely not using). It doesn't seem we're using -fwrapv, does it?
Even assuming two's complement semantics, this doesn't seem to give an order: for example, 0 > (INT_MIN + 1), because 0 - (INT_MIN + 1) = INT_MAX > 0 and INT_MAX > 0, because INT_MAX - 0 = INT_MAX > 0, but (INT_MIN + 1) - INT_MAX = 2 > 0, therefore you'd have (INT_MIN + 1) > INT_MAX. Therefore the relationship is not transitive, which I am pretty sure the red-black tree implementation won't like.
Am I missing something?
Ah. I assumed that you would actually be indexing right before copyprop, and didn't check whether that was actually the case.
Although on reflection now I'm not sure if we *should* index. We only dump the IR once, at the end, and I feel like we risk confusion by printing instruction indices that won't match. Hmm, maybe the code makes sense as it is...
You're right, I'd say at this point it's better to just ignore the index and always use the pointer, so there is no confusion with the indices in the final dump. This also means that I can put the %p directly in the TRACE without the additional buffer that you didn't like (and removing like 10 lines of code used just for tracing).
I'm typically in favor of leaving in traces if they were helpful during debugging.
For what it's worth, I'd also be in favor of a "debugstr_node" helper or something. Maybe even if it uses a fixed-size buffer, although I'm not sure that Matteo or Henri will agree with that ;-)
If you accept my solution above, this has just been swept away.
Shouldn't this function just return void?
It's a matter of philosophy. If you ask me, the switch in copy_propagation_recursive should be rather oblivious of what the various helpers actually do, it shouldn't have an insight that copy_propagation_store never returns true. For its point of view, any helper can potentially modify the code. Though I can change to void if you prefer.
It's a fair point, but I guess I look at copy_propagation_recursive() and think "recording stores should never make any progress; all the magic happens in rewriting loads". This becomes especially true if this function gets renamed to copy_prop_record_store() or something, which I think it should.
Ok, you win on this one.
Thanks again, Giovanni.
On 11/11/21 3:18 AM, Giovanni Mascellani wrote:
Hi,
On 10/11/21 17:33, Zebediah Figura (she/her) wrote:
Mmh, note that you have to handle control flow in some way or another, you cannot just ignore it. This patch doesn't know about control flow, but the only way it can do it is by bailing out at the first control flow instruction in the program. It's not enough to ignore the control flow block and resume processing after it, because the inner block might invalidate some of your knowledge about the program state (in 3/5 patch this is done by copy_propagation_invalidate_from_block).
Right. I should have said to invalidate the whole copyprop state both when entering a CF block and when leaving one. The point is that we can do copyprop within every BB, not just the first one, without having to worry about things like partial invalidation.
I guess that by now I've already read the other mail I sent on that, but the point is that wouldn't be enough to get texture working, I think.
It should at least be enough for our unit tests, but yeah, if there's shaders that you've encountered that do texture access from within CF, I guess we'll need more sophistication.
Fortunately it's also something that's relatively easy to do one step at a time.
Personally I would have loved to meet comments like mine in other areas of Wine: knowing what data read or written by an algorithm are meant to represent makes it much easier to understand why the algorithm does what it does. I agree that this is not the most complicated case around, but I thought this would be helpful for someone not already knowing my code. OTOH I understand that also a high-level description is useful: would you consider it satisfying to have both?
Eh, maybe. I would at least start with the high-level explanation. When I read the above comment my eyes kind of glaze over, and it doesn't tell me anything that the code already doesn't (even the struct definitions alone tell me as much without checking how they're used.)
I dunno, I'll see what you end up coming up with.
Ok, I'll do my best.
Er, sorry, I should have said intptr_t. I mean to return intptr_t from the rbtree compare callback, and then all you need is
return (intptr_t)key - (intptr_t) variable->var;
Would that really work? For one thing, signed overflow is undefined behavior in C (except possibly very recent versions which we're definitely not using). It doesn't seem we're using -fwrapv, does it?
Even assuming two's complement semantics, this doesn't seem to give an order: for example, 0 > (INT_MIN + 1), because 0 - (INT_MIN + 1) = INT_MAX > 0 and INT_MAX > 0, because INT_MAX - 0 = INT_MAX > 0, but (INT_MIN + 1) - INT_MAX = 2 > 0, therefore you'd have (INT_MIN + 1) > INT_MAX. Therefore the relationship is not transitive, which I am pretty sure the red-black tree implementation won't like.
Am I missing something?
Actually I think you're right, but I didn't realize because I've seen that idiom used elsewhere. Specifically, compare_register_range() and compare_descriptor_range() in libs/vkd3d/state.c are definitely broken, and we have a few other comparison functions which are only not broken by virtue of implicit bounds.
Unless we're both missing something...
Ah. I assumed that you would actually be indexing right before copyprop, and didn't check whether that was actually the case.
Although on reflection now I'm not sure if we *should* index. We only dump the IR once, at the end, and I feel like we risk confusion by printing instruction indices that won't match. Hmm, maybe the code makes sense as it is...
You're right, I'd say at this point it's better to just ignore the index and always use the pointer, so there is no confusion with the indices in the final dump. This also means that I can put the %p directly in the TRACE without the additional buffer that you didn't like (and removing like 10 lines of code used just for tracing).
I'm typically in favor of leaving in traces if they were helpful during debugging.
For what it's worth, I'd also be in favor of a "debugstr_node" helper or something. Maybe even if it uses a fixed-size buffer, although I'm not sure that Matteo or Henri will agree with that ;-)
If you accept my solution above, this has just been swept away.
Eh, I guess. The nice thing about printing indices is that you can just add an hlsl_dump_function() call right before the relevant function, and then it's a lot easier to see what the nodes are (whereas currently we basically never print node pointers).
I'm not sure we want that call to be in by default, but if someone needs to debug copyprop it'd be very easy to add.
Shouldn't this function just return void?
It's a matter of philosophy. If you ask me, the switch in copy_propagation_recursive should be rather oblivious of what the various helpers actually do, it shouldn't have an insight that copy_propagation_store never returns true. For its point of view, any helper can potentially modify the code. Though I can change to void if you prefer.
It's a fair point, but I guess I look at copy_propagation_recursive() and think "recording stores should never make any progress; all the magic happens in rewriting loads". This becomes especially true if this function gets renamed to copy_prop_record_store() or something, which I think it should.
Ok, you win on this one.
Thanks again, Giovanni.
Hi,
On 11/11/21 22:24, Zebediah Figura wrote:
Actually I think you're right, but I didn't realize because I've seen that idiom used elsewhere. Specifically, compare_register_range() and compare_descriptor_range() in libs/vkd3d/state.c are definitely broken, and we have a few other comparison functions which are only not broken by virtue of implicit bounds.
The idiom is correct if it is assumed that the integers you're trying to compare are smaller than INT_MAX/2 (possibly - 1, because I didn't do the math throughout). I don't know that code, but this assumption seems plausible in that case, while it certainly is not for pointers.
Unless we're both missing something...
It seems that Matteo agrees with us. Has it ever happened that three Wine developers agreed on something and they were wrong? :-P
Giovanni.
On Thu, 11 Nov 2021 at 22:24, Zebediah Figura zfigura@codeweavers.com wrote:
On 11/11/21 3:18 AM, Giovanni Mascellani wrote:
On 10/11/21 17:33, Zebediah Figura (she/her) wrote:
Er, sorry, I should have said intptr_t. I mean to return intptr_t from the rbtree compare callback, and then all you need is
return (intptr_t)key - (intptr_t) variable->var;
Would that really work? For one thing, signed overflow is undefined behavior in C (except possibly very recent versions which we're definitely not using). It doesn't seem we're using -fwrapv, does it?
Even assuming two's complement semantics, this doesn't seem to give an order: for example, 0 > (INT_MIN + 1), because 0 - (INT_MIN + 1) = INT_MAX > 0 and INT_MAX > 0, because INT_MAX - 0 = INT_MAX > 0, but (INT_MIN + 1) - INT_MAX = 2 > 0, therefore you'd have (INT_MIN + 1) > INT_MAX. Therefore the relationship is not transitive, which I am pretty sure the red-black tree implementation won't like.
Am I missing something?
Actually I think you're right, but I didn't realize because I've seen that idiom used elsewhere. Specifically, compare_register_range() and compare_descriptor_range() in libs/vkd3d/state.c are definitely broken, and we have a few other comparison functions which are only not broken by virtue of implicit bounds.
Unless we're both missing something...
I think all three of you are right. :) I think I somehow managed to convince myself that it's safe, while clearly it isn't in all cases. Unfortunately, between vkd3d and wined3d I've managed to introduce quite a few instances of this. My suggested replacement would be to introduce a helper like this:
static int vkd3d_compare_uint32(uint32_t x, uint32_t y) { return (x > y) - (x < y); }
and then use that helper in the places where we currently have the subtraction.
On Wed, Nov 10, 2021 at 5:33 PM Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 11/10/21 03:57, Giovanni Mascellani wrote:
Hi,
On 09/11/21 23:21, Zebediah Figura wrote:
Overall this patch looks a lot better than I was afraid of. It's a lot of code that's intimidating to review, but once you ignore the rbtree boilerplate it's simple enough and seems about in line with what I expect. There's quite a few things that I think can be simplified, but the basic structure seems sound, so nice work.
Great, that's good! Thanks for your review, I think most if not all of the points you wrote should be easy to fix.
The CF part is probably going to be a lot trickier, but fortunately I think that this patch alone is the one that really matters. Frankly, if we were to take this patch, and then a second patch that does copy-prop on interior CF blocks with a fresh copy_propagation_state(), we might even cover enough that it's not even worrying about CF...
Mmh, note that you have to handle control flow in some way or another, you cannot just ignore it. This patch doesn't know about control flow, but the only way it can do it is by bailing out at the first control flow instruction in the program. It's not enough to ignore the control flow block and resume processing after it, because the inner block might invalidate some of your knowledge about the program state (in 3/5 patch this is done by copy_propagation_invalidate_from_block).
Right. I should have said to invalidate the whole copyprop state both when entering a CF block and when leaving one. The point is that we can do copyprop within every BB, not just the first one, without having to worry about things like partial invalidation.
Yeah, that would probably be good enough for our purposes for a long time.
+/* struct copy_propagation_state represents the accumulated knowledge
- of the copy propagation pass while it scans through the code. Field
- "variables" is a tree whose elements have type struct
- copy_propagation_varible, and represent each of the variables the
- pass has already encountered (except those with special semantics,
- which are ignored). For each variable, the array "values" (whose
- length is the register size of the variable) represent which node
- and which index inside that node (i.e., which of the at most four
- entries of a vector) provided that value last time. Field "node"
- can be NULL, meaning that the pass was not able to statically
- determine the node.
- */
This comment feels a bit too low-level? I dunno, comments like this are hard to review, but my inclination is to describe what you're doing at a high level, along the lines of "we track the last known value of each component of each variable", and let the code itself explain the details.
Personally I would have loved to meet comments like mine in other areas of Wine: knowing what data read or written by an algorithm are meant to represent makes it much easier to understand why the algorithm does what it does. I agree that this is not the most complicated case around, but I thought this would be helpful for someone not already knowing my code. OTOH I understand that also a high-level description is useful: would you consider it satisfying to have both?
Eh, maybe. I would at least start with the high-level explanation. When I read the above comment my eyes kind of glaze over, and it doesn't tell me anything that the code already doesn't (even the struct definitions alone tell me as much without checking how they're used.)
Same, really.
if (TRACE_ON())
{
char buf[32];
if (!node)
sprintf(buf, "(nil)");
Can that happen?
Not yet, but it will as soon as loops and conditionals enter the scene, because they can cause the last store to a variable to become unknown (at compilation time).
Ah, of course, that makes sense.
Now, I know that usually we don't like to have dead code around. I think this case could be accepted, since this code is just a line long and becomes alive at 3/5. Also, I find it easier to review this piece of code at once, instead of first reviewing a part, checking that the missing case will never happen and then review the missing case and checking that it fits well in the already-reviewed part.
But if you really don't like it, I can defer the NULL case to 3/5.
Yeah, even here I find it easier to review if this code is moved to 3/5.
Agreed, if it's not a big hassle I'd prefer the same.
else if (node->index)
sprintf(buf, "@%u", node->index);
else
sprintf(buf, "%p", node);
Can that happen?
Definitely. Arguably, it might be the common case, given that the earlier optimization passes (including this pass itself, given that it might be executed more than once) might synthesize a lot of nodes, which won't have an index until compute_liveness is executed.
Ah. I assumed that you would actually be indexing right before copyprop, and didn't check whether that was actually the case.
Although on reflection now I'm not sure if we *should* index. We only dump the IR once, at the end, and I feel like we risk confusion by printing instruction indices that won't match. Hmm, maybe the code makes sense as it is...
FWIW I have a local patch dumping the initial IR and sometimes I add more for intermediate steps while reviewing. I'd be in favor of forcing reindexing in hlsl_dump_function() itself, except that would be a bit of an unexpected side effect from calling a debug function.
IIRC NIR does have some kind of "metadata lifetime management" thingie where each transformation pass flags if some kind of metadata is not valid anymore after it's done its job. The next pass will automatically get the relevant metadata regenerated if it's going to use it. Not saying that we necessarily need something like that but let's keep the idea in mind.
(speaking of constant expressions in the code, I am a bit bothered that evaluate_array_dimension is not able to figure out the value of constant expressions that are not immediate constants, and unfortunately fixing that is not as easy as running a fold_constants pass; I don't in mind a clean solution for that that doesn't require re-implementing constant folding there; that's another matter, of course)
Yeah, that one is worrying...
I think we might have to reimplement constant folding. Probably we can at least use a common helper, though.
Is it that hard? I guess I should try to do it, on the off-chance that it isn't :D
On Tue, Nov 9, 2021 at 11:21 PM Zebediah Figura zfigura@codeweavers.com wrote:
On 11/9/21 3:44 AM, Giovanni Mascellani wrote:
Signed-off-by: Giovanni Mascellani gmascellani@codeweavers.com
Pretty sure this (and the following ones) will require some more tweaking before being accepted! :-)
libs/vkd3d-shader/hlsl_codegen.c | 254 ++++++++++++++++++++++++++++++- 1 file changed, 253 insertions(+), 1 deletion(-)
Overall this patch looks a lot better than I was afraid of. It's a lot of code that's intimidating to review, but once you ignore the rbtree boilerplate it's simple enough and seems about in line with what I expect. There's quite a few things that I think can be simplified, but the basic structure seems sound, so nice work.
The CF part is probably going to be a lot trickier, but fortunately I think that this patch alone is the one that really matters. Frankly, if we were to take this patch, and then a second patch that does copy-prop on interior CF blocks with a fresh copy_propagation_state(), we might even cover enough that it's not even worrying about CF...
Basically agree with all the above. In particular I feared that handling individual components right from the start would make things overly convoluted but it doesn't seem to be the case, so good job :)
I also generally agree with the rest of the review. A few notes below.
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 0ee8ab55..8db8bfc9 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -237,6 +237,253 @@ static void replace_node(struct hlsl_ir_node *old, struct hlsl_ir_node *new) hlsl_free_instr(old); }
+/* struct copy_propagation_state represents the accumulated knowledge
- of the copy propagation pass while it scans through the code. Field
- "variables" is a tree whose elements have type struct
- copy_propagation_varible, and represent each of the variables the
- pass has already encountered (except those with special semantics,
- which are ignored). For each variable, the array "values" (whose
- length is the register size of the variable) represent which node
- and which index inside that node (i.e., which of the at most four
- entries of a vector) provided that value last time. Field "node"
- can be NULL, meaning that the pass was not able to statically
- determine the node.
- */
This comment feels a bit too low-level? I dunno, comments like this are hard to review, but my inclination is to describe what you're doing at a high level, along the lines of "we track the last known value of each component of each variable", and let the code itself explain the details.
Yes. An ideal comment says why you're doing something, not what. The latter should be clear from the code; if you feel you need a comment to explain the nooks and crannies of some code path in detail, chances are that the code itself needs some more thought.
+struct copy_propagation_value +{
- struct hlsl_ir_node *node;
- unsigned int index;
I think the term we want is "component", not "index".
+};
+struct copy_propagation_variable +{
- struct rb_entry entry;
- struct hlsl_ir_var *var;
- struct copy_propagation_value *values;
+};
I think just hardcoding an array of 4 for values (and getting rid of struct copy_propagation_value altogether) would make things quite a bit nicer.
+struct copy_propagation_state +{
- struct rb_tree variables;
+};
+static int copy_propagation_variable_compare(const void *key, const struct rb_entry *entry) +{
- struct copy_propagation_variable *variable = RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry);
- uintptr_t key_int = (uintptr_t)key, entry_int = (uintptr_t)variable->var;
- if (key_int < entry_int)
return -1;
- else if (key_int > entry_int)
return 1;
- else
return 0;
Could we just modify the rbtree implementation to use uintptr_t instead, and then do a direct subtraction?
That's dangerous, overflow / underflow can break the ordering guarantees.
+}
+static void copy_propagation_variable_destroy(struct rb_entry *entry, void *context) +{
- struct copy_propagation_variable *variable = RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry);
- vkd3d_free(variable);
+}
+static struct copy_propagation_variable *copy_propagation_get_variable(struct hlsl_ctx *ctx,
struct copy_propagation_state *state, struct hlsl_ir_var *var)
+{
- struct rb_entry *entry = rb_get(&state->variables, var);
- struct copy_propagation_variable *variable;
- int res;
- if (entry)
return RB_ENTRY_VALUE(entry, struct copy_propagation_variable, entry);
- variable = hlsl_alloc(ctx, sizeof(*variable));
- if (!variable)
return NULL;
- variable->var = var;
- variable->values = hlsl_alloc(ctx, sizeof(*variable->values) * var->data_type->reg_size);
- if (!variable->values)
- {
vkd3d_free(variable);
return NULL;
- }
- res = rb_put(&state->variables, var, &variable->entry);
- assert(!res);
Although this is a bit awkward because most of the function is only relevant for stores, not loads, and I think it would actually be better to reflect that in the code, one way or another (so we can do less effort).
Right. It might be just a matter of naming the function differently. This returns a struct copy_propagation_variable which "collides" in my mind with struct hlsl_ir_var. Probably it's worth renaming struct copy_propagation_variable too.
I guess what Zeb's getting at is to have a "get" function that only does the rb_get() + return existing entry and a separate function for inserting a new replacement in the tree.
- return variable;
+}
+static void copy_propagation_set_value(struct copy_propagation_variable *variable, unsigned int offset,
unsigned char writemask, struct hlsl_ir_node *node)
+{
- unsigned int index;
- for (index = 0; index < 4; ++index)
- {
if (writemask & (1u << index))
{
if (TRACE_ON())
{
char buf[32];
You generally want to use a struct vkd3d_string_buffer instead of a fixed-size string buffer. For this specific case maybe we can rework the trace along the lines of Zeb's idea.
if (!node)
sprintf(buf, "(nil)");
Can that happen?
else if (node->index)
sprintf(buf, "@%u", node->index);
else
sprintf(buf, "%p", node);
Can that happen?
TRACE("variable %s[%d] is written by %p[%d]\n", variable->var->name, offset + index, buf, index);
This trace doesn't really match the usual format, but more saliently, the hardcoded buffer is ugly. Assuming the other two cases really can happen, I'd rather see individual traces spelled out.
Same thing below.
}
variable->values[offset + index].node = node;
variable->values[offset + index].index = index;
}
- }
+}
+/* Check if locations [offset, offset+count) in variable were all
- written from the same node. If so return the node the corresponding
- indices, otherwise return NULL (and undefined indices). */
+static struct hlsl_ir_node *copy_propagation_reconstruct_node(struct copy_propagation_variable *variable,
unsigned int offset, unsigned int count, unsigned int indices[4])
"indices" is really just a swizzle; can we return it as such?
I think we can drop the comment by renaming the function to "copy_propagation_get_replacement" or something along those lines.
In general I'd rather not add new references to "node". Call it instruction, if you need to use some term for it.
+{
- struct hlsl_ir_node *node = NULL;
- unsigned int i;
- assert(offset + count <= variable->var->data_type->reg_size);
- for (i = 0; i < count; ++i)
- {
if (!node)
node = variable->values[offset + i].node;
else if (node != variable->values[offset + i].node)
return NULL;
indices[i] = variable->values[offset + i].index;
- }
- return node;
+}
This is really obviously the right thing to do, yet it still confused me a *lot* when trying to review it.
That might just be me, but if not, I guess the function name and comment could use work. Explaining *why* the function is necessary would probably help.
+static bool copy_propagation_load(struct hlsl_ctx *ctx, struct hlsl_ir_load *load,
Can you please try to name functions using a verb? There's three more examples below.
struct copy_propagation_state *state)
+{
- struct hlsl_ir_node *node = &load->node, *new_node;
- struct copy_propagation_variable *variable;
- struct hlsl_type *type = node->data_type;
- unsigned int offset, indices[4] = {};
{} isn't portable, unfortunately.
- struct hlsl_deref *src = &load->src;
- struct hlsl_ir_var *var = src->var;
- struct hlsl_ir_swizzle *swizzle;
- DWORD s;
- if (var->is_input_semantic || var->is_output_semantic || var->is_uniform)
return false;
For input semantics and uniforms: yeah, but do we need to? The important question is "can we reconstruct a store for this variable", and this should already be false.
For output semantics, we should never get here in the first place.
- if (type->type != HLSL_CLASS_SCALAR && type->type != HLSL_CLASS_VECTOR)
return false;
- offset = hlsl_offset_from_deref(src);
The problem with hlsl_offset_from_deref(), and the reason I probably should have fought against 62b25bc52b in its current form, is that it not only doesn't deal with non-constant offsets, but doesn't really give the caller a way to bail either. We should probably be returning bool from it, and then aborting here.
- variable = copy_propagation_get_variable(ctx, state, var);
- if (!variable)
return false;
- new_node = copy_propagation_reconstruct_node(variable, offset, type->dimx, indices);
- if (TRACE_ON())
- {
char buf[32];
if (!new_node)
sprintf(buf, "(nil)");
Is this useful to trace?
else if (new_node->index)
sprintf(buf, "@%u", new_node->index);
else
sprintf(buf, "%p", new_node);
Can this happen?
TRACE("load from %s[%d-%d] reconstructed to %s[%d %d %d %d]\n", var->name, offset,
offset + type->dimx, buf, indices[0], indices[1], indices[2], indices[3]);
- }
- if (!new_node)
return false;
- s = indices[0] | indices[1] << 2 | indices[2] << 4 | indices[3] << 6;
I guess we could introduce a small helper (hlsl_make_swizzle()?) for this.
Hi,
On 11/11/21 11:18, Matteo Bruni wrote:
Yes. An ideal comment says why you're doing something, not what. The latter should be clear from the code; if you feel you need a comment to explain the nooks and crannies of some code path in detail, chances are that the code itself needs some more thought.
This, as a blanket statement, seems a bit excessive to me. As I said, when reading code in places like user32 and winex11.drv I'd be very happy to have comments, even hard to read or getting in the specific details of something. Or, as I meant my comment to be initially, describing what data structures are supposed to represent.
That said, the revised patch set that I sent two seconds before receiving this email should have been improved on that side (and, hopefully, many other).
I think just hardcoding an array of 4 for values (and getting rid of struct copy_propagation_value altogether) would make things quite a bit nicer.
Notice that variables can have more than four components. Matrices can have up to 16 and arrays even more.
Thanks, Giovanni.
On Thu, Nov 11, 2021 at 11:40 AM Giovanni Mascellani gmascellani@codeweavers.com wrote:
Hi,
On 11/11/21 11:18, Matteo Bruni wrote:
Yes. An ideal comment says why you're doing something, not what. The latter should be clear from the code; if you feel you need a comment to explain the nooks and crannies of some code path in detail, chances are that the code itself needs some more thought.
This, as a blanket statement, seems a bit excessive to me. As I said, when reading code in places like user32 and winex11.drv I'd be very happy to have comments, even hard to read or getting in the specific details of something. Or, as I meant my comment to be initially, describing what data structures are supposed to represent.
I accept that in practice sometimes a comment explaining at a high level what's happening can be good, especially if there is some non-trivial algorithm involved. Same for comments on some very specific technicality of the code in question. Those cases should be the exception though.
That said, the revised patch set that I sent two seconds before receiving this email should have been improved on that side (and, hopefully, many other).
I think just hardcoding an array of 4 for values (and getting rid of struct copy_propagation_value altogether) would make things quite a bit nicer.
Notice that variables can have more than four components. Matrices can have up to 16 and arrays even more.
Right, but we probably don't want or need to do copy propagation on those i.e. copy propagation should probably happen after matrix / struct / array splitting.
Hi,
On 11/11/21 12:14, Matteo Bruni wrote:
Notice that variables can have more than four components. Matrices can have up to 16 and arrays even more.
Right, but we probably don't want or need to do copy propagation on those i.e. copy propagation should probably happen after matrix / struct / array splitting.
Mmh, then there is something about splitting that I'm not understanding.
My understanding so far was that variables themselves are not splitted: they are just there, and do not appear in the code as themselves. What gets splitted are the temporaries that appear when some piece of code actually does something with (say) a matrix. So, for example, if you have this fragment of code:
--- float4x4 a; float4x4 b; float4x4 c; c = a + b; ---
the compile first naively represents it as:
--- float4x4 a float4x4 b float4x4 c @1 = load(a) of type float4x4 @2 = load(b) of type float4x4 @3 = + (@1 @2) of type float4x4 store(c, @3) ---
and then this gets splitted as:
--- float4x4 a float4x4 b float4x4 c @1 = load(a, 0) of type float4 @2 = load(b, 0) of type float4 @3 = + (@1 @2) of type float4 store(c, 0, @3) @5 = load(a, 4) of type float4 @6 = load(b, 4) of type float4 @7 = + (@5 @6) of type float4 store(c, 4, @7) ... ---
That is, the variables keep their type, even though the accesses (loads and stores) to the variables have a smaller type. That's my understanding of what we want. My code mirrors this, therefore allows a variable to have more than four registers.
What is the advantage of splitting variables themselves?
In the specific case of my copy propagation pass, this would make things more complicated. For example, if right now I cannot reconstruct the offset of a store, I can just invalidate the whole variable. In your model, as I get it, I'd have to also invalidate other variables, that are unrelated by that point.
Thanks, Giovanni.
On Thu, Nov 11, 2021 at 12:50 PM Giovanni Mascellani gmascellani@codeweavers.com wrote:
Hi,
On 11/11/21 12:14, Matteo Bruni wrote:
Notice that variables can have more than four components. Matrices can have up to 16 and arrays even more.
Right, but we probably don't want or need to do copy propagation on those i.e. copy propagation should probably happen after matrix / struct / array splitting.
Mmh, then there is something about splitting that I'm not understanding.
My understanding so far was that variables themselves are not splitted: they are just there, and do not appear in the code as themselves. What gets splitted are the temporaries that appear when some piece of code actually does something with (say) a matrix. So, for example, if you have this fragment of code:
float4x4 a; float4x4 b; float4x4 c; c = a + b;
the compile first naively represents it as:
float4x4 a float4x4 b float4x4 c @1 = load(a) of type float4x4 @2 = load(b) of type float4x4 @3 = + (@1 @2) of type float4x4 store(c, @3)
and then this gets splitted as:
float4x4 a float4x4 b float4x4 c @1 = load(a, 0) of type float4 @2 = load(b, 0) of type float4 @3 = + (@1 @2) of type float4 store(c, 0, @3) @5 = load(a, 4) of type float4 @6 = load(b, 4) of type float4 @7 = + (@5 @6) of type float4 store(c, 4, @7) ...
That is, the variables keep their type, even though the accesses (loads and stores) to the variables have a smaller type. That's my understanding of what we want. My code mirrors this, therefore allows a variable to have more than four registers.
What is the advantage of splitting variables themselves?
Continuing from your example: assuming a, b and c are temporaries, you split them into 4 vectors each and update the LOAD and STORE instructions to point to the specific vector. Once that is done, it becomes explicit that those groups of 4 instructions (LOAD x2, ADD, STORE) are in fact entirely independent from each other. That alone might help further transformations down the road. It's also pretty nice for register allocation, as it's easier to allocate 4 groups of 4 registers rather than a single contiguous group of 16. Sometimes you can even find out that whole rows / columns are unused and drop them altogether.
The same applies to all the complex types of course, not just matrices. There is a complication with the above in that sometimes it can be impossible to split the vars. That is, when the load / store offset is not always known at compile time. That's a bit unfortunate but it should also be pretty rare in HLSL shaders. I think it's worthwhile to optimize for the common case and accept that we won't necessarily have the best code when things align badly for us.
With all that said: WRT copy propagation and this patch specifically, I think it's a good idea to only handle vector variables if it makes things easier (as it should). Notice that you don't have to bail out entirely even in the "bad" case, as a non-vector is perfectly fine as a "value". It's only when the complex variable is the destination of a store that we're in trouble.
In the specific case of my copy propagation pass, this would make things more complicated. For example, if right now I cannot reconstruct the offset of a store, I can just invalidate the whole variable. In your model, as I get it, I'd have to also invalidate other variables, that are unrelated by that point.
I don't think that's the case? A STORE is always directed to a specific temporary variable and will affect that one alone. I guess you were thinking of a model where you always split variables into vectors no matter what, in which case you're right, it quickly becomes a mess...
Hi,
On 11/11/21 14:43, Matteo Bruni wrote:
Continuing from your example: assuming a, b and c are temporaries, you split them into 4 vectors each and update the LOAD and STORE instructions to point to the specific vector. Once that is done, it becomes explicit that those groups of 4 instructions (LOAD x2, ADD, STORE) are in fact entirely independent from each other. That alone might help further transformations down the road. It's also pretty nice for register allocation, as it's easier to allocate 4 groups of 4 registers rather than a single contiguous group of 16. Sometimes you can even find out that whole rows / columns are unused and drop them altogether.
The same applies to all the complex types of course, not just matrices. There is a complication with the above in that sometimes it can be impossible to split the vars. That is, when the load / store offset is not always known at compile time. That's a bit unfortunate but it should also be pretty rare in HLSL shaders. I think it's worthwhile to optimize for the common case and accept that we won't necessarily have the best code when things align badly for us.
Ok, that is the part I was missing: it's possible that a variable cannot be split, and we want to handle that case as well (meaning that it's sensible to optimize for the common case, but we also want to be correct in _all_ cases).
With all that said: WRT copy propagation and this patch specifically, I think it's a good idea to only handle vector variables if it makes things easier (as it should). Notice that you don't have to bail out entirely even in the "bad" case, as a non-vector is perfectly fine as a "value". It's only when the complex variable is the destination of a store that we're in trouble.
Though I don't understand why we should arbitrarily ignore non-vector variables. They don't require any special handling at all, just using a dynamically-sized array instead of a static one. All the rest is perfectly identical, the is no simplification in the algorithm itself. Arguably, we would make it more complicated, because we would arbitrarily drop some cases, which would make a reader why we are doing that.
In the specific case of my copy propagation pass, this would make things more complicated. For example, if right now I cannot reconstruct the offset of a store, I can just invalidate the whole variable. In your model, as I get it, I'd have to also invalidate other variables, that are unrelated by that point.
I don't think that's the case? A STORE is always directed to a specific temporary variable and will affect that one alone. I guess you were thinking of a model where you always split variables into vectors no matter what, in which case you're right, it quickly becomes a mess...
Yeah, I has understood that way.
Thanks, Giovanni.
On Thu, Nov 11, 2021 at 4:41 PM Giovanni Mascellani gmascellani@codeweavers.com wrote:
Hi,
On 11/11/21 14:43, Matteo Bruni wrote:
Continuing from your example: assuming a, b and c are temporaries, you split them into 4 vectors each and update the LOAD and STORE instructions to point to the specific vector. Once that is done, it becomes explicit that those groups of 4 instructions (LOAD x2, ADD, STORE) are in fact entirely independent from each other. That alone might help further transformations down the road. It's also pretty nice for register allocation, as it's easier to allocate 4 groups of 4 registers rather than a single contiguous group of 16. Sometimes you can even find out that whole rows / columns are unused and drop them altogether.
The same applies to all the complex types of course, not just matrices. There is a complication with the above in that sometimes it can be impossible to split the vars. That is, when the load / store offset is not always known at compile time. That's a bit unfortunate but it should also be pretty rare in HLSL shaders. I think it's worthwhile to optimize for the common case and accept that we won't necessarily have the best code when things align badly for us.
Ok, that is the part I was missing: it's possible that a variable cannot be split, and we want to handle that case as well (meaning that it's sensible to optimize for the common case, but we also want to be correct in _all_ cases).
Exactly. At least that's how I see it.
With all that said: WRT copy propagation and this patch specifically, I think it's a good idea to only handle vector variables if it makes things easier (as it should). Notice that you don't have to bail out entirely even in the "bad" case, as a non-vector is perfectly fine as a "value". It's only when the complex variable is the destination of a store that we're in trouble.
Though I don't understand why we should arbitrarily ignore non-vector variables. They don't require any special handling at all, just using a dynamically-sized array instead of a static one. All the rest is perfectly identical, the is no simplification in the algorithm itself. Arguably, we would make it more complicated, because we would arbitrarily drop some cases, which would make a reader why we are doing that.
I think focusing on vectors is a good general direction and keeping that in mind might lead to simpler passes down the line. I guess it's not so useful when taken into consideration after the fact, like in this particular case. It's not a big deal either way.
In the specific case of my copy propagation pass, this would make things more complicated. For example, if right now I cannot reconstruct the offset of a store, I can just invalidate the whole variable. In your model, as I get it, I'd have to also invalidate other variables, that are unrelated by that point.
I don't think that's the case? A STORE is always directed to a specific temporary variable and will affect that one alone. I guess you were thinking of a model where you always split variables into vectors no matter what, in which case you're right, it quickly becomes a mess...
Yeah, I has understood that way.
Thanks, Giovanni.
On 11/11/21 10:40, Matteo Bruni wrote:
On Thu, Nov 11, 2021 at 4:41 PM Giovanni Mascellani gmascellani@codeweavers.com wrote:
Hi,
On 11/11/21 14:43, Matteo Bruni wrote:
Continuing from your example: assuming a, b and c are temporaries, you split them into 4 vectors each and update the LOAD and STORE instructions to point to the specific vector. Once that is done, it becomes explicit that those groups of 4 instructions (LOAD x2, ADD, STORE) are in fact entirely independent from each other. That alone might help further transformations down the road. It's also pretty nice for register allocation, as it's easier to allocate 4 groups of 4 registers rather than a single contiguous group of 16. Sometimes you can even find out that whole rows / columns are unused and drop them altogether.
The same applies to all the complex types of course, not just matrices. There is a complication with the above in that sometimes it can be impossible to split the vars. That is, when the load / store offset is not always known at compile time. That's a bit unfortunate but it should also be pretty rare in HLSL shaders. I think it's worthwhile to optimize for the common case and accept that we won't necessarily have the best code when things align badly for us.
Ok, that is the part I was missing: it's possible that a variable cannot be split, and we want to handle that case as well (meaning that it's sensible to optimize for the common case, but we also want to be correct in _all_ cases).
Exactly. At least that's how I see it.
I think I'm missing something, or have misread part of this, but assuming that we want to go right ahead and split types into vectors where possible, I think it only makes sense to handle vectors in copyprop? If a type is bigger than that and can't be split, I think that means it necessarily has non-uniform access, which means we can't really perform copyprop on it anyway.
Or maybe you're already saying exactly that.
On Thu, Nov 11, 2021 at 10:33 PM Zebediah Figura (she/her) zfigura@codeweavers.com wrote:
On 11/11/21 10:40, Matteo Bruni wrote:
On Thu, Nov 11, 2021 at 4:41 PM Giovanni Mascellani gmascellani@codeweavers.com wrote:
Hi,
On 11/11/21 14:43, Matteo Bruni wrote:
Continuing from your example: assuming a, b and c are temporaries, you split them into 4 vectors each and update the LOAD and STORE instructions to point to the specific vector. Once that is done, it becomes explicit that those groups of 4 instructions (LOAD x2, ADD, STORE) are in fact entirely independent from each other. That alone might help further transformations down the road. It's also pretty nice for register allocation, as it's easier to allocate 4 groups of 4 registers rather than a single contiguous group of 16. Sometimes you can even find out that whole rows / columns are unused and drop them altogether.
The same applies to all the complex types of course, not just matrices. There is a complication with the above in that sometimes it can be impossible to split the vars. That is, when the load / store offset is not always known at compile time. That's a bit unfortunate but it should also be pretty rare in HLSL shaders. I think it's worthwhile to optimize for the common case and accept that we won't necessarily have the best code when things align badly for us.
Ok, that is the part I was missing: it's possible that a variable cannot be split, and we want to handle that case as well (meaning that it's sensible to optimize for the common case, but we also want to be correct in _all_ cases).
Exactly. At least that's how I see it.
I think I'm missing something, or have misread part of this, but assuming that we want to go right ahead and split types into vectors where possible, I think it only makes sense to handle vectors in copyprop? If a type is bigger than that and can't be split, I think that means it necessarily has non-uniform access, which means we can't really perform copyprop on it anyway.
Or maybe you're already saying exactly that.
I'm thinking that, although I'm not sure that I said that :D But yes, that's how I see things working eventually. There are a few missing pieces at the moment and if e.g. we want to go through with copy prop right away (which is absolutely fine), we either have to handle complex types in copy prop or miss some copy propagation opportunities until the other shoe drops. I feel like the latter is easier but I'm okay with the former too.
Hi,
On 11/11/21 23:11, Matteo Bruni wrote:
I think I'm missing something, or have misread part of this, but assuming that we want to go right ahead and split types into vectors where possible, I think it only makes sense to handle vectors in copyprop? If a type is bigger than that and can't be split, I think that means it necessarily has non-uniform access, which means we can't really perform copyprop on it anyway.
Or maybe you're already saying exactly that.
I'm thinking that, although I'm not sure that I said that :D But yes, that's how I see things working eventually. There are a few missing pieces at the moment and if e.g. we want to go through with copy prop right away (which is absolutely fine), we either have to handle complex types in copy prop or miss some copy propagation opportunities until the other shoe drops. I feel like the latter is easier but I'm okay with the former too.
I think my copy propagation patches already handle complex types (i.e., handle variables which have complex types; they don't handle loads or stores that have complex types, but neither does code generation, so there is no loss there), and they don't require jumping through any hoops to do that (unless you count dynamically allocating an array as jumping through hoops, but I don't think so).
When at some point we'll split variables to simple types, the copy propagation pass should keep working just as well, provided that the splitting correctly preserve semantics (but that has to happen anyway).
Notice that having a variable with an offset which cannot be known at compile time doesn't necessarily mean that copy propagation cannot do anything: a variable could have both accesses with statically-known offsets and non-statically-known offsets, and the copy propagation pass will still use the information it can pinpoint. This might not be the common case, sure, but given that nothing has to be done to support it, I see no reason to willingly break it.
Giovanni.
On 11/11/21 04:18, Matteo Bruni wrote:
TRACE("load from %s[%d-%d] reconstructed to %s[%d %d %d %d]\n", var->name, offset,
offset + type->dimx, buf, indices[0], indices[1], indices[2], indices[3]);
- }
- if (!new_node)
return false;
- s = indices[0] | indices[1] << 2 | indices[2] << 4 | indices[3] << 6;
I guess we could introduce a small helper (hlsl_make_swizzle()?) for this.
Yeah, we could probably use this, although it doesn't need to block this patch.
Signed-off-by: Giovanni Mascellani gmascellani@codeweavers.com --- libs/vkd3d-shader/hlsl_codegen.c | 138 +++++++++++++++++++++++++++---- 1 file changed, 123 insertions(+), 15 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 8db8bfc9..78bcd079 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -239,15 +239,17 @@ static void replace_node(struct hlsl_ir_node *old, struct hlsl_ir_node *new)
/* struct copy_propagation_state represents the accumulated knowledge * of the copy propagation pass while it scans through the code. Field - * "variables" is a tree whose elements have type struct - * copy_propagation_varible, and represent each of the variables the - * pass has already encountered (except those with special semantics, - * which are ignored). For each variable, the array "values" (whose - * length is the register size of the variable) represent which node - * and which index inside that node (i.e., which of the at most four - * entries of a vector) provided that value last time. Field "node" - * can be NULL, meaning that the pass was not able to statically - * determine the node. + * "variables" is an stack of trees: each stack entry represents an + * embedded block (of type either HLSL_IR_IF or HLSL_IR_TREE); the + * elements of the tree have type struct copy_propagation_variable, + * and represent each of the variables the pass has already + * encountered (except those with special semantics, which are + * ignored). For each variable, the array "values" (whose length is + * the register size of the variable) represent which node and which + * index inside that node (i.e., which of the at most four entries of + * a vector) provided that value last time. Field "node" can be NULL, + * meaning that the pass was not able to statically determine the + * node. */
struct copy_propagation_value @@ -265,7 +267,9 @@ struct copy_propagation_variable
struct copy_propagation_state { - struct rb_tree variables; + struct rb_tree *variables; + unsigned int depth; + unsigned int capacity; };
static int copy_propagation_variable_compare(const void *key, const struct rb_entry *entry) @@ -291,7 +295,7 @@ static void copy_propagation_variable_destroy(struct rb_entry *entry, void *cont static struct copy_propagation_variable *copy_propagation_get_variable(struct hlsl_ctx *ctx, struct copy_propagation_state *state, struct hlsl_ir_var *var) { - struct rb_entry *entry = rb_get(&state->variables, var); + struct rb_entry *entry = rb_get(&state->variables[state->depth], var); struct copy_propagation_variable *variable; int res;
@@ -310,7 +314,7 @@ static struct copy_propagation_variable *copy_propagation_get_variable(struct hl return NULL; }
- res = rb_put(&state->variables, var, &variable->entry); + res = rb_put(&state->variables[state->depth], var, &variable->entry); assert(!res);
return variable; @@ -342,6 +346,71 @@ static void copy_propagation_set_value(struct copy_propagation_variable *variabl } }
+static void copy_propagation_invalidate_from_block(struct hlsl_ctx *ctx, struct copy_propagation_state *state, + struct hlsl_block *block) +{ + struct hlsl_ir_node *instr; + + LIST_FOR_EACH_ENTRY(instr, &block->instrs, struct hlsl_ir_node, entry) + { + if (instr->type == HLSL_IR_STORE) + { + struct hlsl_ir_store *store = hlsl_ir_store(instr); + struct copy_propagation_variable *variable; + struct hlsl_deref *lhs = &store->lhs; + struct hlsl_ir_var *var = lhs->var; + + if (var->is_input_semantic || var->is_output_semantic || var->is_uniform) + continue; + + variable = copy_propagation_get_variable(ctx, state, var); + if (!variable) + continue; + + copy_propagation_set_value(variable, hlsl_offset_from_deref(lhs), store->writemask, NULL); + } + } +} + +static void copy_propagation_pop(struct copy_propagation_state *state) +{ + assert(state->depth > 0); + rb_destroy(&state->variables[state->depth], copy_propagation_variable_destroy, NULL); + --state->depth; +} + +static bool copy_propagation_duplicate(struct hlsl_ctx *ctx, struct copy_propagation_state *state) +{ + struct copy_propagation_variable *var; + + if (state->depth + 1 == state->capacity) + { + unsigned int new_capacity = 2 * state->capacity; + struct rb_tree *new_vars; + + new_vars = hlsl_realloc(ctx, state->variables, sizeof(*state->variables) * new_capacity); + if (!new_vars) + return false; + state->capacity = new_capacity; + state->variables = new_vars; + } + ++state->depth; + + rb_init(&state->variables[state->depth], copy_propagation_variable_compare); + + RB_FOR_EACH_ENTRY(var, &state->variables[state->depth - 1], struct copy_propagation_variable, entry) + { + struct copy_propagation_variable *new_var = copy_propagation_get_variable(ctx, state, var->var); + + if (!new_var) + continue; + + memcpy(new_var->values, var->values, sizeof(*var->values) * var->var->data_type->reg_size); + } + + return true; +} + /* Check if locations [offset, offset+count) in variable were all * written from the same node. If so return the node the corresponding * indices, otherwise return NULL (and undefined indices). */ @@ -436,6 +505,38 @@ static bool copy_propagation_store(struct hlsl_ctx *ctx, struct hlsl_ir_store *s return false; }
+static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block *block, + struct copy_propagation_state *state); + +/* Both branches can inherit the variable state available when + * entering the "if". After the "if", all variables that might have + * been written in either branch must be invalidated, because we don't + * know which branch has executed. */ +static bool copy_propagation_if(struct hlsl_ctx *ctx, struct hlsl_ir_if *iff, + struct copy_propagation_state *state) +{ + bool progress = false; + + if (!copy_propagation_duplicate(ctx, state)) + goto end; + + progress |= copy_propagation_recursive(ctx, &iff->then_instrs, state); + + copy_propagation_pop(state); + if (!copy_propagation_duplicate(ctx, state)) + goto end; + + progress |= copy_propagation_recursive(ctx, &iff->else_instrs, state); + + copy_propagation_pop(state); + +end: + copy_propagation_invalidate_from_block(ctx, state, &iff->then_instrs); + copy_propagation_invalidate_from_block(ctx, state, &iff->else_instrs); + + return progress; +} + static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block *block, struct copy_propagation_state *state) { @@ -455,7 +556,7 @@ static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block * break;
case HLSL_IR_IF: - FIXME("Copy propagation doesn't support conditionals yet, leaving.\n"); + progress |= copy_propagation_if(ctx, hlsl_ir_if(instr), state); return progress;
case HLSL_IR_LOOP: @@ -475,11 +576,18 @@ static bool copy_propagation_pass(struct hlsl_ctx *ctx, struct hlsl_block *block struct copy_propagation_state state; bool progress;
- rb_init(&state.variables, copy_propagation_variable_compare); + state.depth = 0; + state.capacity = 1; + state.variables = hlsl_alloc(ctx, sizeof(*state.variables) * state.capacity); + if (!state.variables) + return false; + rb_init(&state.variables[state.depth], copy_propagation_variable_compare);
progress = copy_propagation_recursive(ctx, block, &state);
- rb_destroy(&state.variables, copy_propagation_variable_destroy, NULL); + assert(state.depth == 0); + rb_destroy(&state.variables[state.depth], copy_propagation_variable_destroy, NULL); + vkd3d_free(state.variables);
return progress; }
Signed-off-by: Giovanni Mascellani gmascellani@codeweavers.com --- libs/vkd3d-shader/hlsl_codegen.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 78bcd079..fe6a9908 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -537,6 +537,28 @@ end: return progress; }
+/* The loop body can inherit the variable state avaiable when entering + * the "loop", except that all variables that are written in the body + * must be invalidated (because at the beginning of each execution we + * don't know whether the body has already executed or not). The same + * is valid for after the "loop", because we don't know whether the + * body has executed or not. */ +static bool copy_propagation_loop(struct hlsl_ctx *ctx, struct hlsl_ir_loop *loop, + struct copy_propagation_state *state) +{ + bool progress = false; + + copy_propagation_invalidate_from_block(ctx, state, &loop->body); + if (!copy_propagation_duplicate(ctx, state)) + return progress; + + progress |= copy_propagation_recursive(ctx, &loop->body, state); + + copy_propagation_pop(state); + + return progress; +} + static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block *block, struct copy_propagation_state *state) { @@ -560,7 +582,7 @@ static bool copy_propagation_recursive(struct hlsl_ctx *ctx, struct hlsl_block * return progress;
case HLSL_IR_LOOP: - FIXME("Copy propagation doesn't support loops yet, leaving.\n"); + progress |= copy_propagation_loop(ctx, hlsl_ir_loop(instr), state); return progress;
default:
Signed-off-by: Giovanni Mascellani gmascellani@codeweavers.com --- libs/vkd3d-shader/hlsl_codegen.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index fe6a9908..2cadbdba 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -852,6 +852,27 @@ static bool fold_constants(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, voi return true; }
+static bool remove_trivial_swizzles(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context) +{ + struct hlsl_ir_swizzle *swizzle; + unsigned int i; + + if (instr->type != HLSL_IR_SWIZZLE) + return false; + swizzle = hlsl_ir_swizzle(instr); + + if (instr->data_type->dimx != swizzle->val.node->data_type->dimx) + return false; + + for (i = 0; i < instr->data_type->dimx; ++i) + if (((swizzle->swizzle >> (2 * i)) & 3) != i) + return false; + + replace_node(instr, swizzle->val.node); + + return true; +} + /* Lower DIV to RCP + MUL. */ static bool lower_division(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context) { @@ -1737,6 +1758,7 @@ int hlsl_emit_dxbc(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_fun progress |= copy_propagation_pass(ctx, body); } while (progress); + transform_ir(ctx, remove_trivial_swizzles, body, NULL);
if (ctx->profile->major_version < 4) transform_ir(ctx, lower_division, body, NULL);
This patch looks good, and is valuable on its own, but it depends on the previous ones...