-- v2: vkd3d-shader/hlsl: Map the colour output for ps_1_* to r0. vkd3d-shader/hlsl: Rewrite the register allocator to allow allocating in multiple passes. vkd3d-shader/hlsl: Avoid leaking the allocator register map in allocate_const_registers(). vkd3d-shader/hlsl: Rename struct liveness to struct register_allocator.
From: Zebediah Figura zfigura@codeweavers.com
--- libs/vkd3d-shader/hlsl_codegen.c | 99 ++++++++++++++++---------------- 1 file changed, 51 insertions(+), 48 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 42f8ab3b..95406c43 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2557,7 +2557,7 @@ static void compute_liveness(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl compute_liveness_recurse(&entry_func->body, 0, 0); }
-struct liveness +struct register_allocator { size_t size; uint32_t reg_count; @@ -2568,14 +2568,14 @@ struct liveness } *regs; };
-static unsigned int get_available_writemask(struct liveness *liveness, +static unsigned int get_available_writemask(struct register_allocator *allocator, unsigned int first_write, unsigned int component_idx, unsigned int reg_size) { unsigned int i, writemask = 0, count = 0;
for (i = 0; i < 4; ++i) { - if (liveness->regs[component_idx + i].last_read <= first_write) + if (allocator->regs[component_idx + i].last_read <= first_write) { writemask |= 1u << i; if (++count == reg_size) @@ -2586,22 +2586,22 @@ static unsigned int get_available_writemask(struct liveness *liveness, return 0; }
-static bool resize_liveness(struct hlsl_ctx *ctx, struct liveness *liveness, size_t new_count) +static bool resize_liveness(struct hlsl_ctx *ctx, struct register_allocator *allocator, size_t new_count) { - size_t old_capacity = liveness->size; + size_t old_capacity = allocator->size;
- if (!hlsl_array_reserve(ctx, (void **)&liveness->regs, &liveness->size, new_count, sizeof(*liveness->regs))) + if (!hlsl_array_reserve(ctx, (void **)&allocator->regs, &allocator->size, new_count, sizeof(*allocator->regs))) return false;
- if (liveness->size > old_capacity) - memset(liveness->regs + old_capacity, 0, (liveness->size - old_capacity) * sizeof(*liveness->regs)); + if (allocator->size > old_capacity) + memset(allocator->regs + old_capacity, 0, (allocator->size - old_capacity) * sizeof(*allocator->regs)); return true; }
/* reg_size is the number of register components to be reserved, while component_count is the number * of components for the register's writemask. In SM1, floats and vectors allocate the whole * register, even if they don't use it completely. */ -static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness *liveness, +static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct register_allocator *allocator, unsigned int first_write, unsigned int last_read, unsigned int reg_size, unsigned int component_count) { @@ -2610,74 +2610,74 @@ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness *
assert(component_count <= reg_size);
- for (component_idx = 0; component_idx < liveness->size; component_idx += 4) + for (component_idx = 0; component_idx < allocator->size; component_idx += 4) { - if ((writemask = get_available_writemask(liveness, first_write, component_idx, reg_size))) + if ((writemask = get_available_writemask(allocator, first_write, component_idx, reg_size))) break; } - if (component_idx == liveness->size) + if (component_idx == allocator->size) { - if (!resize_liveness(ctx, liveness, component_idx + 4)) + if (!resize_liveness(ctx, allocator, component_idx + 4)) return ret; writemask = (1u << reg_size) - 1; } for (i = 0; i < 4; ++i) { if (writemask & (1u << i)) - liveness->regs[component_idx + i].last_read = last_read; + allocator->regs[component_idx + i].last_read = last_read; } ret.id = component_idx / 4; ret.writemask = hlsl_combine_writemasks(writemask, (1u << component_count) - 1); ret.allocated = true; - liveness->reg_count = max(liveness->reg_count, ret.id + 1); + allocator->reg_count = max(allocator->reg_count, ret.id + 1); return ret; }
-static bool is_range_available(struct liveness *liveness, unsigned int first_write, +static bool is_range_available(struct register_allocator *allocator, unsigned int first_write, unsigned int component_idx, unsigned int reg_size) { unsigned int i;
for (i = 0; i < reg_size; i += 4) { - if (!get_available_writemask(liveness, first_write, component_idx + i, 4)) + if (!get_available_writemask(allocator, first_write, component_idx + i, 4)) return false; } return true; }
-static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct liveness *liveness, +static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct register_allocator *allocator, unsigned int first_write, unsigned int last_read, unsigned int reg_size) { unsigned int i, component_idx; struct hlsl_reg ret = {0};
- for (component_idx = 0; component_idx < liveness->size; component_idx += 4) + for (component_idx = 0; component_idx < allocator->size; component_idx += 4) { - if (is_range_available(liveness, first_write, component_idx, - min(reg_size, liveness->size - component_idx))) + if (is_range_available(allocator, first_write, component_idx, + min(reg_size, allocator->size - component_idx))) break; } - if (!resize_liveness(ctx, liveness, component_idx + reg_size)) + if (!resize_liveness(ctx, allocator, component_idx + reg_size)) return ret;
for (i = 0; i < reg_size; ++i) - liveness->regs[component_idx + i].last_read = last_read; + allocator->regs[component_idx + i].last_read = last_read; ret.id = component_idx / 4; ret.allocated = true; - liveness->reg_count = max(liveness->reg_count, ret.id + align(reg_size, 4)); + allocator->reg_count = max(allocator->reg_count, ret.id + align(reg_size, 4)); return ret; }
-static struct hlsl_reg allocate_numeric_registers_for_type(struct hlsl_ctx *ctx, struct liveness *liveness, +static struct hlsl_reg allocate_numeric_registers_for_type(struct hlsl_ctx *ctx, struct register_allocator *allocator, unsigned int first_write, unsigned int last_read, const struct hlsl_type *type) { unsigned int reg_size = type->reg_size[HLSL_REGSET_NUMERIC];
if (type->class <= HLSL_CLASS_VECTOR) - return allocate_register(ctx, liveness, first_write, last_read, reg_size, type->dimx); + return allocate_register(ctx, allocator, first_write, last_read, reg_size, type->dimx); else - return allocate_range(ctx, liveness, first_write, last_read, reg_size); + return allocate_range(ctx, allocator, first_write, last_read, reg_size); }
static const char *debug_register(char class, struct hlsl_reg reg, const struct hlsl_type *type) @@ -2696,14 +2696,15 @@ static const char *debug_register(char class, struct hlsl_reg reg, const struct return vkd3d_dbg_sprintf("%c%u%s", class, reg.id, debug_hlsl_writemask(reg.writemask)); }
-static void allocate_variable_temp_register(struct hlsl_ctx *ctx, struct hlsl_ir_var *var, struct liveness *liveness) +static void allocate_variable_temp_register(struct hlsl_ctx *ctx, + struct hlsl_ir_var *var, struct register_allocator *allocator) { if (var->is_input_semantic || var->is_output_semantic || var->is_uniform) return;
if (!var->regs[HLSL_REGSET_NUMERIC].allocated && var->last_read) { - var->regs[HLSL_REGSET_NUMERIC] = allocate_numeric_registers_for_type(ctx, liveness, + var->regs[HLSL_REGSET_NUMERIC] = allocate_numeric_registers_for_type(ctx, allocator, var->first_write, var->last_read, var->data_type);
TRACE("Allocated %s to %s (liveness %u-%u).\n", var->name, debug_register('r', @@ -2711,7 +2712,8 @@ static void allocate_variable_temp_register(struct hlsl_ctx *ctx, struct hlsl_ir } }
-static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_block *block, struct liveness *liveness) +static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, + struct hlsl_block *block, struct register_allocator *allocator) { struct hlsl_ir_node *instr;
@@ -2719,7 +2721,7 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl { if (!instr->reg.allocated && instr->last_read) { - instr->reg = allocate_numeric_registers_for_type(ctx, liveness, instr->index, instr->last_read, + instr->reg = allocate_numeric_registers_for_type(ctx, allocator, instr->index, instr->last_read, instr->data_type); TRACE("Allocated anonymous expression @%u to %s (liveness %u-%u).\n", instr->index, debug_register('r', instr->reg, instr->data_type), instr->index, instr->last_read); @@ -2730,8 +2732,8 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl case HLSL_IR_IF: { struct hlsl_ir_if *iff = hlsl_ir_if(instr); - allocate_temp_registers_recurse(ctx, &iff->then_block, liveness); - allocate_temp_registers_recurse(ctx, &iff->else_block, liveness); + allocate_temp_registers_recurse(ctx, &iff->then_block, allocator); + allocate_temp_registers_recurse(ctx, &iff->else_block, allocator); break; }
@@ -2740,21 +2742,21 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl struct hlsl_ir_load *load = hlsl_ir_load(instr); /* We need to at least allocate a variable for undefs. * FIXME: We should probably find a way to remove them instead. */ - allocate_variable_temp_register(ctx, load->src.var, liveness); + allocate_variable_temp_register(ctx, load->src.var, allocator); break; }
case HLSL_IR_LOOP: { struct hlsl_ir_loop *loop = hlsl_ir_loop(instr); - allocate_temp_registers_recurse(ctx, &loop->body, liveness); + allocate_temp_registers_recurse(ctx, &loop->body, allocator); break; }
case HLSL_IR_STORE: { struct hlsl_ir_store *store = hlsl_ir_store(instr); - allocate_variable_temp_register(ctx, store->lhs.var, liveness); + allocate_variable_temp_register(ctx, store->lhs.var, allocator); break; }
@@ -2764,7 +2766,8 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl } }
-static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_block *block, struct liveness *liveness) +static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, + struct hlsl_block *block, struct register_allocator *allocator) { struct hlsl_constant_defs *defs = &ctx->constant_defs; struct hlsl_ir_node *instr; @@ -2780,7 +2783,7 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b unsigned int x, y, i, writemask, end_reg; unsigned int reg_size = type->reg_size[HLSL_REGSET_NUMERIC];
- constant->reg = allocate_numeric_registers_for_type(ctx, liveness, 1, UINT_MAX, type); + constant->reg = allocate_numeric_registers_for_type(ctx, allocator, 1, UINT_MAX, type); TRACE("Allocated constant @%u to %s.\n", instr->index, debug_register('c', constant->reg, type));
if (!hlsl_array_reserve(ctx, (void **)&defs->values, &defs->size, @@ -2845,15 +2848,15 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b case HLSL_IR_IF: { struct hlsl_ir_if *iff = hlsl_ir_if(instr); - allocate_const_registers_recurse(ctx, &iff->then_block, liveness); - allocate_const_registers_recurse(ctx, &iff->else_block, liveness); + allocate_const_registers_recurse(ctx, &iff->then_block, allocator); + allocate_const_registers_recurse(ctx, &iff->else_block, allocator); break; }
case HLSL_IR_LOOP: { struct hlsl_ir_loop *loop = hlsl_ir_loop(instr); - allocate_const_registers_recurse(ctx, &loop->body, liveness); + allocate_const_registers_recurse(ctx, &loop->body, allocator); break; }
@@ -2865,10 +2868,10 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b
static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func) { - struct liveness liveness = {0}; + struct register_allocator allocator = {0}; struct hlsl_ir_var *var;
- allocate_const_registers_recurse(ctx, &entry_func->body, &liveness); + allocate_const_registers_recurse(ctx, &entry_func->body, &allocator);
LIST_FOR_EACH_ENTRY(var, &ctx->extern_vars, struct hlsl_ir_var, extern_entry) { @@ -2879,7 +2882,7 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi if (reg_size == 0) continue;
- var->regs[HLSL_REGSET_NUMERIC] = allocate_numeric_registers_for_type(ctx, &liveness, + var->regs[HLSL_REGSET_NUMERIC] = allocate_numeric_registers_for_type(ctx, &allocator, 1, UINT_MAX, var->data_type); TRACE("Allocated %s to %s.\n", var->name, debug_register('c', var->regs[HLSL_REGSET_NUMERIC], var->data_type)); @@ -2893,10 +2896,10 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi * does not handle constants. */ static void allocate_temp_registers(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func) { - struct liveness liveness = {0}; - allocate_temp_registers_recurse(ctx, &entry_func->body, &liveness); - ctx->temp_count = liveness.reg_count; - vkd3d_free(liveness.regs); + struct register_allocator allocator = {0}; + allocate_temp_registers_recurse(ctx, &entry_func->body, &allocator); + ctx->temp_count = allocator.reg_count; + vkd3d_free(allocator.regs); }
static void allocate_semantic_register(struct hlsl_ctx *ctx, struct hlsl_ir_var *var, unsigned int *counter, bool output)
From: Zebediah Figura zfigura@codeweavers.com
--- libs/vkd3d-shader/hlsl_codegen.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 95406c43..e06b6c0f 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2888,6 +2888,8 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi debug_register('c', var->regs[HLSL_REGSET_NUMERIC], var->data_type)); } } + + vkd3d_free(allocator.regs); }
/* Simple greedy temporary register allocation pass that just assigns a unique
From: Zebediah Figura zfigura@codeweavers.com
We will need this in order to allocate some "special" registers: ps_1_* output, sincos output, etc. --- libs/vkd3d-shader/hlsl_codegen.c | 129 +++++++++++++++++-------------- 1 file changed, 71 insertions(+), 58 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index e06b6c0f..1081422e 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2559,43 +2559,61 @@ static void compute_liveness(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl
struct register_allocator { - size_t size; - uint32_t reg_count; - struct + size_t count, capacity; + + /* Highest register index that has been allocated. + * Used to declare sm4 temp count. */ + uint32_t max_reg; + + struct allocation { - /* 0 if not live yet. */ - unsigned int last_read; - } *regs; + uint32_t reg; + unsigned int writemask; + unsigned int first_write, last_read; + } *allocations; };
-static unsigned int get_available_writemask(struct register_allocator *allocator, - unsigned int first_write, unsigned int component_idx, unsigned int reg_size) +static unsigned int get_available_writemask(const struct register_allocator *allocator, + unsigned int first_write, unsigned int last_read, uint32_t reg_idx) { - unsigned int i, writemask = 0, count = 0; + unsigned int writemask = VKD3DSP_WRITEMASK_ALL; + size_t i;
- for (i = 0; i < 4; ++i) + for (i = 0; i < allocator->count; ++i) { - if (allocator->regs[component_idx + i].last_read <= first_write) - { - writemask |= 1u << i; - if (++count == reg_size) - return writemask; - } + const struct allocation *allocation = &allocator->allocations[i]; + + /* We do not overlap if first write == last read: + * this is the case where we are allocating the result of that + * expression, e.g. "add r0, r0, r1". */ + + if (allocation->reg == reg_idx + && first_write < allocation->last_read && last_read > allocation->first_write) + writemask &= ~allocation->writemask; + + if (!writemask) + break; }
- return 0; + return writemask; }
-static bool resize_liveness(struct hlsl_ctx *ctx, struct register_allocator *allocator, size_t new_count) +static void record_allocation(struct hlsl_ctx *ctx, struct register_allocator *allocator, + uint32_t reg_idx, unsigned int writemask, unsigned int first_write, unsigned int last_read) { - size_t old_capacity = allocator->size; + struct allocation *allocation;
- if (!hlsl_array_reserve(ctx, (void **)&allocator->regs, &allocator->size, new_count, sizeof(*allocator->regs))) - return false; + if (!hlsl_array_reserve(ctx, (void **)&allocator->allocations, &allocator->capacity, + allocator->count + 1, sizeof(*allocator->allocations))) + return;
- if (allocator->size > old_capacity) - memset(allocator->regs + old_capacity, 0, (allocator->size - old_capacity) * sizeof(*allocator->regs)); - return true; + allocation = &allocator->allocations[allocator->count++]; + allocation->reg = reg_idx; + allocation->writemask = writemask; + allocation->first_write = first_write; + allocation->last_read = last_read; + + allocator->max_reg = max(allocator->max_reg, reg_idx); }
/* reg_size is the number of register components to be reserved, while component_count is the number @@ -2605,42 +2623,39 @@ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct register_a unsigned int first_write, unsigned int last_read, unsigned int reg_size, unsigned int component_count) { - unsigned int component_idx, writemask, i; struct hlsl_reg ret = {0}; + unsigned int writemask; + uint32_t reg_idx;
assert(component_count <= reg_size);
- for (component_idx = 0; component_idx < allocator->size; component_idx += 4) + for (reg_idx = 0;; ++reg_idx) { - if ((writemask = get_available_writemask(allocator, first_write, component_idx, reg_size))) + writemask = get_available_writemask(allocator, first_write, last_read, reg_idx); + + if (vkd3d_popcount(writemask) >= reg_size) + { + writemask = hlsl_combine_writemasks(writemask, (1u << reg_size) - 1); break; + } } - if (component_idx == allocator->size) - { - if (!resize_liveness(ctx, allocator, component_idx + 4)) - return ret; - writemask = (1u << reg_size) - 1; - } - for (i = 0; i < 4; ++i) - { - if (writemask & (1u << i)) - allocator->regs[component_idx + i].last_read = last_read; - } - ret.id = component_idx / 4; + + record_allocation(ctx, allocator, reg_idx, writemask, first_write, last_read); + + ret.id = reg_idx; ret.writemask = hlsl_combine_writemasks(writemask, (1u << component_count) - 1); ret.allocated = true; - allocator->reg_count = max(allocator->reg_count, ret.id + 1); return ret; }
-static bool is_range_available(struct register_allocator *allocator, unsigned int first_write, - unsigned int component_idx, unsigned int reg_size) +static bool is_range_available(const struct register_allocator *allocator, + unsigned int first_write, unsigned int last_read, uint32_t reg_idx, unsigned int reg_size) { - unsigned int i; + uint32_t i;
- for (i = 0; i < reg_size; i += 4) + for (i = 0; i < (reg_size / 4); ++i) { - if (!get_available_writemask(allocator, first_write, component_idx + i, 4)) + if (get_available_writemask(allocator, first_write, last_read, reg_idx + i) != VKD3DSP_WRITEMASK_ALL) return false; } return true; @@ -2649,23 +2664,21 @@ static bool is_range_available(struct register_allocator *allocator, unsigned in static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct register_allocator *allocator, unsigned int first_write, unsigned int last_read, unsigned int reg_size) { - unsigned int i, component_idx; struct hlsl_reg ret = {0}; + uint32_t reg_idx; + unsigned int i;
- for (component_idx = 0; component_idx < allocator->size; component_idx += 4) + for (reg_idx = 0;; ++reg_idx) { - if (is_range_available(allocator, first_write, component_idx, - min(reg_size, allocator->size - component_idx))) + if (is_range_available(allocator, first_write, last_read, reg_idx, reg_size)) break; } - if (!resize_liveness(ctx, allocator, component_idx + reg_size)) - return ret;
- for (i = 0; i < reg_size; ++i) - allocator->regs[component_idx + i].last_read = last_read; - ret.id = component_idx / 4; + for (i = 0; i < reg_size / 4; ++i) + record_allocation(ctx, allocator, reg_idx + i, VKD3DSP_WRITEMASK_ALL, first_write, last_read); + + ret.id = reg_idx; ret.allocated = true; - allocator->reg_count = max(allocator->reg_count, ret.id + align(reg_size, 4)); return ret; }
@@ -2889,7 +2902,7 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi } }
- vkd3d_free(allocator.regs); + vkd3d_free(allocator.allocations); }
/* Simple greedy temporary register allocation pass that just assigns a unique @@ -2900,8 +2913,8 @@ static void allocate_temp_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functio { struct register_allocator allocator = {0}; allocate_temp_registers_recurse(ctx, &entry_func->body, &allocator); - ctx->temp_count = allocator.reg_count; - vkd3d_free(allocator.regs); + ctx->temp_count = allocator.max_reg + 1; + vkd3d_free(allocator.allocations); }
static void allocate_semantic_register(struct hlsl_ctx *ctx, struct hlsl_ir_var *var, unsigned int *counter, bool output)
From: Zebediah Figura zfigura@codeweavers.com
--- libs/vkd3d-shader/d3dbc.c | 7 ++++++- libs/vkd3d-shader/hlsl_codegen.c | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/d3dbc.c b/libs/vkd3d-shader/d3dbc.c index 14268440..011d967f 100644 --- a/libs/vkd3d-shader/d3dbc.c +++ b/libs/vkd3d-shader/d3dbc.c @@ -1838,7 +1838,12 @@ static void write_sm1_store(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *
if (store->lhs.var->is_output_semantic) { - if (!hlsl_sm1_register_from_semantic(ctx, &store->lhs.var->semantic, + if (ctx->profile->type == VKD3D_SHADER_TYPE_PIXEL && ctx->profile->major_version == 1) + { + sm1_instr.dst.type = D3DSPR_TEMP; + sm1_instr.dst.reg = 0; + } + else if (!hlsl_sm1_register_from_semantic(ctx, &store->lhs.var->semantic, true, &sm1_instr.dst.type, &sm1_instr.dst.reg)) { assert(reg.allocated); diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 1081422e..d3dac815 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2912,6 +2912,24 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi static void allocate_temp_registers(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func) { struct register_allocator allocator = {0}; + + /* ps_1_* outputs are special and go in temp register 0. */ + if (ctx->profile->major_version == 1 && ctx->profile->type == VKD3D_SHADER_TYPE_PIXEL) + { + size_t i; + + for (i = 0; i < entry_func->parameters.count; ++i) + { + const struct hlsl_ir_var *var = entry_func->parameters.vars[i]; + + if (var->is_output_semantic) + { + record_allocation(ctx, &allocator, 0, VKD3DSP_WRITEMASK_ALL, var->first_write, var->last_read); + break; + } + } + } + allocate_temp_registers_recurse(ctx, &entry_func->body, &allocator); ctx->temp_count = allocator.max_reg + 1; vkd3d_free(allocator.allocations); @@ -2940,6 +2958,10 @@ static void allocate_semantic_register(struct hlsl_ctx *ctx, struct hlsl_ir_var D3DDECLUSAGE usage; uint32_t usage_idx;
+ /* ps_1_* outputs are special and go in temp register 0. */ + if (ctx->profile->major_version == 1 && output && ctx->profile->type == VKD3D_SHADER_TYPE_PIXEL) + return; + builtin = hlsl_sm1_register_from_semantic(ctx, &var->semantic, output, &type, ®); if (!builtin && !hlsl_sm1_usage_from_semantic(&var->semantic, &usage, &usage_idx)) {
On Thu Apr 20 21:00:31 2023 +0000, Matteo Bruni wrote:
In any case, I don't think you need to do anything about it.
I felt sort of the same confusion. Part of it is that "We do not overlap" can mean two things: "There is not overlap when fw == lr, so we can proceed" (the intended meaning) and "We (the programmers) do not want to create an overlap by accepting this case in which fw == lr, so we cannot proceed". I guess this might be a general problem with ergative verbs with an unclear pronoun (and I generally interpret "we" as the programmers in program comments).
"There is no overlap" feels much clearer to me.
The second instance of "we" ("we are allocating") is probably more clear, but maybe "we are" can be dropped there too without sacrificing grammaticality and without requiring the random reader to figure out who is "we".
The MR is ok for me, except for the little remark about comment wording.
I just wanted to notice that the new algorithm has higher computational complexity than before, because `get_available_writemask()` used to be constant time and it's now linear in the number of register allocations. This already causes a measurable performance hit in a synthetic but still relatively simple shader as this: ``` uniform float4x4 x; uniform float4x4 y;
float4 main(float4 pos : sv_position) : sv_target { float4x4 a = mul(mul(y, x), mul(x, x)); float4x4 b = mul(mul(y, x), mul(y, x)); float4x4 c = mul(mul(y, y), mul(x, x)); float4x4 d = mul(mul(y, y), mul(y, x));
float4 ret = 0.0; ret += a[0] - b[0] * c[0] / d[0]; ret += a[1] - b[1] * c[1] / d[1]; ret += a[2] - b[2] * c[2] / d[2]; ret += a[3] - b[3] * c[3] / d[3];
return ret; } ```
Here I am leveraging `mul()` to create a lot of temporaries and summing everything to prevent DCE from optimizing too much. On my computer a shader runner that just compiles this (doesn't execute it) takes 0.1 seconds before this MR and 0.11 seconds after it.
I don't claim any significance for my random microbenchmark experiment, so I don't think it's necessary to change the MR, but when and if we'll be harvesting for performances in the HLSL compiler let's remember to have a look here.
This merge request was approved by Giovanni Mascellani.
This merge request was approved by Henri Verbeet.
On Wed May 3 16:08:53 2023 +0000, Giovanni Mascellani wrote:
The MR is ok for me, except for the little remark about comment wording. I just wanted to notice that the new algorithm has higher computational complexity than before, because `get_available_writemask()` used to be constant time and it's now linear in the number of register allocations. This already causes a measurable performance hit in a synthetic but still relatively simple shader as this:
uniform float4x4 x; uniform float4x4 y; float4 main(float4 pos : sv_position) : sv_target { float4x4 a = mul(mul(y, x), mul(x, x)); float4x4 b = mul(mul(y, x), mul(y, x)); float4x4 c = mul(mul(y, y), mul(x, x)); float4x4 d = mul(mul(y, y), mul(y, x)); float4 ret = 0.0; ret += a[0] - b[0] * c[0] / d[0]; ret += a[1] - b[1] * c[1] / d[1]; ret += a[2] - b[2] * c[2] / d[2]; ret += a[3] - b[3] * c[3] / d[3]; return ret; }
Here I am leveraging `mul()` to create a lot of temporaries and summing everything to prevent DCE from optimizing too much. On my computer a shader runner that just compiles this (doesn't execute it) takes 0.1 seconds before this MR and 0.11 seconds after it. I don't claim any significance for my random microbenchmark experiment, so I don't think it's necessary to change the MR, but when and if we'll be harvesting for performances in the HLSL compiler let's remember to have a look here.
We could probably do better by just recording allocations for the few cases where we need to reserve, and then using the old pass for everything else. But it's probably not worth rewriting this again until we see evidence it matters.
On Wed May 3 16:08:53 2023 +0000, Zebediah Figura wrote:
We could probably do better by just recording allocations for the few cases where we need to reserve, and then using the old pass for everything else. But it's probably not worth rewriting this again until we see evidence it matters.
Or simply use a more efficient data structure for recording allocations. I agree there is no need to do that now.