[PATCH v2 0/4] MR168: hlsl: ps_1_* outputs.

List overview All Threads
Download

newer

older

Re: [PATCH v4 0/3] MR194: Release...

Re: [PATCH v2 0/4] MR174:...

Zebediah Figura (＠zfigura)

24 Apr 2023 24 Apr '23

7:28 p.m.

-- v2: vkd3d-shader/hlsl: Map the colour output for ps_1_* to r0. vkd3d-shader/hlsl: Rewrite the register allocator to allow allocating in multiple passes. vkd3d-shader/hlsl: Avoid leaking the allocator register map in allocate_const_registers(). vkd3d-shader/hlsl: Rename struct liveness to struct register_allocator.

https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168

Show replies by date

Zebediah Figura

24 Apr 24 Apr

7:28 p.m.

New subject: [PATCH v2 1/4] vkd3d-shader/hlsl: Rename struct liveness to struct register_allocator.

From: Zebediah Figura zfigura@codeweavers.com

--- libs/vkd3d-shader/hlsl_codegen.c | 99 ++++++++++++++++---------------- 1 file changed, 51 insertions(+), 48 deletions(-)

diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 42f8ab3b..95406c43 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2557,7 +2557,7 @@ static void compute_liveness(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl compute_liveness_recurse(&entry_func->body, 0, 0); }

-struct liveness +struct register_allocator { size_t size; uint32_t reg_count; @@ -2568,14 +2568,14 @@ struct liveness } *regs; };

-static unsigned int get_available_writemask(struct liveness *liveness, +static unsigned int get_available_writemask(struct register_allocator *allocator, unsigned int first_write, unsigned int component_idx, unsigned int reg_size) { unsigned int i, writemask = 0, count = 0;

for (i = 0; i < 4; ++i) { - if (liveness->regs[component_idx + i].last_read <= first_write) + if (allocator->regs[component_idx + i].last_read <= first_write) { writemask |= 1u << i; if (++count == reg_size) @@ -2586,22 +2586,22 @@ static unsigned int get_available_writemask(struct liveness *liveness, return 0; }

-static bool resize_liveness(struct hlsl_ctx *ctx, struct liveness *liveness, size_t new_count) +static bool resize_liveness(struct hlsl_ctx *ctx, struct register_allocator *allocator, size_t new_count) { - size_t old_capacity = liveness->size; + size_t old_capacity = allocator->size;

- if (!hlsl_array_reserve(ctx, (void **)&liveness->regs, &liveness->size, new_count, sizeof(*liveness->regs))) + if (!hlsl_array_reserve(ctx, (void **)&allocator->regs, &allocator->size, new_count, sizeof(*allocator->regs))) return false;

- if (liveness->size > old_capacity) - memset(liveness->regs + old_capacity, 0, (liveness->size - old_capacity) * sizeof(*liveness->regs)); + if (allocator->size > old_capacity) + memset(allocator->regs + old_capacity, 0, (allocator->size - old_capacity) * sizeof(*allocator->regs)); return true; }

/* reg_size is the number of register components to be reserved, while component_count is the number * of components for the register's writemask. In SM1, floats and vectors allocate the whole * register, even if they don't use it completely. */ -static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness *liveness, +static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct register_allocator *allocator, unsigned int first_write, unsigned int last_read, unsigned int reg_size, unsigned int component_count) { @@ -2610,74 +2610,74 @@ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness *

assert(component_count <= reg_size);

- for (component_idx = 0; component_idx < liveness->size; component_idx += 4) + for (component_idx = 0; component_idx < allocator->size; component_idx += 4) { - if ((writemask = get_available_writemask(liveness, first_write, component_idx, reg_size))) + if ((writemask = get_available_writemask(allocator, first_write, component_idx, reg_size))) break; } - if (component_idx == liveness->size) + if (component_idx == allocator->size) { - if (!resize_liveness(ctx, liveness, component_idx + 4)) + if (!resize_liveness(ctx, allocator, component_idx + 4)) return ret; writemask = (1u << reg_size) - 1; } for (i = 0; i < 4; ++i) { if (writemask & (1u << i)) - liveness->regs[component_idx + i].last_read = last_read; + allocator->regs[component_idx + i].last_read = last_read; } ret.id = component_idx / 4; ret.writemask = hlsl_combine_writemasks(writemask, (1u << component_count) - 1); ret.allocated = true; - liveness->reg_count = max(liveness->reg_count, ret.id + 1); + allocator->reg_count = max(allocator->reg_count, ret.id + 1); return ret; }

-static bool is_range_available(struct liveness *liveness, unsigned int first_write, +static bool is_range_available(struct register_allocator *allocator, unsigned int first_write, unsigned int component_idx, unsigned int reg_size) { unsigned int i;

for (i = 0; i < reg_size; i += 4) { - if (!get_available_writemask(liveness, first_write, component_idx + i, 4)) + if (!get_available_writemask(allocator, first_write, component_idx + i, 4)) return false; } return true; }

-static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct liveness *liveness, +static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct register_allocator *allocator, unsigned int first_write, unsigned int last_read, unsigned int reg_size) { unsigned int i, component_idx; struct hlsl_reg ret = {0};

- for (component_idx = 0; component_idx < liveness->size; component_idx += 4) + for (component_idx = 0; component_idx < allocator->size; component_idx += 4) { - if (is_range_available(liveness, first_write, component_idx, - min(reg_size, liveness->size - component_idx))) + if (is_range_available(allocator, first_write, component_idx, + min(reg_size, allocator->size - component_idx))) break; } - if (!resize_liveness(ctx, liveness, component_idx + reg_size)) + if (!resize_liveness(ctx, allocator, component_idx + reg_size)) return ret;

for (i = 0; i < reg_size; ++i) - liveness->regs[component_idx + i].last_read = last_read; + allocator->regs[component_idx + i].last_read = last_read; ret.id = component_idx / 4; ret.allocated = true; - liveness->reg_count = max(liveness->reg_count, ret.id + align(reg_size, 4)); + allocator->reg_count = max(allocator->reg_count, ret.id + align(reg_size, 4)); return ret; }

-static struct hlsl_reg allocate_numeric_registers_for_type(struct hlsl_ctx *ctx, struct liveness *liveness, +static struct hlsl_reg allocate_numeric_registers_for_type(struct hlsl_ctx *ctx, struct register_allocator *allocator, unsigned int first_write, unsigned int last_read, const struct hlsl_type *type) { unsigned int reg_size = type->reg_size[HLSL_REGSET_NUMERIC];

if (type->class <= HLSL_CLASS_VECTOR) - return allocate_register(ctx, liveness, first_write, last_read, reg_size, type->dimx); + return allocate_register(ctx, allocator, first_write, last_read, reg_size, type->dimx); else - return allocate_range(ctx, liveness, first_write, last_read, reg_size); + return allocate_range(ctx, allocator, first_write, last_read, reg_size); }

static const char *debug_register(char class, struct hlsl_reg reg, const struct hlsl_type *type) @@ -2696,14 +2696,15 @@ static const char *debug_register(char class, struct hlsl_reg reg, const struct return vkd3d_dbg_sprintf("%c%u%s", class, reg.id, debug_hlsl_writemask(reg.writemask)); }

-static void allocate_variable_temp_register(struct hlsl_ctx *ctx, struct hlsl_ir_var *var, struct liveness *liveness) +static void allocate_variable_temp_register(struct hlsl_ctx *ctx, + struct hlsl_ir_var *var, struct register_allocator *allocator) { if (var->is_input_semantic || var->is_output_semantic || var->is_uniform) return;

if (!var->regs[HLSL_REGSET_NUMERIC].allocated && var->last_read) { - var->regs[HLSL_REGSET_NUMERIC] = allocate_numeric_registers_for_type(ctx, liveness, + var->regs[HLSL_REGSET_NUMERIC] = allocate_numeric_registers_for_type(ctx, allocator, var->first_write, var->last_read, var->data_type);

TRACE("Allocated %s to %s (liveness %u-%u).\n", var->name, debug_register('r', @@ -2711,7 +2712,8 @@ static void allocate_variable_temp_register(struct hlsl_ctx *ctx, struct hlsl_ir } }

-static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_block *block, struct liveness *liveness) +static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, + struct hlsl_block *block, struct register_allocator *allocator) { struct hlsl_ir_node *instr;

@@ -2719,7 +2721,7 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl { if (!instr->reg.allocated && instr->last_read) { - instr->reg = allocate_numeric_registers_for_type(ctx, liveness, instr->index, instr->last_read, + instr->reg = allocate_numeric_registers_for_type(ctx, allocator, instr->index, instr->last_read, instr->data_type); TRACE("Allocated anonymous expression @%u to %s (liveness %u-%u).\n", instr->index, debug_register('r', instr->reg, instr->data_type), instr->index, instr->last_read); @@ -2730,8 +2732,8 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl case HLSL_IR_IF: { struct hlsl_ir_if *iff = hlsl_ir_if(instr); - allocate_temp_registers_recurse(ctx, &iff->then_block, liveness); - allocate_temp_registers_recurse(ctx, &iff->else_block, liveness); + allocate_temp_registers_recurse(ctx, &iff->then_block, allocator); + allocate_temp_registers_recurse(ctx, &iff->else_block, allocator); break; }

@@ -2740,21 +2742,21 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl struct hlsl_ir_load *load = hlsl_ir_load(instr); /* We need to at least allocate a variable for undefs. * FIXME: We should probably find a way to remove them instead. */ - allocate_variable_temp_register(ctx, load->src.var, liveness); + allocate_variable_temp_register(ctx, load->src.var, allocator); break; }

case HLSL_IR_LOOP: { struct hlsl_ir_loop *loop = hlsl_ir_loop(instr); - allocate_temp_registers_recurse(ctx, &loop->body, liveness); + allocate_temp_registers_recurse(ctx, &loop->body, allocator); break; }

case HLSL_IR_STORE: { struct hlsl_ir_store *store = hlsl_ir_store(instr); - allocate_variable_temp_register(ctx, store->lhs.var, liveness); + allocate_variable_temp_register(ctx, store->lhs.var, allocator); break; }

@@ -2764,7 +2766,8 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl } }

-static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_block *block, struct liveness *liveness) +static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, + struct hlsl_block *block, struct register_allocator *allocator) { struct hlsl_constant_defs *defs = &ctx->constant_defs; struct hlsl_ir_node *instr; @@ -2780,7 +2783,7 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b unsigned int x, y, i, writemask, end_reg; unsigned int reg_size = type->reg_size[HLSL_REGSET_NUMERIC];

- constant->reg = allocate_numeric_registers_for_type(ctx, liveness, 1, UINT_MAX, type); + constant->reg = allocate_numeric_registers_for_type(ctx, allocator, 1, UINT_MAX, type); TRACE("Allocated constant @%u to %s.\n", instr->index, debug_register('c', constant->reg, type));

if (!hlsl_array_reserve(ctx, (void **)&defs->values, &defs->size, @@ -2845,15 +2848,15 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b case HLSL_IR_IF: { struct hlsl_ir_if *iff = hlsl_ir_if(instr); - allocate_const_registers_recurse(ctx, &iff->then_block, liveness); - allocate_const_registers_recurse(ctx, &iff->else_block, liveness); + allocate_const_registers_recurse(ctx, &iff->then_block, allocator); + allocate_const_registers_recurse(ctx, &iff->else_block, allocator); break; }

case HLSL_IR_LOOP: { struct hlsl_ir_loop *loop = hlsl_ir_loop(instr); - allocate_const_registers_recurse(ctx, &loop->body, liveness); + allocate_const_registers_recurse(ctx, &loop->body, allocator); break; }

@@ -2865,10 +2868,10 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b

static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func) { - struct liveness liveness = {0}; + struct register_allocator allocator = {0}; struct hlsl_ir_var *var;

- allocate_const_registers_recurse(ctx, &entry_func->body, &liveness); + allocate_const_registers_recurse(ctx, &entry_func->body, &allocator);

LIST_FOR_EACH_ENTRY(var, &ctx->extern_vars, struct hlsl_ir_var, extern_entry) { @@ -2879,7 +2882,7 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi if (reg_size == 0) continue;

- var->regs[HLSL_REGSET_NUMERIC] = allocate_numeric_registers_for_type(ctx, &liveness, + var->regs[HLSL_REGSET_NUMERIC] = allocate_numeric_registers_for_type(ctx, &allocator, 1, UINT_MAX, var->data_type); TRACE("Allocated %s to %s.\n", var->name, debug_register('c', var->regs[HLSL_REGSET_NUMERIC], var->data_type)); @@ -2893,10 +2896,10 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi * does not handle constants. */ static void allocate_temp_registers(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func) { - struct liveness liveness = {0}; - allocate_temp_registers_recurse(ctx, &entry_func->body, &liveness); - ctx->temp_count = liveness.reg_count; - vkd3d_free(liveness.regs); + struct register_allocator allocator = {0}; + allocate_temp_registers_recurse(ctx, &entry_func->body, &allocator); + ctx->temp_count = allocator.reg_count; + vkd3d_free(allocator.regs); }

static void allocate_semantic_register(struct hlsl_ctx *ctx, struct hlsl_ir_var *var, unsigned int *counter, bool output)

-- GitLab https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168

Zebediah Figura

7:28 p.m.

New subject: [PATCH v2 2/4] vkd3d-shader/hlsl: Avoid leaking the allocator register map in allocate_const_registers().

From: Zebediah Figura zfigura@codeweavers.com

--- libs/vkd3d-shader/hlsl_codegen.c | 2 ++ 1 file changed, 2 insertions(+)

diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 95406c43..e06b6c0f 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2888,6 +2888,8 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi debug_register('c', var->regs[HLSL_REGSET_NUMERIC], var->data_type)); } } + + vkd3d_free(allocator.regs); }

/* Simple greedy temporary register allocation pass that just assigns a unique

-- GitLab https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168

Zebediah Figura

7:28 p.m.

New subject: [PATCH v2 3/4] vkd3d-shader/hlsl: Rewrite the register allocator to allow allocating in multiple passes.

From: Zebediah Figura zfigura@codeweavers.com

We will need this in order to allocate some "special" registers: ps_1_* output, sincos output, etc. --- libs/vkd3d-shader/hlsl_codegen.c | 129 +++++++++++++++++-------------- 1 file changed, 71 insertions(+), 58 deletions(-)

diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index e06b6c0f..1081422e 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2559,43 +2559,61 @@ static void compute_liveness(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl

struct register_allocator { - size_t size; - uint32_t reg_count; - struct + size_t count, capacity; + + /* Highest register index that has been allocated. + * Used to declare sm4 temp count. */ + uint32_t max_reg; + + struct allocation { - /* 0 if not live yet. */ - unsigned int last_read; - } *regs; + uint32_t reg; + unsigned int writemask; + unsigned int first_write, last_read; + } *allocations; };

-static unsigned int get_available_writemask(struct register_allocator *allocator, - unsigned int first_write, unsigned int component_idx, unsigned int reg_size) +static unsigned int get_available_writemask(const struct register_allocator *allocator, + unsigned int first_write, unsigned int last_read, uint32_t reg_idx) { - unsigned int i, writemask = 0, count = 0; + unsigned int writemask = VKD3DSP_WRITEMASK_ALL; + size_t i;

- for (i = 0; i < 4; ++i) + for (i = 0; i < allocator->count; ++i) { - if (allocator->regs[component_idx + i].last_read <= first_write) - { - writemask |= 1u << i; - if (++count == reg_size) - return writemask; - } + const struct allocation *allocation = &allocator->allocations[i]; + + /* We do not overlap if first write == last read: + * this is the case where we are allocating the result of that + * expression, e.g. "add r0, r0, r1". */ + + if (allocation->reg == reg_idx + && first_write < allocation->last_read && last_read > allocation->first_write) + writemask &= ~allocation->writemask; + + if (!writemask) + break; }

- return 0; + return writemask; }

-static bool resize_liveness(struct hlsl_ctx *ctx, struct register_allocator *allocator, size_t new_count) +static void record_allocation(struct hlsl_ctx *ctx, struct register_allocator *allocator, + uint32_t reg_idx, unsigned int writemask, unsigned int first_write, unsigned int last_read) { - size_t old_capacity = allocator->size; + struct allocation *allocation;

- if (!hlsl_array_reserve(ctx, (void **)&allocator->regs, &allocator->size, new_count, sizeof(*allocator->regs))) - return false; + if (!hlsl_array_reserve(ctx, (void **)&allocator->allocations, &allocator->capacity, + allocator->count + 1, sizeof(*allocator->allocations))) + return;

- if (allocator->size > old_capacity) - memset(allocator->regs + old_capacity, 0, (allocator->size - old_capacity) * sizeof(*allocator->regs)); - return true; + allocation = &allocator->allocations[allocator->count++]; + allocation->reg = reg_idx; + allocation->writemask = writemask; + allocation->first_write = first_write; + allocation->last_read = last_read; + + allocator->max_reg = max(allocator->max_reg, reg_idx); }

/* reg_size is the number of register components to be reserved, while component_count is the number @@ -2605,42 +2623,39 @@ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct register_a unsigned int first_write, unsigned int last_read, unsigned int reg_size, unsigned int component_count) { - unsigned int component_idx, writemask, i; struct hlsl_reg ret = {0}; + unsigned int writemask; + uint32_t reg_idx;

assert(component_count <= reg_size);

- for (component_idx = 0; component_idx < allocator->size; component_idx += 4) + for (reg_idx = 0;; ++reg_idx) { - if ((writemask = get_available_writemask(allocator, first_write, component_idx, reg_size))) + writemask = get_available_writemask(allocator, first_write, last_read, reg_idx); + + if (vkd3d_popcount(writemask) >= reg_size) + { + writemask = hlsl_combine_writemasks(writemask, (1u << reg_size) - 1); break; + } } - if (component_idx == allocator->size) - { - if (!resize_liveness(ctx, allocator, component_idx + 4)) - return ret; - writemask = (1u << reg_size) - 1; - } - for (i = 0; i < 4; ++i) - { - if (writemask & (1u << i)) - allocator->regs[component_idx + i].last_read = last_read; - } - ret.id = component_idx / 4; + + record_allocation(ctx, allocator, reg_idx, writemask, first_write, last_read); + + ret.id = reg_idx; ret.writemask = hlsl_combine_writemasks(writemask, (1u << component_count) - 1); ret.allocated = true; - allocator->reg_count = max(allocator->reg_count, ret.id + 1); return ret; }

-static bool is_range_available(struct register_allocator *allocator, unsigned int first_write, - unsigned int component_idx, unsigned int reg_size) +static bool is_range_available(const struct register_allocator *allocator, + unsigned int first_write, unsigned int last_read, uint32_t reg_idx, unsigned int reg_size) { - unsigned int i; + uint32_t i;

- for (i = 0; i < reg_size; i += 4) + for (i = 0; i < (reg_size / 4); ++i) { - if (!get_available_writemask(allocator, first_write, component_idx + i, 4)) + if (get_available_writemask(allocator, first_write, last_read, reg_idx + i) != VKD3DSP_WRITEMASK_ALL) return false; } return true; @@ -2649,23 +2664,21 @@ static bool is_range_available(struct register_allocator *allocator, unsigned in static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct register_allocator *allocator, unsigned int first_write, unsigned int last_read, unsigned int reg_size) { - unsigned int i, component_idx; struct hlsl_reg ret = {0}; + uint32_t reg_idx; + unsigned int i;

- for (component_idx = 0; component_idx < allocator->size; component_idx += 4) + for (reg_idx = 0;; ++reg_idx) { - if (is_range_available(allocator, first_write, component_idx, - min(reg_size, allocator->size - component_idx))) + if (is_range_available(allocator, first_write, last_read, reg_idx, reg_size)) break; } - if (!resize_liveness(ctx, allocator, component_idx + reg_size)) - return ret;

- for (i = 0; i < reg_size; ++i) - allocator->regs[component_idx + i].last_read = last_read; - ret.id = component_idx / 4; + for (i = 0; i < reg_size / 4; ++i) + record_allocation(ctx, allocator, reg_idx + i, VKD3DSP_WRITEMASK_ALL, first_write, last_read); + + ret.id = reg_idx; ret.allocated = true; - allocator->reg_count = max(allocator->reg_count, ret.id + align(reg_size, 4)); return ret; }

@@ -2889,7 +2902,7 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi } }

- vkd3d_free(allocator.regs); + vkd3d_free(allocator.allocations); }

/* Simple greedy temporary register allocation pass that just assigns a unique @@ -2900,8 +2913,8 @@ static void allocate_temp_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functio { struct register_allocator allocator = {0}; allocate_temp_registers_recurse(ctx, &entry_func->body, &allocator); - ctx->temp_count = allocator.reg_count; - vkd3d_free(allocator.regs); + ctx->temp_count = allocator.max_reg + 1; + vkd3d_free(allocator.allocations); }

static void allocate_semantic_register(struct hlsl_ctx *ctx, struct hlsl_ir_var *var, unsigned int *counter, bool output)

-- GitLab https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168

Zebediah Figura

7:28 p.m.

New subject: [PATCH v2 4/4] vkd3d-shader/hlsl: Map the colour output for ps_1_* to r0.

From: Zebediah Figura zfigura@codeweavers.com

--- libs/vkd3d-shader/d3dbc.c | 7 ++++++- libs/vkd3d-shader/hlsl_codegen.c | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/libs/vkd3d-shader/d3dbc.c b/libs/vkd3d-shader/d3dbc.c index 14268440..011d967f 100644 --- a/libs/vkd3d-shader/d3dbc.c +++ b/libs/vkd3d-shader/d3dbc.c @@ -1838,7 +1838,12 @@ static void write_sm1_store(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *

if (store->lhs.var->is_output_semantic) { - if (!hlsl_sm1_register_from_semantic(ctx, &store->lhs.var->semantic, + if (ctx->profile->type == VKD3D_SHADER_TYPE_PIXEL && ctx->profile->major_version == 1) + { + sm1_instr.dst.type = D3DSPR_TEMP; + sm1_instr.dst.reg = 0; + } + else if (!hlsl_sm1_register_from_semantic(ctx, &store->lhs.var->semantic, true, &sm1_instr.dst.type, &sm1_instr.dst.reg)) { assert(reg.allocated); diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 1081422e..d3dac815 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2912,6 +2912,24 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi static void allocate_temp_registers(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func) { struct register_allocator allocator = {0}; + + /* ps_1_* outputs are special and go in temp register 0. */ + if (ctx->profile->major_version == 1 && ctx->profile->type == VKD3D_SHADER_TYPE_PIXEL) + { + size_t i; + + for (i = 0; i < entry_func->parameters.count; ++i) + { + const struct hlsl_ir_var *var = entry_func->parameters.vars[i]; + + if (var->is_output_semantic) + { + record_allocation(ctx, &allocator, 0, VKD3DSP_WRITEMASK_ALL, var->first_write, var->last_read); + break; + } + } + } + allocate_temp_registers_recurse(ctx, &entry_func->body, &allocator); ctx->temp_count = allocator.max_reg + 1; vkd3d_free(allocator.allocations); @@ -2940,6 +2958,10 @@ static void allocate_semantic_register(struct hlsl_ctx *ctx, struct hlsl_ir_var D3DDECLUSAGE usage; uint32_t usage_idx;

+ /* ps_1_* outputs are special and go in temp register 0. */ + if (ctx->profile->major_version == 1 && output && ctx->profile->type == VKD3D_SHADER_TYPE_PIXEL) + return; + builtin = hlsl_sm1_register_from_semantic(ctx, &var->semantic, output, &type, &reg); if (!builtin && !hlsl_sm1_usage_from_semantic(&var->semantic, &usage, &usage_idx)) {

-- GitLab https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168

Giovanni Mascellani (＠giomasce)

3 May 3 May

11:12 a.m.

On Thu Apr 20 21:00:31 2023 +0000, Matteo Bruni wrote:

...

In any case, I don't think you need to do anything about it.

I felt sort of the same confusion. Part of it is that "We do not overlap" can mean two things: "There is not overlap when fw == lr, so we can proceed" (the intended meaning) and "We (the programmers) do not want to create an overlap by accepting this case in which fw == lr, so we cannot proceed". I guess this might be a general problem with ergative verbs with an unclear pronoun (and I generally interpret "we" as the programmers in program comments).

"There is no overlap" feels much clearer to me.

The second instance of "we" ("we are allocating") is probably more clear, but maybe "we are" can be dropped there too without sacrificing grammaticality and without requiring the random reader to figure out who is "we".

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168#note_31863

Giovanni Mascellani (＠giomasce)

11:12 a.m.

The MR is ok for me, except for the little remark about comment wording.

I just wanted to notice that the new algorithm has higher computational complexity than before, because `get_available_writemask()` used to be constant time and it's now linear in the number of register allocations. This already causes a measurable performance hit in a synthetic but still relatively simple shader as this: ``` uniform float4x4 x; uniform float4x4 y;

float4 main(float4 pos : sv_position) : sv_target { float4x4 a = mul(mul(y, x), mul(x, x)); float4x4 b = mul(mul(y, x), mul(y, x)); float4x4 c = mul(mul(y, y), mul(x, x)); float4x4 d = mul(mul(y, y), mul(y, x));

float4 ret = 0.0; ret += a[0] - b[0] * c[0] / d[0]; ret += a[1] - b[1] * c[1] / d[1]; ret += a[2] - b[2] * c[2] / d[2]; ret += a[3] - b[3] * c[3] / d[3];

return ret; } ```

Here I am leveraging `mul()` to create a lot of temporaries and summing everything to prevent DCE from optimizing too much. On my computer a shader runner that just compiles this (doesn't execute it) takes 0.1 seconds before this MR and 0.11 seconds after it.

I don't claim any significance for my random microbenchmark experiment, so I don't think it's necessary to change the MR, but when and if we'll be harvesting for performances in the HLSL compiler let's remember to have a look here.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168#note_31864

Giovanni Mascellani (＠giomasce)

11:12 a.m.

New subject: [PATCH v2 0/4] MR168: hlsl: ps_1_* outputs. - approved

This merge request was approved by Giovanni Mascellani.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168

Henri Verbeet (＠hverbeet)

12:40 p.m.

New subject: [PATCH v2 0/4] MR168: hlsl: ps_1_* outputs. - approved

This merge request was approved by Henri Verbeet.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168

Zebediah Figura (＠zfigura)

4:08 p.m.

On Wed May 3 16:08:53 2023 +0000, Giovanni Mascellani wrote:

...

The MR is ok for me, except for the little remark about comment wording. I just wanted to notice that the new algorithm has higher computational complexity than before, because `get_available_writemask()` used to be constant time and it's now linear in the number of register allocations. This already causes a measurable performance hit in a synthetic but still relatively simple shader as this:
uniform float4x4 x;
uniform float4x4 y;
float4 main(float4 pos : sv_position) : sv_target
{
    float4x4 a = mul(mul(y, x), mul(x, x));
    float4x4 b = mul(mul(y, x), mul(y, x));
    float4x4 c = mul(mul(y, y), mul(x, x));
    float4x4 d = mul(mul(y, y), mul(y, x));
    float4 ret = 0.0;
    ret += a[0] - b[0] * c[0] / d[0];
    ret += a[1] - b[1] * c[1] / d[1];
    ret += a[2] - b[2] * c[2] / d[2];
    ret += a[3] - b[3] * c[3] / d[3];
    return ret;
}
Here I am leveraging `mul()` to create a lot of temporaries and summing everything to prevent DCE from optimizing too much. On my computer a shader runner that just compiles this (doesn't execute it) takes 0.1 seconds before this MR and 0.11 seconds after it. I don't claim any significance for my random microbenchmark experiment, so I don't think it's necessary to change the MR, but when and if we'll be harvesting for performances in the HLSL compiler let's remember to have a look here.

We could probably do better by just recording allocations for the few cases where we need to reserve, and then using the old pass for everything else. But it's probably not worth rewriting this again until we see evidence it matters.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168#note_31892

Giovanni Mascellani (＠giomasce)

5:30 p.m.

On Wed May 3 16:08:53 2023 +0000, Zebediah Figura wrote:

...

We could probably do better by just recording allocations for the few cases where we need to reserve, and then using the old pass for everything else. But it's probably not worth rewriting this again until we see evidence it matters.

Or simply use a more efficient data structure for recording allocations. I agree there is no need to do that now.

-- https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/168#note_31900

946

Age (days ago)

955

Last active (days ago)

wine-gitlab@winehq.org

10 comments

4 participants

tags (0)

participants (4)

Giovanni Mascellani (＠giomasce)
Henri Verbeet (＠hverbeet)
Zebediah Figura
Zebediah Figura (＠zfigura)