Because of the change introduced in f21693b2, SM1 scalars and vectors were not longer getting the correct writemask when they are allocated, so this is fixed.
Also, the mapping of sm1 src register swizzles is moved outside `write_sm1_instruction()` since there are some instructions that don't do this, remarkably dp2add. This is fixed.
Before the last patch we are writing the operation as: ``` dp2add r0.x, r1.x, r0.x, r2.x ```
and now it is: ``` dp2add r0.x, r1.xyxx, r0.xyxx, r2.x ```
dp2add now has its own function, `write_sm1_dp2add()`, since it seems to be the only instruction with this structure.
Ideally we would be using the default swizzles for the first two src arguments: ``` dp2add r0.x, r1, r0, r2.x ``` since, according to native's documentation, these are supported for all sm < 4.
But using default swizzles whenever is possible -- along with following the conversion of repeating the last component of the swizzle when fewer than 4 components are to be specified -- has a higher scope. Probably would involve modifying `hlsl_swizzle_from_writemask()` and `hlsl_map_swizzle()`.
-- v3: vkd3d-shader/hlsl: Fix SM1 dp2add swizzles. vkd3d-shader/hlsl: Map SM1 src swizzles outside write_sm1_instruction(). vkd3d-shader/hlsl: Set writemasks correctly for SM1 scalar and vector types. vkd3d-shader/hlsl: Expect component count in allocate_register(). vkd3d-shader/hlsl: Rename 'component_count' arguments to 'reg_size'.
From: Francisco Casas fcasas@codeweavers.com
component_count will be used in the next patch for the actual number of components of the type and not its register size. --- libs/vkd3d-shader/hlsl_codegen.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 308e3dae..1ccb006d 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2131,7 +2131,7 @@ struct liveness };
static unsigned int get_available_writemask(struct liveness *liveness, - unsigned int first_write, unsigned int component_idx, unsigned int component_count) + unsigned int first_write, unsigned int component_idx, unsigned int reg_size) { unsigned int i, writemask = 0, count = 0;
@@ -2140,7 +2140,7 @@ static unsigned int get_available_writemask(struct liveness *liveness, if (liveness->regs[component_idx + i].last_read <= first_write) { writemask |= 1u << i; - if (++count == component_count) + if (++count == reg_size) return writemask; } } @@ -2161,21 +2161,21 @@ static bool resize_liveness(struct hlsl_ctx *ctx, struct liveness *liveness, siz }
static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness *liveness, - unsigned int first_write, unsigned int last_read, unsigned int component_count) + unsigned int first_write, unsigned int last_read, unsigned int reg_size) { unsigned int component_idx, writemask, i; struct hlsl_reg ret = {0};
for (component_idx = 0; component_idx < liveness->size; component_idx += 4) { - if ((writemask = get_available_writemask(liveness, first_write, component_idx, component_count))) + if ((writemask = get_available_writemask(liveness, first_write, component_idx, reg_size))) break; } if (component_idx == liveness->size) { if (!resize_liveness(ctx, liveness, component_idx + 4)) return ret; - writemask = (1u << component_count) - 1; + writemask = (1u << reg_size) - 1; } for (i = 0; i < 4; ++i) { @@ -2190,11 +2190,11 @@ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness * }
static bool is_range_available(struct liveness *liveness, unsigned int first_write, - unsigned int component_idx, unsigned int component_count) + unsigned int component_idx, unsigned int reg_size) { unsigned int i;
- for (i = 0; i < component_count; i += 4) + for (i = 0; i < reg_size; i += 4) { if (!get_available_writemask(liveness, first_write, component_idx + i, 4)) return false; @@ -2203,7 +2203,7 @@ static bool is_range_available(struct liveness *liveness, unsigned int first_wri }
static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct liveness *liveness, - unsigned int first_write, unsigned int last_read, unsigned int component_count) + unsigned int first_write, unsigned int last_read, unsigned int reg_size) { unsigned int i, component_idx; struct hlsl_reg ret = {0}; @@ -2211,17 +2211,17 @@ static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct liveness *liv for (component_idx = 0; component_idx < liveness->size; component_idx += 4) { if (is_range_available(liveness, first_write, component_idx, - min(component_count, liveness->size - component_idx))) + min(reg_size, liveness->size - component_idx))) break; } - if (!resize_liveness(ctx, liveness, component_idx + component_count)) + if (!resize_liveness(ctx, liveness, component_idx + reg_size)) return ret;
- for (i = 0; i < component_count; ++i) + for (i = 0; i < reg_size; ++i) liveness->regs[component_idx + i].last_read = last_read; ret.id = component_idx / 4; ret.allocated = true; - liveness->reg_count = max(liveness->reg_count, ret.id + align(component_count, 4)); + liveness->reg_count = max(liveness->reg_count, ret.id + align(reg_size, 4)); return ret; }
From: Francisco Casas fcasas@codeweavers.com
This in order to set the correct writemasks for SM1 registers. --- libs/vkd3d-shader/hlsl_codegen.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 1ccb006d..83b4c6b6 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2160,12 +2160,18 @@ static bool resize_liveness(struct hlsl_ctx *ctx, struct liveness *liveness, siz return true; }
+/* reg_size is the number of register components to be reserved, while component_count is the number + * of components for the register's writemask. In SM1, floats and vectors allocate the whole + * register, even if they don't use it completely. */ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness *liveness, - unsigned int first_write, unsigned int last_read, unsigned int reg_size) + unsigned int first_write, unsigned int last_read, unsigned int reg_size, + unsigned int component_count) { unsigned int component_idx, writemask, i; struct hlsl_reg ret = {0};
+ assert(component_count <= reg_size); + for (component_idx = 0; component_idx < liveness->size; component_idx += 4) { if ((writemask = get_available_writemask(liveness, first_write, component_idx, reg_size))) @@ -2183,7 +2189,7 @@ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness * liveness->regs[component_idx + i].last_read = last_read; } ret.id = component_idx / 4; - ret.writemask = writemask; + ret.writemask = hlsl_combine_writemasks(writemask, (1u << component_count) - 1); ret.allocated = true; liveness->reg_count = max(liveness->reg_count, ret.id + 1); return ret; @@ -2253,7 +2259,7 @@ static void allocate_variable_temp_register(struct hlsl_ctx *ctx, struct hlsl_ir var->last_read, var->data_type->reg_size); else var->reg = allocate_register(ctx, liveness, var->first_write, - var->last_read, var->data_type->reg_size); + var->last_read, var->data_type->reg_size, var->data_type->reg_size); TRACE("Allocated %s to %s (liveness %u-%u).\n", var->name, debug_register('r', var->reg, var->data_type), var->first_write, var->last_read); } @@ -2272,7 +2278,7 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl instr->last_read, instr->data_type->reg_size); else instr->reg = allocate_register(ctx, liveness, instr->index, - instr->last_read, instr->data_type->reg_size); + instr->last_read, instr->data_type->reg_size, instr->data_type->reg_size); TRACE("Allocated anonymous expression @%u to %s (liveness %u-%u).\n", instr->index, debug_register('r', instr->reg, instr->data_type), instr->index, instr->last_read); } @@ -2335,7 +2341,7 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b if (reg_size > 4) constant->reg = allocate_range(ctx, liveness, 1, UINT_MAX, reg_size); else - constant->reg = allocate_register(ctx, liveness, 1, UINT_MAX, reg_size); + constant->reg = allocate_register(ctx, liveness, 1, UINT_MAX, reg_size, reg_size); TRACE("Allocated constant @%u to %s.\n", instr->index, debug_register('c', constant->reg, type));
if (!hlsl_array_reserve(ctx, (void **)&defs->values, &defs->size, @@ -2435,10 +2441,7 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi if (var->data_type->reg_size > 4) var->reg = allocate_range(ctx, &liveness, 1, UINT_MAX, var->data_type->reg_size); else - { - var->reg = allocate_register(ctx, &liveness, 1, UINT_MAX, 4); - var->reg.writemask = (1u << var->data_type->reg_size) - 1; - } + var->reg = allocate_register(ctx, &liveness, 1, UINT_MAX, 4, var->data_type->reg_size); TRACE("Allocated %s to %s.\n", var->name, debug_register('c', var->reg, var->data_type)); } }
From: Francisco Casas fcasas@codeweavers.com
Because of the change introduced in
f21693b2 vkd3d-shader/hlsl: Use reg_size as component count when allocating a single register.
SM1 scalars and vectors were not longer getting the correct writemask when they are allocated.
This happened because they have to reserve the whole register even if they only use some of its components, so their reg_size may differ from the number of components.
This commit fixes that. --- libs/vkd3d-shader/hlsl_codegen.c | 36 ++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 83b4c6b6..7c95ebd1 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2231,6 +2231,15 @@ static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct liveness *liv return ret; }
+static struct hlsl_reg allocate_numeric_registers_for_type(struct hlsl_ctx *ctx, struct liveness *liveness, + unsigned int first_write, unsigned int last_read, const struct hlsl_type *type) +{ + if (type->type <= HLSL_CLASS_VECTOR) + return allocate_register(ctx, liveness, first_write, last_read, type->reg_size, type->dimx); + else + return allocate_range(ctx, liveness, first_write, last_read, type->reg_size); +} + static const char *debug_register(char class, struct hlsl_reg reg, const struct hlsl_type *type) { static const char writemask_offset[] = {'w','x','y','z'}; @@ -2254,12 +2263,9 @@ static void allocate_variable_temp_register(struct hlsl_ctx *ctx, struct hlsl_ir
if (!var->reg.allocated && var->last_read) { - if (var->data_type->reg_size > 4) - var->reg = allocate_range(ctx, liveness, var->first_write, - var->last_read, var->data_type->reg_size); - else - var->reg = allocate_register(ctx, liveness, var->first_write, - var->last_read, var->data_type->reg_size, var->data_type->reg_size); + var->reg = allocate_numeric_registers_for_type(ctx, liveness, var->first_write, var->last_read, + var->data_type); + TRACE("Allocated %s to %s (liveness %u-%u).\n", var->name, debug_register('r', var->reg, var->data_type), var->first_write, var->last_read); } @@ -2273,12 +2279,8 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl { if (!instr->reg.allocated && instr->last_read) { - if (instr->data_type->reg_size > 4) - instr->reg = allocate_range(ctx, liveness, instr->index, - instr->last_read, instr->data_type->reg_size); - else - instr->reg = allocate_register(ctx, liveness, instr->index, - instr->last_read, instr->data_type->reg_size, instr->data_type->reg_size); + instr->reg = allocate_numeric_registers_for_type(ctx, liveness, instr->index, instr->last_read, + instr->data_type); TRACE("Allocated anonymous expression @%u to %s (liveness %u-%u).\n", instr->index, debug_register('r', instr->reg, instr->data_type), instr->index, instr->last_read); } @@ -2338,10 +2340,7 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b unsigned int x, y, i, writemask, end_reg; unsigned int reg_size = type->reg_size;
- if (reg_size > 4) - constant->reg = allocate_range(ctx, liveness, 1, UINT_MAX, reg_size); - else - constant->reg = allocate_register(ctx, liveness, 1, UINT_MAX, reg_size, reg_size); + constant->reg = allocate_numeric_registers_for_type(ctx, liveness, 1, UINT_MAX, type); TRACE("Allocated constant @%u to %s.\n", instr->index, debug_register('c', constant->reg, type));
if (!hlsl_array_reserve(ctx, (void **)&defs->values, &defs->size, @@ -2438,10 +2437,7 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi if (var->data_type->reg_size == 0) continue;
- if (var->data_type->reg_size > 4) - var->reg = allocate_range(ctx, &liveness, 1, UINT_MAX, var->data_type->reg_size); - else - var->reg = allocate_register(ctx, &liveness, 1, UINT_MAX, 4, var->data_type->reg_size); + var->reg = allocate_numeric_registers_for_type(ctx, &liveness, 1, UINT_MAX, var->data_type); TRACE("Allocated %s to %s.\n", var->name, debug_register('c', var->reg, var->data_type)); } }
From: Francisco Casas fcasas@codeweavers.com
Not every instruction expects src swizzles to be mapped according to the dst writemasks, so this logic must be outside this function. --- libs/vkd3d-shader/hlsl_sm1.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_sm1.c b/libs/vkd3d-shader/hlsl_sm1.c index 60b54a42..ebeaadb1 100644 --- a/libs/vkd3d-shader/hlsl_sm1.c +++ b/libs/vkd3d-shader/hlsl_sm1.c @@ -435,11 +435,9 @@ static void write_sm1_dst_register(struct vkd3d_bytecode_buffer *buffer, const s }
static void write_sm1_src_register(struct vkd3d_bytecode_buffer *buffer, - const struct sm1_src_register *reg, unsigned int dst_writemask) + const struct sm1_src_register *reg) { - unsigned int swizzle = hlsl_map_swizzle(reg->swizzle, dst_writemask); - - put_u32(buffer, (1u << 31) | sm1_encode_register_type(reg->type) | reg->mod | (swizzle << 16) | reg->reg); + put_u32(buffer, (1u << 31) | sm1_encode_register_type(reg->type) | reg->mod | (reg->swizzle << 16) | reg->reg); }
static void write_sm1_instruction(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, @@ -456,14 +454,19 @@ static void write_sm1_instruction(struct hlsl_ctx *ctx, struct vkd3d_bytecode_bu write_sm1_dst_register(buffer, &instr->dst);
for (i = 0; i < instr->src_count; ++i) - write_sm1_src_register(buffer, &instr->srcs[i], instr->dst.writemask); + write_sm1_src_register(buffer, &instr->srcs[i]); };
+static void sm1_map_src_swizzle(struct sm1_src_register *src, unsigned int map_writemask) +{ + src->swizzle = hlsl_map_swizzle(src->swizzle, map_writemask); +} + static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, const struct hlsl_reg *src1, const struct hlsl_reg *src2, const struct hlsl_reg *src3) { - const struct sm1_instruction instr = + struct sm1_instruction instr = { .opcode = opcode,
@@ -483,6 +486,10 @@ static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buf .srcs[2].reg = src3->id, .src_count = 3, }; + + sm1_map_src_swizzle(&instr.srcs[0], instr.dst.writemask); + sm1_map_src_swizzle(&instr.srcs[1], instr.dst.writemask); + sm1_map_src_swizzle(&instr.srcs[2], instr.dst.writemask); write_sm1_instruction(ctx, buffer, &instr); }
@@ -490,7 +497,7 @@ static void write_sm1_binary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buff D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, const struct hlsl_reg *src1, const struct hlsl_reg *src2) { - const struct sm1_instruction instr = + struct sm1_instruction instr = { .opcode = opcode,
@@ -507,6 +514,9 @@ static void write_sm1_binary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buff .srcs[1].reg = src2->id, .src_count = 2, }; + + sm1_map_src_swizzle(&instr.srcs[0], instr.dst.writemask); + sm1_map_src_swizzle(&instr.srcs[1], instr.dst.writemask); write_sm1_instruction(ctx, buffer, &instr); }
@@ -514,7 +524,7 @@ static void write_sm1_unary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffe D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, const struct hlsl_reg *src, D3DSHADER_PARAM_SRCMOD_TYPE src_mod, D3DSHADER_PARAM_DSTMOD_TYPE dst_mod) { - const struct sm1_instruction instr = + struct sm1_instruction instr = { .opcode = opcode,
@@ -530,6 +540,8 @@ static void write_sm1_unary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffe .srcs[0].mod = src_mod, .src_count = 1, }; + + sm1_map_src_swizzle(&instr.srcs[0], instr.dst.writemask); write_sm1_instruction(ctx, buffer, &instr); }
@@ -634,6 +646,7 @@ static void write_sm1_constant(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffe
assert(instr->reg.allocated); assert(constant->reg.allocated); + sm1_map_src_swizzle(&sm1_instr.srcs[0], sm1_instr.dst.writemask); write_sm1_instruction(ctx, buffer, &sm1_instr); }
@@ -777,6 +790,7 @@ static void write_sm1_load(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *b sm1_instr.srcs[0].swizzle = hlsl_swizzle_from_writemask((1 << load->src.var->data_type->dimx) - 1); }
+ sm1_map_src_swizzle(&sm1_instr.srcs[0], sm1_instr.dst.writemask); write_sm1_instruction(ctx, buffer, &sm1_instr); }
@@ -822,6 +836,7 @@ static void write_sm1_store(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer * else assert(reg.allocated);
+ sm1_map_src_swizzle(&sm1_instr.srcs[0], sm1_instr.dst.writemask); write_sm1_instruction(ctx, buffer, &sm1_instr); }
@@ -848,6 +863,7 @@ static void write_sm1_swizzle(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer
assert(instr->reg.allocated); assert(val->reg.allocated); + sm1_map_src_swizzle(&sm1_instr.srcs[0], sm1_instr.dst.writemask); write_sm1_instruction(ctx, buffer, &sm1_instr); }
From: Francisco Casas fcasas@codeweavers.com
SM1 dp2add doesn't map src swizzles to the dst writemask, also it expects the last argument to have a replicate swizzle.
Before this patch we were writing the operation as: ``` dp2add r0.x, r1.x, r0.x, r2.x ```
and now it is: ``` dp2add r0.x, r1.xyxx, r0.xyxx, r2.x ```
dp2add now has its own function, write_sm1_dp2add(), since it seems to be the only instruction with this structure.
Ideally we would be using the default swizzles for the first two src arguments: ``` dp2add r0.x, r1, r0, r2.x ``` since, according to native's documentation, these are supported for all sm < 4.
But this change -- along with following the convention of repeating the last component of the swizzle when fewer than 4 components are to be specified -- would require more global changes, probably in hlsl_swizzle_from_writemask() and hlsl_map_swizzle(). --- libs/vkd3d-shader/hlsl_sm1.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_sm1.c b/libs/vkd3d-shader/hlsl_sm1.c index ebeaadb1..34bead54 100644 --- a/libs/vkd3d-shader/hlsl_sm1.c +++ b/libs/vkd3d-shader/hlsl_sm1.c @@ -462,13 +462,13 @@ static void sm1_map_src_swizzle(struct sm1_src_register *src, unsigned int map_w src->swizzle = hlsl_map_swizzle(src->swizzle, map_writemask); }
-static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, - D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, - const struct hlsl_reg *src1, const struct hlsl_reg *src2, const struct hlsl_reg *src3) +static void write_sm1_dp2add(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, + const struct hlsl_reg *dst, const struct hlsl_reg *src1, const struct hlsl_reg *src2, + const struct hlsl_reg *src3) { struct sm1_instruction instr = { - .opcode = opcode, + .opcode = D3DSIO_DP2ADD,
.dst.type = D3DSPR_TEMP, .dst.writemask = dst->writemask, @@ -487,9 +487,6 @@ static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buf .src_count = 3, };
- sm1_map_src_swizzle(&instr.srcs[0], instr.dst.writemask); - sm1_map_src_swizzle(&instr.srcs[1], instr.dst.writemask); - sm1_map_src_swizzle(&instr.srcs[2], instr.dst.writemask); write_sm1_instruction(ctx, buffer, &instr); }
@@ -742,7 +739,7 @@ static void write_sm1_expr(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *b break;
case HLSL_OP3_DP2ADD: - write_sm1_ternary_op(ctx, buffer, D3DSIO_DP2ADD, &instr->reg, &arg1->reg, &arg2->reg, &arg3->reg); + write_sm1_dp2add(ctx, buffer, &instr->reg, &arg1->reg, &arg2->reg, &arg3->reg); break;
default:
Rebased. There didn't seem to be a conflict with !79.
On Tue Feb 7 23:04:03 2023 +0000, Francisco Casas wrote:
Rebased. There didn't seem to be a conflict with !79.
I mean, that the code that resulted after auto-rebasing is correct.
This merge request was approved by Henri Verbeet.