Because of the change introduced in f21693b2, SM1 scalars and vectors were not longer getting the correct writemask when they are allocated, so this is fixed.
Also, the mapping of sm1 src register swizzles is moved outside `write_sm1_instruction()` since there are some instructions that don't do this, remarkably dp2add. This is fixed.
Before the last patch we are writing the operation as: ``` dp2add r0.x, r1.x, r0.x, r2.x ```
and now it is: ``` dp2add r0.x, r1.xyxx, r0.xyxx, r2.x ```
dp2add now has its own function, write_sm1_dp2add(), since it seems to be the only instruction with this structure.
Ideally we would be using the default swizzles for the first two src arguments: ``` dp2add r0.x, r1, r0, r2.x ``` since, according to native's documentation, these are supported for all sm < 4.
But using default swizzles whenever is possible -- along with following the conversion of repeating the last component of the swizzle when fewer than 4 components are to be specified -- has a higher scope. Probably would involve modifying `hlsl_swizzle_from_writemask()` and `hlsl_map_swizzle()`.
From: Francisco Casas fcasas@codeweavers.com
Because of the change introduced in
f21693b2 vkd3d-shader/hlsl: Use reg_size as component count when allocating a single register.
SM1 scalars and vectors were not longer getting the correct writemask when they are allocated.
This happened because they have to reserve the whole register even if they only use some of its components, so their reg_size may differ from the number of components.
This commit fixes that. --- libs/vkd3d-shader/hlsl_codegen.c | 49 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 25 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index aa950e35..f0d73743 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2160,12 +2160,18 @@ static bool resize_liveness(struct hlsl_ctx *ctx, struct liveness *liveness, siz return true; }
+/* component_count is the number of register components to be reserved, while dimx is the number of + * components for the register's writemask. In SM1, floats and vectors allocate the whole register, + * even if they don't use it completely. */ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness *liveness, - unsigned int first_write, unsigned int last_read, unsigned int component_count) + unsigned int first_write, unsigned int last_read, unsigned int component_count, + unsigned int dimx) { unsigned int component_idx, writemask, i; struct hlsl_reg ret = {0};
+ assert(dimx <= component_count); + for (component_idx = 0; component_idx < liveness->size; component_idx += 4) { if ((writemask = get_available_writemask(liveness, first_write, component_idx, component_count))) @@ -2183,7 +2189,7 @@ static struct hlsl_reg allocate_register(struct hlsl_ctx *ctx, struct liveness * liveness->regs[component_idx + i].last_read = last_read; } ret.id = component_idx / 4; - ret.writemask = writemask; + ret.writemask = hlsl_combine_writemasks(writemask, (1u << dimx) - 1); ret.allocated = true; liveness->reg_count = max(liveness->reg_count, ret.id + 1); return ret; @@ -2225,6 +2231,15 @@ static struct hlsl_reg allocate_range(struct hlsl_ctx *ctx, struct liveness *liv return ret; }
+static struct hlsl_reg allocate_numeric_registers_for_type(struct hlsl_ctx *ctx, struct liveness *liveness, + unsigned int first_write, unsigned int last_read, const struct hlsl_type *type) +{ + if (type->type <= HLSL_CLASS_VECTOR) + return allocate_register(ctx, liveness, first_write, last_read, type->reg_size, type->dimx); + else + return allocate_range(ctx, liveness, first_write, last_read, type->reg_size); +} + static const char *debug_register(char class, struct hlsl_reg reg, const struct hlsl_type *type) { static const char writemask_offset[] = {'w','x','y','z'}; @@ -2248,12 +2263,9 @@ static void allocate_variable_temp_register(struct hlsl_ctx *ctx, struct hlsl_ir
if (!var->reg.allocated && var->last_read) { - if (var->data_type->reg_size > 4) - var->reg = allocate_range(ctx, liveness, var->first_write, - var->last_read, var->data_type->reg_size); - else - var->reg = allocate_register(ctx, liveness, var->first_write, - var->last_read, var->data_type->reg_size); + var->reg = allocate_numeric_registers_for_type(ctx, liveness, var->first_write, var->last_read, + var->data_type); + TRACE("Allocated %s to %s (liveness %u-%u).\n", var->name, debug_register('r', var->reg, var->data_type), var->first_write, var->last_read); } @@ -2267,12 +2279,8 @@ static void allocate_temp_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_bl { if (!instr->reg.allocated && instr->last_read) { - if (instr->data_type->reg_size > 4) - instr->reg = allocate_range(ctx, liveness, instr->index, - instr->last_read, instr->data_type->reg_size); - else - instr->reg = allocate_register(ctx, liveness, instr->index, - instr->last_read, instr->data_type->reg_size); + instr->reg = allocate_numeric_registers_for_type(ctx, liveness, instr->index, instr->last_read, + instr->data_type); TRACE("Allocated anonymous expression @%u to %s (liveness %u-%u).\n", instr->index, debug_register('r', instr->reg, instr->data_type), instr->index, instr->last_read); } @@ -2332,10 +2340,7 @@ static void allocate_const_registers_recurse(struct hlsl_ctx *ctx, struct hlsl_b unsigned int x, y, i, writemask, end_reg; unsigned int reg_size = type->reg_size;
- if (reg_size > 4) - constant->reg = allocate_range(ctx, liveness, 1, UINT_MAX, reg_size); - else - constant->reg = allocate_register(ctx, liveness, 1, UINT_MAX, reg_size); + constant->reg = allocate_numeric_registers_for_type(ctx, liveness, 1, UINT_MAX, type); TRACE("Allocated constant @%u to %s.\n", instr->index, debug_register('c', constant->reg, type));
if (!hlsl_array_reserve(ctx, (void **)&defs->values, &defs->size, @@ -2432,13 +2437,7 @@ static void allocate_const_registers(struct hlsl_ctx *ctx, struct hlsl_ir_functi if (var->data_type->reg_size == 0) continue;
- if (var->data_type->reg_size > 4) - var->reg = allocate_range(ctx, &liveness, 1, UINT_MAX, var->data_type->reg_size); - else - { - var->reg = allocate_register(ctx, &liveness, 1, UINT_MAX, 4); - var->reg.writemask = (1u << var->data_type->reg_size) - 1; - } + var->reg = allocate_numeric_registers_for_type(ctx, &liveness, 1, UINT_MAX, var->data_type); TRACE("Allocated %s to %s.\n", var->name, debug_register('c', var->reg, var->data_type)); } }
From: Francisco Casas fcasas@codeweavers.com
Not every instruction expects src swizzles to be mapped according to the dst writemasks, so this logic must be outside this function. --- libs/vkd3d-shader/hlsl_sm1.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_sm1.c b/libs/vkd3d-shader/hlsl_sm1.c index a09d7566..66731265 100644 --- a/libs/vkd3d-shader/hlsl_sm1.c +++ b/libs/vkd3d-shader/hlsl_sm1.c @@ -435,11 +435,9 @@ static void write_sm1_dst_register(struct vkd3d_bytecode_buffer *buffer, const s }
static void write_sm1_src_register(struct vkd3d_bytecode_buffer *buffer, - const struct sm1_src_register *reg, unsigned int dst_writemask) + const struct sm1_src_register *reg) { - unsigned int swizzle = hlsl_map_swizzle(reg->swizzle, dst_writemask); - - put_u32(buffer, (1u << 31) | sm1_encode_register_type(reg->type) | reg->mod | (swizzle << 16) | reg->reg); + put_u32(buffer, (1u << 31) | sm1_encode_register_type(reg->type) | reg->mod | (reg->swizzle << 16) | reg->reg); }
static void write_sm1_instruction(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, @@ -456,14 +454,19 @@ static void write_sm1_instruction(struct hlsl_ctx *ctx, struct vkd3d_bytecode_bu write_sm1_dst_register(buffer, &instr->dst);
for (i = 0; i < instr->src_count; ++i) - write_sm1_src_register(buffer, &instr->srcs[i], instr->dst.writemask); + write_sm1_src_register(buffer, &instr->srcs[i]); };
+static void sm1_map_src_swizzle(struct sm1_src_register *src, unsigned int map_writemask) +{ + src->swizzle = hlsl_map_swizzle(src->swizzle, map_writemask); +} + static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, const struct hlsl_reg *src1, const struct hlsl_reg *src2, const struct hlsl_reg *src3) { - const struct sm1_instruction instr = + struct sm1_instruction instr = { .opcode = opcode,
@@ -483,6 +486,10 @@ static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buf .srcs[2].reg = src3->id, .src_count = 3, }; + + sm1_map_src_swizzle(&instr.srcs[0], instr.dst.writemask); + sm1_map_src_swizzle(&instr.srcs[1], instr.dst.writemask); + sm1_map_src_swizzle(&instr.srcs[2], instr.dst.writemask); write_sm1_instruction(ctx, buffer, &instr); }
@@ -490,7 +497,7 @@ static void write_sm1_binary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buff D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, const struct hlsl_reg *src1, const struct hlsl_reg *src2) { - const struct sm1_instruction instr = + struct sm1_instruction instr = { .opcode = opcode,
@@ -507,6 +514,9 @@ static void write_sm1_binary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buff .srcs[1].reg = src2->id, .src_count = 2, }; + + sm1_map_src_swizzle(&instr.srcs[0], instr.dst.writemask); + sm1_map_src_swizzle(&instr.srcs[1], instr.dst.writemask); write_sm1_instruction(ctx, buffer, &instr); }
@@ -514,7 +524,7 @@ static void write_sm1_unary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffe D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, const struct hlsl_reg *src, D3DSHADER_PARAM_SRCMOD_TYPE src_mod) { - const struct sm1_instruction instr = + struct sm1_instruction instr = { .opcode = opcode,
@@ -529,6 +539,8 @@ static void write_sm1_unary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffe .srcs[0].mod = src_mod, .src_count = 1, }; + + sm1_map_src_swizzle(&instr.srcs[0], instr.dst.writemask); write_sm1_instruction(ctx, buffer, &instr); }
@@ -633,6 +645,7 @@ static void write_sm1_constant(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffe
assert(instr->reg.allocated); assert(constant->reg.allocated); + sm1_map_src_swizzle(&sm1_instr.srcs[0], sm1_instr.dst.writemask); write_sm1_instruction(ctx, buffer, &sm1_instr); }
@@ -772,6 +785,7 @@ static void write_sm1_load(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *b sm1_instr.srcs[0].swizzle = hlsl_swizzle_from_writemask((1 << load->src.var->data_type->dimx) - 1); }
+ sm1_map_src_swizzle(&sm1_instr.srcs[0], sm1_instr.dst.writemask); write_sm1_instruction(ctx, buffer, &sm1_instr); }
@@ -817,6 +831,7 @@ static void write_sm1_store(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer * else assert(reg.allocated);
+ sm1_map_src_swizzle(&sm1_instr.srcs[0], sm1_instr.dst.writemask); write_sm1_instruction(ctx, buffer, &sm1_instr); }
@@ -843,6 +858,7 @@ static void write_sm1_swizzle(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer
assert(instr->reg.allocated); assert(val->reg.allocated); + sm1_map_src_swizzle(&sm1_instr.srcs[0], sm1_instr.dst.writemask); write_sm1_instruction(ctx, buffer, &sm1_instr); }
From: Francisco Casas fcasas@codeweavers.com
SM1 dp2add doesn't map src swizzles to the dst writemask, also it expects the last argument to have a replicate swizzle.
Before this patch we were writing the operation as: ``` dp2add r0.x, r1.x, r0.x, r2.x ```
and now it is: ``` dp2add r0.x, r1.xyxx, r0.xyxx, r2.x ```
dp2add now has its own function, write_sm1_dp2add(), since it seems to be the only instruction with this structure.
Ideally we would be using the default swizzles for the first two src arguments: ``` dp2add r0.x, r1, r0, r2.x ``` since, according to native's documentation, these are supported for all sm < 4.
But this change -- along with following the conversion of repeating the last component of the swizzle when fewer than 4 components are to be specified -- would require more global changes, probably in hlsl_swizzle_from_writemask() and hlsl_map_swizzle(). --- libs/vkd3d-shader/hlsl_sm1.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/libs/vkd3d-shader/hlsl_sm1.c b/libs/vkd3d-shader/hlsl_sm1.c index 66731265..b9ead0f3 100644 --- a/libs/vkd3d-shader/hlsl_sm1.c +++ b/libs/vkd3d-shader/hlsl_sm1.c @@ -462,13 +462,13 @@ static void sm1_map_src_swizzle(struct sm1_src_register *src, unsigned int map_w src->swizzle = hlsl_map_swizzle(src->swizzle, map_writemask); }
-static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, - D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode, const struct hlsl_reg *dst, - const struct hlsl_reg *src1, const struct hlsl_reg *src2, const struct hlsl_reg *src3) +static void write_sm1_dp2add(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *buffer, + const struct hlsl_reg *dst, const struct hlsl_reg *src1, const struct hlsl_reg *src2, + const struct hlsl_reg *src3) { struct sm1_instruction instr = { - .opcode = opcode, + .opcode = D3DSIO_DP2ADD,
.dst.type = D3DSPR_TEMP, .dst.writemask = dst->writemask, @@ -487,9 +487,6 @@ static void write_sm1_ternary_op(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buf .src_count = 3, };
- sm1_map_src_swizzle(&instr.srcs[0], instr.dst.writemask); - sm1_map_src_swizzle(&instr.srcs[1], instr.dst.writemask); - sm1_map_src_swizzle(&instr.srcs[2], instr.dst.writemask); write_sm1_instruction(ctx, buffer, &instr); }
@@ -737,7 +734,7 @@ static void write_sm1_expr(struct hlsl_ctx *ctx, struct vkd3d_bytecode_buffer *b break;
case HLSL_OP3_DP2ADD: - write_sm1_ternary_op(ctx, buffer, D3DSIO_DP2ADD, &instr->reg, &arg1->reg, &arg2->reg, &arg3->reg); + write_sm1_dp2add(ctx, buffer, &instr->reg, &arg1->reg, &arg2->reg, &arg3->reg); break;
default:
Patch 1/3 does two things at once [introduces a helper and changes allocate_register()], which made it somewhat harder to review. Anyway, the concept seems fine, but "dimx" and "component_count" aren't great names. Honestly I'd use "component_count" and "reg_size" respectively.
Patch 2/3 is fine, although I'd prefer something closer to the sm4 architecture.
But this change -- along with following the conversion of repeating the last component of the swizzle when fewer than 4 components are to be specified -- would require more global changes, probably in hlsl_swizzle_from_writemask() and hlsl_map_swizzle().
Yeah, and while I wouldn't object to it on the grounds of making it a bit easier to compare output by eye, I don't think it's necessary.