-- v4: vkd3d-shader/dxil: Implement DX instruction LoadInput. vkd3d-shader/dxil: Declare shader inputs. vkd3d-shader/dxbc: Load input signatures also from ISG1 chunks. vkd3d-shader/spirv: Build undefined values once. vkd3d-shader/spirv: Introduce a Static Single Assignment register type.
From: Conor McCarthy cmccarthy@codeweavers.com
The allocator is used for DXIL input/output parameter arrays. --- libs/vkd3d-shader/vkd3d_shader_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/vkd3d_shader_main.c b/libs/vkd3d-shader/vkd3d_shader_main.c index 273a543a..2e2569d2 100644 --- a/libs/vkd3d-shader/vkd3d_shader_main.c +++ b/libs/vkd3d-shader/vkd3d_shader_main.c @@ -1748,7 +1748,7 @@ static struct vkd3d_shader_param_node *shader_param_allocator_node_create( static void shader_param_allocator_init(struct vkd3d_shader_param_allocator *allocator, unsigned int count, unsigned int stride) { - allocator->count = max(count, 4); + allocator->count = max(count, MAX_REG_OUTPUT); allocator->stride = stride; allocator->head = NULL; allocator->current = NULL;
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d-shader/d3d_asm.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/libs/vkd3d-shader/d3d_asm.c b/libs/vkd3d-shader/d3d_asm.c index d72402eb..5eddb6fe 100644 --- a/libs/vkd3d-shader/d3d_asm.c +++ b/libs/vkd3d-shader/d3d_asm.c @@ -1066,6 +1066,10 @@ static void shader_dump_register(struct vkd3d_d3d_asm_compiler *compiler, const shader_addline(buffer, "oStencilRef"); break;
+ case VKD3DSPR_UNDEF: + shader_addline(buffer, "undef"); + break; + default: shader_addline(buffer, "<unhandled_rtype(%#x)>", reg->type); break;
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d-shader/d3d_asm.c | 6 ++- libs/vkd3d-shader/spirv.c | 50 +++++++++++++++++++++++- libs/vkd3d-shader/vkd3d_shader_private.h | 4 ++ 3 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d-shader/d3d_asm.c b/libs/vkd3d-shader/d3d_asm.c index 5eddb6fe..dea35941 100644 --- a/libs/vkd3d-shader/d3d_asm.c +++ b/libs/vkd3d-shader/d3d_asm.c @@ -1070,6 +1070,10 @@ static void shader_dump_register(struct vkd3d_d3d_asm_compiler *compiler, const shader_addline(buffer, "undef"); break;
+ case VKD3DSPR_SSA: + shader_addline(buffer, "sr"); + break; + default: shader_addline(buffer, "<unhandled_rtype(%#x)>", reg->type); break; @@ -1185,7 +1189,7 @@ static void shader_dump_register(struct vkd3d_d3d_asm_compiler *compiler, const { shader_print_subscript_range(compiler, reg->idx[1].offset, reg->idx[2].offset); } - else + else if (reg->type != VKD3DSPR_SSA) { /* For descriptors in sm < 5.1 we move the reg->idx values up one slot * to normalise with 5.1. diff --git a/libs/vkd3d-shader/spirv.c b/libs/vkd3d-shader/spirv.c index f93960d6..3215e41c 100644 --- a/libs/vkd3d-shader/spirv.c +++ b/libs/vkd3d-shader/spirv.c @@ -2323,6 +2323,9 @@ struct spirv_compiler bool write_tess_geom_point_size;
struct vkd3d_string_buffer_cache string_buffers; + + uint32_t *ssa_register_ids; + unsigned int ssa_register_count; };
static bool is_in_default_phase(const struct spirv_compiler *compiler) @@ -2370,6 +2373,8 @@ static void spirv_compiler_destroy(struct spirv_compiler *compiler) shader_signature_cleanup(&compiler->output_signature); shader_signature_cleanup(&compiler->patch_constant_signature);
+ vkd3d_free(compiler->ssa_register_ids); + vkd3d_free(compiler); }
@@ -3682,6 +3687,23 @@ static uint32_t spirv_compiler_emit_load_scalar(struct spirv_compiler *compiler, return val_id; }
+static inline uint32_t spirv_compiler_get_ssa_register_id(const struct spirv_compiler *compiler, + const struct vkd3d_shader_register *reg) +{ + assert(reg->idx[0].offset < compiler->ssa_register_count); + assert(reg->idx_count == 1); + return compiler->ssa_register_ids[reg->idx[0].offset]; +} + +static uint32_t spirv_compiler_emit_load_ssa_reg(struct spirv_compiler *compiler, + const struct vkd3d_shader_register *reg, enum vkd3d_shader_component_type component_type, + unsigned int swizzle, unsigned int write_mask) +{ + uint32_t val_id = spirv_compiler_get_ssa_register_id(compiler, reg); + assert(val_id); + return val_id; +} + static uint32_t spirv_compiler_emit_load_reg(struct spirv_compiler *compiler, const struct vkd3d_shader_register *reg, DWORD swizzle, DWORD write_mask) { @@ -3701,6 +3723,10 @@ static uint32_t spirv_compiler_emit_load_reg(struct spirv_compiler *compiler,
component_count = vkd3d_write_mask_component_count(write_mask); component_type = vkd3d_component_type_from_data_type(reg->data_type); + + if (reg->type == VKD3DSPR_SSA) + return spirv_compiler_emit_load_ssa_reg(compiler, reg, component_type, swizzle, write_mask); + if (!spirv_compiler_get_register_info(compiler, reg, ®_info)) { type_id = vkd3d_spirv_get_type_id(builder, component_type, component_count); @@ -3912,6 +3938,13 @@ static void spirv_compiler_emit_store_reg(struct spirv_compiler *compiler,
assert(!register_is_constant_or_undef(reg));
+ if (reg->type == VKD3DSPR_SSA) + { + assert(reg->idx[0].offset < compiler->ssa_register_count); + compiler->ssa_register_ids[reg->idx[0].offset] = val_id; + return; + } + if (!spirv_compiler_get_register_info(compiler, reg, ®_info)) return; spirv_compiler_emit_dereference_register(compiler, reg, ®_info); @@ -5375,6 +5408,18 @@ static void spirv_compiler_emit_temps(struct spirv_compiler *compiler, uint32_t vkd3d_spirv_end_function_stream_insertion(builder); }
+static void spirv_compiler_emit_ssas(struct spirv_compiler *compiler, unsigned int count) +{ + assert(!compiler->ssa_register_ids); + if (!(compiler->ssa_register_ids = vkd3d_calloc(count, sizeof(*compiler->ssa_register_ids)))) + { + ERR("Failed to allocate SSA register value id array, count %u.\n", count); + spirv_compiler_error(compiler, VKD3D_SHADER_ERROR_SPV_OUT_OF_MEMORY, + "Failed to allocate SSA register value id array of count %u.", count); + } + compiler->ssa_register_count = count; +} + static void spirv_compiler_emit_dcl_indexable_temp(struct spirv_compiler *compiler, const struct vkd3d_shader_instruction *instruction) { @@ -6692,7 +6737,8 @@ static void spirv_compiler_emit_mov(struct spirv_compiler *compiler, uint32_t components[VKD3D_VEC4_SIZE]; unsigned int i, component_count;
- if (register_is_constant_or_undef(&src->reg) || dst->modifiers || src->modifiers) + if (register_is_constant_or_undef(&src->reg) || src->reg.type == VKD3DSPR_SSA || dst->reg.type == VKD3DSPR_SSA + || dst->modifiers || src->modifiers) goto general_implementation;
spirv_compiler_get_register_info(compiler, &dst->reg, &dst_reg_info); @@ -9541,6 +9587,8 @@ static int spirv_compiler_generate_spirv(struct spirv_compiler *compiler,
if (parser->shader_desc.temp_count) spirv_compiler_emit_temps(compiler, parser->shader_desc.temp_count); + if (parser->shader_desc.ssa_count) + spirv_compiler_emit_ssas(compiler, parser->shader_desc.ssa_count);
spirv_compiler_emit_descriptor_declarations(compiler);
diff --git a/libs/vkd3d-shader/vkd3d_shader_private.h b/libs/vkd3d-shader/vkd3d_shader_private.h index eab1c730..22645e2e 100644 --- a/libs/vkd3d-shader/vkd3d_shader_private.h +++ b/libs/vkd3d-shader/vkd3d_shader_private.h @@ -92,6 +92,7 @@ enum vkd3d_shader_error VKD3D_SHADER_ERROR_SPV_INVALID_DESCRIPTOR_BINDING = 2002, VKD3D_SHADER_ERROR_SPV_DESCRIPTOR_IDX_UNSUPPORTED = 2003, VKD3D_SHADER_ERROR_SPV_STENCIL_EXPORT_UNSUPPORTED = 2004, + VKD3D_SHADER_ERROR_SPV_OUT_OF_MEMORY = 2005,
VKD3D_SHADER_WARNING_SPV_INVALID_SWIZZLE = 2300,
@@ -523,6 +524,7 @@ enum vkd3d_shader_register_type VKD3DSPR_RASTERIZER, VKD3DSPR_OUTSTENCILREF, VKD3DSPR_UNDEF, + VKD3DSPR_SSA,
VKD3DSPR_COUNT,
@@ -738,6 +740,7 @@ struct vkd3d_shader_register uint64_t immconst_uint64[VKD3D_DVEC2_SIZE]; double immconst_double[VKD3D_DVEC2_SIZE]; unsigned fp_body_idx; + enum vkd3d_data_type dcl_data_type; } u; };
@@ -872,6 +875,7 @@ struct vkd3d_shader_desc struct shader_signature patch_constant_signature;
uint32_t temp_count; + unsigned int ssa_count;
struct {
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d-shader/spirv.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/libs/vkd3d-shader/spirv.c b/libs/vkd3d-shader/spirv.c index 3215e41c..3dd81def 100644 --- a/libs/vkd3d-shader/spirv.c +++ b/libs/vkd3d-shader/spirv.c @@ -1215,10 +1215,14 @@ static uint32_t vkd3d_spirv_build_op_function_call(struct vkd3d_spirv_builder *b SpvOpFunctionCall, result_type, function_id, arguments, argument_count); }
-static uint32_t vkd3d_spirv_build_op_undef(struct vkd3d_spirv_builder *builder, - struct vkd3d_spirv_stream *stream, uint32_t type_id) +static uint32_t vkd3d_spirv_build_op_undef(struct vkd3d_spirv_builder *builder, uint32_t type_id) { - return vkd3d_spirv_build_op_tr(builder, stream, SpvOpUndef, type_id); + return vkd3d_spirv_build_op_tr(builder, &builder->global_stream, SpvOpUndef, type_id); +} + +static uint32_t vkd3d_spirv_get_op_undef(struct vkd3d_spirv_builder *builder, uint32_t type_id) +{ + return vkd3d_spirv_build_once1(builder, SpvOpUndef, type_id, vkd3d_spirv_build_op_undef); }
static uint32_t vkd3d_spirv_build_op_access_chain(struct vkd3d_spirv_builder *builder, @@ -2855,7 +2859,7 @@ static uint32_t spirv_compiler_get_constant(struct spirv_compiler *compiler, break; default: FIXME("Unhandled component_type %#x.\n", component_type); - return vkd3d_spirv_build_op_undef(builder, &builder->global_stream, type_id); + return vkd3d_spirv_get_op_undef(builder, type_id); }
if (component_count == 1) @@ -2884,7 +2888,7 @@ static uint32_t spirv_compiler_get_constant64(struct spirv_compiler *compiler, if (component_type != VKD3D_SHADER_COMPONENT_DOUBLE) { FIXME("Unhandled component_type %#x.\n", component_type); - return vkd3d_spirv_build_op_undef(builder, &builder->global_stream, type_id); + return vkd3d_spirv_get_op_undef(builder, type_id); }
if (component_count == 1) @@ -3636,7 +3640,7 @@ static uint32_t spirv_compiler_emit_load_undef(struct spirv_compiler *compiler, assert(reg->type == VKD3DSPR_UNDEF);
type_id = vkd3d_spirv_get_type_id_for_data_type(builder, reg->data_type, component_count); - return vkd3d_spirv_build_op_undef(builder, &builder->global_stream, type_id); + return vkd3d_spirv_get_op_undef(builder, type_id); }
static uint32_t spirv_compiler_emit_load_scalar(struct spirv_compiler *compiler, @@ -3730,7 +3734,7 @@ static uint32_t spirv_compiler_emit_load_reg(struct spirv_compiler *compiler, if (!spirv_compiler_get_register_info(compiler, reg, ®_info)) { type_id = vkd3d_spirv_get_type_id(builder, component_type, component_count); - return vkd3d_spirv_build_op_undef(builder, &builder->global_stream, type_id); + return vkd3d_spirv_get_op_undef(builder, type_id); } assert(reg_info.component_type != VKD3D_SHADER_COMPONENT_DOUBLE); spirv_compiler_emit_dereference_register(compiler, reg, ®_info);
From: Conor McCarthy cmccarthy@codeweavers.com
When DXBC contains DXIL code it uses ISG1 signatures. --- libs/vkd3d-shader/dxbc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libs/vkd3d-shader/dxbc.c b/libs/vkd3d-shader/dxbc.c index cedc3da4..52234872 100644 --- a/libs/vkd3d-shader/dxbc.c +++ b/libs/vkd3d-shader/dxbc.c @@ -429,7 +429,7 @@ static int isgn_handler(const struct vkd3d_shader_dxbc_section_desc *section, { struct shader_signature *is = ctx;
- if (section->tag != TAG_ISGN) + if (section->tag != TAG_ISGN && section->tag != TAG_ISG1) return VKD3D_OK;
if (is->elements)
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d-shader/dxil.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/libs/vkd3d-shader/dxil.c b/libs/vkd3d-shader/dxil.c index 666d8b08..6c8d6ec5 100644 --- a/libs/vkd3d-shader/dxil.c +++ b/libs/vkd3d-shader/dxil.c @@ -296,6 +296,7 @@ struct sm6_parser size_t global_symbol_count;
struct vkd3d_shader_dst_param *output_params; + struct vkd3d_shader_dst_param *input_params;
struct sm6_function *functions; size_t function_count; @@ -2112,6 +2113,8 @@ static void sm6_parser_emit_signature(struct sm6_parser *sm6, const struct shade param = &ins->declaration.dst; }
+ /* TODO: set the interpolation mode when signatures are loaded from DXIL metadata. */ + ins->flags = (handler_idx == VKD3DSIH_DCL_INPUT_PS) ? VKD3DSIM_LINEAR_NOPERSPECTIVE : 0; *param = params[i]; } } @@ -2123,11 +2126,24 @@ static void sm6_parser_init_output_signature(struct sm6_parser *sm6, const struc sm6->output_params); }
+static void sm6_parser_init_input_signature(struct sm6_parser *sm6, const struct shader_signature *input_signature) +{ + sm6_parser_init_signature(sm6, input_signature, VKD3DSPR_INPUT, sm6->input_params); +} + static void sm6_parser_emit_output_signature(struct sm6_parser *sm6, const struct shader_signature *output_signature) { sm6_parser_emit_signature(sm6, output_signature, VKD3DSIH_DCL_OUTPUT, VKD3DSIH_DCL_OUTPUT_SIV, sm6->output_params); }
+static void sm6_parser_emit_input_signature(struct sm6_parser *sm6, const struct shader_signature *input_signature) +{ + sm6_parser_emit_signature(sm6, input_signature, + (sm6->p.shader_version.type == VKD3D_SHADER_TYPE_PIXEL) ? VKD3DSIH_DCL_INPUT_PS : VKD3DSIH_DCL_INPUT, + (sm6->p.shader_version.type == VKD3D_SHADER_TYPE_PIXEL) ? VKD3DSIH_DCL_INPUT_PS_SIV : VKD3DSIH_DCL_INPUT_SIV, + sm6->input_params); +} + static const struct sm6_value *sm6_parser_next_function_definition(struct sm6_parser *sm6) { size_t i, count = sm6->function_count; @@ -2683,6 +2699,7 @@ static enum vkd3d_result sm6_parser_init(struct sm6_parser *sm6, const uint32_t const char *source_name, struct vkd3d_shader_message_context *message_context) { const struct shader_signature *output_signature = &sm6->p.shader_desc.output_signature; + const struct shader_signature *input_signature = &sm6->p.shader_desc.input_signature; const struct vkd3d_shader_location location = {.source_name = source_name}; uint32_t version_token, dxil_version, token_count, magic; unsigned int chunk_offset, chunk_size; @@ -2838,7 +2855,8 @@ static enum vkd3d_result sm6_parser_init(struct sm6_parser *sm6, const uint32_t return ret; }
- if (!(sm6->output_params = shader_parser_get_dst_params(&sm6->p, output_signature->element_count))) + if (!(sm6->output_params = shader_parser_get_dst_params(&sm6->p, output_signature->element_count)) + || !(sm6->input_params = shader_parser_get_dst_params(&sm6->p, input_signature->element_count))) { ERR("Failed to allocate output parameters.\n"); vkd3d_shader_error(message_context, &location, VKD3D_SHADER_ERROR_DXIL_OUT_OF_MEMORY, @@ -2877,6 +2895,7 @@ static enum vkd3d_result sm6_parser_init(struct sm6_parser *sm6, const uint32_t }
sm6_parser_init_output_signature(sm6, output_signature); + sm6_parser_init_input_signature(sm6, input_signature);
if ((ret = sm6_parser_module_init(sm6, &sm6->root_block, 0)) < 0) { @@ -2889,13 +2908,14 @@ static enum vkd3d_result sm6_parser_init(struct sm6_parser *sm6, const uint32_t return ret; }
- if (!sm6_parser_require_space(sm6, output_signature->element_count)) + if (!sm6_parser_require_space(sm6, output_signature->element_count + input_signature->element_count)) { vkd3d_shader_error(message_context, &location, VKD3D_SHADER_ERROR_DXIL_OUT_OF_MEMORY, "Out of memory emitting shader signature declarations."); return VKD3D_ERROR_OUT_OF_MEMORY; } sm6_parser_emit_output_signature(sm6, output_signature); + sm6_parser_emit_input_signature(sm6, input_signature);
for (i = 0; i < sm6->function_count; ++i) {
From: Conor McCarthy cmccarthy@codeweavers.com
--- libs/vkd3d-shader/dxil.c | 103 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+)
diff --git a/libs/vkd3d-shader/dxil.c b/libs/vkd3d-shader/dxil.c index 6c8d6ec5..a304bd97 100644 --- a/libs/vkd3d-shader/dxil.c +++ b/libs/vkd3d-shader/dxil.c @@ -141,6 +141,7 @@ enum bitcode_value_symtab_code
enum dx_intrinsic_opcode { + DX_LOAD_INPUT = 4, DX_STORE_OUTPUT = 5, };
@@ -305,6 +306,7 @@ struct sm6_parser size_t value_count; size_t value_capacity; size_t cur_max_value; + unsigned int ssa_next_id;
struct vkd3d_shader_parser p; }; @@ -1548,6 +1550,11 @@ static inline unsigned int sm6_value_get_constant_uint(const struct sm6_value *v return register_get_uint_value(&value->u.reg); }
+static inline unsigned int sm6_parser_alloc_ssa_id(struct sm6_parser *sm6) +{ + return sm6->ssa_next_id++; +} + static struct vkd3d_shader_src_param *instruction_src_params_alloc(struct vkd3d_shader_instruction *ins, unsigned int count, struct sm6_parser *sm6) { @@ -1580,6 +1587,24 @@ static struct vkd3d_shader_dst_param *instruction_dst_params_alloc(struct vkd3d_ return params; }
+static void register_init_with_id(struct vkd3d_shader_register *reg, + enum vkd3d_shader_register_type reg_type, enum vkd3d_data_type data_type, unsigned int index) +{ + shader_register_init(reg, reg_type, data_type, 1); + reg->idx[0].offset = index; +} + +static void register_init_ssa_vector(struct vkd3d_shader_register *reg, enum vkd3d_data_type data_type, + unsigned int component_count, struct sm6_parser *sm6) +{ + unsigned int id; + + id = sm6_parser_alloc_ssa_id(sm6); + register_init_with_id(reg, VKD3DSPR_SSA, data_type, id); + reg->u.dcl_data_type = data_type; + reg->immconst_type = component_count > 1 ? VKD3D_IMMCONST_VEC4 : VKD3D_IMMCONST_SCALAR; +} + static enum vkd3d_data_type vkd3d_data_type_from_sm6_type(const struct sm6_type *type) { if (type->class == TYPE_CLASS_INTEGER) @@ -1613,6 +1638,19 @@ static enum vkd3d_data_type vkd3d_data_type_from_sm6_type(const struct sm6_type return VKD3D_DATA_UINT; }
+static inline void register_init_ssa_scalar(struct vkd3d_shader_register *reg, const struct sm6_type *type, + struct sm6_parser *sm6) +{ + register_init_ssa_vector(reg, vkd3d_data_type_from_sm6_type(sm6_type_get_scalar_type(type, 0)), 1, sm6); +} + +static inline void dst_param_init(struct vkd3d_shader_dst_param *param) +{ + param->write_mask = VKD3DSP_WRITEMASK_0; + param->modifiers = 0; + param->shift = 0; +} + static inline void dst_param_init_scalar(struct vkd3d_shader_dst_param *param, unsigned int component_idx) { param->write_mask = 1u << component_idx; @@ -1620,12 +1658,25 @@ static inline void dst_param_init_scalar(struct vkd3d_shader_dst_param *param, u param->shift = 0; }
+static inline void dst_param_init_ssa_scalar(struct vkd3d_shader_dst_param *param, const struct sm6_type *type, + struct sm6_parser *sm6) +{ + dst_param_init(param); + register_init_ssa_scalar(¶m->reg, type, sm6); +} + static inline void src_param_init(struct vkd3d_shader_src_param *param) { param->swizzle = VKD3D_SHADER_SWIZZLE(X, X, X, X); param->modifiers = VKD3DSPSM_NONE; }
+static inline void src_param_init_scalar(struct vkd3d_shader_src_param *param, unsigned int component_idx) +{ + param->swizzle = vkd3d_shader_create_swizzle(component_idx, component_idx, component_idx, component_idx); + param->modifiers = VKD3DSPSM_NONE; +} + static void src_param_init_from_value(struct vkd3d_shader_src_param *param, const struct sm6_value *src) { src_param_init(param); @@ -1654,6 +1705,22 @@ static void register_address_init(struct vkd3d_shader_register *reg, const struc } }
+static void instruction_dst_param_init_ssa_scalar_component(struct vkd3d_shader_instruction *ins, + unsigned int component_idx, struct sm6_parser *sm6) +{ + struct vkd3d_shader_dst_param *param = instruction_dst_params_alloc(ins, 1, sm6); + struct sm6_value *dst = sm6_parser_get_current_value(sm6); + + dst_param_init_ssa_scalar(param, dst->type, sm6); + param->write_mask = VKD3DSP_WRITEMASK_0 << component_idx; + dst->u.reg = param->reg; +} + +static inline void instruction_dst_param_init_ssa_scalar(struct vkd3d_shader_instruction *ins, struct sm6_parser *sm6) +{ + return instruction_dst_param_init_ssa_scalar_component(ins, 0, sm6); +} + /* Recurse through the block tree while maintaining a current value count. The current * count is the sum of the global count plus all declarations within the current function. * Store into value_capacity the highest count seen. */ @@ -2166,6 +2233,38 @@ static struct sm6_block *sm6_block_create() return block; }
+static void sm6_parser_emit_dx_load_input(struct sm6_parser *sm6, struct sm6_block *code_block, + enum dx_intrinsic_opcode op, const struct sm6_value **operands, struct vkd3d_shader_instruction *ins) +{ + struct vkd3d_shader_src_param *src_param; + const struct shader_signature *signature; + unsigned int row_index, column_index; + const struct signature_element *e; + + row_index = sm6_value_get_constant_uint(operands[0]); + column_index = sm6_value_get_constant_uint(operands[2]); + + shader_instruction_init(ins, VKD3DSIH_MOV); + + signature = &sm6->p.shader_desc.input_signature; + if (row_index >= signature->element_count) + { + WARN("Invalid row index %u.\n", row_index); + vkd3d_shader_parser_error(&sm6->p, VKD3D_SHADER_ERROR_DXIL_INVALID_OPERAND, + "Invalid input row index %u.", row_index); + return; + } + e = &signature->elements[row_index]; + + src_param = instruction_src_params_alloc(ins, 1, sm6); + src_param->reg = sm6->input_params[row_index].reg; + src_param_init_scalar(src_param, column_index); + if (e->register_count > 1) + register_address_init(&src_param->reg, operands[1], 0, sm6); + + instruction_dst_param_init_ssa_scalar(ins, sm6); +} + static void sm6_parser_emit_dx_store_output(struct sm6_parser *sm6, struct sm6_block *code_block, enum dx_intrinsic_opcode op, const struct sm6_value **operands, struct vkd3d_shader_instruction *ins) { @@ -2235,6 +2334,7 @@ struct sm6_dx_opcode_info */ static const struct sm6_dx_opcode_info sm6_dx_op_table[] = { + [DX_LOAD_INPUT ] = {'o', "ii8i", sm6_parser_emit_dx_load_input}, [DX_STORE_OUTPUT ] = {'v', "ii8o", sm6_parser_emit_dx_store_output}, };
@@ -2887,6 +2987,7 @@ static enum vkd3d_result sm6_parser_init(struct sm6_parser *sm6, const uint32_t "Out of memory allocating DXIL value array."); return VKD3D_ERROR_OUT_OF_MEMORY; } + sm6->ssa_next_id = 1;
if ((ret = sm6_parser_globals_init(sm6)) < 0) { @@ -2917,6 +3018,8 @@ static enum vkd3d_result sm6_parser_init(struct sm6_parser *sm6, const uint32_t sm6_parser_emit_output_signature(sm6, output_signature); sm6_parser_emit_input_signature(sm6, input_signature);
+ sm6->p.shader_desc.ssa_count = sm6->ssa_next_id; + for (i = 0; i < sm6->function_count; ++i) { if (!sm6_block_emit_instructions(sm6->functions[i].blocks[0], sm6))
I removed most of the code from `spirv_compiler_emit_load_ssa_reg()` as we don't need it yet.
The sm6_rebase branch has some draft changes to shader runner. It introduces an `[enable]` directive where specifying `dxil` runs the test in SM 6. If this is missing the test is skipped for SM 6. Some tests can't be compiled with DXC. Currently it uses an environment variable to access DXC, but as was discussed some time ago, calling DXC's C++ library functions from C doesn't work. I'll revisit this to see if I can hack a way to match up the parameters.
On Wed Sep 6 10:55:59 2023 +0000, Henri Verbeet wrote:
How valuable is that? If it's significant, we may want to consider
converting TPF to SSA form as well in the SPIR-V backend.
It seems to me quite likely that translating sm4 to SSA would be more
expensive than passing it to the driver (and letting the driver translate it to SSA internally, probably), but translating sm6 to temps would be more expensive than passing it through to the driver. To be clear, my expectation would be that these redundant OpStore and OpLoad instructions largely don't matter. I could very well be wrong about that, of course. And perhaps there are other reasons to avoid using temps here, but I haven't seen those. So what I imagine we want to do, is to start with naively translating DXIL SSA values to vsir temp assignments. That should unblock upstreaming plenty of bits. We don't have control flow yet, we don't have phi instructions yet either, and it's not clear to me that there's a fundamental reason e.g. vsir phi instructions couldn't operate on vsir temps anyway. I think there are a few scenarios that could happen from there:
- It's fine.
- We introduce vsir SSA registers for some other reason, e.g. because
the HLSL compiler wants to do optimisations in that form; DXIL/SPIR-V could then just take advantage of that.
- We encounter some issue that can't reasonably be resolved with vsir
temps. We discuss it when we get there, instead of now in the abstract.
- We do some benchmarking/profiling and find that there are either
advantages in compile time or run time to introducing a separate register type. We discuss it when we get there, with the hard data to back it up, instead of speculating about it now. And perhaps this is also a good time to reflect on the broader upstreaming strategy. On a very broad level, the sequence that I would have hoped/expected to see would be something along these lines:
- Some shader_runner infrastructure to compile and run DXIL shaders.
- The most basic, straightforward implementation of the bits required
to make a fairly minimal test pass. E.g., tests/hlsl/swizzles.shader_test.
- Basic, straightforward implementations of features required to make
the rest of the tests pass.
- Features required by applications, but not covered by the tests.
Writing tests for these as they're implemented.
- Optimisations and other complications.
Also, I'd like to stress this because it has come up before for other MRs, please don't pre-emptively add complications for issues that will only come up later; whether that's in a later patch in the same series or 400 patches later in the branch you're upstreaming from. Reviewers generally can't or don't want to look that far ahead, and it just ends up slowing the entire process down. Patches should make sense in isolation, at the point in time where they're introduced.
Just to explain my acceptance, I think too that the ideal flow would be to first convert SSA to temporary load and store, and introduce later SSA if there is an advantage. Dealing with loads and stores is something I expect downstream compilers to do pretty well (even the HLSL compiler does that, to a certain extent!), so I wouldn't stress too much over that. Phi nodes shouldn't be too much of a hassle either: just before emitting a jump you look at the phi nodes of the target block and convert them to assignments, unless I am missing something.
That said, I don't think it is unreasonable to go with SSA from the get-go, so I accept this MR and leave to Henri to do the actual software architecture part.
I tested an incrementing temp register index for each value. For a simple pass-through pixel shader, here is the SPIR-V function with SSA:
```%main = OpFunction %void None %3 %4 = OpLabel %13 = OpLoad %v4float %gl_FragCoord %14 = OpCompositeExtract %float %13 3 %16 = OpFDiv %float %float_1 %14 %17 = OpCompositeInsert %v4float %16 %13 3 OpStore %v0 %17 %21 = OpInBoundsAccessChain %_ptr_Private_float %v0 %uint_0 %22 = OpLoad %float %21 %24 = OpInBoundsAccessChain %_ptr_Private_float %v0 %uint_1 %25 = OpLoad %float %24 %27 = OpInBoundsAccessChain %_ptr_Private_float %v0 %uint_2 %28 = OpLoad %float %27 %30 = OpInBoundsAccessChain %_ptr_Private_float %v0 %uint_3 %31 = OpLoad %float %30 %33 = OpInBoundsAccessChain %_ptr_Output_float %o0 %uint_0 OpStore %33 %22 %34 = OpInBoundsAccessChain %_ptr_Output_float %o0 %uint_1 OpStore %34 %25 %35 = OpInBoundsAccessChain %_ptr_Output_float %o0 %uint_2 OpStore %35 %28 %36 = OpInBoundsAccessChain %_ptr_Output_float %o0 %uint_3 OpStore %36 %31 OpReturn OpFunctionEnd ```
And with temps:
```%main = OpFunction %void None %3 %4 = OpLabel %17 = OpLoad %v4float %gl_FragCoord %18 = OpCompositeExtract %float %17 3 %20 = OpFDiv %float %float_1 %18 %21 = OpCompositeInsert %v4float %20 %17 3 OpStore %v0 %21 %25 = OpInBoundsAccessChain %_ptr_Private_float %v0 %uint_0 %26 = OpLoad %float %25 %27 = OpInBoundsAccessChain %_ptr_Private_float %r0 %uint_0 OpStore %27 %26 %29 = OpInBoundsAccessChain %_ptr_Private_float %v0 %uint_1 %30 = OpLoad %float %29 %31 = OpInBoundsAccessChain %_ptr_Private_float %r1 %uint_0 OpStore %31 %30 %33 = OpInBoundsAccessChain %_ptr_Private_float %v0 %uint_2 %34 = OpLoad %float %33 %35 = OpInBoundsAccessChain %_ptr_Private_float %r2 %uint_0 OpStore %35 %34 %37 = OpInBoundsAccessChain %_ptr_Private_float %v0 %uint_3 %38 = OpLoad %float %37 %39 = OpInBoundsAccessChain %_ptr_Private_float %r3 %uint_0 OpStore %39 %38 %40 = OpInBoundsAccessChain %_ptr_Private_float %r0 %uint_0 %41 = OpLoad %float %40 %43 = OpInBoundsAccessChain %_ptr_Output_float %o0 %uint_0 OpStore %43 %41 %44 = OpInBoundsAccessChain %_ptr_Private_float %r1 %uint_0 %45 = OpLoad %float %44 %46 = OpInBoundsAccessChain %_ptr_Output_float %o0 %uint_1 OpStore %46 %45 %47 = OpInBoundsAccessChain %_ptr_Private_float %r2 %uint_0 %48 = OpLoad %float %47 %49 = OpInBoundsAccessChain %_ptr_Output_float %o0 %uint_2 OpStore %49 %48 %50 = OpInBoundsAccessChain %_ptr_Private_float %r3 %uint_0 %51 = OpLoad %float %50 %52 = OpInBoundsAccessChain %_ptr_Output_float %o0 %uint_3 OpStore %52 %51 OpReturn OpFunctionEnd ```
I can make it like this if necessary, but IMO using temps this way is an ugly hack which makes debugging SPIR-V traces harder. Once 1.9 is out, temps must be replaced with SSA because using them is a dead end. We should not attempt to implement a temp register allocator which works on DXIL control flow graphs.
Also, I'd like to stress this because it has come up before for other MRs, please don't pre-emptively add complications for issues that will only come up later; whether that's in a later patch in the same series or 400 patches later in the branch you're upstreaming from. Reviewers generally can't or don't want to look that far ahead, and it just ends up slowing the entire process down. Patches should make sense in isolation, at the point in time where they're introduced.
I agree strongly with this in general. I find myself often asking people to do this. But—and I'm sure I'm missing some perspective here—I don't see it as being a problem in this specific case.
Prematurely adding complications is a problem, I think, when I (as a reviewer) can't see how they're going to be used, and when I feel that there might be a better, simpler option that will possibly preclude the more complex code from *ever* being necessary.
I don't see an issue with the first part; I have the barest of knowledge about sm6 but I can tell that if it has SSA nodes, then we're introducing a vsir SSA node to translate from sm6 SSA nodes and that it's going to be used to translate to spirv SSA nodes.
As for whether there's a better, simpler option—I have a hard time personally seeing temps as simpler. This is partly because the SSA register type is just not complicated to begin with, and I don't expect the sm6 -> vsir or vsir -> spirv translations thereof to be complicated either. But it's also because passes on vsir (both the ones I'm anticipating related to sm1/sm4 output, and the ones Conor is talking about) are going to be distinctly harder to do on temps than on SSA nodes. [Some of this is armchair analysis, some of this may be bias from having just written the code, but some of it is influenced by comparing copy-prop to our other HLSL passes. Copy-prop is more complex and harder to reason with in general, and I think roughly models how we'd have to do passes on non-SSA.]
(And, well, I'm more than a little concerned about compilation speed, and compiled shader size, which makes me think that we likely *will* SSA nodes in vsir at some point. But I don't have any data to back that up, so I'll let go of that concern for now...)
Also, I'd like to stress this because it has come up before for other MRs, please don't pre-emptively add complications for issues that will only come up later; whether that's in a later patch in the same series or 400 patches later in the branch you're upstreaming from. Reviewers generally can't or don't want to look that far ahead, and it just ends up slowing the entire process down. Patches should make sense in isolation, at the point in time where they're introduced.
I agree strongly with this in general. I find myself often asking people to do this. But—and I'm sure I'm missing some perspective here—I don't see it as being a problem in this specific case.
Well, part of the context is at which point in the release process we are. We were about a week into the freeze when this was first submitted, and we're about a week from the release now. If this had been submitted in e.g. the first month or so after the 1.8 release, I would probably still have grumbled about the process and posted a comment along the lines of "please do better", but it may have ultimately gone in. As it is, I don't think so, and for the most part that's simply because the MR is touching things outside dxil.c that it doesn't strictly have to touch.
Prematurely adding complications is a problem, I think, when I (as a reviewer) can't see how they're going to be used, and when I feel that there might be a better, simpler option that will possibly preclude the more complex code from *ever* being necessary.
I don't see an issue with the first part; I have the barest of knowledge about sm6 but I can tell that if it has SSA nodes, then we're introducing a vsir SSA node to translate from sm6 SSA nodes and that it's going to be used to translate to spirv SSA nodes.
As for whether there's a better, simpler option—I have a hard time personally seeing temps as simpler.
Which parts of the system are you considering when you make that evaluation? Introducing SSA nodes is certainly simpler if you consider dxil.c in isolation, or even the dxil -> spirv path in isolation. That's (trivially) not true when you consider e.g. d3d_asm.c or spirv.c in isolation. Similarly, SSA nodes are extra work for someone adding e.g. a MSL or GLSL backend. What about transformation passes in ir.c? Are those potentially going to need to handle VKD3DSPR_SSA? We don't run any of the existing ones on dxil sources, but that doesn't make the consideration just go away.
It gets more complicated when you consider vkd3d-shader as a whole. We add some complexity in some places, but avoid some in others. We probably also eventually want SSA nodes for the HLSL compiler anyway. Is that worth it in the end? Well, probably.
What we're concerned about here though, is review/upstreaming complexity. And it's not even close there; SSA nodes introduce the evaluation above, as well as e.g. considerations about whether it's prudent to introduce these at this point in the release process, while for changes confined to dxil.c it would be only slight more complicated than "Well, we didn't support dxil before, so it's not terribly likely to make things worse."
This is partly because the SSA register type is just not complicated to begin with, and I don't expect the sm6 -> vsir or vsir -> spirv translations thereof to be complicated either. But it's also because passes on vsir (both the ones I'm anticipating related to sm1/sm4 output, and the ones Conor is talking about) are going to be distinctly harder to do on temps than on SSA nodes. [Some of this is armchair analysis, some of this may be bias from having just written the code, but some of it is influenced by comparing copy-prop to our other HLSL passes. Copy-prop is more complex and harder to reason with in general, and I think roughly models how we'd have to do passes on non-SSA.]
(And, well, I'm more than a little concerned about compilation speed, and compiled shader size, which makes me think that we likely *will* SSA nodes in vsir at some point. But I don't have any data to back that up, so I'll let go of that concern for now...)
Sure; I don't think anyone raised concerns about introducing SSA nodes per se. The issue is mostly about whether the rest of this MR (and by extension future dxil patches) should depend on that (and by extension the 1.9 release).
Sure; I don't think anyone raised concerns about introducing SSA nodes per se. The issue is mostly about whether the rest of this MR (and by extension future dxil patches) should depend on that (and by extension the 1.9 release).
That makes more sense to me. Personally I wasn't even considering 1.9, and if I was, I would figure that if we're late enough in the release process to be that concerned, we wouldn't want to accept this anyway. I can see why we'd accept isolated changes to the sm6 stuff, though.
[I also did not consider GLSL or MSL, although having done so, I feel like we can translate those to an appropriately-namespaced temp? I.e. it shouldn't be that hard? It's not taking advantage of the SSA part, but it'd at least work.]
I think there is a bit of a tradeoff here, if we do use SSA eventually... namely, the unused complexity it adds to vsir, versus the cost of asking Conor to rewrite his patches just so he can un-rewrite them later. I'm still inclined to say the latter outweighs the former, and if that means we have to wait for 1.9 so be it. But I won't belabor the argument.
Which parts of the system are you considering when you make that evaluation? Introducing SSA nodes is certainly simpler if you consider dxil.c in isolation, or even the dxil -> spirv path in isolation. That's (trivially) not true when you consider e.g. d3d_asm.c or spirv.c in isolation. Similarly, SSA nodes are extra work for someone adding e.g. a MSL or GLSL backend. What about transformation passes in ir.c? Are those potentially going to need to handle VKD3DSPR_SSA? We don't run any of the existing ones on dxil sources, but that doesn't make the consideration just go away.
As I said, I agree with Henri that it would have been sensible to first use temporaries, but as a side consideration if we eventually introduce SSA I don't expect that necessarily every consumer of VSIR is expected to handle it. We could internally have a few different dialects, for example one that allows SSA and one that doesn't. And maybe a pass that lowers SSA to temporaries, so that a SSA-less consumer can be chained with a SSA-ful producer. Maybe even a pass that raises temporaries to SSA (i.e., a copy propagation pass), if that turns out to be useful.
I think there is a bit of a tradeoff here, if we do use SSA eventually... namely, the unused complexity it adds to vsir, versus the cost of asking Conor to rewrite his patches just so he can un-rewrite them later. I'm still inclined to say the latter outweighs the former, and if that means we have to wait for 1.9 so be it. But I won't belabor the argument.
Right, at this point the likely scenario is that we'll release 1.9 without dxil support, and then commit this MR more or less as-is after the release. That's unfortunate though, and I don't think it necessarily had to be that way. Perhaps more importantly, I'd also like to make sure we try to avoid similar situations going forward.
I didn't see DXIL support as being particularly relevant to 1.9 because it would be the barest minimum, but I realise it's meaningful as a milestone to have a test pass. Are there other reasons I'm not aware of?
Having just dealt with temps in CFGs, I think SSA -> temp conversion may be reasonable to implement, with a risk of getting caught up in edge cases. It would allow deletion of all the phi complications in the structuriser, which may be more than enough to justify it. It could also break the structuriser and make the project take a lot longer than expected. Also, we could clean up the SPIR-V code by emitting all temp access chains in the entry block with a build-once function, and name them, e.g. `r0.x_ptr`. Then optimise away many loads/stores by tracking and reusing result values within blocks. This is just to make debugging easier.
If I test this before the 1.9 release and it works out okay then SSA can be removed from the MR and it will be ready right after 1.9. But the shader runner tests come first.
I didn't see DXIL support as being particularly relevant to 1.9 because it would be the barest minimum, but I realise it's meaningful as a milestone to have a test pass. Are there other reasons I'm not aware of?
Fundamentally, we'd like development to happen upstream as much as possible. There are situations where developing/prototyping something on a branch first makes sense, and this is certainly one of them, but there are costs to maintaining branches. The most obvious of those is perhaps the time spent rebasing to account for upstream changes, but you can also think about e.g. the branch diverging from what's acceptable upstream, people spending time debugging issues that are already fixed on the branch, different people independently writing the same patches, and so on. I.e., we'd simply like to minimise the set of changes that's not upstream as soon as possible. And to give some sense of the kind of upstreaming rate I'd look for, I think for about 80 percent or so of this branch, about 20 patches a week on average should be achievable.