Signed-off-by: Rémi Bernon rbernon@codeweavers.com ---
The shader_sm4_read_instruction function shows up in perf report when running SOTTR on Intel because of this loop.
The commented enumeration entries are mostly here to help the reader validate that the opcode_table matches the enumeration.
libs/vkd3d-shader/dxbc.c | 54 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-)
diff --git a/libs/vkd3d-shader/dxbc.c b/libs/vkd3d-shader/dxbc.c index 98c51e42..794c6c66 100644 --- a/libs/vkd3d-shader/dxbc.c +++ b/libs/vkd3d-shader/dxbc.c @@ -117,10 +117,13 @@
enum vkd3d_sm4_opcode { + VKD3D_SM4_OP_INVALID = -1, VKD3D_SM4_OP_ADD = 0x00, VKD3D_SM4_OP_AND = 0x01, VKD3D_SM4_OP_BREAK = 0x02, VKD3D_SM4_OP_BREAKC = 0x03, + /* VKD3D_SM4_OP_? = 0x04, */ + /* VKD3D_SM4_OP_? = 0x05, */ VKD3D_SM4_OP_CASE = 0x06, VKD3D_SM4_OP_CONTINUE = 0x07, VKD3D_SM4_OP_CONTINUEC = 0x08, @@ -135,6 +138,7 @@ enum vkd3d_sm4_opcode VKD3D_SM4_OP_DP4 = 0x11, VKD3D_SM4_OP_ELSE = 0x12, VKD3D_SM4_OP_EMIT = 0x13, + /* VKD3D_SM4_OP_? = 0x14, */ VKD3D_SM4_OP_ENDIF = 0x15, VKD3D_SM4_OP_ENDLOOP = 0x16, VKD3D_SM4_OP_ENDSWITCH = 0x17, @@ -196,6 +200,7 @@ enum vkd3d_sm4_opcode VKD3D_SM4_OP_ULT = 0x4f, VKD3D_SM4_OP_UGE = 0x50, VKD3D_SM4_OP_UMUL = 0x51, + /* VKD3D_SM4_OP_? = 0x52, */ VKD3D_SM4_OP_UMAX = 0x53, VKD3D_SM4_OP_UMIN = 0x54, VKD3D_SM4_OP_USHR = 0x55, @@ -215,20 +220,24 @@ enum vkd3d_sm4_opcode VKD3D_SM4_OP_DCL_INPUT_PS_SGV = 0x63, VKD3D_SM4_OP_DCL_INPUT_PS_SIV = 0x64, VKD3D_SM4_OP_DCL_OUTPUT = 0x65, + /* VKD3D_SM4_OP_? = 0x66, */ VKD3D_SM4_OP_DCL_OUTPUT_SIV = 0x67, VKD3D_SM4_OP_DCL_TEMPS = 0x68, VKD3D_SM4_OP_DCL_INDEXABLE_TEMP = 0x69, VKD3D_SM4_OP_DCL_GLOBAL_FLAGS = 0x6a, + /* VKD3D_SM4_OP_? = 0x6b, */ VKD3D_SM4_OP_LOD = 0x6c, VKD3D_SM4_OP_GATHER4 = 0x6d, VKD3D_SM4_OP_SAMPLE_POS = 0x6e, VKD3D_SM4_OP_SAMPLE_INFO = 0x6f, + /* VKD3D_SM5_OP_? = 0x70, */ VKD3D_SM5_OP_HS_DECLS = 0x71, VKD3D_SM5_OP_HS_CONTROL_POINT_PHASE = 0x72, VKD3D_SM5_OP_HS_FORK_PHASE = 0x73, VKD3D_SM5_OP_HS_JOIN_PHASE = 0x74, VKD3D_SM5_OP_EMIT_STREAM = 0x75, VKD3D_SM5_OP_CUT_STREAM = 0x76, + /* VKD3D_SM5_OP_? = 0x77, */ VKD3D_SM5_OP_FCALL = 0x78, VKD3D_SM5_OP_BUFINFO = 0x79, VKD3D_SM5_OP_DERIV_RTX_COARSE = 0x7a, @@ -241,6 +250,8 @@ enum vkd3d_sm4_opcode VKD3D_SM5_OP_RCP = 0x81, VKD3D_SM5_OP_F32TOF16 = 0x82, VKD3D_SM5_OP_F16TOF32 = 0x83, + /* VKD3D_SM5_OP_? = 0x84, */ + /* VKD3D_SM5_OP_? = 0x85, */ VKD3D_SM5_OP_COUNTBITS = 0x86, VKD3D_SM5_OP_FIRSTBIT_HI = 0x87, VKD3D_SM5_OP_FIRSTBIT_LO = 0x88, @@ -298,6 +309,19 @@ enum vkd3d_sm4_opcode VKD3D_SM5_OP_IMM_ATOMIC_UMAX = 0xbc, VKD3D_SM5_OP_IMM_ATOMIC_UMIN = 0xbd, VKD3D_SM5_OP_SYNC = 0xbe, + /* VKD3D_SM5_OP_? = 0xbf, */ + /* VKD3D_SM5_OP_? = 0xc0, */ + /* VKD3D_SM5_OP_? = 0xc1, */ + /* VKD3D_SM5_OP_? = 0xc2, */ + /* VKD3D_SM5_OP_? = 0xc3, */ + /* VKD3D_SM5_OP_? = 0xc4, */ + /* VKD3D_SM5_OP_? = 0xc5, */ + /* VKD3D_SM5_OP_? = 0xc6, */ + /* VKD3D_SM5_OP_? = 0xc7, */ + /* VKD3D_SM5_OP_? = 0xc8, */ + /* VKD3D_SM5_OP_? = 0xc9, */ + /* VKD3D_SM5_OP_? = 0xca, */ + /* VKD3D_SM5_OP_? = 0xcb, */ VKD3D_SM5_OP_EVAL_SAMPLE_INDEX = 0xcc, VKD3D_SM5_OP_EVAL_CENTROID = 0xcd, VKD3D_SM5_OP_DCL_GS_INSTANCES = 0xce, @@ -947,6 +971,8 @@ static const struct vkd3d_sm4_opcode_info opcode_table[] = {VKD3D_SM4_OP_BREAK, VKD3DSIH_BREAK, "", ""}, {VKD3D_SM4_OP_BREAKC, VKD3DSIH_BREAKP, "", "u", shader_sm4_read_conditional_op}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM4_OP_CASE, VKD3DSIH_CASE, "", "u"}, {VKD3D_SM4_OP_CONTINUE, VKD3DSIH_CONTINUE, "", ""}, {VKD3D_SM4_OP_CONTINUEC, VKD3DSIH_CONTINUEP, "", "u", @@ -963,6 +989,7 @@ static const struct vkd3d_sm4_opcode_info opcode_table[] = {VKD3D_SM4_OP_DP4, VKD3DSIH_DP4, "f", "ff"}, {VKD3D_SM4_OP_ELSE, VKD3DSIH_ELSE, "", ""}, {VKD3D_SM4_OP_EMIT, VKD3DSIH_EMIT, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM4_OP_ENDIF, VKD3DSIH_ENDIF, "", ""}, {VKD3D_SM4_OP_ENDLOOP, VKD3DSIH_ENDLOOP, "", ""}, {VKD3D_SM4_OP_ENDSWITCH, VKD3DSIH_ENDSWITCH, "", ""}, @@ -1027,6 +1054,7 @@ static const struct vkd3d_sm4_opcode_info opcode_table[] = {VKD3D_SM4_OP_ULT, VKD3DSIH_ULT, "u", "uu"}, {VKD3D_SM4_OP_UGE, VKD3DSIH_UGE, "u", "uu"}, {VKD3D_SM4_OP_UMUL, VKD3DSIH_UMUL, "uu", "uu"}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM4_OP_UMAX, VKD3DSIH_UMAX, "u", "uu"}, {VKD3D_SM4_OP_UMIN, VKD3DSIH_UMIN, "u", "uu"}, {VKD3D_SM4_OP_USHR, VKD3DSIH_USHR, "u", "uu"}, @@ -1060,6 +1088,7 @@ static const struct vkd3d_sm4_opcode_info opcode_table[] = shader_sm4_read_dcl_input_ps_siv}, {VKD3D_SM4_OP_DCL_OUTPUT, VKD3DSIH_DCL_OUTPUT, "", "", shader_sm4_read_declaration_dst}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM4_OP_DCL_OUTPUT_SIV, VKD3DSIH_DCL_OUTPUT_SIV, "", "", shader_sm4_read_declaration_register_semantic}, {VKD3D_SM4_OP_DCL_TEMPS, VKD3DSIH_DCL_TEMPS, "", "", @@ -1068,16 +1097,19 @@ static const struct vkd3d_sm4_opcode_info opcode_table[] = shader_sm4_read_dcl_indexable_temp}, {VKD3D_SM4_OP_DCL_GLOBAL_FLAGS, VKD3DSIH_DCL_GLOBAL_FLAGS, "", "", shader_sm4_read_dcl_global_flags}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM4_OP_LOD, VKD3DSIH_LOD, "f", "fRS"}, {VKD3D_SM4_OP_GATHER4, VKD3DSIH_GATHER4, "u", "fRS"}, {VKD3D_SM4_OP_SAMPLE_POS, VKD3DSIH_SAMPLE_POS, "f", "Ru"}, {VKD3D_SM4_OP_SAMPLE_INFO, VKD3DSIH_SAMPLE_INFO, "f", "R"}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM5_OP_HS_DECLS, VKD3DSIH_HS_DECLS, "", ""}, {VKD3D_SM5_OP_HS_CONTROL_POINT_PHASE, VKD3DSIH_HS_CONTROL_POINT_PHASE, "", ""}, {VKD3D_SM5_OP_HS_FORK_PHASE, VKD3DSIH_HS_FORK_PHASE, "", ""}, {VKD3D_SM5_OP_HS_JOIN_PHASE, VKD3DSIH_HS_JOIN_PHASE, "", ""}, {VKD3D_SM5_OP_EMIT_STREAM, VKD3DSIH_EMIT_STREAM, "", "f"}, {VKD3D_SM5_OP_CUT_STREAM, VKD3DSIH_CUT_STREAM, "", "f"}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM5_OP_FCALL, VKD3DSIH_FCALL, "", "O", shader_sm5_read_fcall}, {VKD3D_SM5_OP_BUFINFO, VKD3DSIH_BUFINFO, "i", "U"}, @@ -1091,6 +1123,8 @@ static const struct vkd3d_sm4_opcode_info opcode_table[] = {VKD3D_SM5_OP_RCP, VKD3DSIH_RCP, "f", "f"}, {VKD3D_SM5_OP_F32TOF16, VKD3DSIH_F32TOF16, "u", "f"}, {VKD3D_SM5_OP_F16TOF32, VKD3DSIH_F16TOF32, "f", "u"}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM5_OP_COUNTBITS, VKD3DSIH_COUNTBITS, "u", "u"}, {VKD3D_SM5_OP_FIRSTBIT_HI, VKD3DSIH_FIRSTBIT_HI, "u", "u"}, {VKD3D_SM5_OP_FIRSTBIT_LO, VKD3DSIH_FIRSTBIT_LO, "u", "u"}, @@ -1168,6 +1202,19 @@ static const struct vkd3d_sm4_opcode_info opcode_table[] = {VKD3D_SM5_OP_IMM_ATOMIC_UMIN, VKD3DSIH_IMM_ATOMIC_UMIN, "uU", "iu"}, {VKD3D_SM5_OP_SYNC, VKD3DSIH_SYNC, "", "", shader_sm5_read_sync}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, + {VKD3D_SM4_OP_INVALID, VKD3DSIH_NOP, "", ""}, {VKD3D_SM5_OP_EVAL_SAMPLE_INDEX, VKD3DSIH_EVAL_SAMPLE_INDEX, "f", "fi"}, {VKD3D_SM5_OP_EVAL_CENTROID, VKD3DSIH_EVAL_CENTROID, "f", "f"}, {VKD3D_SM5_OP_DCL_GS_INSTANCES, VKD3DSIH_DCL_GS_INSTANCES, "", "", @@ -1220,11 +1267,10 @@ static const enum vkd3d_shader_register_type register_type_table[] =
static const struct vkd3d_sm4_opcode_info *get_opcode_info(enum vkd3d_sm4_opcode opcode) { - unsigned int i; - - for (i = 0; i < sizeof(opcode_table) / sizeof(*opcode_table); ++i) + if (opcode < ARRAY_SIZE(opcode_table) && opcode_table[opcode].opcode != VKD3D_SM4_OP_INVALID) { - if (opcode == opcode_table[i].opcode) return &opcode_table[i]; + assert(opcode_table[opcode].opcode == opcode); + return &opcode_table[opcode]; }
return NULL; -- 2.23.0
Signed-off-by: Rémi Bernon rbernon@codeweavers.com ---
Same thing, the vkd3d_spirv_build_word function shows up in perf report, and we can instead reserve space upfront.
libs/vkd3d-shader/spirv.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/libs/vkd3d-shader/spirv.c b/libs/vkd3d-shader/spirv.c index 75a3a4f3..d74d2d38 100644 --- a/libs/vkd3d-shader/spirv.c +++ b/libs/vkd3d-shader/spirv.c @@ -371,12 +371,17 @@ static uint32_t vkd3d_spirv_opcode_word(SpvOp op, unsigned int word_count) return (word_count << SpvWordCountShift) | op; }
-static void vkd3d_spirv_build_word(struct vkd3d_spirv_stream *stream, uint32_t word) +static void vkd3d_spirv_reserve_words(struct vkd3d_spirv_stream *stream, uint32_t count) { if (!vkd3d_array_reserve((void **)&stream->words, &stream->capacity, - stream->word_count + 1, sizeof(*stream->words))) - return; + stream->word_count + count, sizeof(*stream->words))) + ERR("Not enough memory to reserve words.\n"); + assert(stream->capacity >= stream->word_count + count); +}
+static void vkd3d_spirv_build_word(struct vkd3d_spirv_stream *stream, uint32_t word) +{ + assert(stream->capacity >= stream->word_count + 1); stream->words[stream->word_count++] = word; }
@@ -563,12 +568,14 @@ static uint32_t vkd3d_spirv_build_once7(struct vkd3d_spirv_builder *builder, */ static void vkd3d_spirv_build_op(struct vkd3d_spirv_stream *stream, SpvOp op) { + vkd3d_spirv_reserve_words(stream, 1); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(op, 1)); }
static void vkd3d_spirv_build_op1(struct vkd3d_spirv_stream *stream, SpvOp op, uint32_t operand) { + vkd3d_spirv_reserve_words(stream, 2); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(op, 2)); vkd3d_spirv_build_word(stream, operand); } @@ -577,6 +584,7 @@ static void vkd3d_spirv_build_op1v(struct vkd3d_spirv_stream *stream, SpvOp op, uint32_t operand0, const uint32_t *operands, unsigned int operand_count) { unsigned int i; + vkd3d_spirv_reserve_words(stream, 2 + operand_count); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(op, 2 + operand_count)); vkd3d_spirv_build_word(stream, operand0); for (i = 0; i < operand_count; ++i) @@ -588,6 +596,7 @@ static void vkd3d_spirv_build_op2v(struct vkd3d_spirv_stream *stream, const uint32_t *operands, unsigned int operand_count) { unsigned int i; + vkd3d_spirv_reserve_words(stream, 3 + operand_count); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(op, 3 + operand_count)); vkd3d_spirv_build_word(stream, operand0); vkd3d_spirv_build_word(stream, operand1); @@ -600,6 +609,7 @@ static void vkd3d_spirv_build_op3v(struct vkd3d_spirv_stream *stream, const uint32_t *operands, unsigned int operand_count) { unsigned int i; + vkd3d_spirv_reserve_words(stream, 4 + operand_count); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(op, 4 + operand_count)); vkd3d_spirv_build_word(stream, operand0); vkd3d_spirv_build_word(stream, operand1); @@ -712,6 +722,7 @@ static uint32_t vkd3d_spirv_build_op_tr2v(struct vkd3d_spirv_builder *builder, { uint32_t result_id = vkd3d_spirv_alloc_id(builder); unsigned int i; + vkd3d_spirv_reserve_words(stream, 5 + operand_count); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(op, 5 + operand_count)); vkd3d_spirv_build_word(stream, result_type); vkd3d_spirv_build_word(stream, result_id); @@ -775,6 +786,7 @@ static void vkd3d_spirv_build_op_extension(struct vkd3d_spirv_stream *stream, const char *name) { unsigned int name_size = vkd3d_spirv_string_word_count(name); + vkd3d_spirv_reserve_words(stream, 1 + name_size); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(SpvOpExtension, 1 + name_size)); vkd3d_spirv_build_string(stream, name, name_size); } @@ -783,6 +795,7 @@ static void vkd3d_spirv_build_op_ext_inst_import(struct vkd3d_spirv_stream *stre uint32_t result_id, const char *name) { unsigned int name_size = vkd3d_spirv_string_word_count(name); + vkd3d_spirv_reserve_words(stream, 2 + name_size); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(SpvOpExtInstImport, 2 + name_size)); vkd3d_spirv_build_word(stream, result_id); vkd3d_spirv_build_string(stream, name, name_size); @@ -807,6 +820,7 @@ static void vkd3d_spirv_build_op_entry_point(struct vkd3d_spirv_stream *stream, uint32_t *interface_list, unsigned int interface_size) { unsigned int i, name_size = vkd3d_spirv_string_word_count(name); + vkd3d_spirv_reserve_words(stream, 3 + name_size + interface_size); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(SpvOpEntryPoint, 3 + name_size + interface_size)); vkd3d_spirv_build_word(stream, model); vkd3d_spirv_build_word(stream, function_id); @@ -835,6 +849,7 @@ static void vkd3d_spirv_build_op_name(struct vkd3d_spirv_builder *builder, va_end(args);
name_size = vkd3d_spirv_string_word_count(name); + vkd3d_spirv_reserve_words(stream, 2 + name_size); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(SpvOpName, 2 + name_size)); vkd3d_spirv_build_word(stream, id); vkd3d_spirv_build_string(stream, name, name_size); @@ -854,6 +869,7 @@ static void vkd3d_spirv_build_op_member_name(struct vkd3d_spirv_builder *builder va_end(args);
name_size = vkd3d_spirv_string_word_count(name); + vkd3d_spirv_reserve_words(stream, 3 + name_size); vkd3d_spirv_build_word(stream, vkd3d_spirv_opcode_word(SpvOpMemberName, 3 + name_size)); vkd3d_spirv_build_word(stream, type_id); vkd3d_spirv_build_word(stream, member); @@ -1699,6 +1715,7 @@ static bool vkd3d_spirv_compile_module(struct vkd3d_spirv_builder *builder,
vkd3d_spirv_stream_init(&stream);
+ vkd3d_spirv_reserve_words(&stream, 5); vkd3d_spirv_build_word(&stream, SpvMagicNumber); vkd3d_spirv_build_word(&stream, VKD3D_SPIRV_VERSION); vkd3d_spirv_build_word(&stream, VKD3D_SPIRV_GENERATOR_MAGIC); -- 2.23.0
On Thu, 3 Oct 2019 at 20:42, Rémi Bernon rbernon@codeweavers.com wrote:
Same thing, the vkd3d_spirv_build_word function shows up in perf report, and we can instead reserve space upfront.
Likewise.
On Thu, 3 Oct 2019 at 20:42, Rémi Bernon rbernon@codeweavers.com wrote:
The shader_sm4_read_instruction function shows up in perf report when running SOTTR on Intel because of this loop.
That seems like a questionable claim. Does this actually improve things? Do you have numbers? Direct3D 12 applications should ideally not be creating pipeline states at all during rendering, but if they do, actual shader compilation is going to be much more expensive than anything we do here.
That's not to say this can't be improved though.
On 10/3/19 9:05 PM, Henri Verbeet wrote:
On Thu, 3 Oct 2019 at 20:42, Rémi Bernon rbernon@codeweavers.com wrote:
The shader_sm4_read_instruction function shows up in perf report when running SOTTR on Intel because of this loop.
That seems like a questionable claim. Does this actually improve things? Do you have numbers? Direct3D 12 applications should ideally not be creating pipeline states at all during rendering, but if they do, actual shader compilation is going to be much more expensive than anything we do here.
That's not to say this can't be improved though.
Yes I did the measurements, and perf (with default settings) reports the function from ~2.5% self overhead down to 0.6% with this patch. For the second patch it was reporting the other function from 1.7% self overhead and didn't report it with the second patch, because it gets inlined somewhere - but the sum of all the vkd3d_spirv function overhead is lowered.
My interpretation is that the shader compilation that happens at startup but is CPU bound - and it is noticeable. I then didn't let the game run for very long but it is highly GPU bound afterwards, so nothing in particular shows up in perf.