2016-05-26 20:28 GMT+02:00 Paul Gofman gofmanp@gmail.com:
Signed-off-by: Paul Gofman gofmanp@gmail.com
dlls/d3dx9_36/d3dx9_private.h | 25 +++- dlls/d3dx9_36/preshader.c | 259 +++++++++++++++++++++++++++++++++++------- 2 files changed, 237 insertions(+), 47 deletions(-)
diff --git a/dlls/d3dx9_36/d3dx9_private.h b/dlls/d3dx9_36/d3dx9_private.h index 9938d83..970ef2f 100644 --- a/dlls/d3dx9_36/d3dx9_private.h +++ b/dlls/d3dx9_36/d3dx9_private.h @@ -129,14 +129,33 @@ enum pres_reg_tables PRES_REGTAB_FIRST_SHADER = PRES_REGTAB_CONST, };
+enum const_param_copy_state_type +{
- COPY_STATE_TYPE_PARAM,
- COPY_STATE_TYPE_TABLE,
- COPY_STATE_TYPE_PARAM_OFFSET,
- COPY_STATE_TYPE_TABLE_OFFSET,
- COPY_STATE_TYPE_COPY_COUNT
The name of the last enumerator is a bit ambiguous (it might be mistaken to be a _COUNT entry == last actual enumerator + 1). I would either change the name a bit or move it from the last place. Although this might be moot, see below.
+struct d3dx_const_copy_state
I'm not entirely happy with the name, maybe d3dx_const_param_eval_output or something in that fashion?
@@ -811,10 +820,10 @@ static void d3dx_free_const_tab(struct d3dx_const_tab *ctab) { HeapFree(GetProcessHeap(), 0, ctab->inputs); HeapFree(GetProcessHeap(), 0, ctab->inputs_param);
- if (ctab->ctab)
ID3DXConstantTable_Release(ctab->ctab);
- HeapFree(GetProcessHeap(), 0, ctab->const_set);
}
static void d3dx_free_preshader(struct d3dx_preshader *pres)
That extra blank line doesn't seem necessary.
@@ -835,14 +844,181 @@ void d3dx_free_param_eval(struct d3dx_param_eval *peval) HeapFree(GetProcessHeap(), 0, peval); }
-static HRESULT set_constants_param(struct d3dx_regstore *rs, struct d3dx_const_tab *const_tab,
D3DXHANDLE hc, struct d3dx_parameter *param)
+static void set_constants(struct d3dx_regstore *rs, struct d3dx_const_tab *const_tab) +{
- unsigned int i;
- struct d3dx_const_copy_state set_state;
- memset(&set_state, 0, sizeof(set_state));
- for (i = 0; i < const_tab->const_set_count; ++i)
- {
switch (const_tab->const_set[i].copy_state_type)
{
case COPY_STATE_TYPE_PARAM:
set_state.param = const_tab->const_set[i].state_change.param;
break;
case COPY_STATE_TYPE_TABLE:
set_state.table = const_tab->const_set[i].state_change.value;
break;
case COPY_STATE_TYPE_PARAM_OFFSET:
set_state.param_offset = const_tab->const_set[i].state_change.value;
break;
case COPY_STATE_TYPE_TABLE_OFFSET:
set_state.table_offset = const_tab->const_set[i].state_change.value;
break;
case COPY_STATE_TYPE_COPY_COUNT:
{
unsigned int count;
struct d3dx_parameter *param;
enum pres_value_type table_type;
unsigned int j;
count = const_tab->const_set[i].state_change.value;
param = set_state.param;
table_type = table_info[set_state.table].type;
if ((set_state.table_offset + count - 1) / table_info[set_state.table].reg_component_count
>= rs->table_sizes[set_state.table])
{
FIXME("Insufficient table space allocated.\n");
break;
}
if ((param->type == D3DXPT_FLOAT && table_type == PRES_VT_FLOAT)
|| (param->type == D3DXPT_INT && table_type == PRES_VT_INT)
|| (param->type == D3DXPT_BOOL && table_type == PRES_VT_BOOL))
{
regstore_set_values(rs, set_state.table, (unsigned int *)param->data + set_state.param_offset,
set_state.table_offset, count);
}
else
{
for (j = 0; j < count; ++j)
{
unsigned int out;
unsigned int *in = (unsigned int *)param->data + set_state.param_offset + j;
switch (table_info[set_state.table].type)
{
case PRES_VT_FLOAT: set_number(&out, D3DXPT_FLOAT, in, param->type); break;
case PRES_VT_INT: set_number(&out, D3DXPT_INT, in, param->type); break;
case PRES_VT_BOOL: set_number(&out, D3DXPT_BOOL, in, param->type); break;
default:
FIXME("Unexpected type %#x.\n", table_info[set_state.table].type);
break;
}
regstore_set_values(rs, set_state.table, &out, set_state.table_offset + j, 1);
}
}
set_state.param_offset += count;
set_state.table_offset += count;
break;
}
}
- }
+}
This seems overly complicated and fragile. Instead of this kind of "5 instructions bytecode" isn't it easier to just store and then lookup a struct with all the info you need? I guess the end result would be something akin to storing an array of struct d3dx_const_copy_state in place of struct d3dx_const_param_set. I don't know how the code would exactly look like but I would start by moving building all the struct d3dx_const_copy_param to init time. BTW the coalescing you currently do in add_const_set() could be potentially done as a separate pass after you generate all the "d3dx_const_copy_state-like" structures (but still at init time), which might be simpler.
Does this make sense? Am I missing something?
+static HRESULT add_const_set_value(struct d3dx_const_tab *const_tab, unsigned int value,
unsigned int *current_value, enum const_param_copy_state_type type)
+{
- struct d3dx_const_param_set set;
- HRESULT hr;
- if (*current_value != value)
- {
*current_value = value;
set.copy_state_type = type;
set.state_change.value = value;
if (FAILED(hr=append_const_set(const_tab, &set)))
Whitespace.
@@ -931,21 +1107,17 @@ static HRESULT set_constants_param(struct d3dx_regstore *rs, struct d3dx_const_t major_stride = max(minor, table_info[table].reg_component_count); n = min(major * major_stride, desc.RegisterCount * table_info[table].reg_component_count + major_stride - 1) / major_stride;
- for (i = 0; i < n; ++i) { for (j = 0; j < minor; ++j) {
unsigned int out;
unsigned int *in; unsigned int offset; offset = start_offset + i * major_stride + j;
if (offset / table_info[table].reg_component_count >= rs->table_sizes[table])
{
if (table_info[table].reg_component_count != 1)
FIXME("Output offset exceeds table size, name %s, component %u.\n", desc.Name, i);
if ((offset - start_offset) / table_info[table].reg_component_count >= desc.RegisterCount) break;
Maybe keeping a WARN or something makes sense here?
On 05/31/2016 12:24 AM, Matteo Bruni wrote:
+struct d3dx_const_copy_state
I'm not entirely happy with the name, maybe d3dx_const_param_eval_output or something in that fashion?
I will rename to d3dx_const_param_eval_output.
unsigned int out;
unsigned int *in = (unsigned int *)param->data + set_state.param_offset + j;
switch (table_info[set_state.table].type)
{
case PRES_VT_FLOAT: set_number(&out, D3DXPT_FLOAT, in, param->type); break;
case PRES_VT_INT: set_number(&out, D3DXPT_INT, in, param->type); break;
case PRES_VT_BOOL: set_number(&out, D3DXPT_BOOL, in, param->type); break;
default:
FIXME("Unexpected type %#x.\n", table_info[set_state.table].type);
break;
}
regstore_set_values(rs, set_state.table, &out, set_state.table_offset + j, 1);
}
}
set_state.param_offset += count;
set_state.table_offset += count;
break;
}
}
- }
+}
This seems overly complicated and fragile. Instead of this kind of "5 instructions bytecode" isn't it easier to just store and then lookup a struct with all the info you need?
Keeping a struct with offsets & count altogether is of course more straightforward, but I intentionally did it this way to save a lot of extra space and to introduce an easier coalescing. When there is a matrix transpose involved (which happens quite often in real apps as far as I have seen) the whole structure would be duplicated for every matrix element. In the current approach each element for transposed matrix has just one offset (destination offset changes sequentially and does not come to the array for each matrix element) and one count of 1. The other way to optimize this is to introduce a separate operation for transposed matrix copy, but will it really be simpler or nicer than it is now? Current approach is indeed a sort of encoding the data copy into a very simple bytecode. Keeping the whole struct and optimizing as a separate op for matrix transpose would be the same in principle, but I am afraid will result in a bit more code and a bit more mess as there will be more cases to handle. I thought the current approach is simpler as the data copy is encoded in a generic and simple way so on the actual copy I do not have to care for any specifics of initial parameter & constant layout, as well as in process of extending the array & coalescing.
I guess the end result would be something akin to storing an array of struct d3dx_const_copy_state in place of struct d3dx_const_param_set. I don't know how the code would exactly look like but I would start by moving building all the struct d3dx_const_copy_param to init time.
I am not sure I understand what you mean here, could you please clarify? In the variant currently suggested the whole building of structure responsible for parameter copy is already done solely at init time. Shader/preshader full constant structure is not stored anymore. There is still an array of parameters and constant description left which is currently used in effect.c code for setting sampler states. This could easily be optimized further by not storing the parameters other than samplers and storing just register index instead of the whole constant descs for them, but this looked as a separate step to me not fully related to this patch. Or probably I am misunderstanding your point here.
BTW the coalescing you currently do in add_const_set() could be potentially done as a separate pass after you generate all the "d3dx_const_copy_state-like" structures (but still at init time), which might be simpler.
It can be done as a separate pass, but why? In my understanding it will result just in bigger intermediate array and a few more lines of code. I will need to do all the same but not at once when getting an "copy request" but from scanning the pre-stored array. Or am I missing something?
Whitespace.
@@ -931,21 +1107,17 @@ static HRESULT set_constants_param(struct d3dx_regstore *rs, struct d3dx_const_t major_stride = max(minor, table_info[table].reg_component_count); n = min(major * major_stride, desc.RegisterCount * table_info[table].reg_component_count + major_stride - 1) / major_stride;
- for (i = 0; i < n; ++i) { for (j = 0; j < minor; ++j) {
unsigned int out;
unsigned int *in; unsigned int offset; offset = start_offset + i * major_stride + j;
if (offset / table_info[table].reg_component_count >= rs->table_sizes[table])
{
if (table_info[table].reg_component_count != 1)
FIXME("Output offset exceeds table size, name %s, component %u.\n", desc.Name, i);
if ((offset - start_offset) / table_info[table].reg_component_count >= desc.RegisterCount) break;
Maybe keeping a WARN or something makes sense here?
This break is just normal for boolean register set (it is covered by test case) and should not happen for vec4 register sets. This is because the inner loop can never be bound by desc.RegisterCount for vec4 registers (as nminor <= 4 and limiting just outer 'n' loop counter is sufficient), but can be bound for boolean single value registers.
I will get it back in the same way it was before (FIXME for vec4 register sets and nothing for boolean).
2016-05-31 11:48 GMT+02:00 Paul Gofman gofmanp@gmail.com:
On 05/31/2016 12:24 AM, Matteo Bruni wrote:
This seems overly complicated and fragile. Instead of this kind of "5 instructions bytecode" isn't it easier to just store and then lookup a struct with all the info you need?
Keeping a struct with offsets & count altogether is of course more straightforward, but I intentionally did it this way to save a lot of extra space and to introduce an easier coalescing. When there is a matrix transpose involved (which happens quite often in real apps as far as I have seen) the whole structure would be duplicated for every matrix element. In the current approach each element for transposed matrix has just one offset (destination offset changes sequentially and does not come to the array for each matrix element) and one count of 1. The other way to optimize this is to introduce a separate operation for transposed matrix copy, but will it really be simpler or nicer than it is now? Current approach is indeed a sort of encoding the data copy into a very simple bytecode. Keeping the whole struct and optimizing as a separate op for matrix transpose would be the same in principle, but I am afraid will result in a bit more code and a bit more mess as there will be more cases to handle. I thought the current approach is simpler as the data copy is encoded in a generic and simple way so on the actual copy I do not have to care for any specifics of initial parameter & constant layout, as well as in process of extending the array & coalescing.
I don't see why just moving the part of set_constants() which generates the set_state structures for each "copy" to init time would be more complicated than the current solution. It might take some more memory (not a whole lot more, AFAICS) but other than that it seems strictly nicer to me. Maybe it's a case of "beauty is in the eye of the beholder", I don't know... The issue with transposed matrices seems mostly orthogonal to the encoding of the constant uploads and I agree it's important. What about simply adding a transpose flag and the stride of the matrix and encode it all in one struct (or something like a COPY_STATE_TYPE_COPY_TRANSPOSED op for your approach)? It seems it should be good from a practical standpoint and it should reduce the number of operations (encoded either way) you have to store by a lot.
I guess the end result would be something akin to storing an array of struct d3dx_const_copy_state in place of struct d3dx_const_param_set. I don't know how the code would exactly look like but I would start by moving building all the struct d3dx_const_copy_param to init time.
I am not sure I understand what you mean here, could you please clarify? In the variant currently suggested the whole building of structure responsible for parameter copy is already done solely at init time. Shader/preshader full constant structure is not stored anymore. There is still an array of parameters and constant description left which is currently used in effect.c code for setting sampler states. This could easily be optimized further by not storing the parameters other than samplers and storing just register index instead of the whole constant descs for them, but this looked as a separate step to me not fully related to this patch. Or probably I am misunderstanding your point here.
Yeah, that can and should certainly be a separate patch. What I mean is storing the structs d3dx_const_copy_state you compute in set_constants() (as they are right before actually setting the values) in struct d3dx_const_tab in place of the structs d3dx_const_param_set you use to generate them. Does it make sense?
BTW the coalescing you currently do in add_const_set() could be potentially done as a separate pass after you generate all the "d3dx_const_copy_state-like" structures (but still at init time), which might be simpler.
It can be done as a separate pass, but why? In my understanding it will result just in bigger intermediate array and a few more lines of code. I will need to do all the same but not at once when getting an "copy request" but from scanning the pre-stored array. Or am I missing something?
No, that's right. It is just an alternative option which I mentioned because it might be simpler or more powerful (e.g. it makes possible to coalesce multiple copies which are not next to each other - it probably wouldn't help much at the moment since you can't coalesce accesses from multiple parameters) in some case. Feel free to take or ignore it.