Signed-off-by: Zebediah Figura z.figura12@gmail.com --- dlls/wined3d/cs.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index 00cf2f2f1d7..ce0417c9f71 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -3310,6 +3310,9 @@ struct wined3d_cs *wined3d_cs_create(struct wined3d_device *device, cs->c.device = device; cs->serialize_commands = TRACE_ON(d3d_sync) || wined3d_settings.cs_multithreaded & WINED3D_CSMT_SERIALIZE;
+ if (cs->serialize_commands) + ERR_(d3d_sync)("Forcing serialization of all command streams.\n"); + state_init(&cs->state, d3d_info, WINED3D_STATE_NO_REF | WINED3D_STATE_INIT_DEFAULT, cs->c.state->feature_level);
cs->data_size = WINED3D_INITIAL_CS_SIZE;
Signed-off-by: Zebediah Figura z.figura12@gmail.com --- dlls/wined3d/adapter_vk.c | 2 +- dlls/wined3d/context_gl.c | 4 ++-- dlls/wined3d/cs.c | 2 +- dlls/wined3d/device.c | 2 +- dlls/wined3d/directx.c | 2 +- dlls/wined3d/swapchain.c | 2 +- dlls/wined3d/wined3d_private.h | 8 ++++---- 7 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/dlls/wined3d/adapter_vk.c b/dlls/wined3d/adapter_vk.c index 850e2a6dfdb..d5001c1296b 100644 --- a/dlls/wined3d/adapter_vk.c +++ b/dlls/wined3d/adapter_vk.c @@ -541,7 +541,7 @@ static struct wined3d_context *adapter_vk_acquire_context(struct wined3d_device { TRACE("device %p, texture %p, sub_resource_idx %u.\n", device, texture, sub_resource_idx);
- wined3d_from_cs(device->cs); + wined3d_from_cs(device);
if (!device->context_count) return NULL; diff --git a/dlls/wined3d/context_gl.c b/dlls/wined3d/context_gl.c index 474003d7553..33cb76343cd 100644 --- a/dlls/wined3d/context_gl.c +++ b/dlls/wined3d/context_gl.c @@ -2295,7 +2295,7 @@ void wined3d_context_gl_destroy(struct wined3d_context_gl *context_gl)
TRACE("Destroying context %p.\n", context_gl);
- wined3d_from_cs(device->cs); + wined3d_from_cs(device);
/* We delay destroying a context when it is active. The context_release() * function invokes wined3d_context_gl_destroy() again while leaving the @@ -4229,7 +4229,7 @@ struct wined3d_context_gl *wined3d_context_gl_reacquire(struct wined3d_context_g return NULL;
device = context_gl->c.device; - wined3d_from_cs(device->cs); + wined3d_from_cs(device);
if (context_gl->c.current_rt.texture) { diff --git a/dlls/wined3d/cs.c b/dlls/wined3d/cs.c index ce0417c9f71..3acc41f93f5 100644 --- a/dlls/wined3d/cs.c +++ b/dlls/wined3d/cs.c @@ -3047,7 +3047,7 @@ static const struct wined3d_device_context_ops wined3d_cs_st_ops =
static BOOL wined3d_cs_queue_is_empty(const struct wined3d_cs *cs, const struct wined3d_cs_queue *queue) { - wined3d_from_cs(cs); + wined3d_from_cs(cs->c.device); return *(volatile LONG *)&queue->head == queue->tail; }
diff --git a/dlls/wined3d/device.c b/dlls/wined3d/device.c index 5a2a95ebe96..55879e546f2 100644 --- a/dlls/wined3d/device.c +++ b/dlls/wined3d/device.c @@ -5765,7 +5765,7 @@ void device_invalidate_state(const struct wined3d_device *device, unsigned int s unsigned int representative, i, idx, shift; struct wined3d_context *context;
- wined3d_from_cs(device->cs); + wined3d_from_cs(device);
if (STATE_IS_COMPUTE(state_id)) { diff --git a/dlls/wined3d/directx.c b/dlls/wined3d/directx.c index a39c6d75750..b87efaddaff 100644 --- a/dlls/wined3d/directx.c +++ b/dlls/wined3d/directx.c @@ -2694,7 +2694,7 @@ static struct wined3d_context *adapter_no3d_acquire_context(struct wined3d_devic { TRACE("device %p, texture %p, sub_resource_idx %u.\n", device, texture, sub_resource_idx);
- wined3d_from_cs(device->cs); + wined3d_from_cs(device);
if (!device->context_count) return NULL; diff --git a/dlls/wined3d/swapchain.c b/dlls/wined3d/swapchain.c index 693557448fe..fe1bcb3824a 100644 --- a/dlls/wined3d/swapchain.c +++ b/dlls/wined3d/swapchain.c @@ -1783,7 +1783,7 @@ static struct wined3d_context_gl *wined3d_swapchain_gl_create_context(struct win
TRACE("Creating a new context for swapchain %p, thread %u.\n", swapchain_gl, GetCurrentThreadId());
- wined3d_from_cs(device->cs); + wined3d_from_cs(device);
if (!(context_gl = heap_alloc_zero(sizeof(*context_gl)))) { diff --git a/dlls/wined3d/wined3d_private.h b/dlls/wined3d/wined3d_private.h index 80e03ad9212..36e1fa4ec7a 100644 --- a/dlls/wined3d/wined3d_private.h +++ b/dlls/wined3d/wined3d_private.h @@ -6034,10 +6034,10 @@ static inline void wined3d_insert_bits(DWORD *bitstream, } }
-static inline void wined3d_from_cs(const struct wined3d_cs *cs) +static inline void wined3d_from_cs(const struct wined3d_device *device) { - if (cs->thread) - assert(cs->thread_id == GetCurrentThreadId()); + if (device->cs->thread) + assert(device->cs->thread_id == GetCurrentThreadId()); }
static inline void wined3d_not_from_cs(const struct wined3d_device *device) @@ -6103,7 +6103,7 @@ void compute_normal_matrix(float *normal_matrix, BOOL legacy_lighting, static inline struct wined3d_context *context_acquire(struct wined3d_device *device, struct wined3d_texture *texture, unsigned int sub_resource_idx) { - wined3d_from_cs(device->cs); + wined3d_from_cs(device);
return device->adapter->adapter_ops->adapter_acquire_context(device, texture, sub_resource_idx); }
On Fri, 2 Jul 2021 at 00:35, Zebediah Figura z.figura12@gmail.com wrote:
static BOOL wined3d_cs_queue_is_empty(const struct wined3d_cs *cs, const struct wined3d_cs_queue *queue) {
- wined3d_from_cs(cs);
- wined3d_from_cs(cs->c.device); return *(volatile LONG *)&queue->head == queue->tail;
}
This introduces a race though. If wined3d_cs_run() gets to wined3d_cs_queue_is_empty() before wined3d_device_init() gets to the "device->cs" assignment, wined3d_from_cs() will dereference a NULL "device->cs".
On 7/2/21 8:48 AM, Henri Verbeet wrote:
On Fri, 2 Jul 2021 at 00:35, Zebediah Figura z.figura12@gmail.com wrote:
static BOOL wined3d_cs_queue_is_empty(const struct wined3d_cs *cs, const struct wined3d_cs_queue *queue) {
- wined3d_from_cs(cs);
- wined3d_from_cs(cs->c.device); return *(volatile LONG *)&queue->head == queue->tail; }
This introduces a race though. If wined3d_cs_run() gets to wined3d_cs_queue_is_empty() before wined3d_device_init() gets to the "device->cs" assignment, wined3d_from_cs() will dereference a NULL "device->cs".
Indeed. Maybe this assertion should just be removed; it's a little aggressive for a function that's only called from directly inside wined3d_cs_run()...
Hi,
Il 02/07/21 00:35, Zebediah Figura ha scritto:
static BOOL wined3d_cs_queue_is_empty(const struct wined3d_cs *cs, const struct wined3d_cs_queue *queue) {
- wined3d_from_cs(cs);
- wined3d_from_cs(cs->c.device); return *(volatile LONG *)&queue->head == queue->tail; }
That's not related to this patch, but it caught my attention: why are we using "volatile" here?
My understanding is that "volatile" should only be used to deal with special memory (like MMIO), but I don't think this it is the case here. (or is it?)
Here it seems that "volatile" is using to do inter-thread communication, but that's in general wrong: it doesn't protect from data races and it doesn't ensure atomicity.
What am I missing?
Giovanni.
That's not related to this patch, but it caught my attention: why are we using "volatile" here?
My understanding is that "volatile" should only be used to deal with special memory (like MMIO), but I don't think this it is the case here. (or is it?)
Here it seems that "volatile" is using to do inter-thread communication, but that's in general wrong: it doesn't protect from data races and it doesn't ensure atomicity.
What am I missing?
Giovanni.
I guess it’s to force a memory load on each call, which could be omitted if the function is inlined. X86 loads are atomic IIRC. - Jan
Am Freitag, 2. Juli 2021, 17:09:50 CEST schrieb Jan Sikorski:
I guess it’s to force a memory load on each call, which could be omitted if the function is inlined. X86 loads are atomic IIRC. - Jan
It was necessary to make ARM CPUs re-load the data from memory. gcc inlines the function and would put both head and tail in a register and loop forever.
Hi,
Il 03/07/21 17:42, Stefan Dösinger ha scritto:
Am Freitag, 2. Juli 2021, 17:09:50 CEST schrieb Jan Sikorski:
I guess it’s to force a memory load on each call, which could be omitted if the function is inlined. X86 loads are atomic IIRC. - Jan
It was necessary to make ARM CPUs re-load the data from memory. gcc inlines the function and would put both head and tail in a register and loop forever.
My understanding is that this solution is not correct. As far is I understand, the "volatile" qualifier doesn't force the generation of appropriate fencing and doesn't protect you from data races: if a thread writes the value at the same time another is reading or writing it, you are hitting undefined behavior.
I think that atomic values should be used here, and if there is no atomic support in the C version we are using, then we should manually use Interlocked* functions.
Giovanni.
On Sun, 4 Jul 2021 at 16:49, Giovanni Mascellani gmascellani@codeweavers.com wrote:
Il 03/07/21 17:42, Stefan Dösinger ha scritto:
Am Freitag, 2. Juli 2021, 17:09:50 CEST schrieb Jan Sikorski:
I guess it’s to force a memory load on each call, which could be omitted if the function is inlined. X86 loads are atomic IIRC. - Jan
It was necessary to make ARM CPUs re-load the data from memory. gcc inlines the function and would put both head and tail in a register and loop forever.
My understanding is that this solution is not correct. As far is I understand, the "volatile" qualifier doesn't force the generation of appropriate fencing and doesn't protect you from data races:
Well, yes, it doesn't. Are you saying we need those here though?
Hi,
Il 05/07/21 14:28, Henri Verbeet ha scritto:
Well, yes, it doesn't. Are you saying we need those here though?
I think so.
My understanding is that we are relying on undefined behavior, and I think we shouldn't do that, because it might break randomly at some point after unrelated changes, or with a different compiler; and it could result in a kind of bug that is very painful to debug.
I am not expert in compilers nor in this specific code, so I don't want to make an excessively strong opinion. But I think we might do a great favor to our future selves by avoiding the risk altogether.
Giovanni.
On Tue, 6 Jul 2021 at 15:32, Giovanni Mascellani gmascellani@codeweavers.com wrote:
Il 05/07/21 14:28, Henri Verbeet ha scritto:
Well, yes, it doesn't. Are you saying we need those here though?
I think so.
My understanding is that we are relying on undefined behavior, and I think we shouldn't do that, because it might break randomly at some point after unrelated changes, or with a different compiler; and it could result in a kind of bug that is very painful to debug.
C before C11 doesn't specify a whole lot about concurrency, so in that sense we're already in trouble simply for using threads; the C standard isn't going to help us much here.
As I understand it, we don't need a barrier here because this is an (aligned) CPU word sized variable that's only updated with InterlockedExchange(). I.e., the combination of InterlockedExchange() for stores and volatile for loads should be sufficient for preventing torn/fused/invented loads and stores, as well as any reordering we care about. I don't particularly mind being wrong about that, but I'd need something more concrete than (essentially) "volatile is not a full memory barrier".
On 7/6/21 6:43 PM, Henri Verbeet wrote:
On Tue, 6 Jul 2021 at 15:32, Giovanni Mascellani gmascellani@codeweavers.com wrote:
Il 05/07/21 14:28, Henri Verbeet ha scritto:
Well, yes, it doesn't. Are you saying we need those here though?
I think so.
My understanding is that we are relying on undefined behavior, and I think we shouldn't do that, because it might break randomly at some point after unrelated changes, or with a different compiler; and it could result in a kind of bug that is very painful to debug.
C before C11 doesn't specify a whole lot about concurrency, so in that sense we're already in trouble simply for using threads; the C standard isn't going to help us much here.
As I understand it, we don't need a barrier here because this is an (aligned) CPU word sized variable that's only updated with InterlockedExchange(). I.e., the combination of InterlockedExchange() for stores and volatile for loads should be sufficient for preventing torn/fused/invented loads and stores, as well as any reordering we care about. I don't particularly mind being wrong about that, but I'd need something more concrete than (essentially) "volatile is not a full memory barrier".
AFAIK volatile accesses are only compiler ordered wrt other volatile accesses, so it's generally not safe to only use a volatile variable as a barrier to read some non-volatile data written by another thread. I think [1] describes that for instance.
Then, on x86(_64), volatile are also execution order safe, because x86 has a very strong memory model, where although out of order execution is possible, the load and store operation ordering is almost completely ordered wrt each other, including from another core perspective.
This is not the case on other architectures, where you always need to use atomic operations (through compiler builtin) to get load / store order guarantee across threads, or use synchronization primitives.
Then I don't know about this specific case and what exactly is shared, but even with a single variable I think it's possible that on some arch (not x86) you would have local load cache where even a volatile read could return stale value compared to what an atomic read would do.
[1] https://gcc.gnu.org/onlinedocs/gcc/Volatiles.html
On 7/6/21 12:34 PM, Rémi Bernon wrote:
On 7/6/21 6:43 PM, Henri Verbeet wrote:
On Tue, 6 Jul 2021 at 15:32, Giovanni Mascellani gmascellani@codeweavers.com wrote:
Il 05/07/21 14:28, Henri Verbeet ha scritto:
Well, yes, it doesn't. Are you saying we need those here though?
I think so.
My understanding is that we are relying on undefined behavior, and I think we shouldn't do that, because it might break randomly at some point after unrelated changes, or with a different compiler; and it could result in a kind of bug that is very painful to debug.
C before C11 doesn't specify a whole lot about concurrency, so in that sense we're already in trouble simply for using threads; the C standard isn't going to help us much here.
As I understand it, we don't need a barrier here because this is an (aligned) CPU word sized variable that's only updated with InterlockedExchange(). I.e., the combination of InterlockedExchange() for stores and volatile for loads should be sufficient for preventing torn/fused/invented loads and stores, as well as any reordering we care about. I don't particularly mind being wrong about that, but I'd need something more concrete than (essentially) "volatile is not a full memory barrier".
AFAIK volatile accesses are only compiler ordered wrt other volatile accesses, so it's generally not safe to only use a volatile variable as a barrier to read some non-volatile data written by another thread. I think [1] describes that for instance.
Then, on x86(_64), volatile are also execution order safe, because x86 has a very strong memory model, where although out of order execution is possible, the load and store operation ordering is almost completely ordered wrt each other, including from another core perspective.
This is not the case on other architectures, where you always need to use atomic operations (through compiler builtin) to get load / store order guarantee across threads, or use synchronization primitives.
Then I don't know about this specific case and what exactly is shared, but even with a single variable I think it's possible that on some arch (not x86) you would have local load cache where even a volatile read could return stale value compared to what an atomic read would do.
To be fair, I think the real benefit of a strong memory model is that even if the current code is safe, you don't have to convince yourself it's safe—e.g. you don't have to reason about what ordering problems there could be, and ask yourself if it's safe on any of the given architectures we support.
Not that win32 actually has a strong memory model, or that we can depend on the C11 memory model being available, but we could always pull a Linux and roll our own. Or use Interlocked functions even for atomic read/write.
Just food for thought.
Hi,
Il 06/07/21 18:43, Henri Verbeet ha scritto:
C before C11 doesn't specify a whole lot about concurrency, so in that sense we're already in trouble simply for using threads; the C standard isn't going to help us much here.
Yes. This, to me, would be alone a good reason to move to a more recent standard. Given that we are not, I think it is sensible to have more recent standards as a guide anyway, because it is what compiler writers keep in mind. Then of course we cannot fully rely on the standard anyway, because Wine is intrinsically tied to a specific set of binary interfaces, it has to dynamically load code, etc. But I think there shouldn't be the need of specific reasons to write code trying to respect the standard as much as possible.
However, I think I made my point, and it doesn't seem useful to insist any more.
Giovanni.