When waiting asyncs exist after processing one, server should watch for the event POLLIN again.
Reselecting POLLIN would occur in async_destroy, if the contents of async remains.
It had worked on wine-6.7.
Signed-off-by: Dongwan Kim kdw6485@gmail.com --- server/async.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/server/async.c b/server/async.c index 1a564ff1a69..ec3396c930f 100644 --- a/server/async.c +++ b/server/async.c @@ -512,12 +512,7 @@ void async_set_result( struct object *obj, unsigned int status, apc_param_t tota async_reselect( async );
if (async->queue) - { - async->fd = NULL; - list_remove( &async->queue_entry ); - async->queue = NULL; release_object( async ); - } } }
Calling try_recv could make messages misordered.
Suppose that a server program calls a overlapped WSARecv and its operation is pending. Another program sends a message and wineserver is busy. The server program calls another overlapped WSARecv and it intercepts the message for the pending WSARecv.
The problem already had been discussed here https://www.winehq.org/pipermail/wine-devel/2021-May/186612.html
To avoid this situation, this kind of approach can be applied.
The server program needs to know if there are pending asyncs before calling try_recv. Wineserver notifies it whenever invoking APC_ASYNC_IO. Then, the server program updates it and check it before calling try_recv.
Signed-off-by: Dongwan Kim kdw6485@gmail.com --- dlls/ntdll/unix/server.c | 8 +++---- dlls/ntdll/unix/socket.c | 42 +++++++++++++++++++++++++++++++++- include/wine/server_protocol.h | 1 + server/async.c | 8 +++++++ 4 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/dlls/ntdll/unix/server.c b/dlls/ntdll/unix/server.c index 1dcd0792072..1144bbcfa11 100644 --- a/dlls/ntdll/unix/server.c +++ b/dlls/ntdll/unix/server.c @@ -374,17 +374,17 @@ static void invoke_system_apc( const apc_call_t *call, apc_result_t *result, BOO case APC_ASYNC_IO: { struct async_fileio *user = wine_server_get_ptr( call->async_io.user ); - ULONG_PTR info = call->async_io.result; + ULONG_PTR info[2] = { call->async_io.result, call->async_io.async_wait }; NTSTATUS status;
result->type = call->type; status = call->async_io.status; - if (user->callback( user, &info, &status )) + if (user->callback( user, info, &status )) { result->async_io.status = status; - result->async_io.total = info; + result->async_io.total = *info; /* the server will pass us NULL if a call failed synchronously */ - set_async_iosb( call->async_io.sb, result->async_io.status, info ); + set_async_iosb( call->async_io.sb, result->async_io.status, *info ); } else result->async_io.status = STATUS_PENDING; /* restart it */ break; diff --git a/dlls/ntdll/unix/socket.c b/dlls/ntdll/unix/socket.c index 2e79b9baa0f..ca56a7707a6 100644 --- a/dlls/ntdll/unix/socket.c +++ b/dlls/ntdll/unix/socket.c @@ -575,13 +575,45 @@ static NTSTATUS try_recv( int fd, struct async_recv_ioctl *async, ULONG_PTR *siz return status; }
+struct async_queue{ + struct list entry; + HANDLE sock; + char read_q; + char write_q; +}; +static struct list async_queue_list = LIST_INIT(async_queue_list); + + +static struct async_queue* alloc_async_queue(HANDLE handle){ + struct async_queue *queue = + (struct async_queue*)malloc(sizeof(struct async_queue)); + memset(queue, 0, sizeof(struct async_queue)); + + queue->sock = handle; + list_add_head(&async_queue_list, &queue->entry); + return queue; +} + +static struct async_queue* find_async_queue(HANDLE handle){ + struct async_queue *queue; + + LIST_FOR_EACH_ENTRY(queue, &async_queue_list, struct async_queue, entry){ + if(queue->sock == handle) + return queue; + } + return NULL; +} static BOOL async_recv_proc( void *user, ULONG_PTR *info, NTSTATUS *status ) { struct async_recv_ioctl *async = user; + struct async_queue *queue; int fd, needs_close;
TRACE( "%#x\n", *status );
+ queue = find_async_queue(async->io.handle); + if(queue) + queue->read_q = info[1]; //store if waiting asyncs exist if (*status == STATUS_ALERTED) { if ((*status = server_get_unix_fd( async->io.handle, 0, &fd, &needs_close, NULL, NULL ))) @@ -603,6 +635,7 @@ static NTSTATUS sock_recv( HANDLE handle, HANDLE event, PIO_APC_ROUTINE apc, voi struct WS_sockaddr *addr, int *addr_len, DWORD *ret_flags, int unix_flags, int force_async ) { struct async_recv_ioctl *async; + struct async_queue *queue; ULONG_PTR information; HANDLE wait_handle; DWORD async_size; @@ -641,7 +674,13 @@ static NTSTATUS sock_recv( HANDLE handle, HANDLE event, PIO_APC_ROUTINE apc, voi async->addr_len = addr_len; async->ret_flags = ret_flags;
- status = try_recv( fd, async, &information ); + queue = find_async_queue(handle); + if(!queue) + queue = alloc_async_queue(handle); + if(queue->read_q ) // if there are waiting asyncs, avoid cutting in line. + status = STATUS_DEVICE_NOT_READY; + else + status = try_recv( fd, async, &information );
if (status != STATUS_SUCCESS && status != STATUS_BUFFER_OVERFLOW && status != STATUS_DEVICE_NOT_READY) { @@ -670,6 +709,7 @@ static NTSTATUS sock_recv( HANDLE handle, HANDLE event, PIO_APC_ROUTINE apc, voi SERVER_END_REQ;
if (status != STATUS_PENDING) release_fileio( &async->io ); + else queue->read_q = 1;
if (wait_handle) status = wait_async( wait_handle, options & FILE_SYNCHRONOUS_IO_ALERT ); return status; diff --git a/include/wine/server_protocol.h b/include/wine/server_protocol.h index b37a8e8e056..4dc2eb567db 100644 --- a/include/wine/server_protocol.h +++ b/include/wine/server_protocol.h @@ -492,6 +492,7 @@ typedef union client_ptr_t user; client_ptr_t sb; data_size_t result; + unsigned int async_wait; } async_io; struct { diff --git a/server/async.c b/server/async.c index ec3396c930f..0220bf4e2f4 100644 --- a/server/async.c +++ b/server/async.c @@ -164,6 +164,7 @@ static void async_destroy( struct object *obj ) void async_terminate( struct async *async, unsigned int status ) { struct iosb *iosb = async->iosb; + struct async* async_waiting;
if (async->terminated) return;
@@ -202,6 +203,13 @@ void async_terminate( struct async *async, unsigned int status ) else data.async_io.status = status;
+ async_waiting = find_pending_async(async->queue); + if(async_waiting) + { + data.async_io.async_wait = 1; + release_object(async_waiting); + } + thread_queue_apc( async->thread->process, async->thread, &async->obj, &data ); }
Hi,
While running your changed tests, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check?
Full results can be found at: https://testbot.winehq.org/JobDetails.pl?Key=103504
Your paranoid android.
=== debiant2 (build log) ===
Error: Mount manager not running, most likely your WINEPREFIX wasn't created correctly. Task: WineTest did not produce the win32 report
=== debiant2 (build log) ===
Error: Mount manager not running, most likely your WINEPREFIX wasn't created correctly. Task: WineTest did not produce the wow32 report
On 12/6/21 19:30, Dongwan Kim wrote:
Calling try_recv could make messages misordered.
Suppose that a server program calls a overlapped WSARecv and its operation is pending. Another program sends a message and wineserver is busy. The server program calls another overlapped WSARecv and it intercepts the message for the pending WSARecv.
The problem already had been discussed here https://www.winehq.org/pipermail/wine-devel/2021-May/186612.html
To avoid this situation, this kind of approach can be applied.
The server program needs to know if there are pending asyncs before calling try_recv. Wineserver notifies it whenever invoking APC_ASYNC_IO. Then, the server program updates it and check it before calling try_recv.
This is broken for a few reasons, but the main one is that socket handles are cross-process, which is the main reason that the server is involved in the first place.
As I asked the last time a patch like this was submitted [1], it would firstly be nice to know what application needs this, and second, it would be nice to have a test case (ideally in-tree, but if necessary out-of-tree) to validate that this guarantee holds.
[1] https://www.winehq.org/pipermail/wine-devel/2021-May/186848.html
Thanks to review my patch agian.
I need this kind of patch to run KakaoTalk which is the dominant messenger application in Korea. All communication between client and server is done this way. So, there is a problem in Login and Sending files currently.
I've analyzed the appplication and made prototype server/client programs to test the problem. The client sends messages, which are part of a file, to the server. The server assembles the messages and write them to a file. And compare the result file with the original one. If two files are different, there is a problem in message ordering.
Actually, the other patch is originated from this test.
I'll try embed the test into winetest, but I dont't know what file is chosen to test. Could you give me an advice? As you know, this test is not deterministic for a small file because there are few chances to cut in line. I use 96MB file to test and it always guarantees the test result.
Also, to convince you quickly that there is a problem, could I send you the code of the test programs?
Last question, sorry. What is in-tree/out-of-tree test?
Thanks.
On 12/7/21 18:20, Dongwan Kim wrote:
Thanks to review my patch agian.
I need this kind of patch to run KakaoTalk which is the dominant messenger application in Korea. All communication between client and server is done this way. So, there is a problem in Login and Sending files currently.
I've analyzed the appplication and made prototype server/client programs to test the problem. The client sends messages, which are part of a file, to the server. The server assembles the messages and write them to a file. And compare the result file with the original one. If two files are different, there is a problem in message ordering.
Actually, the other patch is originated from this test.
I'll try embed the test into winetest, but I dont't know what file is chosen to test. Could you give me an advice? As you know, this test is not deterministic for a small file because there are few chances to cut in line. I use 96MB file to test and it always guarantees the test result.
You can synthesize data (note that you don't really need a file anyway). I might suggest something like this:
int buffer[200];
for (i = 0; i < 100; ++i) { for (j = 0; j < sizeof(buffer); ++j) buffer[j] = (i * sizeof(buffer)) + j; send(client, buffer, sizeof(buffer), 0); }
and then, when receiving:
total = 0;
while (total < 100 * sizeof(buffer)) { size = recv(server, buffer, sizeof(buffer), 0); for (i = 0; i < size; ++i) ok(buffer[i] == total + i); total += size; }
Tweak as necessary (bigger buffers, or whatever) so that it consistently reproduces the problem on current Wine.
Also, to convince you quickly that there is a problem, could I send you the code of the test programs?
As long as you've tested, I can trust your word ;-)
But of course a test in Wine's tree would be great to have.
Last question, sorry. What is in-tree/out-of-tree test?
"In-tree" means it'd be added to Wine's unit test suite, which is located in the git tree, under dlls/*/tests/. In this case you'd want to put a new test in dlls/ws2_32/tests/sock.c.
So, with my fears about the necessity of this feature confirmed, I think the right way to implement it is actually to unconditionally get rid of the initial try_recv call. Trying to do it conditionally is pretty much impossible without races.
This will, however, introduce an extra server call in some cases. Specifically, if there is already data available (and we are the first in line) we currently make two server calls (one to report STATUS_SUCCESS to the server, and one to select on the async handle). With this change we'll make three. (Note that it looks like two, because we're still doing a call to NtWaitForSingleObject, but being interrupted by a kernel APC means we restart the select call.)
In order to solve this, my suggestion would be to let the recv_socket request return an initial status of STATUS_ALERTED. This would be a signal to the Unix side to try recv() immediately, instead of calling select and letting the server interrupt us. We would need a new server request to get at async_set_result(), since the "usual" way of doing that requires an APC handle. As an optimization this could (and therefore probably should) be done independently, in a separate patch, and as such should probably be ordered before the removal of the initial try_recv() call, to avoid a temporary performance regression.
On the other hand, the fact that most server socket calls take at least two requests already makes it a bit unclear whether adding a third will cause a performance problem. Socket I/O is, after all, generally asynchronous, and not exactly as low-latency as, say, NT synchronization primitives. Although I'm very hesitant to make existing code worse, and although the above solution isn't exactly complex, I also kind of wonder if it's worthwhile...
I finished embedding the test into ws2_32/tests.
Can I submit this patch with the patch for ws2_32/test? Without it, I cannot pass the Marvin(test bot).
So, with my fears about the necessity of this feature confirmed, I think the right way to implement it is actually to unconditionally get rid of the initial try_recv call. Trying to do it conditionally is pretty much impossible without races.
You mean the case that two processes share a socket? I agree. For instnace, process A makes pending asyncs and process B calls WSARecv for the first time in its lifetime. At that time, the message comes and wine server is busy. Then, process B might intercept the message because it has not received aysnc APC ever. I did not tested this case in Windows. This situation might be really rare in that other applications work well in current wine.
This will, however, introduce an extra server call in some cases. Specifically, if there is already data available (and we are the first in line) we currently make two server calls (one to report STATUS_SUCCESS to the server, and one to select on the async handle). With this change we'll make three. (Note that it looks like two, because we're still doing a call to NtWaitForSingleObject, but being interrupted by a kernel APC means we restart the select call.)
Actually, I can't understand why an extra server call occurs. It might be there's something I don't know. In this patch, I just added extra data about pending asyncs to existing apc_call data(apc_call_t). At your example, there are no pending asyncs because we are the first in line. ( Do I understand correctly?) queue->read_q ( it is the structure created in this patch) was cleared to 0 when the last APC_ASYNC_IO came. So It would behave exactly the same with current wine in this situation. Please let me know what I missed.
Thanks.
On 12/9/21 21:01, Dongwan Kim wrote:
This will, however, introduce an extra server call in some cases. Specifically, if there is already data available (and we are the first in line) we currently make two server calls (one to report STATUS_SUCCESS to the server, and one to select on the async handle). With this change we'll make three. (Note that it looks like two, because we're still doing a call to NtWaitForSingleObject, but being interrupted by a kernel APC means we restart the select call.)
Actually, I can't understand why an extra server call occurs. It might be there's something I don't know. In this patch, I just added extra data about pending asyncs to existing apc_call data(apc_call_t). At your example, there are no pending asyncs because we are the first in line. ( Do I understand correctly?) queue->read_q ( it is the structure created in this patch) was cleared to 0 when the last APC_ASYNC_IO came. So It would behave exactly the same with current wine in this situation. Please let me know what I missed.
Well, I'm referring to my proposal, i.e. "get rid of the initial try_recv". Your patch doesn't add any extra server calls, but it is broken since it doesn't account for read requests queued by other processes.
On 12/6/21 19:30, Dongwan Kim wrote:
When waiting asyncs exist after processing one, server should watch for the event POLLIN again.
Reselecting POLLIN would occur in async_destroy, if the contents of async remains.
It had worked on wine-6.7.
Signed-off-by: Dongwan Kim kdw6485@gmail.com
server/async.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/server/async.c b/server/async.c index 1a564ff1a69..ec3396c930f 100644 --- a/server/async.c +++ b/server/async.c @@ -512,12 +512,7 @@ void async_set_result( struct object *obj, unsigned int status, apc_param_t tota async_reselect( async );
if (async->queue)
{
async->fd = NULL;
list_remove( &async->queue_entry );
async->queue = NULL; release_object( async );
}} }
This patch doesn't do what it says it's doing, and I wouldn't be surprised if this causes server crashes all over the place...
An async that gets to async_set_result() has already been terminated (we have an assert for it). That means that there's no more work to do for this async.
What situation exactly are you encountering? Can you please describe in detail?
If there's other asyncs queued in the given fd queue, we should in general be returning true from async_waiting(), which should in general cause POLLIN to be return from a get_poll_events callback. If there's not, I don't see why we should be returning POLLIN.
The situation is exactly what you said at last sentence.
The real situation in wine-client view is below : GetQueuedCompletionStatus waits for 16 overlapped WSARecv. After one IO Completion is processed, it does not wake up.
async_reselect() should be called after list_remove to receive POLLIN event again. async_waiting() only checks about the first entry in the queue. Before calling list_remove(), async_waiting() returns false.
These stuff was done in async_destroy() on wine-6.7, and I let the contents of async remained so that async_destroy() handles it.
Apparently, as you said, test bot says there is a problem in this patch.
What about calling async_reselect() again right after calling list_remove()?
if (async->queue) { list_remove( &async->queue_entry ); async_reselect(async); async->fd = NULL; async->queue = NULL; release_object( async ); }
It does not cause assertion about handle_count and make GetQueuedCompletionStatus wake up after IO completions.
On 12/7/21 21:23, Dongwan Kim wrote:
The situation is exactly what you said at last sentence.
The real situation in wine-client view is below : GetQueuedCompletionStatus waits for 16 overlapped WSARecv. After one IO Completion is processed, it does not wake up.
async_reselect() should be called after list_remove to receive POLLIN event again. async_waiting() only checks about the first entry in the queue. Before calling list_remove(), async_waiting() returns false.
These stuff was done in async_destroy() on wine-6.7, and I let the contents of async remained so that async_destroy() handles it.
Apparently, as you said, test bot says there is a problem in this patch.
What about calling async_reselect() again right after calling list_remove()?
if (async->queue) { list_remove( &async->queue_entry ); async_reselect(async); async->fd = NULL; async->queue = NULL; release_object( async ); }
It does not cause assertion about handle_count and make GetQueuedCompletionStatus wake up after IO completions.
Yes, I think that looks correct. I'm not happy about it, but only because my dislike for the current async code runs far deeper :-)
When you submit it, can you please also add a test case to Wine's test suite that demonstrates the bug you describe?