The read and pread syscalls are permitted to return fewer bytes than requested (unlike the fread libc call which will only perform a short read on error or EOF). This is most likely to occur when binaries are located on slower filesystems (e.g. NFS or 9pfs)
ffab9d9 is a relatively recent regression here, but a lot of the code appears to be much older (e.g. the read call in 341b7dc)
In this MR I've focused on ntdll, but I see a lot of similar mishandling of the pread return value in `server/`. I will raise a bug as I don't intend to create a follow up MR at this time.
-- v3: ntdll: Correctly handle pread short reads
From: Aidan Hobson Sayers aidanhs@cantab.net
The read and pread syscalls are permitted to return fewer bytes than requested (unlike the fread libc call which will only perform a short read on error or EOF). This is most likely to occur when binaries are located on slower filesystems (e.g. NFS or 9pfs)
ffab9d9 is a relatively recent regression here, but a lot of the code appears to be much older (e.g. the read call in 341b7dc) --- dlls/ntdll/unix/file.c | 2 +- dlls/ntdll/unix/process.c | 4 ++-- dlls/ntdll/unix/unix_private.h | 19 +++++++++++++++++++ dlls/ntdll/unix/virtual.c | 10 +++++++--- 4 files changed, 29 insertions(+), 6 deletions(-)
diff --git a/dlls/ntdll/unix/file.c b/dlls/ntdll/unix/file.c index 3a24d4ebbf2..e440045df2e 100644 --- a/dlls/ntdll/unix/file.c +++ b/dlls/ntdll/unix/file.c @@ -6520,7 +6520,7 @@ static inline BOOL is_device_placeholder( int fd ) static const char wine_placeholder[] = "Wine device placeholder"; char buffer[sizeof(wine_placeholder)-1];
- if (pread( fd, buffer, sizeof(wine_placeholder) - 1, 0 ) != sizeof(wine_placeholder) - 1) + if (pread_all( fd, buffer, sizeof(wine_placeholder) - 1, 0 ) != sizeof(wine_placeholder) - 1) return FALSE; return !memcmp( buffer, wine_placeholder, sizeof(wine_placeholder) - 1 ); } diff --git a/dlls/ntdll/unix/process.c b/dlls/ntdll/unix/process.c index 30bd6f083bd..8224679aba7 100644 --- a/dlls/ntdll/unix/process.c +++ b/dlls/ntdll/unix/process.c @@ -190,7 +190,7 @@ static BOOL get_so_file_info( int fd, pe_image_info_t *info )
off_t pos;
- if (pread( fd, &header, sizeof(header), 0 ) != sizeof(header)) return FALSE; + if (pread_all( fd, &header, sizeof(header), 0 ) != sizeof(header)) return FALSE;
if (!memcmp( header.elf.magic, "\177ELF", 4 )) { @@ -223,7 +223,7 @@ static BOOL get_so_file_info( int fd, pe_image_info_t *info ) } while (phnum--) { - if (pread( fd, &type, sizeof(type), pos ) != sizeof(type)) return FALSE; + if (pread_all( fd, &type, sizeof(type), pos ) != sizeof(type)) return FALSE; if (type == 3 /* PT_INTERP */) return FALSE; pos += (header.elf.class == 2) ? 56 : 32; } diff --git a/dlls/ntdll/unix/unix_private.h b/dlls/ntdll/unix/unix_private.h index 6862d74b863..1e3c725fa1a 100644 --- a/dlls/ntdll/unix/unix_private.h +++ b/dlls/ntdll/unix/unix_private.h @@ -23,6 +23,8 @@
#include <pthread.h> #include <signal.h> +#include <unistd.h> +#include <errno.h> #include "unixlib.h" #include "wine/unixlib.h" #include "wine/server.h" @@ -520,4 +522,21 @@ static inline NTSTATUS map_section( HANDLE mapping, void **ptr, SIZE_T *size, UL 0, NULL, size, ViewShare, 0, protect ); }
+static inline ssize_t pread_all( int fd, void *buf, size_t count, off_t offset ) +{ + ssize_t ret; + ssize_t total = 0; + while (count - total) { + ret = pread( fd, (char *)buf + total, count - total, offset + total ); + if (ret == -1) { + if (errno == EINTR) { continue; } + return -1; + } else if (ret == 0) { + return total; + } + total += ret; + } + return total; +} + #endif /* __NTDLL_UNIX_PRIVATE_H */ diff --git a/dlls/ntdll/unix/virtual.c b/dlls/ntdll/unix/virtual.c index 62fc1c9dd1f..f39fb78aed3 100644 --- a/dlls/ntdll/unix/virtual.c +++ b/dlls/ntdll/unix/virtual.c @@ -2106,7 +2106,9 @@ static NTSTATUS map_file_into_view( struct file_view *view, int fd, size_t start ptr = anon_mmap_fixed( (char *)view->base + start, size, PROT_READ | PROT_WRITE, 0 ); if (ptr == MAP_FAILED) return STATUS_NO_MEMORY; /* Now read in the file */ - pread( fd, ptr, size, offset ); + if (pread_all( fd, ptr, size, offset ) == -1) { + return STATUS_INVALID_PARAMETER; + } if (prot != (PROT_READ|PROT_WRITE)) mprotect( ptr, size, prot ); /* Set the right protection */ done: set_page_vprot( (char *)view->base + start, size, vprot ); @@ -2411,7 +2413,9 @@ static NTSTATUS map_pe_header( void *ptr, size_t size, int fd, BOOL *removable ) } *removable = TRUE; } - pread( fd, ptr, size, 0 ); + if (pread_all( fd, ptr, size, 0 ) == -1) { + return STATUS_INVALID_PARAMETER; + } return STATUS_SUCCESS; /* page protections will be updated later */ }
@@ -4925,7 +4929,7 @@ static NTSTATUS get_working_set_ex( HANDLE process, LPCVOID addr, (vprot & VPROT_COMMITTED)) { if (pagemap_fd == -1 || - pread( pagemap_fd, &pagemap, sizeof(pagemap), ((UINT_PTR)p->VirtualAddress >> page_shift) * sizeof(pagemap) ) != sizeof(pagemap)) + pread_all( pagemap_fd, &pagemap, sizeof(pagemap), ((UINT_PTR)p->VirtualAddress >> page_shift) * sizeof(pagemap) ) != sizeof(pagemap)) { /* If we don't have pagemap information, default to invalid. */ pagemap = 0;
Jinoh Kang (@iamahuman) commented about dlls/ntdll/unix/virtual.c:
(vprot & VPROT_COMMITTED)) { if (pagemap_fd == -1 ||
pread( pagemap_fd, &pagemap, sizeof(pagemap), ((UINT_PTR)p->VirtualAddress >> page_shift) * sizeof(pagemap) ) != sizeof(pagemap))
pread_all( pagemap_fd, &pagemap, sizeof(pagemap), ((UINT_PTR)p->VirtualAddress >> page_shift) * sizeof(pagemap) ) != sizeof(pagemap))
Partial reads on `/proc/<pid>/pagemap` should *not* be handled at all. Unaligned offset (not multiples of `sizeof(pagemap)`) results in `-EINVAL` anyway.
```suggestion:-0+0 pread( pagemap_fd, &pagemap, sizeof(pagemap), ((UINT_PTR)p->VirtualAddress >> page_shift) * sizeof(pagemap) ) != sizeof(pagemap)) ```
The read and pread syscalls are permitted to return fewer bytes than requested (unlike the fread libc call which will only perform a short read on error or EOF). This is most likely to occur when binaries are located on slower filesystems (e.g. NFS or 9pfs)
Are you sure that it actually the case for disk files? I think read (and pread is specced to be an equivalent for read with the only difference that file offset is provided) should never result in short read for disk files unless it is end of file or the read buffer is not fully accessible for write. I also think that virtually no Unix (or Windows for similar functions) program expects short reads when dealing with disk file I/O. So if that is the case with NFS Linux driver or with 9pfs that should probably be treated as a bug in those drivers.
I can imagine that maybe EINTR can happen at some places where we don't have signals blocked and should handle that, maybe that is the case WRT the problem?
I had a closer look at this based on your feedback, and I found out that my executable file is being opened with O_NONBLOCK (before it gets sent over a unix socket and eventually mmaped and pread into memory).
I can prepare a different MR to remove the nonblock if that's preferable?
If after investigation you will still think that there is some room for Wine bug here, it would probably be easier if you'd elaborate a bit on the actual issue, what the app is trying to do, which setup leads to it. As it doesn't look like Wine sets O_NONBLOCK for files which are read in a way your patch concerns.
It's pretty easy to elaborate - I have a minimal rust program (`fn main() { println!("hello") }`) compiled using mingw. The program fails to load, printing: ``` 0100:err:virtual:virtual_setup_exception stack overflow 2144 bytes in thread 0100 addr 0x17005a529 stack 0x207a0 (0x20000-0x21000-0x220000) ``` This problem does not occur when I copy the binary off the 9p mount to the local disk, i.e. it runs successfully and prints `hello`.
As it doesn't look like Wine sets O_NONBLOCK for files which are read in a way your patch concerns.
I do think wine is setting `O_NONBLOCK` in a way that affects the loader. After spending some time with gdb and winedbg it appears the sequence of events is approximately (bearing in mind this is my first real foray into wine):
__wine_main - start_main_thread - init_startup_info - (possibly build_initial_params ->) load_main_exe - open_main_image - open_dll_file - open_unix_file - server:DECL_HANDLER(create_file) - server:create_file -> `fd = open_fd(..., flags | O_NONBLOCK, ...)` - virtual_map_module - virtual_map_image - server_get_unix_fd - map_image_into_view - map_pe_header + map_file_into_view - pread
i.e. all file descriptors that wineserver creates in from `create_file` have `O_NONBLOCK` set on them - and one of these file descriptors is used for mapping in the PE header.
Happening this early in the loading process lines up with the process not even starting, and it matches up with my strace the problematic process (below), where
- the fd is returned from the server via `recvmsg` (`server_get_unix_fd`) - `fcntl` is called by `receive_fd` - the pattern of `mmap` then `pread` (which continues longer than the pasted log) looks like the per-section calls to `map_file_into_view` - you can see multiple short read returns from `pread`
(this trace is a single process snippet grepped out of a `strace -f` log where other processes had interleaved syscalls, hence the many `<unfinished...>` - there were no signals though)
``` [pid 3258] recvmsg(12, <unfinished ...> [pid 3258] <... recvmsg resumed>{msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="$\0\0\0", iov_len=4}], msg_iovlen=1, msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, cmsg_data=[5]}], m sg_controllen=24, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 4 [pid 3258] fcntl(5, F_SETFD, FD_CLOEXEC <unfinished ...> [pid 3258] <... fcntl resumed>) = 0 [pid 3258] rt_sigprocmask(SIG_SETMASK, [HUP INT USR1 USR2 ALRM CHLD IO], <unfinished ...> [pid 3258] <... rt_sigprocmask resumed>NULL, 8) = 0 [pid 3258] rt_sigprocmask(SIG_BLOCK, [HUP INT USR1 USR2 ALRM CHLD IO], <unfinished ...> [pid 3258] <... rt_sigprocmask resumed>[HUP INT USR1 USR2 ALRM CHLD IO], 8) = 0 [pid 3258] mmap(0x140000000, 5025792, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...> [pid 3258] <... mmap resumed>) = 0x140000000 [pid 3258] mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...> [pid 3258] <... mmap resumed>) = 0x7fe4b9d00000 [pid 3258] fstat(5, <unfinished ...> [pid 3258] <... fstat resumed>{st_mode=S_IFREG|0775, st_size=5422023, ...}) = 0 [pid 3258] mmap(0x140000000, 1536, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_FIXED, 5, 0 <unfinished ...> [pid 3258] <... mmap resumed>) = 0x140000000 [pid 3258] mmap(0x140001000, 765952, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0 <unfinished ...> [pid 3258] <... mmap resumed>) = 0x140001000 [pid 3258] pread64(5, <unfinished ...> [pid 3258] <... pread64 resumed>"\303ff.\17\37\204\0\0\0\0\0\17\37@\0H\203\354(H\213\5\365>\16\0001\311\307\0\1"..., 765952, 1536) = 126976 [pid 3258] mmap(0x1400bc000, 512, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0 <unfinished ...> [pid 3258] <... mmap resumed>) = 0x1400bc000 [pid 3258] pread64(5, <unfinished ...> [pid 3258] <... pread64 resumed>"\n\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\202\3@\1\0\0\0"..., 512, 767488) = 512 [pid 3258] mmap(0x1400bd000, 163840, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0 <unfinished ...> [pid 3258] <... mmap resumed>) = 0x1400bd000 [pid 3258] pread64(5, <unfinished ...> [pid 3258] <... pread64 resumed>"invalid args\0\0\0\0\0\320\v@\1\0\0\0\f\0\0\0\0\0\0\0"..., 163840, 768000) = 126976 [pid 3258] mmap(0x1400e5000, 25600, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0 <unfinished ...> [pid 3258] <... mmap resumed>) = 0x1400e5000 [pid 3258] pread64(5, <unfinished ...> [pid 3258] <... pread64 resumed>"\0\20\0\0\1\20\0\0\0\300\16\0\20\20\0\0>\21\0\0\4\300\16\0@\21\0\0\211\21\0\0"..., 25600, 931840) = 25600 ```
I don't think removing the `O_NONBLOCK` from [here](https://gitlab.winehq.org/wine/wine/-/blob/b5e19a33c9360784961918a364175ab2a...) is viable - it's been there for a *very* long time (https://gitlab.winehq.org/wine/wine/-/commit/0562539d18638e5afdd81b4d894c880...) and there are surely far reaching implications.
I did consider disabling `O_NONBLOCK` on the fd when it's received from the server in `virtual_map_image`. Unfortunately, `man 7 unix` documents `SCM_RIGHTS` as being semantically equivalent to `dup` so it'll affect the flags of the fd held in wineserver, which defeats the point.
Open to other ideas - so far I'm still inclined towards the current general approach in this MR as the most self-contained approach to the problem.
Looks like 9p bug to me.
Update: Or, maybe, a way how its mount is configured.
https://github.com/torvalds/linux/blob/0b5547c51827e053cc754db47d3ec3e6c2c45... 9p has different behavior if you request `O_NONBLOCK` - the blocking behavior just calls the nonblocking function in a loop. This behaviour was explicitly added in https://lore.kernel.org/all/20200205003457.24340-2-l29ah@cock.li/T/#u with the stated reasoning being:
I want to interface with synthetic file systems that represent arbitrary data streams. The one where i've hit the problem is reading the log of a XMPP chat client that blocks if there's no new data available.
Given this, I don't think it's likely reverting this will be accepted. If this is a WONTFIX from the position of the wine project, where is best for me to document that 9p is an unsupported filesystem to run wine from?
For O_NONBLOCK, from 'man 2 open' [1]:
``` ... O_NONBLOCK or O_NDELAY ... Note that this flag has no effect for regular files and block devices; that is, I/O operations will (briefly) block when device activity is required, regardless of whether O_NONBLOCK is set. Since O_NONBLOCK semantics might eventually be implemented, applications should not depend upon blocking behavior when specifying this flag for regular files and block devices. ```
So this is a bit inconclusive, yeah. From the one side, the spec clearly says that O_NONBLOCK has no effect on regular files. No exceptions listed. So I still think that if 9p implements short reads with the disk files (even if only this flag is set), it violates the docs and might encounter issues with more apps. IMO such thing, if necessary for some special cases like referenced in the commit, would better go as a special mount option if the filesystem is supposed to be seamlessly used with existing apps.
On the other side, the doc explicitly discourages setting this flag and depending on the blocking behaviour for regular files (and that's probably what Wine actually does) suggesting potential future changes (I think it really be bad idea to change that semantics though, I hope that if that will be implemented and described there will be some other flag for that to maintain backwards compatibility; this is a way too basic place to change the behaviour easily).
So if removing O_NONBLOCK in wineserver really helps here and if there is a more or less nice way to avoid setting the flag on regular files in wineserver maybe that's a valid change?
So if removing O_NONBLOCK in wineserver really helps here and if there is a more or less nice way to avoid setting the flag on regular files in wineserver maybe that's a valid change?
Agreed, and I think a crucial point you've helped me see is that removing `O_NONBLOCK` will *only* affect 9p (based on my census of the `linux/fs/` tree)...which is currently broken anyway, so there shouldn't be any risk.
Tomorrow I'll close this MR and open a fresh one.
The risk would be in accidentally removing it from something else besides disk files (sockets, pipes, character devices, whatever). Also, there is MacOS besides Linux but hopefully there is nothing special about this flag there.
This merge request was closed by Aidan Hobson Sayers.