This fixes an issue when the path includes non-ASCII characters.
Signed-off-by: Jactry Zeng jzeng@codeweavers.com
-- v4: mshtml: Call UrlUnescapeW() with URL_UNESCAPE_AS_UTF8 in is_gecko_path(). shlwapi/tests: Test UrlUnescapeW() with URL_UNESCAPE_AS_UTF8. kernelbase: Implement URL_UNESCAPE_AS_UTF8 for UrlUnescapeW(). shlwapi/tests: Test UrlUnescapeW() with independent data.
From: Jactry Zeng jzeng@codeweavers.com
--- dlls/shlwapi/tests/url.c | 71 ++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 32 deletions(-)
diff --git a/dlls/shlwapi/tests/url.c b/dlls/shlwapi/tests/url.c index 6870de4ee5f..7161870918a 100644 --- a/dlls/shlwapi/tests/url.c +++ b/dlls/shlwapi/tests/url.c @@ -407,7 +407,19 @@ static struct { {"file://%24%25foobar", "file://$%foobar"} };
-/* ################ */ +static struct +{ + const WCHAR *url; + const WCHAR *expect; + DWORD flags; +} TEST_URL_UNESCAPEW[] = +{ + { L"file://foo/bar", L"file://foo/bar" }, + { L"file://fo%20o%5Ca/bar", L"file://fo o\a/bar" }, + { L"file://%24%25foobar", L"file://$%foobar" }, + { L"file:///C:/Program Files", L"file:///C:/Program Files" }, + { L"file:///C:/Program%20Files", L"file:///C:/Program Files" }, +};
static const struct { const char *path; @@ -1391,17 +1403,14 @@ static void test_UrlIs(void)
static void test_UrlUnescape(void) { + WCHAR urlW[INTERNET_MAX_URL_LENGTH], bufferW[INTERNET_MAX_URL_LENGTH]; CHAR szReturnUrl[INTERNET_MAX_URL_LENGTH]; - WCHAR ret_urlW[INTERNET_MAX_URL_LENGTH]; - WCHAR *urlW, *expected_urlW; - DWORD dwEscaped; - size_t i; + DWORD dwEscaped, unescaped; static char inplace[] = "file:///C:/Program%20Files"; static char another_inplace[] = "file:///C:/Program%20Files"; static const char expected[] = "file:///C:/Program Files"; - static WCHAR inplaceW[] = L"file:///C:/Program Files"; - static WCHAR another_inplaceW[] = L"file:///C:/Program%20Files"; HRESULT res; + int i;
for (i = 0; i < ARRAY_SIZE(TEST_URL_UNESCAPE); i++) { dwEscaped=INTERNET_MAX_URL_LENGTH; @@ -1418,21 +1427,30 @@ static void test_UrlUnescape(void) "UrlUnescapeA returned 0x%lx (expected E_INVALIDARG) for "%s"\n", res, TEST_URL_UNESCAPE[i].url); ok(strcmp(szReturnUrl,"")==0, "Expected empty string\n"); + }
- dwEscaped = INTERNET_MAX_URL_LENGTH; - urlW = GetWideString(TEST_URL_UNESCAPE[i].url); - expected_urlW = GetWideString(TEST_URL_UNESCAPE[i].expect); - res = UrlUnescapeW(urlW, ret_urlW, &dwEscaped, 0); - ok(res == S_OK, - "UrlUnescapeW returned 0x%lx (expected S_OK) for "%s"\n", - res, TEST_URL_UNESCAPE[i].url); - - WideCharToMultiByte(CP_ACP,0,ret_urlW,-1,szReturnUrl,INTERNET_MAX_URL_LENGTH,0,0); - ok(lstrcmpW(ret_urlW, expected_urlW)==0, - "Expected "%s", but got "%s" from "%s" flags %08lx\n", - TEST_URL_UNESCAPE[i].expect, szReturnUrl, TEST_URL_UNESCAPE[i].url, 0L); - FreeWideString(urlW); - FreeWideString(expected_urlW); + for (i = 0; i < ARRAYSIZE(TEST_URL_UNESCAPEW); i++) + { + lstrcpyW(urlW, TEST_URL_UNESCAPEW[i].url); + + memset(bufferW, 0xff, sizeof(bufferW)); + unescaped = INTERNET_MAX_URL_LENGTH; + res = UrlUnescapeW(urlW, bufferW, &unescaped, TEST_URL_UNESCAPEW[i].flags); + ok(res == S_OK, "[%d]: returned %#lx.\n", i, res); + ok(unescaped == lstrlenW(TEST_URL_UNESCAPEW[i].expect), "[%d]: got unescaped %ld.\n", i, unescaped); + ok(!lstrcmpW(bufferW, TEST_URL_UNESCAPEW[i].expect), "[%d]: got result "%s".\n", i, debugstr_w(bufferW)); + + /* Test with URL_UNESCAPE_INPLACE */ + unescaped = INTERNET_MAX_URL_LENGTH; + res = UrlUnescapeW(urlW, NULL, &unescaped, TEST_URL_UNESCAPEW[i].flags | URL_UNESCAPE_INPLACE); + ok(res == S_OK, "[%d]: returned %#lx.\n", i, res); + ok(unescaped == INTERNET_MAX_URL_LENGTH, "[%d]: got unescaped %ld.\n", i, unescaped); + ok(!lstrcmpW(urlW, TEST_URL_UNESCAPEW[i].expect), "[%d]: got result "%s".\n", i, debugstr_w(urlW)); + + lstrcpyW(urlW, TEST_URL_UNESCAPEW[i].url); + unescaped = lstrlenW(TEST_URL_UNESCAPEW[i].expect) - 1; + res = UrlUnescapeW(urlW, bufferW, &unescaped, TEST_URL_UNESCAPEW[i].flags); + ok(res == E_POINTER, "[%d]: returned %#lx.\n", i, res); }
dwEscaped = sizeof(inplace); @@ -1445,17 +1463,6 @@ static void test_UrlUnescape(void) res = UrlUnescapeA(another_inplace, NULL, NULL, URL_UNESCAPE_INPLACE); ok(res == S_OK, "UrlUnescapeA returned 0x%lx (expected S_OK)\n", res); ok(!strcmp(another_inplace, expected), "got %s expected %s\n", another_inplace, expected); - - dwEscaped = sizeof(inplaceW); - res = UrlUnescapeW(inplaceW, NULL, &dwEscaped, URL_UNESCAPE_INPLACE); - ok(res == S_OK, "UrlUnescapeW returned 0x%lx (expected S_OK)\n", res); - ok(dwEscaped == 50, "got %ld expected 50\n", dwEscaped); - - /* if we set the buffer pointer to NULL, the string apparently still gets converted (Google Lively does this) */ - res = UrlUnescapeW(another_inplaceW, NULL, NULL, URL_UNESCAPE_INPLACE); - ok(res == S_OK, "UrlUnescapeW returned 0x%lx (expected S_OK)\n", res); - - ok(lstrlenW(another_inplaceW) == 24, "got %d expected 24\n", lstrlenW(another_inplaceW)); }
static const struct parse_url_test_t {
From: Jactry Zeng jzeng@codeweavers.com
--- dlls/kernelbase/path.c | 64 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-)
diff --git a/dlls/kernelbase/path.c b/dlls/kernelbase/path.c index 7eda9bd483c..daf157c83b6 100644 --- a/dlls/kernelbase/path.c +++ b/dlls/kernelbase/path.c @@ -2907,11 +2907,26 @@ HRESULT WINAPI UrlUnescapeA(char *url, char *unescaped, DWORD *unescaped_len, DW return hr; }
+static int get_utf8_len(unsigned char code) +{ + if (code < 0x80) + return 1; + else if ((code & 0xe0) == 0xc0) + return 2; + else if ((code & 0xf0) == 0xe0) + return 3; + else if ((code & 0xf8) == 0xf0) + return 4; + return 0; +} + HRESULT WINAPI UrlUnescapeW(WCHAR *url, WCHAR *unescaped, DWORD *unescaped_len, DWORD flags) { + WCHAR *dst, next, utf16_buf[5]; BOOL stop_unescaping = FALSE; + int utf8_len, utf16_len, i; const WCHAR *src; - WCHAR *dst, next; + char utf8_buf[5]; DWORD needed; HRESULT hr;
@@ -2930,6 +2945,7 @@ HRESULT WINAPI UrlUnescapeW(WCHAR *url, WCHAR *unescaped, DWORD *unescaped_len,
for (src = url, needed = 0; *src; src++, needed++) { + utf16_len = 0; if (flags & URL_DONT_UNESCAPE_EXTRA_INFO && (*src == '#' || *src == '?')) { stop_unescaping = TRUE; @@ -2939,17 +2955,59 @@ HRESULT WINAPI UrlUnescapeW(WCHAR *url, WCHAR *unescaped, DWORD *unescaped_len, { INT ih; WCHAR buf[5] = L"0x"; + memcpy(buf + 2, src + 1, 2*sizeof(WCHAR)); buf[4] = 0; StrToIntExW(buf, STIF_SUPPORT_HEX, &ih); - next = (WCHAR) ih; src += 2; /* Advance to end of escape */ + + if (flags & URL_UNESCAPE_AS_UTF8) + { + utf8_buf[0] = ih; + utf8_len = get_utf8_len(ih); + for (i = 1; i < utf8_len; i++) + { + memcpy(buf + 2, src + 2, 2 * sizeof(WCHAR)); + StrToIntExW(buf, STIF_SUPPORT_HEX, &ih); + /* Check if it is a valid continuation byte. */ + if (*(src + 1) == '%' && (ih & 0xc0) == 0x80) + { + utf8_buf[i] = ih; + src += 3; + } + else + { + utf8_len = i; + break; + } + } + + utf16_len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8_buf, utf8_len, NULL, 0); + if (!utf16_len) + continue; + else + { + MultiByteToWideChar(CP_UTF8, 0, utf8_buf, utf8_len, utf16_buf, utf16_len); + utf16_buf[utf16_len] = 0; + needed += utf16_len - 1; + } + } + else + next = (WCHAR) ih; } else next = *src;
if (flags & URL_UNESCAPE_INPLACE || needed < *unescaped_len) - *dst++ = next; + { + if (utf16_len) + { + wcscpy(dst, utf16_buf); + dst += utf16_len; + } + else + *dst++ = next; + } }
if (flags & URL_UNESCAPE_INPLACE || needed < *unescaped_len)
From: Jactry Zeng jzeng@codeweavers.com
--- dlls/shlwapi/tests/url.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+)
diff --git a/dlls/shlwapi/tests/url.c b/dlls/shlwapi/tests/url.c index 7161870918a..6b60775f7fe 100644 --- a/dlls/shlwapi/tests/url.c +++ b/dlls/shlwapi/tests/url.c @@ -28,6 +28,23 @@ #include "shlwapi.h" #include "wininet.h" #include "intshcut.h" +#include "winternl.h" + + +static BOOL check_win_version(int min_major, int min_minor) +{ + HMODULE hntdll = GetModuleHandleA("ntdll.dll"); + NTSTATUS (WINAPI *pRtlGetVersion)(RTL_OSVERSIONINFOEXW *); + RTL_OSVERSIONINFOEXW rtlver; + + rtlver.dwOSVersionInfoSize = sizeof(RTL_OSVERSIONINFOEXW); + pRtlGetVersion = (void *)GetProcAddress(hntdll, "RtlGetVersion"); + pRtlGetVersion(&rtlver); + return rtlver.dwMajorVersion > min_major || + (rtlver.dwMajorVersion == min_major && + rtlver.dwMinorVersion >= min_minor); +} +#define is_win8_plus() check_win_version(6, 2)
static const char* TEST_URL_1 = "http://www.winehq.org/tests?date=10/10/1923"; static const char* TEST_URL_2 = "http://localhost:8080/tests%2e.html?date=Mon%2010/10/1923"; @@ -418,7 +435,20 @@ static struct { L"file://fo%20o%5Ca/bar", L"file://fo o\a/bar" }, { L"file://%24%25foobar", L"file://$%foobar" }, { L"file:///C:/Program Files", L"file:///C:/Program Files" }, + { L"file:///C:/Program Files", L"file:///C:/Program Files", URL_UNESCAPE_AS_UTF8 }, { L"file:///C:/Program%20Files", L"file:///C:/Program Files" }, + { L"file:///C:/Program%20Files", L"file:///C:/Program Files", URL_UNESCAPE_AS_UTF8 }, + { L"file://foo/%E4%B8%AD%E6%96%87/bar", L"file://foo/\xe4\xb8\xad\xe6\x96\x87/bar" }, /* with 3 btyes utf-8 */ + { L"file://foo/%E4%B8%AD%E6%96%87/bar", L"file://foo/\x4e2d\x6587/bar", URL_UNESCAPE_AS_UTF8 }, + /* mix corrupt and good utf-8 */ + { L"file://foo/%E4%AD%E6%96%87/bar", L"file://foo/\xfffd\x6587/bar", URL_UNESCAPE_AS_UTF8 }, + { L"file://foo/%F0%9F%8D%B7/bar", L"file://foo/\xf0\x9f\x8d\xb7/bar" }, /* with 4 btyes utf-8 */ + { L"file://foo/%F0%9F%8D%B7/bar", L"file://foo/\xd83c\xdf77/bar", URL_UNESCAPE_AS_UTF8 }, + /* non-escaped chars between multi-byte escaped chars */ + { L"file://foo/%E4%B8%ADabc%E6%96%87/bar", L"file://foo/\x4e2d""abc""\x6587/bar", URL_UNESCAPE_AS_UTF8 }, + { L"file://foo/%E4B8%AD/bar", L"file://foo/\xfffd""B8\xfffd/bar", URL_UNESCAPE_AS_UTF8 }, + { L"file://foo/%E4%G8%AD/bar", L"file://foo/\xfffd""%G8\xfffd/bar", URL_UNESCAPE_AS_UTF8 }, + { L"file://foo/%G4%B8%AD/bar", L"file://foo/%G4\xfffd\xfffd/bar", URL_UNESCAPE_AS_UTF8 }, };
static const struct { @@ -1431,6 +1461,14 @@ static void test_UrlUnescape(void)
for (i = 0; i < ARRAYSIZE(TEST_URL_UNESCAPEW); i++) { + if (TEST_URL_UNESCAPEW[i].flags | URL_UNESCAPE_AS_UTF8 + && !!strcmp(winetest_platform, "wine") + && !is_win8_plus()) + { + win_skip("Skip URL_UNESCAPE_AS_UTF8 test for pre-win7 systems.\n"); + continue; + } + lstrcpyW(urlW, TEST_URL_UNESCAPEW[i].url);
memset(bufferW, 0xff, sizeof(bufferW));
From: Jactry Zeng jzeng@codeweavers.com
This fixes an issue when the path includes non-ASCII characters. --- dlls/mshtml/nsembed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dlls/mshtml/nsembed.c b/dlls/mshtml/nsembed.c index 95c07c90390..ea20ffbd202 100644 --- a/dlls/mshtml/nsembed.c +++ b/dlls/mshtml/nsembed.c @@ -1304,7 +1304,7 @@ BOOL is_gecko_path(const char *path) *ptr = '/'; }
- UrlUnescapeW(buf, NULL, NULL, URL_UNESCAPE_INPLACE); + UrlUnescapeW(buf, NULL, NULL, URL_UNESCAPE_INPLACE | URL_UNESCAPE_AS_UTF8); buf[gecko_path_len] = 0;
ret = !wcsicmp(buf, gecko_path);
On Thu Jul 20 10:33:29 2023 +0000, Jacek Caban wrote:
What happens if string terminates or one of those characters are not an xdigit? More tests would be interesting.
MultiByteToWideChar() can handle this. I added tests for this case in v4:
"file://foo/%E4%G8%AD/bar"
"file://foo/%G4%B8%AD/bar"
Jacek Caban (@jacek) commented about dlls/kernelbase/path.c:
memcpy(buf + 2, src + 2, 2 * sizeof(WCHAR));
StrToIntExW(buf, STIF_SUPPORT_HEX, &ih);
/* Check if it is a valid continuation byte. */
if (*(src + 1) == '%' && (ih & 0xc0) == 0x80)
{
utf8_buf[i] = ih;
src += 3;
}
else
{
utf8_len = i;
break;
}
}
utf16_len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8_buf, utf8_len, NULL, 0);
I think you could pass utf16_but here and avoid calling `MultiByteToWideChar` twice.
Jacek Caban (@jacek) commented about dlls/kernelbase/path.c:
/* Check if it is a valid continuation byte. */
if (*(src + 1) == '%' && (ih & 0xc0) == 0x80)
{
utf8_buf[i] = ih;
src += 3;
}
else
{
utf8_len = i;
break;
}
}
utf16_len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8_buf, utf8_len, NULL, 0);
if (!utf16_len)
continue;
Is continue right in this case? Should we use something like next = 0xffff instead? A test would be nice.
Jacek Caban (@jacek) commented about dlls/kernelbase/path.c:
src += 3;
}
else
{
utf8_len = i;
break;
}
}
utf16_len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8_buf, utf8_len, NULL, 0);
if (!utf16_len)
continue;
else
{
MultiByteToWideChar(CP_UTF8, 0, utf8_buf, utf8_len, utf16_buf, utf16_len);
utf16_buf[utf16_len] = 0;
There is no need for null-termination, you have utf16_len available, so you could use that when copying the result.
On Tue Aug 1 10:54:45 2023 +0000, Jacek Caban wrote:
Is continue right in this case? Should we use something like next = 0xffff instead? A test would be nice.
Is "file://foo/%E4%AD%E6%96%87/bar" the case: https://gitlab.winehq.org/wine/wine/-/merge_requests/585/diffs?commit_id=d48b42bbb0750497dbddc972e7287c30feaccd4d#39c88e69f9ddfbd37ae3565e18bb4723cebb11cb_422_444? MultiByteToWideChar() will convert the whole corrupt UTF-8 character to 0xfffd.
On Wed Aug 2 06:22:03 2023 +0000, Jactry Zeng wrote:
Is "file://foo/%E4%AD%E6%96%87/bar" the case: https://gitlab.winehq.org/wine/wine/-/merge_requests/585/diffs?commit_id=d48b42bbb0750497dbddc972e7287c30feaccd4d#39c88e69f9ddfbd37ae3565e18bb4723cebb11cb_422_444? MultiByteToWideChar() will convert the whole corrupt UTF-8 character to 0xfffd.
Ah, I think I see: "file://foo/%E4%AD%E6%96%87/bar" is the case, but lstrcmpW() didn't do a good job in this comparison, using wcscmp() instead will expose the issue.