This fixes an issue when the path includes non-ASCII characters.
Signed-off-by: Jactry Zeng jzeng@codeweavers.com
-- v2: mshtml: Call UrlUnescapeW() with URL_UNESCAPE_AS_UTF8 in is_gecko_path(). shlwapi/tests: Test UrlUnescapeW() with URL_UNESCAPE_AS_UTF8. kernelbase: Implement URL_UNESCAPE_AS_UTF8 for UrlUnescapeW(). shlwapi/tests: Test UrlUnescapeW() with independent data.
From: Jactry Zeng jzeng@codeweavers.com
Signed-off-by: Jactry Zeng jzeng@codeweavers.com --- dlls/shlwapi/tests/url.c | 71 ++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 31 deletions(-)
diff --git a/dlls/shlwapi/tests/url.c b/dlls/shlwapi/tests/url.c index 6870de4ee5f..4a788c5ff51 100644 --- a/dlls/shlwapi/tests/url.c +++ b/dlls/shlwapi/tests/url.c @@ -407,7 +407,19 @@ static struct { {"file://%24%25foobar", "file://$%foobar"} };
-/* ################ */ +static struct +{ + const WCHAR *url; + const WCHAR *expect; + DWORD flags; +} TEST_URL_UNESCAPEW[] = +{ + { L"file://foo/bar", L"file://foo/bar" }, + { L"file://fo%20o%5Ca/bar", L"file://fo o\a/bar" }, + { L"file://%24%25foobar", L"file://$%foobar" }, + { L"file:///C:/Program Files", L"file:///C:/Program Files" }, + { L"file:///C:/Program%20Files", L"file:///C:/Program Files" }, +};
static const struct { const char *path; @@ -1391,16 +1403,13 @@ static void test_UrlIs(void)
static void test_UrlUnescape(void) { + WCHAR urlW[INTERNET_MAX_URL_LENGTH], bufferW[INTERNET_MAX_URL_LENGTH]; CHAR szReturnUrl[INTERNET_MAX_URL_LENGTH]; - WCHAR ret_urlW[INTERNET_MAX_URL_LENGTH]; - WCHAR *urlW, *expected_urlW; - DWORD dwEscaped; + DWORD dwEscaped, unescaped; size_t i; static char inplace[] = "file:///C:/Program%20Files"; static char another_inplace[] = "file:///C:/Program%20Files"; static const char expected[] = "file:///C:/Program Files"; - static WCHAR inplaceW[] = L"file:///C:/Program Files"; - static WCHAR another_inplaceW[] = L"file:///C:/Program%20Files"; HRESULT res;
for (i = 0; i < ARRAY_SIZE(TEST_URL_UNESCAPE); i++) { @@ -1418,21 +1427,32 @@ static void test_UrlUnescape(void) "UrlUnescapeA returned 0x%lx (expected E_INVALIDARG) for "%s"\n", res, TEST_URL_UNESCAPE[i].url); ok(strcmp(szReturnUrl,"")==0, "Expected empty string\n"); + }
- dwEscaped = INTERNET_MAX_URL_LENGTH; - urlW = GetWideString(TEST_URL_UNESCAPE[i].url); - expected_urlW = GetWideString(TEST_URL_UNESCAPE[i].expect); - res = UrlUnescapeW(urlW, ret_urlW, &dwEscaped, 0); - ok(res == S_OK, - "UrlUnescapeW returned 0x%lx (expected S_OK) for "%s"\n", - res, TEST_URL_UNESCAPE[i].url); - - WideCharToMultiByte(CP_ACP,0,ret_urlW,-1,szReturnUrl,INTERNET_MAX_URL_LENGTH,0,0); - ok(lstrcmpW(ret_urlW, expected_urlW)==0, - "Expected "%s", but got "%s" from "%s" flags %08lx\n", - TEST_URL_UNESCAPE[i].expect, szReturnUrl, TEST_URL_UNESCAPE[i].url, 0L); - FreeWideString(urlW); - FreeWideString(expected_urlW); + for (i = 0; i < ARRAYSIZE(TEST_URL_UNESCAPEW); i++) + { + lstrcpyW(urlW, TEST_URL_UNESCAPEW[i].url); + + memset(bufferW, 0xff, sizeof(bufferW)); + unescaped = INTERNET_MAX_URL_LENGTH; + res = UrlUnescapeW(urlW, bufferW, &unescaped, TEST_URL_UNESCAPEW[i].flags); + ok(res == S_OK, "[%d]: returned %#lx.\n", i, res); + ok(unescaped == lstrlenW(TEST_URL_UNESCAPEW[i].expect), + "[%d]: got unescaped %ld.\n", i, unescaped); + ok(!lstrcmpW(bufferW, TEST_URL_UNESCAPEW[i].expect), + "[%d]: got result "%s".\n", i, debugstr_w(bufferW)); + + /* Test with URL_UNESCAPE_INPLACE */ + unescaped = INTERNET_MAX_URL_LENGTH; + res = UrlUnescapeW(urlW, NULL, &unescaped, TEST_URL_UNESCAPEW[i].flags | URL_UNESCAPE_INPLACE); + ok(res == S_OK, "[%d]: returned %#lx.\n", i, res); + ok(unescaped == INTERNET_MAX_URL_LENGTH, "[%d]: got unescaped %ld.\n", i, unescaped); + ok(!lstrcmpW(urlW, TEST_URL_UNESCAPEW[i].expect), + "[%d]: got result "%s".\n", i, debugstr_w(urlW)); + + unescaped = lstrlenW(TEST_URL_UNESCAPEW[i].expect) - 1; + res = UrlUnescapeW(urlW, bufferW, &unescaped, TEST_URL_UNESCAPEW[i].flags); + ok(res == E_POINTER, "[%d]: returned %#lx.\n", i, res); }
dwEscaped = sizeof(inplace); @@ -1445,17 +1465,6 @@ static void test_UrlUnescape(void) res = UrlUnescapeA(another_inplace, NULL, NULL, URL_UNESCAPE_INPLACE); ok(res == S_OK, "UrlUnescapeA returned 0x%lx (expected S_OK)\n", res); ok(!strcmp(another_inplace, expected), "got %s expected %s\n", another_inplace, expected); - - dwEscaped = sizeof(inplaceW); - res = UrlUnescapeW(inplaceW, NULL, &dwEscaped, URL_UNESCAPE_INPLACE); - ok(res == S_OK, "UrlUnescapeW returned 0x%lx (expected S_OK)\n", res); - ok(dwEscaped == 50, "got %ld expected 50\n", dwEscaped); - - /* if we set the buffer pointer to NULL, the string apparently still gets converted (Google Lively does this) */ - res = UrlUnescapeW(another_inplaceW, NULL, NULL, URL_UNESCAPE_INPLACE); - ok(res == S_OK, "UrlUnescapeW returned 0x%lx (expected S_OK)\n", res); - - ok(lstrlenW(another_inplaceW) == 24, "got %d expected 24\n", lstrlenW(another_inplaceW)); }
static const struct parse_url_test_t {
From: Jactry Zeng jzeng@codeweavers.com
Signed-off-by: Jactry Zeng jzeng@codeweavers.com --- dlls/kernelbase/path.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-)
diff --git a/dlls/kernelbase/path.c b/dlls/kernelbase/path.c index 7eda9bd483c..71502545ee3 100644 --- a/dlls/kernelbase/path.c +++ b/dlls/kernelbase/path.c @@ -2909,9 +2909,11 @@ HRESULT WINAPI UrlUnescapeA(char *url, char *unescaped, DWORD *unescaped_len, DW
HRESULT WINAPI UrlUnescapeW(WCHAR *url, WCHAR *unescaped, DWORD *unescaped_len, DWORD flags) { + WCHAR *dst, next, utf16_buf[5]; BOOL stop_unescaping = FALSE; + int utf8_len, utf16_len; const WCHAR *src; - WCHAR *dst, next; + char utf8_buf[5]; DWORD needed; HRESULT hr;
@@ -2928,8 +2930,10 @@ HRESULT WINAPI UrlUnescapeW(WCHAR *url, WCHAR *unescaped, DWORD *unescaped_len, dst = unescaped; }
+ utf8_len = 0; for (src = url, needed = 0; *src; src++, needed++) { + utf16_len = 0; if (flags & URL_DONT_UNESCAPE_EXTRA_INFO && (*src == '#' || *src == '?')) { stop_unescaping = TRUE; @@ -2939,17 +2943,42 @@ HRESULT WINAPI UrlUnescapeW(WCHAR *url, WCHAR *unescaped, DWORD *unescaped_len, { INT ih; WCHAR buf[5] = L"0x"; + memcpy(buf + 2, src + 1, 2*sizeof(WCHAR)); buf[4] = 0; StrToIntExW(buf, STIF_SUPPORT_HEX, &ih); - next = (WCHAR) ih; src += 2; /* Advance to end of escape */ + + if (flags & URL_UNESCAPE_AS_UTF8) + { + utf8_buf[utf8_len++] = ih; + utf16_len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8_buf, utf8_len, NULL, 0); + if (!utf16_len) + continue; + else + { + MultiByteToWideChar(CP_UTF8, 0, utf8_buf, utf8_len, utf16_buf, utf16_len); + utf16_buf[utf16_len] = 0; + needed += (utf16_len - utf8_len); + utf8_len = 0; + } + } + else + next = (WCHAR) ih; } else next = *src;
if (flags & URL_UNESCAPE_INPLACE || needed < *unescaped_len) - *dst++ = next; + { + if (utf16_len) + { + wcscpy(dst, utf16_buf); + dst += utf16_len; + } + else + *dst++ = next; + } }
if (flags & URL_UNESCAPE_INPLACE || needed < *unescaped_len)
From: Jactry Zeng jzeng@codeweavers.com
Signed-off-by: Jactry Zeng jzeng@codeweavers.com --- dlls/shlwapi/tests/url.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/dlls/shlwapi/tests/url.c b/dlls/shlwapi/tests/url.c index 4a788c5ff51..02d306077b4 100644 --- a/dlls/shlwapi/tests/url.c +++ b/dlls/shlwapi/tests/url.c @@ -412,13 +412,20 @@ static struct const WCHAR *url; const WCHAR *expect; DWORD flags; + const WCHAR *win7_expect; } TEST_URL_UNESCAPEW[] = { { L"file://foo/bar", L"file://foo/bar" }, { L"file://fo%20o%5Ca/bar", L"file://fo o\a/bar" }, { L"file://%24%25foobar", L"file://$%foobar" }, { L"file:///C:/Program Files", L"file:///C:/Program Files" }, + { L"file:///C:/Program Files", L"file:///C:/Program Files", URL_UNESCAPE_AS_UTF8 }, { L"file:///C:/Program%20Files", L"file:///C:/Program Files" }, + { L"file:///C:/Program%20Files", L"file:///C:/Program Files", URL_UNESCAPE_AS_UTF8 }, + { L"file://foo/%E4%B8%AD%E6%96%87/bar", L"file://foo/\xe4\xb8\xad\xe6\x96\x87/bar" }, /* with 3 btyes utf-8 */ + { L"file://foo/%E4%B8%AD%E6%96%87/bar", L"file://foo/\x4e2d\x6587/bar", URL_UNESCAPE_AS_UTF8, L"file://foo/\xe4\xb8\xad\xe6\x96\x87/bar" }, + { L"file://foo/%F0%9F%8D%B7/bar", L"file://foo/\xf0\x9f\x8d\xb7/bar" }, /* with 4 btyes utf-8 */ + { L"file://foo/%F0%9F%8D%B7/bar", L"file://foo/\xd83c\xdf77/bar", URL_UNESCAPE_AS_UTF8, L"file://foo/\xf0\x9f\x8d\xb7/bar" }, };
static const struct { @@ -1437,9 +1444,11 @@ static void test_UrlUnescape(void) unescaped = INTERNET_MAX_URL_LENGTH; res = UrlUnescapeW(urlW, bufferW, &unescaped, TEST_URL_UNESCAPEW[i].flags); ok(res == S_OK, "[%d]: returned %#lx.\n", i, res); - ok(unescaped == lstrlenW(TEST_URL_UNESCAPEW[i].expect), + ok(unescaped == lstrlenW(TEST_URL_UNESCAPEW[i].expect) + || broken(unescaped == lstrlenW(TEST_URL_UNESCAPEW[i].win7_expect)), "[%d]: got unescaped %ld.\n", i, unescaped); - ok(!lstrcmpW(bufferW, TEST_URL_UNESCAPEW[i].expect), + ok(!lstrcmpW(bufferW, TEST_URL_UNESCAPEW[i].expect) + || broken(!lstrcmpW(bufferW, TEST_URL_UNESCAPEW[i].win7_expect)), "[%d]: got result "%s".\n", i, debugstr_w(bufferW));
/* Test with URL_UNESCAPE_INPLACE */ @@ -1447,7 +1456,8 @@ static void test_UrlUnescape(void) res = UrlUnescapeW(urlW, NULL, &unescaped, TEST_URL_UNESCAPEW[i].flags | URL_UNESCAPE_INPLACE); ok(res == S_OK, "[%d]: returned %#lx.\n", i, res); ok(unescaped == INTERNET_MAX_URL_LENGTH, "[%d]: got unescaped %ld.\n", i, unescaped); - ok(!lstrcmpW(urlW, TEST_URL_UNESCAPEW[i].expect), + ok(!lstrcmpW(urlW, TEST_URL_UNESCAPEW[i].expect) + || broken(!lstrcmpW(urlW, TEST_URL_UNESCAPEW[i].win7_expect)), "[%d]: got result "%s".\n", i, debugstr_w(urlW));
unescaped = lstrlenW(TEST_URL_UNESCAPEW[i].expect) - 1;
From: Jactry Zeng jzeng@codeweavers.com
This fixes an issue when the path includes non-ASCII characters.
Signed-off-by: Jactry Zeng jzeng@codeweavers.com --- dlls/mshtml/nsembed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dlls/mshtml/nsembed.c b/dlls/mshtml/nsembed.c index c7683a6d06d..c61163d19e0 100644 --- a/dlls/mshtml/nsembed.c +++ b/dlls/mshtml/nsembed.c @@ -1297,7 +1297,7 @@ BOOL is_gecko_path(const char *path) *ptr = '/'; }
- UrlUnescapeW(buf, NULL, NULL, URL_UNESCAPE_INPLACE); + UrlUnescapeW(buf, NULL, NULL, URL_UNESCAPE_INPLACE | URL_UNESCAPE_AS_UTF8); buf[gecko_path_len] = 0;
ret = !wcsicmp(buf, gecko_path);
Jacek Caban (@jacek) commented about dlls/kernelbase/path.c:
{ INT ih; WCHAR buf[5] = L"0x";
memcpy(buf + 2, src + 1, 2*sizeof(WCHAR)); buf[4] = 0; StrToIntExW(buf, STIF_SUPPORT_HEX, &ih);
next = (WCHAR) ih; src += 2; /* Advance to end of escape */
if (flags & URL_UNESCAPE_AS_UTF8)
{
utf8_buf[utf8_len++] = ih;
utf16_len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8_buf, utf8_len, NULL, 0);
if (!utf16_len)
continue;
This doesn't seem reliable. For example, if there is non-escaped char between escaped multi-byte values, you will end up combining characters surrounding non-escaped one. See JSGlobal_decodeURI for an example how it can be handled.
Hi Jacek,
On 8/9/22 18:18, Jacek Caban (@jacek) wrote:
Jacek Caban (@jacek) commented about dlls/kernelbase/path.c:
{ INT ih; WCHAR buf[5] = L"0x";
memcpy(buf + 2, src + 1, 2*sizeof(WCHAR)); buf[4] = 0; StrToIntExW(buf, STIF_SUPPORT_HEX, &ih);
next = (WCHAR) ih; src += 2; /* Advance to end of escape */
if (flags & URL_UNESCAPE_AS_UTF8)
{
utf8_buf[utf8_len++] = ih;
utf16_len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8_buf, utf8_len, NULL, 0);
if (!utf16_len)
continue;
This doesn't seem reliable. For example, if there is non-escaped char between escaped multi-byte values, you will end up combining characters surrounding non-escaped one. See JSGlobal_decodeURI for an example how it can be handled.
Sorry for the long delay, it has been really a good while! The last time I tried the approach in JSGlobal_decodeURI() but I found that it doesn't handle 4-bytes UTF-8 very well. So I hung this up.
Anyway, this comes to my sight again recently. In this try, I use get_utf8_len() and the first byte of the UTF-8 code for calculating the length of the UTF-8 code. Hopefully, this can handle the 'non-escaped characters between multi-byte escaped characters' case and 4 bytes UTF-8. These cases are added to the test correspondingly.
Thanks