-- v2: mshtml: Try to guess the charset on streams with no BOM.
From: Gabriel Ivăncescu gabrielopcode@gmail.com
We have to override it ourselves as Gecko, unlike IE, "inherits" the encoding of the page for the resources linked from it.
Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com --- dlls/mshtml/navigate.c | 10 ++- dlls/mshtml/tests/dom.c | 105 ++++++++++++++++++++++++++++++++ dlls/mshtml/tests/rsrc.rc | 12 ++++ dlls/mshtml/tests/utf16.css | Bin 0 -> 68 bytes dlls/mshtml/tests/utf16_bom.css | Bin 0 -> 70 bytes dlls/mshtml/tests/utf8.css | 1 + dlls/mshtml/tests/utf8_bom.css | 1 + 7 files changed, 127 insertions(+), 2 deletions(-) create mode 100644 dlls/mshtml/tests/utf16.css create mode 100644 dlls/mshtml/tests/utf16_bom.css create mode 100644 dlls/mshtml/tests/utf8.css create mode 100644 dlls/mshtml/tests/utf8_bom.css
diff --git a/dlls/mshtml/navigate.c b/dlls/mshtml/navigate.c index 90ca22ea8b1..084a750b61a 100644 --- a/dlls/mshtml/navigate.c +++ b/dlls/mshtml/navigate.c @@ -1144,8 +1144,14 @@ static HRESULT read_stream_data(nsChannelBSC *This, IStream *stream) break; case BOM_UTF16: This->nschannel->charset = strdup("utf-16"); - case BOM_NONE: - /* FIXME: Get charset from HTTP headers */; + break; + case BOM_NONE: { + /* FIXME: Get charset from HTTP headers first */ + INT flags = IS_TEXT_UNICODE_ASCII16 | IS_TEXT_UNICODE_CONTROLS | IS_TEXT_UNICODE_STATISTICS; + + IsTextUnicode(This->nsstream->buf, This->nsstream->buf_size, &flags); + This->nschannel->charset = strdup(flags ? "utf-16" : "utf-8"); + } }
if(!This->nschannel->content_type) { diff --git a/dlls/mshtml/tests/dom.c b/dlls/mshtml/tests/dom.c index af03e660fe5..74ee1b893cb 100644 --- a/dlls/mshtml/tests/dom.c +++ b/dlls/mshtml/tests/dom.c @@ -11902,6 +11902,110 @@ static void test_quirks_mode(void) } }
+static void test_default_content_charset(void) +{ +#define CHARSETS X(utf8) X(utf16) X(utf8_bom) X(utf16_bom) + enum { +#define X(c) c, + CHARSETS +#undef X + } doc_charset, css_charset; + static const char *charsets[] = { +#define X(c) #c, + CHARSETS +#undef X + }; + static const WCHAR *charsetsW[] = { +#define X(c) L"" #c, + CHARSETS +#undef X + }; +#undef CHARSETS + unsigned module_len, module_lenW; + IHTMLCSSStyleDeclaration *style; + char module_path[MAX_PATH * 3]; + WCHAR module_pathW[MAX_PATH]; + IPersistStreamInit *init; + IHTMLWindow7 *window7; + IHTMLWindow2 *window; + IHTMLDocument2 *doc; + IHTMLElement *body; + IHTMLDOMNode *node; + IStream *stream; + HRESULT hres; + VARIANT var; + HGLOBAL mem; + SIZE_T size; + void *buf; + MSG msg; + + module_lenW = GetModuleFileNameW(NULL, module_pathW, ARRAY_SIZE(module_pathW)); + module_len = WideCharToMultiByte(CP_UTF8, 0, module_pathW, -1, module_path, ARRAY_SIZE(module_path), NULL, NULL); + + for(doc_charset = utf8; doc_charset <= utf16; doc_charset++) { + for(css_charset = utf8; css_charset <= utf16_bom; css_charset++) { + notif_doc = doc = create_document(); + if(!doc) + return; + doc_complete = FALSE; + + buf = malloc((128 + (doc_charset == utf16 ? module_lenW : module_len)) * (doc_charset == utf16 ? sizeof(WCHAR) : 1)); + if(doc_charset == utf16) + size = wsprintfW(buf, L"<!DOCTYPE html>\n<html><head><link href="res://%s/%s.css" rel="stylesheet"></head><body></body></html>", module_pathW, charsetsW[css_charset]) * sizeof(WCHAR); + else + size = sprintf(buf, "<!DOCTYPE html>\n<html><head><link href="res://%s/%s.css" rel="stylesheet"></head><body></body></html>", module_path, charsets[css_charset]); + mem = GlobalAlloc(0, size); + memcpy(mem, buf, size); + free(buf); + + hres = CreateStreamOnHGlobal(mem, TRUE, &stream); + ok(hres == S_OK, "Failed to create stream: %08lx.\n", hres); + hres = IHTMLDocument2_QueryInterface(doc, &IID_IPersistStreamInit, (void**)&init); + ok(hres == S_OK, "Failed to get IPersistStreamInit: %08lx.\n", hres); + IPersistStreamInit_Load(init, stream); + IPersistStreamInit_Release(init); + IStream_Release(stream); + + set_client_site(doc, TRUE); + do_advise((IUnknown*)doc, &IID_IPropertyNotifySink, (IUnknown*)&PropertyNotifySink); + + while(!doc_complete && GetMessageW(&msg, NULL, 0, 0)) { + TranslateMessage(&msg); + DispatchMessageW(&msg); + } + + hres = IHTMLDocument2_get_parentWindow(doc, &window); + ok(hres == S_OK, "get_parentWindow failed: %08lx\n", hres); + + hres = IHTMLWindow2_QueryInterface(window, &IID_IHTMLWindow7, (void**)&window7); + ok(hres == S_OK, "Could not get IHTMLWindow7: %08lx\n", hres); + IHTMLWindow2_Release(window); + + hres = IHTMLDocument2_get_body(doc, &body); + ok(hres == S_OK, "Could not get body: %08lx\n", hres); + + hres = IHTMLElement_QueryInterface(body, &IID_IHTMLDOMNode, (void**)&node); + ok(hres == S_OK, "Could not get IHTMLDOMNode: %08lx\n", hres); + IHTMLElement_Release(body); + + hres = IHTMLWindow7_getComputedStyle(window7, node, NULL, &style); + ok(hres == S_OK, "getComputedStyle failed: %08lx\n", hres); + IHTMLWindow7_Release(window7); + IHTMLDOMNode_Release(node); + + hres = IHTMLCSSStyleDeclaration_get_backgroundColor(style, &var); + ok(hres == S_OK, "get_backgroundColor failed: %08lx\n", hres); + ok(V_VT(&var) == VT_BSTR, "backgroundColor VT = %d\n", V_VT(&var)); + ok(!wcscmp(V_BSTR(&var), L"rgb(222, 173, 184)"), "[%s:%s] backgroundColor = %s\n", charsets[doc_charset], charsets[css_charset], wine_dbgstr_w(V_BSTR(&var))); + IHTMLCSSStyleDeclaration_Release(style); + VariantClear(&var); + + set_client_site(doc, FALSE); + IHTMLDocument2_Release(doc); + } + } +} + static void test_document_mode_lock(void) { IHTMLOptionElementFactory *option, *option2; @@ -12279,6 +12383,7 @@ START_TEST(dom) }
test_quirks_mode(); + test_default_content_charset(); test_document_mode_lock(); test_document_mode_after_initnew(); test_threads(); diff --git a/dlls/mshtml/tests/rsrc.rc b/dlls/mshtml/tests/rsrc.rc index 61b53520781..85d68df658d 100644 --- a/dlls/mshtml/tests/rsrc.rc +++ b/dlls/mshtml/tests/rsrc.rc @@ -91,6 +91,18 @@ iframe.html HTML "iframe.html" /* @makedep: xhr_iframe.html */ xhr_iframe.html HTML "xhr_iframe.html"
+/* @makedep: utf8.css */ +utf8.css HTML "utf8.css" + +/* @makedep: utf16.css */ +utf16.css HTML "utf16.css" + +/* @makedep: utf8_bom.css */ +utf8_bom.css HTML "utf8_bom.css" + +/* @makedep: utf16_bom.css */ +utf16_bom.css HTML "utf16_bom.css" + /* For res: protocol test: */
/* @makedep: jstest.html */ diff --git a/dlls/mshtml/tests/utf16.css b/dlls/mshtml/tests/utf16.css new file mode 100644 index 0000000000000000000000000000000000000000..2d209e0015973fc27773e8b367259baa3a3cbd14 GIT binary patch literal 68 zcmYdd$Y)4lsAN!JsAfoFNMuN6$Yw}qC<2O>GUNeebb%uI3^_nt#9#$fr3@5H1*%H{ Os<vRTW~gQ0VgLZbp$r`W
literal 0 HcmV?d00001
diff --git a/dlls/mshtml/tests/utf16_bom.css b/dlls/mshtml/tests/utf16_bom.css new file mode 100644 index 0000000000000000000000000000000000000000..c1acec91611fd5061c4b75a83c519b40114e99e6 GIT binary patch literal 70 zcmezWFNq<aA%&rmL4l!~A&DW8A(<hYA)TQJC|b&p2b9qTisUoo0C5q66;PEjP%IUw QE(NICg29@hmVt`_08HWz8UO$Q
literal 0 HcmV?d00001
diff --git a/dlls/mshtml/tests/utf8.css b/dlls/mshtml/tests/utf8.css new file mode 100644 index 00000000000..b9c63754cc4 --- /dev/null +++ b/dlls/mshtml/tests/utf8.css @@ -0,0 +1 @@ +body {background-color: #deadb8;} diff --git a/dlls/mshtml/tests/utf8_bom.css b/dlls/mshtml/tests/utf8_bom.css new file mode 100644 index 00000000000..7f2a960e216 --- /dev/null +++ b/dlls/mshtml/tests/utf8_bom.css @@ -0,0 +1 @@ +body {background-color: #deadb8;}
Jacek Caban (@jacek) commented about dlls/mshtml/tests/rsrc.rc:
/* @makedep: xhr_iframe.html */ xhr_iframe.html HTML "xhr_iframe.html"
+/* @makedep: utf8.css */ +utf8.css HTML "utf8.css"
+/* @makedep: utf16.css */ +utf16.css HTML "utf16.css"
+/* @makedep: utf8_bom.css */ +utf8_bom.css HTML "utf8_bom.css"
+/* @makedep: utf16_bom.css */ +utf16_bom.css HTML "utf16_bom.css"
Would the file protocol work for these tests? UTF-16 files are considered binary by git, so while it's not a major issue, avoiding them (and keeping the total number of files lower) would be nice. I wonder if we could instead create temporary files within the test.
On Wed Feb 19 22:24:06 2025 +0000, Jacek Caban wrote:
Would the file protocol work for these tests? UTF-16 files are considered binary by git, so while it's not a major issue, avoiding them (and keeping the total number of files lower) would be nice. I wonder if we could instead create temporary files within the test.
Yeah, temporary file should work. I'm not too big of a fan of it though, it's more fragile and requires cleanup, but yeah the current approach isn't very pretty either, though it might make the test loop less declarative.
Unfortunately, I can't get it to work on wine, for some reason. The thing is, it's not this fix or patch itself, **ALL** tests fail, meaning the CSS is never parsed properly (in fact, it's not parsed at all). It aborts early for some reason, and I can't figure out why. I'm not sure if it's worth considering it's just tests? Maybe you have an idea?
I tried hardcoded path too, is it a gecko "security" limitation by any chance?
Here's the tests that pass on Windows: ```diff diff --git a/dlls/mshtml/tests/dom.c b/dlls/mshtml/tests/dom.c index af03e66..63c2413 100644 --- a/dlls/mshtml/tests/dom.c +++ b/dlls/mshtml/tests/dom.c @@ -11902,6 +11902,128 @@ static void test_quirks_mode(void) } }
+static void test_default_content_charset(void) +{ + static const char *charsets[] = { "utf8", "utf16", "utf8_bom", "utf16_bom" }; + static const char css_utf8[] = "\xef\xbb\xbf""body {background-color: #deadb8;}"; + static const WCHAR css_utf16[] = L"\xfeff""body {background-color: #deadb8;}"; + enum { utf8, utf16, utf8_bom, utf16_bom } doc_charset, css_charset; + WCHAR doc_stringW[128 + MAX_PATH], css_pathW[MAX_PATH]; + IHTMLCSSStyleDeclaration *style; + IPersistStreamInit *init; + IHTMLWindow7 *window7; + IHTMLWindow2 *window; + IHTMLDocument2 *doc; + IHTMLElement *body; + IHTMLDOMNode *node; + char *doc_string; + IStream *stream; + DWORD written; + HRESULT hres; + VARIANT var; + HGLOBAL mem; + HANDLE file; + SIZE_T size; + void *buf; + UINT ret; + MSG msg; + + ret = GetTempPathW(MAX_PATH, doc_stringW); + ok(ret != 0, "GetTempPath failed, error %ld\n", GetLastError()); + ret = GetTempFileNameW(doc_stringW, L"css", 0, css_pathW); + ok(ret != 0, "GetTempFileName failed, error %ld\n", GetLastError()); + + wsprintfW(doc_stringW, L"<!DOCTYPE html>\n<html><head><link href="file://%s" rel="stylesheet"></head><body></body></html>", css_pathW); + + ret = WideCharToMultiByte(CP_UTF8, 0, doc_stringW, -1, NULL, 0, NULL, NULL); + doc_string = malloc(ret); + WideCharToMultiByte(CP_UTF8, 0, doc_stringW, -1, doc_string, ret, NULL, NULL); + + file = CreateFileW(css_pathW, GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_TEMPORARY, 0); + ok(file != INVALID_HANDLE_VALUE, "CreateFile failed, error %ld\n", GetLastError()); + + for(doc_charset = utf8; doc_charset <= utf16; doc_charset++) { + for(css_charset = utf8; css_charset <= utf16_bom; css_charset++) { + notif_doc = doc = create_document(); + if(!doc) + return; + doc_complete = FALSE; + + switch(css_charset) { + case utf8: ret = WriteFile(file, css_utf8 + 3, sizeof(css_utf8) - sizeof(char) - 3, &written, NULL); break; + case utf8_bom: ret = WriteFile(file, css_utf8, sizeof(css_utf8) - sizeof(char), &written, NULL); break; + case utf16: ret = WriteFile(file, css_utf16 + 1, sizeof(css_utf16) - sizeof(WCHAR) - 2, &written, NULL); break; + case utf16_bom: ret = WriteFile(file, css_utf16, sizeof(css_utf16) - sizeof(WCHAR), &written, NULL); break; + } + ok(ret, "WriteFile failed, error %ld\n", GetLastError()); + + if(doc_charset == utf8) { + buf = doc_string; + size = strlen(doc_string); + }else { + buf = doc_stringW; + size = wcslen(doc_stringW) * sizeof(WCHAR); + } + mem = GlobalAlloc(0, size); + memcpy(mem, buf, size); + + hres = CreateStreamOnHGlobal(mem, TRUE, &stream); + ok(hres == S_OK, "Failed to create stream: %08lx.\n", hres); + hres = IHTMLDocument2_QueryInterface(doc, &IID_IPersistStreamInit, (void**)&init); + ok(hres == S_OK, "Failed to get IPersistStreamInit: %08lx.\n", hres); + IPersistStreamInit_Load(init, stream); + IPersistStreamInit_Release(init); + IStream_Release(stream); + + set_client_site(doc, TRUE); + do_advise((IUnknown*)doc, &IID_IPropertyNotifySink, (IUnknown*)&PropertyNotifySink); + + while(!doc_complete && GetMessageW(&msg, NULL, 0, 0)) { + TranslateMessage(&msg); + DispatchMessageW(&msg); + } + + hres = IHTMLDocument2_get_parentWindow(doc, &window); + ok(hres == S_OK, "get_parentWindow failed: %08lx\n", hres); + + hres = IHTMLWindow2_QueryInterface(window, &IID_IHTMLWindow7, (void**)&window7); + ok(hres == S_OK, "Could not get IHTMLWindow7: %08lx\n", hres); + IHTMLWindow2_Release(window); + + hres = IHTMLDocument2_get_body(doc, &body); + ok(hres == S_OK, "Could not get body: %08lx\n", hres); + + hres = IHTMLElement_QueryInterface(body, &IID_IHTMLDOMNode, (void**)&node); + ok(hres == S_OK, "Could not get IHTMLDOMNode: %08lx\n", hres); + IHTMLElement_Release(body); + + hres = IHTMLWindow7_getComputedStyle(window7, node, NULL, &style); + ok(hres == S_OK, "getComputedStyle failed: %08lx\n", hres); + IHTMLWindow7_Release(window7); + IHTMLDOMNode_Release(node); + + hres = IHTMLCSSStyleDeclaration_get_backgroundColor(style, &var); + ok(hres == S_OK, "get_backgroundColor failed: %08lx\n", hres); + ok(V_VT(&var) == VT_BSTR, "backgroundColor VT = %d\n", V_VT(&var)); + ok(!wcscmp(V_BSTR(&var), L"rgb(222, 173, 184)"), "[%s:%s] backgroundColor = %s\n", charsets[doc_charset], charsets[css_charset], wine_dbgstr_w(V_BSTR(&var))); + IHTMLCSSStyleDeclaration_Release(style); + VariantClear(&var); + + set_client_site(doc, FALSE); + IHTMLDocument2_Release(doc); + + ret = SetFilePointer(file, 0, NULL, FILE_BEGIN); + ok(ret == 0, "wrong file offset %u\n", ret); + ret = SetEndOfFile(file); + ok(ret, "SetEndOfFile failed, error %ld\n", GetLastError()); + } + } + + CloseHandle(file); + DeleteFileW(css_pathW); + free(doc_string); +} + static void test_document_mode_lock(void) { IHTMLOptionElementFactory *option, *option2; @@ -12279,6 +12401,7 @@ START_TEST(dom) }
test_quirks_mode(); + test_default_content_charset(); test_document_mode_lock(); test_document_mode_after_initnew(); test_threads(); ```
Any chance to revive this one? Is this blocked by problematic tests only?
Sorry for the delay. In general, setting the charset for all data here is questionable, it would also apply to binary data, which doesn't make much sense.
I checked the Gecko source, and it seems that the CSS parser does rely on the protocol handler to detect the charset. This means it likely wouldn’t work for the file protocol. Did you confirm this behavior in Firefox? It should be easy to convert this test to JS and run it in stock Firefox 47 to verify that it’s not an issue with our nsio replacement.
If it turns out that we do need this behavior, perhaps we should apply it only to selected MIME types?