-- v3: mshtml: Try to guess the charset on streams with no BOM.
From: Gabriel Ivăncescu gabrielopcode@gmail.com
We have to override it ourselves as Gecko, unlike IE, "inherits" the encoding of the page for the resources linked from it.
Signed-off-by: Gabriel Ivăncescu gabrielopcode@gmail.com --- dlls/mshtml/navigate.c | 10 ++- dlls/mshtml/tests/dom.c | 112 ++++++++++++++++++++++++++++++++ dlls/mshtml/tests/rsrc.rc | 12 ++++ dlls/mshtml/tests/utf16_bom_css | Bin 0 -> 84 bytes dlls/mshtml/tests/utf16_css | Bin 0 -> 82 bytes dlls/mshtml/tests/utf8_bom_css | 1 + dlls/mshtml/tests/utf8_css | 1 + 7 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 dlls/mshtml/tests/utf16_bom_css create mode 100644 dlls/mshtml/tests/utf16_css create mode 100644 dlls/mshtml/tests/utf8_bom_css create mode 100644 dlls/mshtml/tests/utf8_css
diff --git a/dlls/mshtml/navigate.c b/dlls/mshtml/navigate.c index f05bf0be9c6..afe5f5ea4bc 100644 --- a/dlls/mshtml/navigate.c +++ b/dlls/mshtml/navigate.c @@ -1144,8 +1144,14 @@ static HRESULT read_stream_data(nsChannelBSC *This, IStream *stream) break; case BOM_UTF16: This->nschannel->charset = strdup("utf-16"); - case BOM_NONE: - /* FIXME: Get charset from HTTP headers */; + break; + case BOM_NONE: { + /* FIXME: Get charset from HTTP headers first */ + INT flags = IS_TEXT_UNICODE_ASCII16 | IS_TEXT_UNICODE_CONTROLS | IS_TEXT_UNICODE_STATISTICS; + + IsTextUnicode(This->nsstream->buf, This->nsstream->buf_size, &flags); + This->nschannel->charset = strdup(flags ? "utf-16" : "utf-8"); + } }
if(!This->nschannel->content_type) { diff --git a/dlls/mshtml/tests/dom.c b/dlls/mshtml/tests/dom.c index b1625b619ca..30443ce9b9b 100644 --- a/dlls/mshtml/tests/dom.c +++ b/dlls/mshtml/tests/dom.c @@ -12037,6 +12037,117 @@ static void test_quirks_mode(void) } }
+static void test_default_content_charset(void) +{ +#define CHARSETS X(utf8) X(utf16) X(utf8_bom) X(utf16_bom) + enum { +#define X(c) c, + CHARSETS +#undef X + } doc_charset, css_charset; + static const char *charsets[] = { +#define X(c) #c, + CHARSETS +#undef X + }; + static const WCHAR *charsetsW[] = { +#define X(c) L"" #c, + CHARSETS +#undef X + }; +#undef CHARSETS + static char utf8_css[] = ".snd{} body {background-color: #deadb8;}"; + unsigned module_len, module_lenW; + IHTMLCSSStyleDeclaration *style; + char module_path[MAX_PATH * 3]; + WCHAR module_pathW[MAX_PATH]; + IPersistStreamInit *init; + IHTMLWindow7 *window7; + IHTMLWindow2 *window; + IHTMLDocument2 *doc; + IHTMLElement *body; + IHTMLDOMNode *node; + IStream *stream; + HRESULT hres; + VARIANT var; + HGLOBAL mem; + SIZE_T size; + WCHAR *mime; + void *buf; + MSG msg; + + module_lenW = GetModuleFileNameW(NULL, module_pathW, ARRAY_SIZE(module_pathW)); + module_len = WideCharToMultiByte(CP_UTF8, 0, module_pathW, -1, module_path, ARRAY_SIZE(module_path), NULL, NULL); + + hres = FindMimeFromData(NULL, NULL, utf8_css, sizeof(utf8_css) - 1, NULL, 0, &mime, 0); + ok(hres == S_OK, "FindMimeFromData failed: %08lx\n", hres); + ok(!wcscmp(mime, L"audio/basic"), "mime = %s\n", wine_dbgstr_w(mime)); + CoTaskMemFree(mime); + + for(doc_charset = utf8; doc_charset <= utf16; doc_charset++) { + for(css_charset = utf8; css_charset <= utf16_bom; css_charset++) { + notif_doc = doc = create_document(); + if(!doc) + return; + doc_complete = FALSE; + + buf = malloc((128 + (doc_charset == utf16 ? module_lenW : module_len)) * (doc_charset == utf16 ? sizeof(WCHAR) : 1)); + if(doc_charset == utf16) + size = wsprintfW(buf, L"<!DOCTYPE html>\n<html><head><link href="res://%s/%s_css" rel="stylesheet"></head><body></body></html>", module_pathW, charsetsW[css_charset]) * sizeof(WCHAR); + else + size = sprintf(buf, "<!DOCTYPE html>\n<html><head><link href="res://%s/%s_css" rel="stylesheet"></head><body></body></html>", module_path, charsets[css_charset]); + mem = GlobalAlloc(0, size); + memcpy(mem, buf, size); + free(buf); + + hres = CreateStreamOnHGlobal(mem, TRUE, &stream); + ok(hres == S_OK, "Failed to create stream: %08lx.\n", hres); + hres = IHTMLDocument2_QueryInterface(doc, &IID_IPersistStreamInit, (void**)&init); + ok(hres == S_OK, "Failed to get IPersistStreamInit: %08lx.\n", hres); + IPersistStreamInit_Load(init, stream); + IPersistStreamInit_Release(init); + IStream_Release(stream); + + set_client_site(doc, TRUE); + do_advise((IUnknown*)doc, &IID_IPropertyNotifySink, (IUnknown*)&PropertyNotifySink); + + while(!doc_complete && GetMessageW(&msg, NULL, 0, 0)) { + TranslateMessage(&msg); + DispatchMessageW(&msg); + } + + hres = IHTMLDocument2_get_parentWindow(doc, &window); + ok(hres == S_OK, "get_parentWindow failed: %08lx\n", hres); + + hres = IHTMLWindow2_QueryInterface(window, &IID_IHTMLWindow7, (void**)&window7); + ok(hres == S_OK, "Could not get IHTMLWindow7: %08lx\n", hres); + IHTMLWindow2_Release(window); + + hres = IHTMLDocument2_get_body(doc, &body); + ok(hres == S_OK, "Could not get body: %08lx\n", hres); + + hres = IHTMLElement_QueryInterface(body, &IID_IHTMLDOMNode, (void**)&node); + ok(hres == S_OK, "Could not get IHTMLDOMNode: %08lx\n", hres); + IHTMLElement_Release(body); + + hres = IHTMLWindow7_getComputedStyle(window7, node, NULL, &style); + ok(hres == S_OK, "getComputedStyle failed: %08lx\n", hres); + IHTMLWindow7_Release(window7); + IHTMLDOMNode_Release(node); + + hres = IHTMLCSSStyleDeclaration_get_backgroundColor(style, &var); + ok(hres == S_OK, "get_backgroundColor failed: %08lx\n", hres); + ok(V_VT(&var) == VT_BSTR, "backgroundColor VT = %d\n", V_VT(&var)); + ok(!wcscmp(V_BSTR(&var), L"rgb(222, 173, 184)"), "[%s:%s] backgroundColor = %s\n", charsets[doc_charset], charsets[css_charset], wine_dbgstr_w(V_BSTR(&var))); + IHTMLCSSStyleDeclaration_Release(style); + VariantClear(&var); + + set_client_site(doc, FALSE); + IHTMLDocument2_Release(doc); + } + } +} + static void test_document_mode_lock(void) { IHTMLOptionElementFactory *option, *option2; @@ -12414,6 +12525,7 @@ START_TEST(dom) }
test_quirks_mode(); + test_default_content_charset(); test_document_mode_lock(); test_document_mode_after_initnew(); test_threads(); diff --git a/dlls/mshtml/tests/rsrc.rc b/dlls/mshtml/tests/rsrc.rc index 61b53520781..77207086214 100644 --- a/dlls/mshtml/tests/rsrc.rc +++ b/dlls/mshtml/tests/rsrc.rc @@ -91,6 +91,18 @@ iframe.html HTML "iframe.html" /* @makedep: xhr_iframe.html */ xhr_iframe.html HTML "xhr_iframe.html"
+/* @makedep: utf8_css */ +utf8_css HTML "utf8_css" + +/* @makedep: utf16_css */ +utf16_css HTML "utf16_css" + +/* @makedep: utf8_bom_css */ +utf8_bom_css HTML "utf8_bom_css" + +/* @makedep: utf16_bom_css */ +utf16_bom_css HTML "utf16_bom_css" + /* For res: protocol test: */
/* @makedep: jstest.html */ diff --git a/dlls/mshtml/tests/utf16_bom_css b/dlls/mshtml/tests/utf16_bom_css new file mode 100644 index 0000000000000000000000000000000000000000..873d27b6544a5d3d373d8e03f090b067f4910636 GIT binary patch literal 84 zcmezWPmiIPA&()2p_-wVL4hHOA)g_Ip^`y?p_(CyA(0`OA)6tcp@<=$p%ko2mm!%U kpCN}KpP`7sia~)vnIVNCl_8NKg&~Q-g29@hmVt`_04ggGr2qf`
literal 0 HcmV?d00001
diff --git a/dlls/mshtml/tests/utf16_css b/dlls/mshtml/tests/utf16_css new file mode 100644 index 0000000000000000000000000000000000000000..fb315b0139f946b4010182b833ee8e229764a8bf GIT binary patch literal 82 zcmdO6C}zlGNMWdEsAW)KNMgumNMWdCP++KLNMcB2NM^`pNM|Tw$Y&@8tI}miX2@sA iVaR7FVz6RRU{GdAVMt|2WJqC1Vz6MaW~gQ0VgLYy6b`5W
literal 0 HcmV?d00001
diff --git a/dlls/mshtml/tests/utf8_bom_css b/dlls/mshtml/tests/utf8_bom_css new file mode 100644 index 00000000000..b44b804ba27 --- /dev/null +++ b/dlls/mshtml/tests/utf8_bom_css @@ -0,0 +1 @@ +.snd{} body {background-color: #deadb8;} diff --git a/dlls/mshtml/tests/utf8_css b/dlls/mshtml/tests/utf8_css new file mode 100644 index 00000000000..8dcd51fc75e --- /dev/null +++ b/dlls/mshtml/tests/utf8_css @@ -0,0 +1 @@ +.snd{} body {background-color: #deadb8;}
On Wed May 28 16:48:30 2025 +0000, Jacek Caban wrote:
Sorry for the delay. In general, setting the charset for all data here is questionable, it would also apply to binary data, which doesn't make much sense. I checked the Gecko source, and it seems that the CSS parser does rely on the protocol handler to detect the charset. This means it likely wouldn’t work for the file protocol. Did you confirm this behavior in Firefox? It should be easy to convert this test to JS and run it in stock Firefox 47 to verify that it’s not an issue with our nsio replacement. If it turns out that we do need this behavior, perhaps we should apply it only to selected MIME types?
I tested stock Firefox 47, and it has the same behavior (wrong compared to IE).
Interesting, I hadn't considered binary data, but does the charset even make a difference if the content type is not text? Note that setting it based on MIME can be wrong. I've added tests now to exemplify this; files beginning with ".snd" are detected as `audio/basic` (this is also on native), and that's completely valid CSS even if probably not in practice.
I also renamed the resource files to _css instead of .css to avoid detection based on file extension (if Windows even does that sort of thing), just to make sure.
That said, considering it's wrong in at least one case, and complicates the code, does it even matter if the charset is set on binary data?
Jacek Caban (@jacek) commented about dlls/mshtml/navigate.c:
break; case BOM_UTF16: This->nschannel->charset = strdup("utf-16");
case BOM_NONE:
/* FIXME: Get charset from HTTP headers */;
break;
case BOM_NONE: {
/* FIXME: Get charset from HTTP headers first */
The comment is outdated, we already look at HTTP headers, but with this change, we'd leak and override the charset found there. (It's already true for cases where BOM is present, which is likely wrong, but at least the value is more reliable there.)
Interesting, I hadn't considered binary data, but does the charset even make a difference if the content type is not text?
I'm not sure, likely not. I was hoping you'd check that in Gecko before signing off on the patch.
That aside, `IsTextUnicode` is a pretty questionable function. It relies on heuristics, and from what I’ve seen, `IS_TEXT_UNICODE_STATISTICS` can easily give false positives, especially with UTF-8 input. We may unfortunately need something like it to match IE behavior, but reducing our exposure to the problem would be an improvement, in my opinion.
One way to limit its use specifically to CSS would be to move the check into `SheetLoadData::OnDetermineCharset`.