This is a first basic implementation that can handle UTF-16/UTF-8 and should cover most text files.
It's currently not used by any known program, but I intend to use the function for find.exe to detect encodings.
Signed-off-by: Fabian Maurer dark.shadow4@web.de --- dlls/mlang/mlang.c | 35 +++++++++++- dlls/mlang/tests/mlang.c | 114 +++++++++++++++++++++++++++++++++++++++ include/mlang.idl | 12 ++++- 3 files changed, 158 insertions(+), 3 deletions(-)
diff --git a/dlls/mlang/mlang.c b/dlls/mlang/mlang.c index f12df298f1..e33c982c76 100644 --- a/dlls/mlang/mlang.c +++ b/dlls/mlang/mlang.c @@ -3097,8 +3097,39 @@ static HRESULT WINAPI fnIMultiLanguage3_DetectInputCodepage( DetectEncodingInfo* lpEncoding, INT* pnScores) { - FIXME("\n"); - return E_NOTIMPL; + INT test; + + FIXME("(%u %u, %p, %p, %p, %p - semi-stub!\n", dwFlag, dwPrefWinCodePage, pSrcStr, pcSrcSize, lpEncoding, pnScores); + + if (!pSrcStr || !lpEncoding || *pcSrcSize <= 0 || *pnScores <= 0) + return E_INVALIDARG; + + test = IS_TEXT_UNICODE_SIGNATURE | IS_TEXT_UNICODE_REVERSE_SIGNATURE; + IsTextUnicode(pSrcStr, *pcSrcSize, &test); + + if (test & IS_TEXT_UNICODE_SIGNATURE) + { + *pnScores = 1; + lpEncoding[0].nCodePage = 1200; + return S_OK; + } + + if (test & IS_TEXT_UNICODE_REVERSE_SIGNATURE) + { + *pnScores = 1; + lpEncoding[0].nCodePage = 1201; + return S_OK; + } + + /* Check for valid UTF-8 */ + if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pSrcStr, *pcSrcSize, NULL, 0) > 0) + { + *pnScores = 1; + lpEncoding[0].nCodePage = 65001; + return S_OK; + } + + return S_FALSE; }
static HRESULT WINAPI fnIMultiLanguage3_ValidateCodePage( diff --git a/dlls/mlang/tests/mlang.c b/dlls/mlang/tests/mlang.c index b5d6fc6114..f57a870d93 100644 --- a/dlls/mlang/tests/mlang.c +++ b/dlls/mlang/tests/mlang.c @@ -2695,6 +2695,119 @@ static void test_MapFont(IMLangFontLink *font_link, IMLangFontLink2 *font_link2) ReleaseDC(NULL, hdc); }
+static void test_DetectInputCodepage(IMultiLanguage2 *ml2) +{ + static char str_empty[] = {0}; + static char str_utf8_bom1[] = "\xef\xbb\xbf this is a test string with utf8 bom"; + static char str_utf8_bom2[] = "\xef\xbb\xbf this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom" + "this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom"; + static char str_shift_jis[] = {0x82, 0xB1, 0x82, 0xEA, 0x82, 0xCD, 0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA, 0x82, 0xCC, 0x83, 0x65, 0x83, 0x4C, 0x83, 0x58, 0x83, 0x67, 0x82, 0xC5, 0x82, 0xB7, 0x00 }; + static char str_utf16_be_with_bom[] = { + 0xFE, 0xFF, 0x00, 0x54, 0x00, 0x68, 0x00, 0x69, 0x00, 0x73, 0x00, 0x20, 0x00, 0x69, 0x00, 0x73, + 0x00, 0x20, 0x00, 0x74, 0x00, 0x65, 0x00, 0x78, 0x00, 0x74, 0x00, 0x20, 0x00, 0x69, 0x00, 0x6E, + 0x00, 0x20, 0x00, 0x55, 0x00, 0x54, 0x00, 0x46, 0x00, 0x31, 0x00, 0x36, 0x00 }; + char *str_utf16_be_without_bom = &str_utf16_be_with_bom[2]; + static char str_utf16_le_with_bom[] = { + 0xFF, 0xFE, 0x54, 0x00, 0x68, 0x00, 0x69, 0x00, 0x73, 0x00, 0x20, 0x00, 0x69, 0x00, 0x73, 0x00, + 0x20, 0x00, 0x74, 0x00, 0x65, 0x00, 0x78, 0x00, 0x74, 0x00, 0x20, 0x00, 0x69, 0x00, 0x6E, 0x00, + 0x20, 0x00, 0x55, 0x00, 0x54, 0x00, 0x46, 0x00, 0x31, 0x00, 0x36, 0x00, 0x00 }; + char *str_utf16_le_without_bom = &str_utf16_le_with_bom[2]; + static char str_utf8_hello_without_bom[] = { /* Hello in english, russian and japanese */ + 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, + 0xD1, 0x82, 0x20, 0xE4, 0xBB, 0x8A, 0xE6, 0x97, 0xA5, 0xE3, 0x81, 0xAF, 0x00 }; + + DetectEncodingInfo encoding_info[5]; + HRESULT result; + INT str_size; + INT encoding_count; + + /* Test error conditions */ + + str_size = sizeof(str_empty); + encoding_count = ARRAY_SIZE(encoding_info); + result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, NULL, &str_size, encoding_info, &encoding_count); + ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result); + ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count); + + str_size = sizeof(str_empty); + encoding_count = ARRAY_SIZE(encoding_info); + result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, NULL, &encoding_count); + ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result); + ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count); + + str_size = 0; + encoding_count = ARRAY_SIZE(encoding_info); + result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, encoding_info, &encoding_count); + ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result); + ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count); + + str_size = sizeof(str_empty); + encoding_count = 0; + result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, encoding_info, &encoding_count); + ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result); + ok(0 == encoding_count, "Expected encoding_count to be %d, got %d\n", 0, encoding_count); + + /* Test strings */ + +#define run_DetectInputCodepage(flags, codepage_default, str) \ + str_size = sizeof(str); \ + encoding_count = ARRAY_SIZE(encoding_info); \ + memset(&encoding_info, 0, sizeof(encoding_info)); \ + result = IMultiLanguage2_DetectInputCodepage(ml2, flags, codepage_default, str, &str_size, encoding_info, &encoding_count); + + + run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf8_bom1); + todo_wine + ok(result == S_FALSE, "Expected %#x, got %#x\n", S_FALSE, result); + ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count); + ok(encoding_info[0].nCodePage == 65001, "Expected code-page %d, got %d\n", 65001, encoding_info[0].nCodePage); + + run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf8_bom2); + ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result); + todo_wine + ok(2 == encoding_count, "Expected encoding_count to be %d, got %d\n", 2, encoding_count); + todo_wine + ok(encoding_info[0].nCodePage == 1252, "Expected code-page %d, got %d\n", 1252, encoding_info[0].nCodePage); + todo_wine + ok(encoding_info[1].nCodePage == 65001, "Expected code-page %d, got %d\n", 65001, encoding_info[1].nCodePage); + + run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_shift_jis); + todo_wine + ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result); + todo_wine + ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count); + todo_wine + ok(encoding_info[0].nCodePage == 932, "Expected code-page %d, got %d\n", 932, encoding_info[0].nCodePage); + + run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf16_be_with_bom); + ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result); + ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count); + ok(encoding_info[0].nCodePage == 1201, "Expected code-page %d, got %d\n", 1201, encoding_info[0].nCodePage); + + run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf16_le_with_bom); + ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result); + ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count); + ok(encoding_info[0].nCodePage == 1200, "Expected code-page %d, got %d\n", 1200, encoding_info[0].nCodePage); + + run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf16_be_without_bom); + ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result); + ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count); + todo_wine + ok(encoding_info[0].nCodePage == 20127, "Expected code-page %d, got %d\n", 20127, encoding_info[0].nCodePage); + + run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf16_le_without_bom); + ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result); + ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count); + todo_wine + ok(encoding_info[0].nCodePage == 20127, "Expected code-page %d, got %d\n", 20127, encoding_info[0].nCodePage); + + run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf8_hello_without_bom); + todo_wine + ok(result == S_FALSE, "Expected %#x, got %#x\n", S_FALSE, result); + ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count); + ok(encoding_info[0].nCodePage == 65001, "Expected code-page %d, got %d\n", 65001, encoding_info[0].nCodePage); +} + START_TEST(mlang) { IMultiLanguage *iML = NULL; @@ -2761,6 +2874,7 @@ START_TEST(mlang) test_IMultiLanguage2_ConvertStringFromUnicode(iML2);
test_IsCodePageInstallable(iML2); + test_DetectInputCodepage(iML2);
IMultiLanguage2_Release(iML2);
diff --git a/include/mlang.idl b/include/mlang.idl index 5867648b04..a23fdb85f4 100644 --- a/include/mlang.idl +++ b/include/mlang.idl @@ -25,6 +25,16 @@ interface IStream; cpp_quote("#define CPIOD_PEEK 0x40000000") cpp_quote("#define CPIOD_FORCE_PROMPT 0x80000000")
+typedef enum tagMLDETECTCP +{ + MLDETECTCP_NONE = 0, + MLDETECTCP_7BIT = 1, + MLDETECTCP_8BIT = 2, + MLDETECTCP_DBCS = 4, + MLDETECTCP_HTML = 8, + MLDETECTCP_NUMBER = 16, +} MLDETECTCP; + [ object, uuid(359f3443-bd4a-11d0-b188-00aa0038c969), @@ -677,7 +687,7 @@ interface IMultiLanguage2 : IUnknown [in,out] INT *pnScores);
HRESULT DetectInputCodepage( - [in] DWORD dwFlag, + [in] MLDETECTCP dwFlag, [in] DWORD dwPrefWinCodePage, [in] CHAR *pSrcStr, [in,out] INT *pcSrcSize,
On 11/11/18 8:45 PM, Fabian Maurer wrote:
This is a first basic implementation that can handle UTF-16/UTF-8 and should cover most text files.
It's currently not used by any known program, but I intend to use the function for find.exe to detect encodings.
Signed-off-by: Fabian Maurer dark.shadow4@web.de
dlls/mlang/mlang.c | 35 +++++++++++- dlls/mlang/tests/mlang.c | 114 +++++++++++++++++++++++++++++++++++++++ include/mlang.idl | 12 ++++- 3 files changed, 158 insertions(+), 3 deletions(-)
diff --git a/dlls/mlang/mlang.c b/dlls/mlang/mlang.c index f12df298f1..e33c982c76 100644 --- a/dlls/mlang/mlang.c +++ b/dlls/mlang/mlang.c @@ -3097,8 +3097,39 @@ static HRESULT WINAPI fnIMultiLanguage3_DetectInputCodepage( DetectEncodingInfo* lpEncoding, INT* pnScores) {
- FIXME("\n");
- return E_NOTIMPL;
- INT test;
- FIXME("(%u %u, %p, %p, %p, %p - semi-stub!\n", dwFlag, dwPrefWinCodePage, pSrcStr, pcSrcSize, lpEncoding, pnScores);
- if (!pSrcStr || !lpEncoding || *pcSrcSize <= 0 || *pnScores <= 0)
return E_INVALIDARG;
- test = IS_TEXT_UNICODE_SIGNATURE | IS_TEXT_UNICODE_REVERSE_SIGNATURE;
- IsTextUnicode(pSrcStr, *pcSrcSize, &test);
- if (test & IS_TEXT_UNICODE_SIGNATURE)
- {
*pnScores = 1;
lpEncoding[0].nCodePage = 1200;
return S_OK;
- }
- if (test & IS_TEXT_UNICODE_REVERSE_SIGNATURE)
- {
*pnScores = 1;
lpEncoding[0].nCodePage = 1201;
return S_OK;
- }
- /* Check for valid UTF-8 */
- if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pSrcStr, *pcSrcSize, NULL, 0) > 0)
- {
*pnScores = 1;
lpEncoding[0].nCodePage = 65001;
return S_OK;
- }
- return S_FALSE; }
MLang is clearly more sophisticated than IsTextUnicode(), judging just by returned arguments. I think it's possible dictionary lookup is involved.
Can you use IsTextUnicode() in find.exe instead? And more importantly to we have find.exe tests that prove it has to be supported in a first place?
E.g. does it work if you have UTF-16 encoded file and string you're looking for is in console CP, or does it instead compare byte sequence accounting for utf-8/utf-16 line separators?
static HRESULT WINAPI fnIMultiLanguage3_ValidateCodePage( diff --git a/dlls/mlang/tests/mlang.c b/dlls/mlang/tests/mlang.c index b5d6fc6114..f57a870d93 100644 --- a/dlls/mlang/tests/mlang.c +++ b/dlls/mlang/tests/mlang.c @@ -2695,6 +2695,119 @@ static void test_MapFont(IMLangFontLink *font_link, IMLangFontLink2 *font_link2) ReleaseDC(NULL, hdc); }
+static void test_DetectInputCodepage(IMultiLanguage2 *ml2) +{
- static char str_empty[] = {0};
- static char str_utf8_bom1[] = "\xef\xbb\xbf this is a test string with utf8 bom";
- static char str_utf8_bom2[] = "\xef\xbb\xbf this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom"
"this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom";
- static char str_shift_jis[] = {0x82, 0xB1, 0x82, 0xEA, 0x82, 0xCD, 0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA, 0x82, 0xCC, 0x83, 0x65, 0x83, 0x4C, 0x83, 0x58, 0x83, 0x67, 0x82, 0xC5, 0x82, 0xB7, 0x00 };
- static char str_utf16_be_with_bom[] = {
0xFE, 0xFF, 0x00, 0x54, 0x00, 0x68, 0x00, 0x69, 0x00, 0x73, 0x00, 0x20, 0x00, 0x69, 0x00, 0x73,
0x00, 0x20, 0x00, 0x74, 0x00, 0x65, 0x00, 0x78, 0x00, 0x74, 0x00, 0x20, 0x00, 0x69, 0x00, 0x6E,
0x00, 0x20, 0x00, 0x55, 0x00, 0x54, 0x00, 0x46, 0x00, 0x31, 0x00, 0x36, 0x00 };
- char *str_utf16_be_without_bom = &str_utf16_be_with_bom[2];
- static char str_utf16_le_with_bom[] = {
0xFF, 0xFE, 0x54, 0x00, 0x68, 0x00, 0x69, 0x00, 0x73, 0x00, 0x20, 0x00, 0x69, 0x00, 0x73, 0x00,
0x20, 0x00, 0x74, 0x00, 0x65, 0x00, 0x78, 0x00, 0x74, 0x00, 0x20, 0x00, 0x69, 0x00, 0x6E, 0x00,
0x20, 0x00, 0x55, 0x00, 0x54, 0x00, 0x46, 0x00, 0x31, 0x00, 0x36, 0x00, 0x00 };
- char *str_utf16_le_without_bom = &str_utf16_le_with_bom[2];
- static char str_utf8_hello_without_bom[] = { /* Hello in english, russian and japanese */
0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5,
0xD1, 0x82, 0x20, 0xE4, 0xBB, 0x8A, 0xE6, 0x97, 0xA5, 0xE3, 0x81, 0xAF, 0x00 };
- DetectEncodingInfo encoding_info[5];
- HRESULT result;
- INT str_size;
- INT encoding_count;
- /* Test error conditions */
- str_size = sizeof(str_empty);
- encoding_count = ARRAY_SIZE(encoding_info);
- result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, NULL, &str_size, encoding_info, &encoding_count);
- ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
- ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
- str_size = sizeof(str_empty);
- encoding_count = ARRAY_SIZE(encoding_info);
- result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, NULL, &encoding_count);
- ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
- ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
- str_size = 0;
- encoding_count = ARRAY_SIZE(encoding_info);
- result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, encoding_info, &encoding_count);
- ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
- ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
- str_size = sizeof(str_empty);
- encoding_count = 0;
- result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, encoding_info, &encoding_count);
- ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
- ok(0 == encoding_count, "Expected encoding_count to be %d, got %d\n", 0, encoding_count);
- /* Test strings */
+#define run_DetectInputCodepage(flags, codepage_default, str) \
- str_size = sizeof(str); \
- encoding_count = ARRAY_SIZE(encoding_info); \
- memset(&encoding_info, 0, sizeof(encoding_info)); \
- result = IMultiLanguage2_DetectInputCodepage(ml2, flags, codepage_default, str, &str_size, encoding_info, &encoding_count);
Do you need a macro for that?
On Sonntag, 11. November 2018 19:21:09 CET Nikolay Sivov wrote:
Can you use IsTextUnicode() in find.exe instead? And more importantly to we have find.exe tests that prove it has to be supported in a first place?
Not yet, but it does support different encodings - at least UTF8, UTF16 and ANSI.
Do you need a macro for that?
Not necessarily, but it makes things shorter.
Regards, Fabian Maurer