[PATCH] mlang: Add basic implementation for IMultiLanguage3_DetectInputCodepage - wine-devel

11 Nov 2018

This is a first basic implementation that can handle
UTF-16/UTF-8 and should cover most text files.
It's currently not used by any known program,
but I intend to use the function for find.exe to detect encodings.
Signed-off-by: Fabian Maurer dark.shadow4@web.de
---
 dlls/mlang/mlang.c       |  35 +++++++++++-
 dlls/mlang/tests/mlang.c | 114 +++++++++++++++++++++++++++++++++++++++
 include/mlang.idl        |  12 ++++-
 3 files changed, 158 insertions(+), 3 deletions(-)

diff --git a/dlls/mlang/mlang.c b/dlls/mlang/mlang.c
index f12df298f1..e33c982c76 100644
--- a/dlls/mlang/mlang.c
+++ b/dlls/mlang/mlang.c
@@ -3097,8 +3097,39 @@ static HRESULT WINAPI fnIMultiLanguage3_DetectInputCodepage(
     DetectEncodingInfo* lpEncoding,
     INT* pnScores)
 {
-    FIXME("\n");
-    return E_NOTIMPL;
+    INT test;
+
+    FIXME("(%u %u, %p, %p, %p, %p - semi-stub!\n", dwFlag, dwPrefWinCodePage, pSrcStr, pcSrcSize, lpEncoding, pnScores);
+
+    if (!pSrcStr || !lpEncoding || *pcSrcSize <= 0 || *pnScores <= 0)
+        return E_INVALIDARG;
+
+    test = IS_TEXT_UNICODE_SIGNATURE | IS_TEXT_UNICODE_REVERSE_SIGNATURE;
+    IsTextUnicode(pSrcStr, *pcSrcSize, &test);
+
+    if (test & IS_TEXT_UNICODE_SIGNATURE)
+    {
+        *pnScores = 1;
+        lpEncoding[0].nCodePage = 1200;
+        return S_OK;
+    }
+
+    if (test & IS_TEXT_UNICODE_REVERSE_SIGNATURE)
+    {
+        *pnScores = 1;
+        lpEncoding[0].nCodePage = 1201;
+        return S_OK;
+    }
+
+    /* Check for valid UTF-8 */
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pSrcStr, *pcSrcSize, NULL, 0) > 0)
+    {
+        *pnScores = 1;
+        lpEncoding[0].nCodePage = 65001;
+        return S_OK;
+    }
+
+    return S_FALSE;
 }
static HRESULT WINAPI fnIMultiLanguage3_ValidateCodePage(
diff --git a/dlls/mlang/tests/mlang.c b/dlls/mlang/tests/mlang.c
index b5d6fc6114..f57a870d93 100644
--- a/dlls/mlang/tests/mlang.c
+++ b/dlls/mlang/tests/mlang.c
@@ -2695,6 +2695,119 @@ static void test_MapFont(IMLangFontLink *font_link, IMLangFontLink2 *font_link2)
     ReleaseDC(NULL, hdc);
 }
+static void test_DetectInputCodepage(IMultiLanguage2 *ml2)
+{
+    static char str_empty[] = {0};
+    static char str_utf8_bom1[] = "\xef\xbb\xbf this is a test string with utf8 bom";
+    static char str_utf8_bom2[] = "\xef\xbb\xbf this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom"
+        "this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom";
+    static char str_shift_jis[] = {0x82, 0xB1, 0x82, 0xEA, 0x82, 0xCD, 0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA, 0x82, 0xCC, 0x83, 0x65, 0x83, 0x4C, 0x83, 0x58, 0x83, 0x67, 0x82, 0xC5, 0x82, 0xB7, 0x00 };
+    static char str_utf16_be_with_bom[] = {
+        0xFE, 0xFF, 0x00, 0x54, 0x00, 0x68, 0x00, 0x69, 0x00, 0x73, 0x00, 0x20, 0x00, 0x69, 0x00, 0x73,
+        0x00, 0x20, 0x00, 0x74, 0x00, 0x65, 0x00, 0x78, 0x00, 0x74, 0x00, 0x20, 0x00, 0x69, 0x00, 0x6E,
+        0x00, 0x20, 0x00, 0x55, 0x00, 0x54, 0x00, 0x46, 0x00, 0x31, 0x00, 0x36, 0x00 };
+    char *str_utf16_be_without_bom = &str_utf16_be_with_bom[2];
+    static char  str_utf16_le_with_bom[] = {
+        0xFF, 0xFE, 0x54, 0x00, 0x68, 0x00, 0x69, 0x00, 0x73, 0x00, 0x20, 0x00, 0x69, 0x00, 0x73, 0x00,
+        0x20, 0x00, 0x74, 0x00, 0x65, 0x00, 0x78, 0x00, 0x74, 0x00, 0x20, 0x00, 0x69, 0x00, 0x6E, 0x00,
+        0x20, 0x00, 0x55, 0x00, 0x54, 0x00, 0x46, 0x00, 0x31, 0x00, 0x36, 0x00, 0x00 };
+    char *str_utf16_le_without_bom = &str_utf16_le_with_bom[2];
+    static char str_utf8_hello_without_bom[] = { /* Hello in english, russian and japanese */
+      0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5,
+      0xD1, 0x82, 0x20, 0xE4, 0xBB, 0x8A, 0xE6, 0x97, 0xA5, 0xE3, 0x81, 0xAF, 0x00 };
+
+    DetectEncodingInfo encoding_info[5];
+    HRESULT result;
+    INT str_size;
+    INT encoding_count;
+
+    /* Test error conditions */
+
+    str_size = sizeof(str_empty);
+    encoding_count = ARRAY_SIZE(encoding_info);
+    result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, NULL, &str_size, encoding_info, &encoding_count);
+    ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
+    ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
+
+    str_size = sizeof(str_empty);
+    encoding_count = ARRAY_SIZE(encoding_info);
+    result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, NULL, &encoding_count);
+    ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
+    ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
+
+    str_size = 0;
+    encoding_count = ARRAY_SIZE(encoding_info);
+    result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, encoding_info, &encoding_count);
+    ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
+    ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
+
+    str_size = sizeof(str_empty);
+    encoding_count = 0;
+    result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, encoding_info, &encoding_count);
+    ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
+    ok(0 == encoding_count, "Expected encoding_count to be %d, got %d\n", 0, encoding_count);
+
+    /* Test strings */
+
+#define run_DetectInputCodepage(flags, codepage_default, str)                                                                   \
+    str_size = sizeof(str);                                                                                                     \
+    encoding_count = ARRAY_SIZE(encoding_info);                                                                                 \
+    memset(&encoding_info, 0, sizeof(encoding_info));                                                                           \
+    result = IMultiLanguage2_DetectInputCodepage(ml2, flags, codepage_default, str, &str_size, encoding_info, &encoding_count);
+
+
+    run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf8_bom1);
+    todo_wine
+    ok(result == S_FALSE, "Expected %#x, got %#x\n", S_FALSE, result);
+    ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count);
+    ok(encoding_info[0].nCodePage == 65001, "Expected code-page %d, got %d\n", 65001, encoding_info[0].nCodePage);
+
+    run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf8_bom2);
+    ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result);
+    todo_wine
+    ok(2 == encoding_count, "Expected encoding_count to be %d, got %d\n", 2, encoding_count);
+    todo_wine
+    ok(encoding_info[0].nCodePage == 1252, "Expected code-page %d, got %d\n", 1252, encoding_info[0].nCodePage);
+    todo_wine
+    ok(encoding_info[1].nCodePage == 65001, "Expected code-page %d, got %d\n", 65001, encoding_info[1].nCodePage);
+
+    run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_shift_jis);
+    todo_wine
+    ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result);
+    todo_wine
+    ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count);
+    todo_wine
+    ok(encoding_info[0].nCodePage == 932, "Expected code-page %d, got %d\n", 932, encoding_info[0].nCodePage);
+
+    run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf16_be_with_bom);
+    ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result);
+    ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count);
+    ok(encoding_info[0].nCodePage == 1201, "Expected code-page %d, got %d\n", 1201, encoding_info[0].nCodePage);
+
+    run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf16_le_with_bom);
+    ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result);
+    ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count);
+    ok(encoding_info[0].nCodePage == 1200, "Expected code-page %d, got %d\n", 1200, encoding_info[0].nCodePage);
+
+    run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf16_be_without_bom);
+    ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result);
+    ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count);
+    todo_wine
+    ok(encoding_info[0].nCodePage == 20127, "Expected code-page %d, got %d\n", 20127, encoding_info[0].nCodePage);
+
+    run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf16_le_without_bom);
+    ok(result == S_OK, "Expected %#x, got %#x\n", S_OK, result);
+    ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count);
+    todo_wine
+    ok(encoding_info[0].nCodePage == 20127, "Expected code-page %d, got %d\n", 20127, encoding_info[0].nCodePage);
+
+    run_DetectInputCodepage(MLDETECTCP_NONE, 0, str_utf8_hello_without_bom);
+    todo_wine
+    ok(result == S_FALSE, "Expected %#x, got %#x\n", S_FALSE, result);
+    ok(1 == encoding_count, "Expected encoding_count to be %d, got %d\n", 1, encoding_count);
+    ok(encoding_info[0].nCodePage == 65001, "Expected code-page %d, got %d\n", 65001, encoding_info[0].nCodePage);
+}
+
 START_TEST(mlang)
 {
     IMultiLanguage  *iML = NULL;
@@ -2761,6 +2874,7 @@ START_TEST(mlang)
     test_IMultiLanguage2_ConvertStringFromUnicode(iML2);
test_IsCodePageInstallable(iML2);
+    test_DetectInputCodepage(iML2);
IMultiLanguage2_Release(iML2);
diff --git a/include/mlang.idl b/include/mlang.idl
index 5867648b04..a23fdb85f4 100644
--- a/include/mlang.idl
+++ b/include/mlang.idl
@@ -25,6 +25,16 @@ interface IStream;
 cpp_quote("#define CPIOD_PEEK          0x40000000")
 cpp_quote("#define CPIOD_FORCE_PROMPT  0x80000000")
+typedef enum tagMLDETECTCP
+{
+    MLDETECTCP_NONE   = 0,
+    MLDETECTCP_7BIT   = 1,
+    MLDETECTCP_8BIT   = 2,
+    MLDETECTCP_DBCS   = 4,
+    MLDETECTCP_HTML   = 8,
+    MLDETECTCP_NUMBER = 16,
+} MLDETECTCP;
+
 [
   object,
   uuid(359f3443-bd4a-11d0-b188-00aa0038c969),
@@ -677,7 +687,7 @@ interface IMultiLanguage2 : IUnknown
         [in,out] INT *pnScores);
HRESULT DetectInputCodepage(      
-        [in] DWORD dwFlag,
+        [in] MLDETECTCP dwFlag,
         [in] DWORD dwPrefWinCodePage,
         [in] CHAR *pSrcStr,
         [in,out] INT *pcSrcSize,
-- 
2.19.1