Alexandre Julliard : wrc: Directly implement UTF-8 conversions.

4 Feb 2020

Module: wine
Branch: master
Commit: 88abd7cb984f80b4f8788582361236ce331346f0
URL:    https://source.winehq.org/git/wine.git/?a=commit;h=88abd7cb984f80b4f87885823...
Author: Alexandre Julliard julliard@winehq.org
Date:   Wed Jan 29 10:39:42 2020 +0100
wrc: Directly implement UTF-8 conversions.
Signed-off-by: Alexandre Julliard julliard@winehq.org
---
tools/wrc/utils.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 156 insertions(+), 27 deletions(-)

diff --git a/tools/wrc/utils.c b/tools/wrc/utils.c
index a1e07b85b2..15bbde9d74 100644
--- a/tools/wrc/utils.c
+++ b/tools/wrc/utils.c
@@ -296,12 +296,12 @@ int is_valid_codepage(int id)
     return IsValidCodePage( id );
 }
-int wrc_mbstowcs( int codepage, int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
+static int wrc_mbstowcs( int codepage, int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
 {
     return MultiByteToWideChar( codepage, flags, src, srclen, dst, dstlen );
 }
-int wrc_wcstombs( int codepage, int flags, const WCHAR *src, int srclen, char *dst, int dstlen )
+static int wrc_wcstombs( int codepage, int flags, const WCHAR *src, int srclen, char *dst, int dstlen )
 {
     return WideCharToMultiByte( codepage, flags, src, srclen, dst, dstlen, NULL, NULL );
 }
@@ -315,25 +315,145 @@ int is_valid_codepage(int cp)
     return cp == CP_UTF8 || wine_cp_get_table(cp);
 }
-int wrc_mbstowcs( int codepage, int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
+static int wrc_mbstowcs( int codepage, int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
 {
-    if (codepage == CP_UTF8) return wine_utf8_mbstowcs( flags, src, srclen, dst, dstlen );
     return wine_cp_mbstowcs( wine_cp_get_table( codepage ), flags, src, srclen, dst, dstlen );
 }
-int wrc_wcstombs( int codepage, int flags, const WCHAR *src, int srclen, char *dst, int dstlen )
+static int wrc_wcstombs( int codepage, int flags, const WCHAR *src, int srclen, char *dst, int dstlen )
 {
-    if (codepage == CP_UTF8) return wine_utf8_wcstombs( flags, src, srclen, dst, dstlen );
     return wine_cp_wcstombs( wine_cp_get_table( codepage ), flags, src, srclen, dst, dstlen, NULL, NULL );
 }
#endif  /* _WIN32 */
+static WCHAR *utf8_to_unicode( const char *src, int srclen, int *dstlen )
+{
+    static const char utf8_length[128] =
+    {
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
+        0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
+        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
+        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
+        3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0  /* 0xf0-0xff */
+    };
+    static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
+
+    const char *srcend = src + srclen;
+    int len, res;
+    WCHAR *ret, *dst;
+
+    dst = ret = xmalloc( (srclen + 1) * sizeof(WCHAR) );
+    while (src < srcend)
+    {
+        unsigned char ch = *src++;
+        if (ch < 0x80)  /* special fast case for 7-bit ASCII */
+        {
+            *dst++ = ch;
+            continue;
+        }
+        len = utf8_length[ch - 0x80];
+        if (len && src + len <= srcend)
+        {
+            res = ch & utf8_mask[len];
+            switch (len)
+            {
+            case 3:
+                if ((ch = *src ^ 0x80) >= 0x40) break;
+                res = (res << 6) | ch;
+                src++;
+                if (res < 0x10) break;
+            case 2:
+                if ((ch = *src ^ 0x80) >= 0x40) break;
+                res = (res << 6) | ch;
+                if (res >= 0x110000 >> 6) break;
+                src++;
+                if (res < 0x20) break;
+                if (res >= 0xd800 >> 6 && res <= 0xdfff >> 6) break;
+            case 1:
+                if ((ch = *src ^ 0x80) >= 0x40) break;
+                res = (res << 6) | ch;
+                src++;
+                if (res < 0x80) break;
+                if (res <= 0xffff) *dst++ = res;
+                else
+                {
+                    res -= 0x10000;
+                    *dst++ = 0xd800 | (res >> 10);
+                    *dst++ = 0xdc00 | (res & 0x3ff);
+                }
+                continue;
+            }
+        }
+        *dst++ = 0xfffd;
+    }
+    *dst = 0;
+    *dstlen = dst - ret;
+    return ret;
+}
+
+static char *unicode_to_utf8( const WCHAR *src, int srclen, int *dstlen )
+{
+    char *ret, *dst;
+
+    dst = ret = xmalloc( srclen * 3 + 1 );
+    for ( ; srclen; srclen--, src++)
+    {
+        unsigned int ch = *src;
+
+        if (ch < 0x80)  /* 0x00-0x7f: 1 byte */
+        {
+            *dst++ = ch;
+            continue;
+        }
+        if (ch < 0x800)  /* 0x80-0x7ff: 2 bytes */
+        {
+            dst[1] = 0x80 | (ch & 0x3f);
+            ch >>= 6;
+            dst[0] = 0xc0 | ch;
+            dst += 2;
+            continue;
+        }
+        if (ch >= 0xd800 && ch <= 0xdbff && srclen > 1 && src[1] >= 0xdc00 && src[1] <= 0xdfff)
+        {
+            /* 0x10000-0x10ffff: 4 bytes */
+            ch = 0x10000 + ((ch & 0x3ff) << 10) + (src[1] & 0x3ff);
+            dst[3] = 0x80 | (ch & 0x3f);
+            ch >>= 6;
+            dst[2] = 0x80 | (ch & 0x3f);
+            ch >>= 6;
+            dst[1] = 0x80 | (ch & 0x3f);
+            ch >>= 6;
+            dst[0] = 0xf0 | ch;
+            dst += 4;
+            src++;
+            srclen--;
+            continue;
+        }
+        if (ch >= 0xd800 && ch <= 0xdfff) ch = 0xfffd;  /* invalid surrogate pair */
+
+        /* 0x800-0xffff: 3 bytes */
+        dst[2] = 0x80 | (ch & 0x3f);
+        ch >>= 6;
+        dst[1] = 0x80 | (ch & 0x3f);
+        ch >>= 6;
+        dst[0] = 0xe0 | ch;
+        dst += 3;
+    }
+    *dst = 0;
+    *dstlen = dst - ret;
+    return ret;
+}
+
 string_t *convert_string(const string_t *str, enum str_e type, int codepage)
 {
     string_t *ret = xmalloc(sizeof(*ret));
     int res;
+    ret->type = type;
     ret->loc = str->loc;
if (!codepage && str->type != type)
@@ -341,27 +461,33 @@ string_t *convert_string(const string_t *str, enum str_e type, int codepage)
if((str->type == str_char) && (type == str_unicode))
     {
-        ret->type = str_unicode;
-        ret->size = wrc_mbstowcs( codepage, 0, str->str.cstr, str->size, NULL, 0 );
-        ret->str.wstr = xmalloc( (ret->size+1) * sizeof(WCHAR) );
-        res = wrc_mbstowcs( codepage, MB_ERR_INVALID_CHARS, str->str.cstr, str->size,
-                            ret->str.wstr, ret->size );
-        if (res == -2)
-            parser_error( "Invalid character in string '%.*s' for codepage %u",
-                   str->size, str->str.cstr, codepage );
-        ret->str.wstr[ret->size] = 0;
+        if (codepage == CP_UTF8)
+            ret->str.wstr = utf8_to_unicode( str->str.cstr, str->size, &ret->size );
+        else
+        {
+            ret->str.wstr = xmalloc( (str->size + 1) * sizeof(WCHAR) );
+            res = wrc_mbstowcs( codepage, MB_ERR_INVALID_CHARS, str->str.cstr, str->size,
+                                ret->str.wstr, str->size );
+            if (res == -2)
+                parser_error( "Invalid character in string '%.*s' for codepage %u",
+                              str->size, str->str.cstr, codepage );
+            ret->size = res;
+            ret->str.wstr[ret->size] = 0;
+        }
     }
     else if((str->type == str_unicode) && (type == str_char))
     {
-        ret->type = str_char;
-        ret->size = wrc_wcstombs( codepage, 0, str->str.wstr, str->size, NULL, 0 );
-        ret->str.cstr = xmalloc( ret->size + 1 );
-        wrc_wcstombs( codepage, 0, str->str.wstr, str->size, ret->str.cstr, ret->size );
-        ret->str.cstr[ret->size] = 0;
+        if (codepage == CP_UTF8)
+            ret->str.cstr = unicode_to_utf8( str->str.wstr, str->size, &ret->size );
+        else
+        {
+            ret->str.cstr = xmalloc( str->size * 2 + 1 );
+            ret->size = wrc_wcstombs( codepage, 0, str->str.wstr, str->size, ret->str.cstr, str->size * 2 );
+            ret->str.cstr[ret->size] = 0;
+        }
     }
     else if(str->type == str_unicode)
     {
-        ret->type     = str_unicode;
         ret->size     = str->size;
         ret->str.wstr = xmalloc(sizeof(WCHAR)*(ret->size+1));
         memcpy( ret->str.wstr, str->str.wstr, ret->size * sizeof(WCHAR) );
@@ -369,7 +495,6 @@ string_t *convert_string(const string_t *str, enum str_e type, int codepage)
     }
     else /* str->type == str_char */
     {
-        ret->type     = str_char;
         ret->size     = str->size;
         ret->str.cstr = xmalloc( ret->size + 1 );
         memcpy( ret->str.cstr, str->str.cstr, ret->size );
@@ -389,22 +514,26 @@ void free_string(string_t *str)
 /* check if the string is valid utf8 despite a different codepage being in use */
 int check_valid_utf8( const string_t *str, int codepage )
 {
-    unsigned int i;
+    int i, count;
+    WCHAR *wstr;
if (!check_utf8) return 0;
     if (!codepage) return 0;
     if (codepage == CP_UTF8) return 0;
     if (!is_valid_codepage( codepage )) return 0;
-    for (i = 0; i < str->size; i++)
+    for (i = count = 0; i < str->size; i++)
     {
         if ((unsigned char)str->str.cstr[i] >= 0xf5) goto done;
-        if ((unsigned char)str->str.cstr[i] >= 0xc2) break;
+        if ((unsigned char)str->str.cstr[i] >= 0xc2) { count++; continue; }
         if ((unsigned char)str->str.cstr[i] >= 0x80) goto done;
     }
-    if (i == str->size) return 0;  /* no 8-bit chars at all */
+    if (!count) return 0;  /* no 8-bit chars at all */
-    if (wrc_mbstowcs( CP_UTF8, MB_ERR_INVALID_CHARS, str->str.cstr, str->size, NULL, 0 ) >= 0) return 1;
+    wstr = utf8_to_unicode( str->str.cstr, str->size, &count );
+    for (i = 0; i < count; i++) if (wstr[i] == 0xfffd) break;
+    free( wstr );
+    return (i == count);
done:
     check_utf8 = 0;  /* at least one 8-bit non-utf8 string found, stop checking */

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

2002

2001

Alexandre Julliard : wrc: Directly implement UTF-8 conversions.