New subject: [PATCH v4 2/3] tools/make_unicode: Implement canonical composition for use in normalization.

4 Apr 2018

Signed-off-by: Sergio Gómez Del Real sdelreal@codeweavers.com
---
 libs/port/mbtowc.c |  10 +-
 tools/make_unicode | 427 +++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 372 insertions(+), 65 deletions(-)

diff --git a/libs/port/mbtowc.c b/libs/port/mbtowc.c
index 4977c82d8b..d78bfe81e6 100644
--- a/libs/port/mbtowc.c
+++ b/libs/port/mbtowc.c
@@ -22,7 +22,7 @@
#include "wine/unicode.h"
-extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ) DECLSPEC_HIDDEN;
+extern unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src, int srclen, WCHAR *dst, int dstlen );
/* check the code whether it is in Unicode Private Use Area (PUA). */
 /* MB_ERR_INVALID_CHARS raises an error converting from 1-byte character to PUA. */
@@ -107,13 +107,13 @@ static int mbstowcs_sbcs_decompose( const struct sbcs_table *table, int flags,
     {
         WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
         for (len = 0; srclen; srclen--, src++)
-            len += wine_decompose( cp2uni[*src], dummy, 4 );
+            len += wine_unicode_decompose_string( 0, &cp2uni[*src], 1, dummy, 4 );
         return len;
     }
for (len = dstlen; srclen && len; srclen--, src++)
     {
-        unsigned int res = wine_decompose( cp2uni[*src], dst, len );
+        unsigned int res = wine_unicode_decompose_string( 0, &cp2uni[*src], 1, dst, len );
         if (!res) break;
         len -= res;
         dst += res;
@@ -219,7 +219,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
                 ch = cp2uni[(off << 8) + *src];
             }
             else ch = cp2uni[*src];
-            len += wine_decompose( ch, dummy, 4 );
+            len += wine_unicode_decompose_string( 0, &ch, 1, dummy, 4 );
         }
         return len;
     }
@@ -234,7 +234,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
             ch = cp2uni[(off << 8) + *src];
         }
         else ch = cp2uni[*src];
-        if (!(res = wine_decompose( ch, dst, len ))) break;
+        if (!(res = wine_unicode_decompose_string( 0, &ch, 1, dst, len ))) break;
         dst += res;
         len -= res;
     }
diff --git a/tools/make_unicode b/tools/make_unicode
index 92b0b64a94..65ae7ab2a0 100755
--- a/tools/make_unicode
+++ b/tools/make_unicode
@@ -471,6 +471,26 @@ sub READ_DEFAULTS($)
next if $decomp eq "";  # no decomposition, skip it
+        # store decomposition table
+        if ($decomp =~ /^<([a-zA-Z]+)>(\s+[0-9a-fA-F]+)+$/)
+        {
+            my @seq = ();
+            for my $ch (split /\s+/, (split /\s+/, $decomp, 2)[1])
+            {
+                push @seq, (hex $ch);
+            }
+            $decomp_table[$src] = [1, @seq];
+        }
+        elsif ($decomp =~ /^([0-9a-fA-F]+)(\s+([0-9a-fA-F]+))*$/)
+        {
+            my @seq = ();
+            for my $ch (split /\s+/, $decomp)
+            {
+                push @seq, (hex $ch);
+            }
+            $decomp_table[$src] = [0, @seq];
+        }
+
         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
         {
             # decomposition of the form "<foo> 1234" -> use char if type is known
@@ -508,7 +528,6 @@ sub READ_DEFAULTS($)
             # store decomposition if it contains two chars
             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
             {
-                $decomp_table[$src] = [ hex $1, hex $2 ];
                 push @compose_table, [ hex $1, hex $2, $src ];
             }
             elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ &&
@@ -2258,6 +2277,51 @@ EOF
     save_file($filename);
 }
+sub recursive_decomp
+{
+    my ($char, $table_ref, $compat) = @_;
+
+    return ($char) unless defined $table_ref->[$char];
+    my $data = $table_ref->[$char];
+    return ($char) if $data->[0] && !$compat;
+    my @mapping = ();
+    for my $ch (@{$data->[1]})
+    {
+        push @mapping, recursive_decomp( $ch, $table_ref, $compat );
+    }
+    return @mapping;
+}
+
+sub expand_pairs
+{
+    my @data = @_;
+    my @result = ();
+
+    for my $ch (@data)
+    {
+        if ($ch <= 0xFFFF)
+        {
+            push @result, $ch;
+        }
+        elsif ($ch >= 2097152) # 2**21
+        {
+            die sprintf "Invalid Unicode character %04x\n", $ch;
+        }
+        else
+        {
+            my $hx = $ch & 0xFFFF;
+            my $hu = ($ch >> 16) & ((1 << 5) - 1);
+            my $hw = ($hu - 1) & 0xFFFF;
+            my $hi = 0xD800 | ($hw << 6) | ($hx >> 10);
+            my $lx = $ch & 0xFFFF;
+            my $lo = (0xDC00 | ($lx & ((1 << 10) - 1))) & 0xFFFF;
+            push @result, $hi;
+            push @result, $lo;
+        }
+    }
+    return @result;
+}
+
 ################################################################
 # dump the char decomposition table
 sub dump_decompose_table($)
@@ -2266,98 +2330,341 @@ sub dump_decompose_table($)
open OUTPUT,">$filename.new" or die "Cannot create $filename";
     print "Building $filename\n";
-    print OUTPUT "/* Unicode char composition */\n";
+    print OUTPUT "/* Unicode char decomposition */\n";
     print OUTPUT "/* generated from $UNIDATA/UnicodeData.txt */\n";
     print OUTPUT "/* DO NOT EDIT!! */\n\n";
     print OUTPUT "#include "wine/unicode.h"\n\n";
-    # first determine all the 16-char subsets that contain something
+    my $utflim = 2097152;
+    my %nfd_lookup = ();
+    my %nfkd_lookup = ();
+    my %decomp_lookup = ();
+    my @decomp_data = (0);
+    my $pos = 1;
+    my $lastchar_decomp;
-    my @filled = (0) x 4096;
-    my $pos = 16*2;  # for the null subset
-    for (my $i = 0; $i < 65536; $i++)
+    for (my $i = 0; $i < $utflim; $i++)
     {
         next unless defined $decomp_table[$i];
-        $filled[$i >> 4] = $pos;
-        $pos += 16*2;
-        $i |= 15;
+
+        if (defined $decomp_table[$i])
+        {
+            $lastchar_decomp = $i;
+            # fully expand input and mappings
+
+            my @char = expand_pairs( ($i) );
+            push @char, 0;
+            my $char = pack "n*", @char;
+
+            my @nfd = recursive_decomp( $i, @decomp_table, 0 );
+            @nfd = expand_pairs( @nfd );
+            push @nfd, 0;
+            my $nfd = pack "n*", @nfd;
+
+            my @nfkd = recursive_decomp( $i, @decomp_table, 1 );
+            @nfkd = expand_pairs( @nfkd );
+            push @nfkd, 0;
+            my $nfkd = pack "n*", @nfkd;
+
+            # lookup or add mappings
+
+            if ($nfd eq $char)
+            {
+                $nfd = undef;
+            }
+            elsif (exists $decomp_lookup{$nfd})
+            {
+                $nfd_lookup{$i} = $decomp_lookup{$nfd};
+            }
+            else
+            {
+                push @decomp_data, @nfd;
+                $decomp_lookup{$nfd} = $pos;
+                $nfd_lookup{$i} = $pos;
+                $pos += @nfd;
+            }
+
+            if ($nfkd eq $char)
+            {
+                $nfkd = undef;
+            }
+            elsif (exists $decomp_lookup{$nfkd})
+            {
+                $nfkd_lookup{$i} = $decomp_lookup{$nfkd};
+            }
+            else
+            {
+                push @decomp_data, @nfkd;
+                $decomp_lookup{$nfkd} = $pos;
+                $nfkd_lookup{$i} = $pos;
+                $pos += @nfkd;
+            }
+        }
     }
-    my $total = $pos;
-    # now count the 256-char subsets that contain something
+    printf OUTPUT "static const UINT last_decomposable = 0x%x;\n\n", $lastchar_decomp;
+
+    # dump decomposition data
+
+    printf OUTPUT "static const WCHAR data_decomp[%d] =\n", $pos;
+    print OUTPUT "{\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @decomp_data );
+    print OUTPUT "\n};\n\n";
-    my @filled_idx = (256) x 256;
-    $pos = 256 + 16;
-    for (my $i = 0; $i < 4096; $i++)
+    # find 256-char subsets that contain something
+
+    my $filled_pos = 1;
+    my $filled_lim = ($lastchar_decomp >> 8) + 1;
+    my @filled = (0) x $filled_lim;
+    for (my $i = 0; $i < $utflim; $i++)
     {
-        next unless $filled[$i];
-        $filled_idx[$i >> 4] = $pos;
-        $pos += 16;
-        $i |= 15;
+        last if $i > $lastchar_decomp;
+        next unless exists $nfd_lookup{$i} || exists $nfkd_lookup{$i};
+        $filled[$i >> 8] = $filled_pos++;
+        $i |= 255;
     }
-    my $null_offset = $pos;  # null mapping
-    $total += $pos;
-    # add the index offsets to the subsets positions
+    # dump index of 256-char subsets
+
+    printf OUTPUT "static const BYTE idx1_decomp[%d] =\n", $filled_lim;
+    print OUTPUT "{\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @filled );
+    print OUTPUT "\n};\n\n";
+
+    # for 256-char subsets, find non-empty 16-char subsets
-    for (my $i = 0; $i < 4096; $i++)
+    my $sub_filled_pos = 1;
+    my %sub_filled = ();
+    for (my $i = 0; $i < $filled_lim; $i++)
     {
         next unless $filled[$i];
-        $filled[$i] += $null_offset;
+        for (my $j = 0; $j < 256; $j++)
+        {
+            my $idx = ($i << 8) | $j;
+            next unless exists $nfd_lookup{$idx} || exists $nfkd_lookup{$idx};
+            $sub_filled{$idx >> 4} = $sub_filled_pos++;
+            $j |= 15;
+        }
     }
-    # dump the main index
+    # dump index of 16-char subsets
-    printf OUTPUT "static const WCHAR table[%d] =\n", $total;
-    printf OUTPUT "{\n    /* index */\n";
-    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
-    printf OUTPUT ",\n    /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
-
-    # dump the second-level indexes
-
-    for (my $i = 0; $i < 256; $i++)
+    printf OUTPUT "static const USHORT idx2_decomp[%d] =\n", $filled_pos * 16;
+    print OUTPUT "{\n";
+    my @null_idx = (0) x 16;
+    print OUTPUT "    /* null sub-index */\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx );
+    for (my $i = 0; $i < $filled_lim; $i++)
     {
-        next unless ($filled_idx[$i] > 256);
-        my @table = @filled[($i<<4)..($i<<4)+15];
-        for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
-        printf OUTPUT ",\n    /* sub-index %02x */\n", $i;
-        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
+        next unless $filled[$i];
+        printf OUTPUT ",\n    /* sub-index 0x%02x */\n", $filled[$i];
+
+        my @sub_idx;
+        for (my $j = 0; $j < 16; $j++)
+        {
+            my $idx = ($i << 4) | $j;
+            $sub_idx[$j] = $sub_filled{$idx} || 0;
+        }
+        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx );
     }
+    print OUTPUT "\n};\n\n";
# dump the 16-char subsets
-    printf OUTPUT ",\n    /* null mapping */\n";
-    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
-
-    for (my $i = 0; $i < 4096; $i++)
+    printf OUTPUT "static const USHORT offsets_decomp[%d] =\n", 32 * $sub_filled_pos;
+    print OUTPUT "{\n";
+    print OUTPUT "    /* (nfd, nfkd) x 16 */\n";
+    my @null_table = (0) x 32;
+    print OUTPUT "    /* no decomposition */\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table );
+    for my $key (sort {$a <=> $b} keys %sub_filled)
     {
-        next unless $filled[$i];
-        my @table = (0) x 32;
+        printf OUTPUT ",\n    /* 0x%03x0 .. 0x%03xf */\n", $key, $key;
+        my @sub_table;
         for (my $j = 0; $j < 16; $j++)
         {
-            if (defined $decomp_table[($i<<4) + $j])
-            {
-                $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
-                $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
-            }
+            my $idx = ($key << 4) | $j;
+            $sub_table[2 * $j] = $nfd_lookup{$idx} || 0;
+            $sub_table[2 * $j + 1] = $nfkd_lookup{$idx} || 0;
         }
-        printf OUTPUT ",\n    /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
-        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
+        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table );
     }
+    print OUTPUT "\n};\n\n";
-    printf OUTPUT "\n};\n\n";
     print OUTPUT <<"EOF";
-unsigned int DECLSPEC_HIDDEN wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen )
+static const WCHAR *unicode_table_lookup( UINT cp, int compat, const BYTE *idx1, UINT scale_idx1,
+                                          const USHORT *idx2, UINT scale_idx2, const USHORT *offsets,
+                                          UINT scale_off, const WCHAR *data, UINT scale_data )
 {
-    const WCHAR *ptr = table + table[table[ch >> 8] + ((ch >> 4) & 0x0f)] + 2 * (ch & 0xf);
-    unsigned int res;
+    USHORT a, b, c, d;
-    *dst = ch;
-    if (!*ptr) return 1;
-    if (dstlen <= 1) return 0;
-    /* apply the decomposition recursively to the first char */
-    if ((res = wine_decompose( *ptr, dst, dstlen-1 ))) dst[res++] = ptr[1];
-    return res;
+    a = idx1[cp >> scale_idx1];
+    b = idx2[(a << scale_idx2) + ((cp >> scale_idx2) & 0xf)];
+    c = (b << scale_off) + ((cp & 0xf) << scale_data);
+    if (compat) ++c;
+    d = offsets[c];
+
+    return &data[d];
+}
+
+static inline int decompose_hangul( WCHAR ch, WCHAR dum[4], int dstlen )
+{
+    static const WCHAR sbase = 0xac00, lbase = 0x1100, vbase = 0x1161, tbase = 0x11a7;
+    static const WCHAR /*lcount = 19, vcount = 21,*/ tcount = 28, ncount = 588/*, scount = 11172*/;
+    WCHAR sindex, lindex, vindex, tindex;
+
+    if (ch >= 0xac00 && ch <= 0xd7af)
+    {
+        sindex = ch - sbase;
+        lindex = sindex / ncount;
+        vindex = (sindex % ncount) / tcount;
+        tindex = sindex % tcount;
+        dum[0] = lbase + lindex;
+        dum[1] = vbase + vindex;
+        dum[2] = (tindex > 0) ? (tbase + tindex) : 0;
+        dum[3] = 0;
+    }
+    else
+    {
+        dum[0] = ch;
+        dum[1] = 0;
+    }
+
+    return 0;
+}
+
+static inline UINT utf16_codepoint_to_surrogates( UINT cp )
+{
+    UINT ch = cp;
+    WCHAR hx, hw, lx;
+    UINT hu;
+
+    hx = (WCHAR)cp;
+    hu = (cp >> 16) & ((1 << 5) - 1);
+    hw = (WCHAR)hu - 1;
+    lx = (WCHAR)cp;
+    ch = 0xD800 | (hw << 6) | (hx >> 10);
+    ch |= (0xDC00 | (lx & ((1 << 10) - 1)))<<16;
+
+    return ch;
+}
+
+static inline UINT utf16_surrogates_to_codepoint( WCHAR hi, WCHAR lo )
+{
+    UINT x, w, u, c;
+
+    x = ((hi & ((1 << 6) - 1)) << 10) | (lo & ((1 << 10) - 1));
+    w = (hi >> 6) & ((1 << 5) - 1);
+    u = w + 1;
+    c = (u << 16) | x;
+
+    return c;
+}
+
+static int decompose_char_recursive( int compat, UINT ch, WCHAR *dst, int dstlen )
+{
+    const WCHAR *map = NULL;
+    int total_decomp = 0;
+    int size_decomp;
+
+    if (ch < 0xa0) /* fast path */
+    {
+        if (dstlen) *dst = (WCHAR)ch;
+        return 1;
+    }
+    else if (ch >= 0xac00 && ch <= 0xd7af) /* hangul */
+    {
+        WCHAR dum[4];
+        int len = 0, i = 0;
+
+        decompose_hangul( ch, dum, dstlen );
+        while (dum[i])
+        {
+            if (dstlen-i) dst[i] = dum[i];
+            i++;
+            len++;
+        }
+        return len;
+    }
+    else if (ch > last_decomposable ||
+             !*(map = unicode_table_lookup( ch, compat, idx1_decomp, 8,
+                idx2_decomp, 4, offsets_decomp, 5, data_decomp, 1 )))
+    {
+        if (ch > 0xffff)
+        {
+            ch = utf16_codepoint_to_surrogates( ch );
+            if (dstlen) *dst = (WCHAR)ch;
+            if (dstlen-1) *(dst+1) = (WCHAR)(ch>>16);
+            return 2;
+        }
+        else
+        {
+            if (dstlen) *dst = (WCHAR)ch;
+            return 1;
+        }
+    }
+    else {
+        while (*map)
+        {
+            size_decomp = decompose_char_recursive( compat, *map, dst, dstlen );
+            dstlen -= size_decomp;
+            if (dstlen < 0) dstlen = 0;
+            dst += size_decomp;
+            map++;
+            total_decomp += size_decomp;
+        }
+        return total_decomp;
+    }
+}
+
+unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src,
+                                            int srclen, WCHAR *dst, int dstlen )
+{
+    UINT ch;
+    int srcpos = 0, dstpos = 0;
+    int num_decomp;
+
+    if (dstlen < 0) dstlen = 0;
+
+    while (srcpos < srclen)
+    {
+        ch = src[srcpos];
+
+        if (ch >= 0xd800 && ch <= 0xdbff) /* high surrogate */
+        {
+            WCHAR hi, lo;
+            if (srcpos+1 == srclen) return srcpos;
+            hi = (WCHAR)ch;
+            lo = src[++srcpos];
+            if (lo < 0xdc00 || lo > 0xdfff) return srcpos;
+            ch = utf16_surrogates_to_codepoint( hi, lo );
+        }
+        else if (ch >= 0xdc00 && ch <= 0xdfff) /* low surrogate */
+        {
+            return srcpos;
+        }
+
+        num_decomp = decompose_char_recursive( compat, ch, dst+dstpos, dstlen );
+        dstpos += num_decomp;
+
+        if (dstlen > 0)
+        {
+            dstlen -= num_decomp;
+            if (dstlen < 0)
+            {
+                while (dstlen < 0)
+                {
+                    dstpos--;
+                    dstlen++;
+                }
+                break;
+            }
+        }
+
+        ++srcpos;
+    }
+
+    return dstpos;
 }
 EOF
     close OUTPUT;
-- 
2.14.1



    

[PATCH v4 1/3] tools/make_unicode: Implement full Unicode character decomposition.