Signed-off-by: Sergio Gómez Del Real <sdelreal(a)codeweavers.com>
---
libs/port/mbtowc.c | 10 +-
tools/make_unicode | 427 +++++++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 372 insertions(+), 65 deletions(-)
diff --git a/libs/port/mbtowc.c b/libs/port/mbtowc.c
index 4977c82d8b..d78bfe81e6 100644
--- a/libs/port/mbtowc.c
+++ b/libs/port/mbtowc.c
@@ -22,7 +22,7 @@
#include "wine/unicode.h"
-extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ) DECLSPEC_HIDDEN;
+extern unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src, int srclen, WCHAR *dst, int dstlen );
/* check the code whether it is in Unicode Private Use Area (PUA). */
/* MB_ERR_INVALID_CHARS raises an error converting from 1-byte character to PUA. */
@@ -107,13 +107,13 @@ static int mbstowcs_sbcs_decompose( const struct sbcs_table *table, int flags,
{
WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
for (len = 0; srclen; srclen--, src++)
- len += wine_decompose( cp2uni[*src], dummy, 4 );
+ len += wine_unicode_decompose_string( 0, &cp2uni[*src], 1, dummy, 4 );
return len;
}
for (len = dstlen; srclen && len; srclen--, src++)
{
- unsigned int res = wine_decompose( cp2uni[*src], dst, len );
+ unsigned int res = wine_unicode_decompose_string( 0, &cp2uni[*src], 1, dst, len );
if (!res) break;
len -= res;
dst += res;
@@ -219,7 +219,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
ch = cp2uni[(off << 8) + *src];
}
else ch = cp2uni[*src];
- len += wine_decompose( ch, dummy, 4 );
+ len += wine_unicode_decompose_string( 0, &ch, 1, dummy, 4 );
}
return len;
}
@@ -234,7 +234,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
ch = cp2uni[(off << 8) + *src];
}
else ch = cp2uni[*src];
- if (!(res = wine_decompose( ch, dst, len ))) break;
+ if (!(res = wine_unicode_decompose_string( 0, &ch, 1, dst, len ))) break;
dst += res;
len -= res;
}
diff --git a/tools/make_unicode b/tools/make_unicode
index 92b0b64a94..65ae7ab2a0 100755
--- a/tools/make_unicode
+++ b/tools/make_unicode
@@ -471,6 +471,26 @@ sub READ_DEFAULTS($)
next if $decomp eq ""; # no decomposition, skip it
+ # store decomposition table
+ if ($decomp =~ /^<([a-zA-Z]+)>(\s+[0-9a-fA-F]+)+$/)
+ {
+ my @seq = ();
+ for my $ch (split /\s+/, (split /\s+/, $decomp, 2)[1])
+ {
+ push @seq, (hex $ch);
+ }
+ $decomp_table[$src] = [1, \@seq];
+ }
+ elsif ($decomp =~ /^([0-9a-fA-F]+)(\s+([0-9a-fA-F]+))*$/)
+ {
+ my @seq = ();
+ for my $ch (split /\s+/, $decomp)
+ {
+ push @seq, (hex $ch);
+ }
+ $decomp_table[$src] = [0, \@seq];
+ }
+
if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
{
# decomposition of the form "<foo> 1234" -> use char if type is known
@@ -508,7 +528,6 @@ sub READ_DEFAULTS($)
# store decomposition if it contains two chars
if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
{
- $decomp_table[$src] = [ hex $1, hex $2 ];
push @compose_table, [ hex $1, hex $2, $src ];
}
elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ &&
@@ -2258,6 +2277,51 @@ EOF
save_file($filename);
}
+sub recursive_decomp
+{
+ my ($char, $table_ref, $compat) = @_;
+
+ return ($char) unless defined $table_ref->[$char];
+ my $data = $table_ref->[$char];
+ return ($char) if $data->[0] && !$compat;
+ my @mapping = ();
+ for my $ch (@{$data->[1]})
+ {
+ push @mapping, recursive_decomp( $ch, $table_ref, $compat );
+ }
+ return @mapping;
+}
+
+sub expand_pairs
+{
+ my @data = @_;
+ my @result = ();
+
+ for my $ch (@data)
+ {
+ if ($ch <= 0xFFFF)
+ {
+ push @result, $ch;
+ }
+ elsif ($ch >= 2097152) # 2**21
+ {
+ die sprintf "Invalid Unicode character %04x\n", $ch;
+ }
+ else
+ {
+ my $hx = $ch & 0xFFFF;
+ my $hu = ($ch >> 16) & ((1 << 5) - 1);
+ my $hw = ($hu - 1) & 0xFFFF;
+ my $hi = 0xD800 | ($hw << 6) | ($hx >> 10);
+ my $lx = $ch & 0xFFFF;
+ my $lo = (0xDC00 | ($lx & ((1 << 10) - 1))) & 0xFFFF;
+ push @result, $hi;
+ push @result, $lo;
+ }
+ }
+ return @result;
+}
+
################################################################
# dump the char decomposition table
sub dump_decompose_table($)
@@ -2266,98 +2330,341 @@ sub dump_decompose_table($)
open OUTPUT,">$filename.new" or die "Cannot create $filename";
print "Building $filename\n";
- print OUTPUT "/* Unicode char composition */\n";
+ print OUTPUT "/* Unicode char decomposition */\n";
print OUTPUT "/* generated from $UNIDATA/UnicodeData.txt */\n";
print OUTPUT "/* DO NOT EDIT!! */\n\n";
print OUTPUT "#include \"wine/unicode.h\"\n\n";
- # first determine all the 16-char subsets that contain something
+ my $utflim = 2097152;
+ my %nfd_lookup = ();
+ my %nfkd_lookup = ();
+ my %decomp_lookup = ();
+ my @decomp_data = (0);
+ my $pos = 1;
+ my $lastchar_decomp;
- my @filled = (0) x 4096;
- my $pos = 16*2; # for the null subset
- for (my $i = 0; $i < 65536; $i++)
+ for (my $i = 0; $i < $utflim; $i++)
{
next unless defined $decomp_table[$i];
- $filled[$i >> 4] = $pos;
- $pos += 16*2;
- $i |= 15;
+
+ if (defined $decomp_table[$i])
+ {
+ $lastchar_decomp = $i;
+ # fully expand input and mappings
+
+ my @char = expand_pairs( ($i) );
+ push @char, 0;
+ my $char = pack "n*", @char;
+
+ my @nfd = recursive_decomp( $i, \@decomp_table, 0 );
+ @nfd = expand_pairs( @nfd );
+ push @nfd, 0;
+ my $nfd = pack "n*", @nfd;
+
+ my @nfkd = recursive_decomp( $i, \@decomp_table, 1 );
+ @nfkd = expand_pairs( @nfkd );
+ push @nfkd, 0;
+ my $nfkd = pack "n*", @nfkd;
+
+ # lookup or add mappings
+
+ if ($nfd eq $char)
+ {
+ $nfd = undef;
+ }
+ elsif (exists $decomp_lookup{$nfd})
+ {
+ $nfd_lookup{$i} = $decomp_lookup{$nfd};
+ }
+ else
+ {
+ push @decomp_data, @nfd;
+ $decomp_lookup{$nfd} = $pos;
+ $nfd_lookup{$i} = $pos;
+ $pos += @nfd;
+ }
+
+ if ($nfkd eq $char)
+ {
+ $nfkd = undef;
+ }
+ elsif (exists $decomp_lookup{$nfkd})
+ {
+ $nfkd_lookup{$i} = $decomp_lookup{$nfkd};
+ }
+ else
+ {
+ push @decomp_data, @nfkd;
+ $decomp_lookup{$nfkd} = $pos;
+ $nfkd_lookup{$i} = $pos;
+ $pos += @nfkd;
+ }
+ }
}
- my $total = $pos;
- # now count the 256-char subsets that contain something
+ printf OUTPUT "static const UINT last_decomposable = 0x%x;\n\n", $lastchar_decomp;
+
+ # dump decomposition data
+
+ printf OUTPUT "static const WCHAR data_decomp[%d] =\n", $pos;
+ print OUTPUT "{\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @decomp_data );
+ print OUTPUT "\n};\n\n";
- my @filled_idx = (256) x 256;
- $pos = 256 + 16;
- for (my $i = 0; $i < 4096; $i++)
+ # find 256-char subsets that contain something
+
+ my $filled_pos = 1;
+ my $filled_lim = ($lastchar_decomp >> 8) + 1;
+ my @filled = (0) x $filled_lim;
+ for (my $i = 0; $i < $utflim; $i++)
{
- next unless $filled[$i];
- $filled_idx[$i >> 4] = $pos;
- $pos += 16;
- $i |= 15;
+ last if $i > $lastchar_decomp;
+ next unless exists $nfd_lookup{$i} || exists $nfkd_lookup{$i};
+ $filled[$i >> 8] = $filled_pos++;
+ $i |= 255;
}
- my $null_offset = $pos; # null mapping
- $total += $pos;
- # add the index offsets to the subsets positions
+ # dump index of 256-char subsets
+
+ printf OUTPUT "static const BYTE idx1_decomp[%d] =\n", $filled_lim;
+ print OUTPUT "{\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @filled );
+ print OUTPUT "\n};\n\n";
+
+ # for 256-char subsets, find non-empty 16-char subsets
- for (my $i = 0; $i < 4096; $i++)
+ my $sub_filled_pos = 1;
+ my %sub_filled = ();
+ for (my $i = 0; $i < $filled_lim; $i++)
{
next unless $filled[$i];
- $filled[$i] += $null_offset;
+ for (my $j = 0; $j < 256; $j++)
+ {
+ my $idx = ($i << 8) | $j;
+ next unless exists $nfd_lookup{$idx} || exists $nfkd_lookup{$idx};
+ $sub_filled{$idx >> 4} = $sub_filled_pos++;
+ $j |= 15;
+ }
}
- # dump the main index
+ # dump index of 16-char subsets
- printf OUTPUT "static const WCHAR table[%d] =\n", $total;
- printf OUTPUT "{\n /* index */\n";
- printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
- printf OUTPUT ",\n /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
-
- # dump the second-level indexes
-
- for (my $i = 0; $i < 256; $i++)
+ printf OUTPUT "static const USHORT idx2_decomp[%d] =\n", $filled_pos * 16;
+ print OUTPUT "{\n";
+ my @null_idx = (0) x 16;
+ print OUTPUT " /* null sub-index */\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx );
+ for (my $i = 0; $i < $filled_lim; $i++)
{
- next unless ($filled_idx[$i] > 256);
- my @table = @filled[($i<<4)..($i<<4)+15];
- for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
- printf OUTPUT ",\n /* sub-index %02x */\n", $i;
- printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
+ next unless $filled[$i];
+ printf OUTPUT ",\n /* sub-index 0x%02x */\n", $filled[$i];
+
+ my @sub_idx;
+ for (my $j = 0; $j < 16; $j++)
+ {
+ my $idx = ($i << 4) | $j;
+ $sub_idx[$j] = $sub_filled{$idx} || 0;
+ }
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx );
}
+ print OUTPUT "\n};\n\n";
# dump the 16-char subsets
- printf OUTPUT ",\n /* null mapping */\n";
- printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
-
- for (my $i = 0; $i < 4096; $i++)
+ printf OUTPUT "static const USHORT offsets_decomp[%d] =\n", 32 * $sub_filled_pos;
+ print OUTPUT "{\n";
+ print OUTPUT " /* (nfd, nfkd) x 16 */\n";
+ my @null_table = (0) x 32;
+ print OUTPUT " /* no decomposition */\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table );
+ for my $key (sort {$a <=> $b} keys %sub_filled)
{
- next unless $filled[$i];
- my @table = (0) x 32;
+ printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $key, $key;
+ my @sub_table;
for (my $j = 0; $j < 16; $j++)
{
- if (defined $decomp_table[($i<<4) + $j])
- {
- $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
- $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
- }
+ my $idx = ($key << 4) | $j;
+ $sub_table[2 * $j] = $nfd_lookup{$idx} || 0;
+ $sub_table[2 * $j + 1] = $nfkd_lookup{$idx} || 0;
}
- printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
- printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table );
}
+ print OUTPUT "\n};\n\n";
- printf OUTPUT "\n};\n\n";
print OUTPUT <<"EOF";
-unsigned int DECLSPEC_HIDDEN wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen )
+static const WCHAR *unicode_table_lookup( UINT cp, int compat, const BYTE *idx1, UINT scale_idx1,
+ const USHORT *idx2, UINT scale_idx2, const USHORT *offsets,
+ UINT scale_off, const WCHAR *data, UINT scale_data )
{
- const WCHAR *ptr = table + table[table[ch >> 8] + ((ch >> 4) & 0x0f)] + 2 * (ch & 0xf);
- unsigned int res;
+ USHORT a, b, c, d;
- *dst = ch;
- if (!*ptr) return 1;
- if (dstlen <= 1) return 0;
- /* apply the decomposition recursively to the first char */
- if ((res = wine_decompose( *ptr, dst, dstlen-1 ))) dst[res++] = ptr[1];
- return res;
+ a = idx1[cp >> scale_idx1];
+ b = idx2[(a << scale_idx2) + ((cp >> scale_idx2) & 0xf)];
+ c = (b << scale_off) + ((cp & 0xf) << scale_data);
+ if (compat) ++c;
+ d = offsets[c];
+
+ return &data[d];
+}
+
+static inline int decompose_hangul( WCHAR ch, WCHAR dum[4], int dstlen )
+{
+ static const WCHAR sbase = 0xac00, lbase = 0x1100, vbase = 0x1161, tbase = 0x11a7;
+ static const WCHAR /*lcount = 19, vcount = 21,*/ tcount = 28, ncount = 588/*, scount = 11172*/;
+ WCHAR sindex, lindex, vindex, tindex;
+
+ if (ch >= 0xac00 && ch <= 0xd7af)
+ {
+ sindex = ch - sbase;
+ lindex = sindex / ncount;
+ vindex = (sindex % ncount) / tcount;
+ tindex = sindex % tcount;
+ dum[0] = lbase + lindex;
+ dum[1] = vbase + vindex;
+ dum[2] = (tindex > 0) ? (tbase + tindex) : 0;
+ dum[3] = 0;
+ }
+ else
+ {
+ dum[0] = ch;
+ dum[1] = 0;
+ }
+
+ return 0;
+}
+
+static inline UINT utf16_codepoint_to_surrogates( UINT cp )
+{
+ UINT ch = cp;
+ WCHAR hx, hw, lx;
+ UINT hu;
+
+ hx = (WCHAR)cp;
+ hu = (cp >> 16) & ((1 << 5) - 1);
+ hw = (WCHAR)hu - 1;
+ lx = (WCHAR)cp;
+ ch = 0xD800 | (hw << 6) | (hx >> 10);
+ ch |= (0xDC00 | (lx & ((1 << 10) - 1)))<<16;
+
+ return ch;
+}
+
+static inline UINT utf16_surrogates_to_codepoint( WCHAR hi, WCHAR lo )
+{
+ UINT x, w, u, c;
+
+ x = ((hi & ((1 << 6) - 1)) << 10) | (lo & ((1 << 10) - 1));
+ w = (hi >> 6) & ((1 << 5) - 1);
+ u = w + 1;
+ c = (u << 16) | x;
+
+ return c;
+}
+
+static int decompose_char_recursive( int compat, UINT ch, WCHAR *dst, int dstlen )
+{
+ const WCHAR *map = NULL;
+ int total_decomp = 0;
+ int size_decomp;
+
+ if (ch < 0xa0) /* fast path */
+ {
+ if (dstlen) *dst = (WCHAR)ch;
+ return 1;
+ }
+ else if (ch >= 0xac00 && ch <= 0xd7af) /* hangul */
+ {
+ WCHAR dum[4];
+ int len = 0, i = 0;
+
+ decompose_hangul( ch, dum, dstlen );
+ while (dum[i])
+ {
+ if (dstlen-i) dst[i] = dum[i];
+ i++;
+ len++;
+ }
+ return len;
+ }
+ else if (ch > last_decomposable ||
+ !*(map = unicode_table_lookup( ch, compat, idx1_decomp, 8,
+ idx2_decomp, 4, offsets_decomp, 5, data_decomp, 1 )))
+ {
+ if (ch > 0xffff)
+ {
+ ch = utf16_codepoint_to_surrogates( ch );
+ if (dstlen) *dst = (WCHAR)ch;
+ if (dstlen-1) *(dst+1) = (WCHAR)(ch>>16);
+ return 2;
+ }
+ else
+ {
+ if (dstlen) *dst = (WCHAR)ch;
+ return 1;
+ }
+ }
+ else {
+ while (*map)
+ {
+ size_decomp = decompose_char_recursive( compat, *map, dst, dstlen );
+ dstlen -= size_decomp;
+ if (dstlen < 0) dstlen = 0;
+ dst += size_decomp;
+ map++;
+ total_decomp += size_decomp;
+ }
+ return total_decomp;
+ }
+}
+
+unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src,
+ int srclen, WCHAR *dst, int dstlen )
+{
+ UINT ch;
+ int srcpos = 0, dstpos = 0;
+ int num_decomp;
+
+ if (dstlen < 0) dstlen = 0;
+
+ while (srcpos < srclen)
+ {
+ ch = src[srcpos];
+
+ if (ch >= 0xd800 && ch <= 0xdbff) /* high surrogate */
+ {
+ WCHAR hi, lo;
+ if (srcpos+1 == srclen) return srcpos;
+ hi = (WCHAR)ch;
+ lo = src[++srcpos];
+ if (lo < 0xdc00 || lo > 0xdfff) return srcpos;
+ ch = utf16_surrogates_to_codepoint( hi, lo );
+ }
+ else if (ch >= 0xdc00 && ch <= 0xdfff) /* low surrogate */
+ {
+ return srcpos;
+ }
+
+ num_decomp = decompose_char_recursive( compat, ch, dst+dstpos, dstlen );
+ dstpos += num_decomp;
+
+ if (dstlen > 0)
+ {
+ dstlen -= num_decomp;
+ if (dstlen < 0)
+ {
+ while (dstlen < 0)
+ {
+ dstpos--;
+ dstlen++;
+ }
+ break;
+ }
+ }
+
+ ++srcpos;
+ }
+
+ return dstpos;
}
EOF
close OUTPUT;
--
2.14.1