Signed-off-by: Sergio Gómez Del Real sdelreal@codeweavers.com --- libs/port/mbtowc.c | 10 +- tools/make_unicode | 427 +++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 372 insertions(+), 65 deletions(-)
diff --git a/libs/port/mbtowc.c b/libs/port/mbtowc.c index 4977c82d8b..d78bfe81e6 100644 --- a/libs/port/mbtowc.c +++ b/libs/port/mbtowc.c @@ -22,7 +22,7 @@
#include "wine/unicode.h"
-extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ) DECLSPEC_HIDDEN; +extern unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src, int srclen, WCHAR *dst, int dstlen );
/* check the code whether it is in Unicode Private Use Area (PUA). */ /* MB_ERR_INVALID_CHARS raises an error converting from 1-byte character to PUA. */ @@ -107,13 +107,13 @@ static int mbstowcs_sbcs_decompose( const struct sbcs_table *table, int flags, { WCHAR dummy[4]; /* no decomposition is larger than 4 chars */ for (len = 0; srclen; srclen--, src++) - len += wine_decompose( cp2uni[*src], dummy, 4 ); + len += wine_unicode_decompose_string( 0, &cp2uni[*src], 1, dummy, 4 ); return len; }
for (len = dstlen; srclen && len; srclen--, src++) { - unsigned int res = wine_decompose( cp2uni[*src], dst, len ); + unsigned int res = wine_unicode_decompose_string( 0, &cp2uni[*src], 1, dst, len ); if (!res) break; len -= res; dst += res; @@ -219,7 +219,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table, ch = cp2uni[(off << 8) + *src]; } else ch = cp2uni[*src]; - len += wine_decompose( ch, dummy, 4 ); + len += wine_unicode_decompose_string( 0, &ch, 1, dummy, 4 ); } return len; } @@ -234,7 +234,7 @@ static int mbstowcs_dbcs_decompose( const struct dbcs_table *table, ch = cp2uni[(off << 8) + *src]; } else ch = cp2uni[*src]; - if (!(res = wine_decompose( ch, dst, len ))) break; + if (!(res = wine_unicode_decompose_string( 0, &ch, 1, dst, len ))) break; dst += res; len -= res; } diff --git a/tools/make_unicode b/tools/make_unicode index 92b0b64a94..65ae7ab2a0 100755 --- a/tools/make_unicode +++ b/tools/make_unicode @@ -471,6 +471,26 @@ sub READ_DEFAULTS($)
next if $decomp eq ""; # no decomposition, skip it
+ # store decomposition table + if ($decomp =~ /^<([a-zA-Z]+)>(\s+[0-9a-fA-F]+)+$/) + { + my @seq = (); + for my $ch (split /\s+/, (split /\s+/, $decomp, 2)[1]) + { + push @seq, (hex $ch); + } + $decomp_table[$src] = [1, @seq]; + } + elsif ($decomp =~ /^([0-9a-fA-F]+)(\s+([0-9a-fA-F]+))*$/) + { + my @seq = (); + for my $ch (split /\s+/, $decomp) + { + push @seq, (hex $ch); + } + $decomp_table[$src] = [0, @seq]; + } + if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/) { # decomposition of the form "<foo> 1234" -> use char if type is known @@ -508,7 +528,6 @@ sub READ_DEFAULTS($) # store decomposition if it contains two chars if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/) { - $decomp_table[$src] = [ hex $1, hex $2 ]; push @compose_table, [ hex $1, hex $2, $src ]; } elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ && @@ -2258,6 +2277,51 @@ EOF save_file($filename); }
+sub recursive_decomp +{ + my ($char, $table_ref, $compat) = @_; + + return ($char) unless defined $table_ref->[$char]; + my $data = $table_ref->[$char]; + return ($char) if $data->[0] && !$compat; + my @mapping = (); + for my $ch (@{$data->[1]}) + { + push @mapping, recursive_decomp( $ch, $table_ref, $compat ); + } + return @mapping; +} + +sub expand_pairs +{ + my @data = @_; + my @result = (); + + for my $ch (@data) + { + if ($ch <= 0xFFFF) + { + push @result, $ch; + } + elsif ($ch >= 2097152) # 2**21 + { + die sprintf "Invalid Unicode character %04x\n", $ch; + } + else + { + my $hx = $ch & 0xFFFF; + my $hu = ($ch >> 16) & ((1 << 5) - 1); + my $hw = ($hu - 1) & 0xFFFF; + my $hi = 0xD800 | ($hw << 6) | ($hx >> 10); + my $lx = $ch & 0xFFFF; + my $lo = (0xDC00 | ($lx & ((1 << 10) - 1))) & 0xFFFF; + push @result, $hi; + push @result, $lo; + } + } + return @result; +} + ################################################################ # dump the char decomposition table sub dump_decompose_table($) @@ -2266,98 +2330,341 @@ sub dump_decompose_table($)
open OUTPUT,">$filename.new" or die "Cannot create $filename"; print "Building $filename\n"; - print OUTPUT "/* Unicode char composition */\n"; + print OUTPUT "/* Unicode char decomposition */\n"; print OUTPUT "/* generated from $UNIDATA/UnicodeData.txt */\n"; print OUTPUT "/* DO NOT EDIT!! */\n\n"; print OUTPUT "#include "wine/unicode.h"\n\n";
- # first determine all the 16-char subsets that contain something + my $utflim = 2097152; + my %nfd_lookup = (); + my %nfkd_lookup = (); + my %decomp_lookup = (); + my @decomp_data = (0); + my $pos = 1; + my $lastchar_decomp;
- my @filled = (0) x 4096; - my $pos = 16*2; # for the null subset - for (my $i = 0; $i < 65536; $i++) + for (my $i = 0; $i < $utflim; $i++) { next unless defined $decomp_table[$i]; - $filled[$i >> 4] = $pos; - $pos += 16*2; - $i |= 15; + + if (defined $decomp_table[$i]) + { + $lastchar_decomp = $i; + # fully expand input and mappings + + my @char = expand_pairs( ($i) ); + push @char, 0; + my $char = pack "n*", @char; + + my @nfd = recursive_decomp( $i, @decomp_table, 0 ); + @nfd = expand_pairs( @nfd ); + push @nfd, 0; + my $nfd = pack "n*", @nfd; + + my @nfkd = recursive_decomp( $i, @decomp_table, 1 ); + @nfkd = expand_pairs( @nfkd ); + push @nfkd, 0; + my $nfkd = pack "n*", @nfkd; + + # lookup or add mappings + + if ($nfd eq $char) + { + $nfd = undef; + } + elsif (exists $decomp_lookup{$nfd}) + { + $nfd_lookup{$i} = $decomp_lookup{$nfd}; + } + else + { + push @decomp_data, @nfd; + $decomp_lookup{$nfd} = $pos; + $nfd_lookup{$i} = $pos; + $pos += @nfd; + } + + if ($nfkd eq $char) + { + $nfkd = undef; + } + elsif (exists $decomp_lookup{$nfkd}) + { + $nfkd_lookup{$i} = $decomp_lookup{$nfkd}; + } + else + { + push @decomp_data, @nfkd; + $decomp_lookup{$nfkd} = $pos; + $nfkd_lookup{$i} = $pos; + $pos += @nfkd; + } + } } - my $total = $pos;
- # now count the 256-char subsets that contain something + printf OUTPUT "static const UINT last_decomposable = 0x%x;\n\n", $lastchar_decomp; + + # dump decomposition data + + printf OUTPUT "static const WCHAR data_decomp[%d] =\n", $pos; + print OUTPUT "{\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @decomp_data ); + print OUTPUT "\n};\n\n";
- my @filled_idx = (256) x 256; - $pos = 256 + 16; - for (my $i = 0; $i < 4096; $i++) + # find 256-char subsets that contain something + + my $filled_pos = 1; + my $filled_lim = ($lastchar_decomp >> 8) + 1; + my @filled = (0) x $filled_lim; + for (my $i = 0; $i < $utflim; $i++) { - next unless $filled[$i]; - $filled_idx[$i >> 4] = $pos; - $pos += 16; - $i |= 15; + last if $i > $lastchar_decomp; + next unless exists $nfd_lookup{$i} || exists $nfkd_lookup{$i}; + $filled[$i >> 8] = $filled_pos++; + $i |= 255; } - my $null_offset = $pos; # null mapping - $total += $pos;
- # add the index offsets to the subsets positions + # dump index of 256-char subsets + + printf OUTPUT "static const BYTE idx1_decomp[%d] =\n", $filled_lim; + print OUTPUT "{\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @filled ); + print OUTPUT "\n};\n\n"; + + # for 256-char subsets, find non-empty 16-char subsets
- for (my $i = 0; $i < 4096; $i++) + my $sub_filled_pos = 1; + my %sub_filled = (); + for (my $i = 0; $i < $filled_lim; $i++) { next unless $filled[$i]; - $filled[$i] += $null_offset; + for (my $j = 0; $j < 256; $j++) + { + my $idx = ($i << 8) | $j; + next unless exists $nfd_lookup{$idx} || exists $nfkd_lookup{$idx}; + $sub_filled{$idx >> 4} = $sub_filled_pos++; + $j |= 15; + } }
- # dump the main index + # dump index of 16-char subsets
- printf OUTPUT "static const WCHAR table[%d] =\n", $total; - printf OUTPUT "{\n /* index */\n"; - printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx ); - printf OUTPUT ",\n /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 ); - - # dump the second-level indexes - - for (my $i = 0; $i < 256; $i++) + printf OUTPUT "static const USHORT idx2_decomp[%d] =\n", $filled_pos * 16; + print OUTPUT "{\n"; + my @null_idx = (0) x 16; + print OUTPUT " /* null sub-index */\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx ); + for (my $i = 0; $i < $filled_lim; $i++) { - next unless ($filled_idx[$i] > 256); - my @table = @filled[($i<<4)..($i<<4)+15]; - for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; } - printf OUTPUT ",\n /* sub-index %02x */\n", $i; - printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table ); + next unless $filled[$i]; + printf OUTPUT ",\n /* sub-index 0x%02x */\n", $filled[$i]; + + my @sub_idx; + for (my $j = 0; $j < 16; $j++) + { + my $idx = ($i << 4) | $j; + $sub_idx[$j] = $sub_filled{$idx} || 0; + } + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx ); } + print OUTPUT "\n};\n\n";
# dump the 16-char subsets
- printf OUTPUT ",\n /* null mapping */\n"; - printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 ); - - for (my $i = 0; $i < 4096; $i++) + printf OUTPUT "static const USHORT offsets_decomp[%d] =\n", 32 * $sub_filled_pos; + print OUTPUT "{\n"; + print OUTPUT " /* (nfd, nfkd) x 16 */\n"; + my @null_table = (0) x 32; + print OUTPUT " /* no decomposition */\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table ); + for my $key (sort {$a <=> $b} keys %sub_filled) { - next unless $filled[$i]; - my @table = (0) x 32; + printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $key, $key; + my @sub_table; for (my $j = 0; $j < 16; $j++) { - if (defined $decomp_table[($i<<4) + $j]) - { - $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0]; - $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1]; - } + my $idx = ($key << 4) | $j; + $sub_table[2 * $j] = $nfd_lookup{$idx} || 0; + $sub_table[2 * $j + 1] = $nfkd_lookup{$idx} || 0; } - printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $i, $i; - printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table ); + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table ); } + print OUTPUT "\n};\n\n";
- printf OUTPUT "\n};\n\n"; print OUTPUT <<"EOF"; -unsigned int DECLSPEC_HIDDEN wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ) +static const WCHAR *unicode_table_lookup( UINT cp, int compat, const BYTE *idx1, UINT scale_idx1, + const USHORT *idx2, UINT scale_idx2, const USHORT *offsets, + UINT scale_off, const WCHAR *data, UINT scale_data ) { - const WCHAR *ptr = table + table[table[ch >> 8] + ((ch >> 4) & 0x0f)] + 2 * (ch & 0xf); - unsigned int res; + USHORT a, b, c, d;
- *dst = ch; - if (!*ptr) return 1; - if (dstlen <= 1) return 0; - /* apply the decomposition recursively to the first char */ - if ((res = wine_decompose( *ptr, dst, dstlen-1 ))) dst[res++] = ptr[1]; - return res; + a = idx1[cp >> scale_idx1]; + b = idx2[(a << scale_idx2) + ((cp >> scale_idx2) & 0xf)]; + c = (b << scale_off) + ((cp & 0xf) << scale_data); + if (compat) ++c; + d = offsets[c]; + + return &data[d]; +} + +static inline int decompose_hangul( WCHAR ch, WCHAR dum[4], int dstlen ) +{ + static const WCHAR sbase = 0xac00, lbase = 0x1100, vbase = 0x1161, tbase = 0x11a7; + static const WCHAR /*lcount = 19, vcount = 21,*/ tcount = 28, ncount = 588/*, scount = 11172*/; + WCHAR sindex, lindex, vindex, tindex; + + if (ch >= 0xac00 && ch <= 0xd7af) + { + sindex = ch - sbase; + lindex = sindex / ncount; + vindex = (sindex % ncount) / tcount; + tindex = sindex % tcount; + dum[0] = lbase + lindex; + dum[1] = vbase + vindex; + dum[2] = (tindex > 0) ? (tbase + tindex) : 0; + dum[3] = 0; + } + else + { + dum[0] = ch; + dum[1] = 0; + } + + return 0; +} + +static inline UINT utf16_codepoint_to_surrogates( UINT cp ) +{ + UINT ch = cp; + WCHAR hx, hw, lx; + UINT hu; + + hx = (WCHAR)cp; + hu = (cp >> 16) & ((1 << 5) - 1); + hw = (WCHAR)hu - 1; + lx = (WCHAR)cp; + ch = 0xD800 | (hw << 6) | (hx >> 10); + ch |= (0xDC00 | (lx & ((1 << 10) - 1)))<<16; + + return ch; +} + +static inline UINT utf16_surrogates_to_codepoint( WCHAR hi, WCHAR lo ) +{ + UINT x, w, u, c; + + x = ((hi & ((1 << 6) - 1)) << 10) | (lo & ((1 << 10) - 1)); + w = (hi >> 6) & ((1 << 5) - 1); + u = w + 1; + c = (u << 16) | x; + + return c; +} + +static int decompose_char_recursive( int compat, UINT ch, WCHAR *dst, int dstlen ) +{ + const WCHAR *map = NULL; + int total_decomp = 0; + int size_decomp; + + if (ch < 0xa0) /* fast path */ + { + if (dstlen) *dst = (WCHAR)ch; + return 1; + } + else if (ch >= 0xac00 && ch <= 0xd7af) /* hangul */ + { + WCHAR dum[4]; + int len = 0, i = 0; + + decompose_hangul( ch, dum, dstlen ); + while (dum[i]) + { + if (dstlen-i) dst[i] = dum[i]; + i++; + len++; + } + return len; + } + else if (ch > last_decomposable || + !*(map = unicode_table_lookup( ch, compat, idx1_decomp, 8, + idx2_decomp, 4, offsets_decomp, 5, data_decomp, 1 ))) + { + if (ch > 0xffff) + { + ch = utf16_codepoint_to_surrogates( ch ); + if (dstlen) *dst = (WCHAR)ch; + if (dstlen-1) *(dst+1) = (WCHAR)(ch>>16); + return 2; + } + else + { + if (dstlen) *dst = (WCHAR)ch; + return 1; + } + } + else { + while (*map) + { + size_decomp = decompose_char_recursive( compat, *map, dst, dstlen ); + dstlen -= size_decomp; + if (dstlen < 0) dstlen = 0; + dst += size_decomp; + map++; + total_decomp += size_decomp; + } + return total_decomp; + } +} + +unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src, + int srclen, WCHAR *dst, int dstlen ) +{ + UINT ch; + int srcpos = 0, dstpos = 0; + int num_decomp; + + if (dstlen < 0) dstlen = 0; + + while (srcpos < srclen) + { + ch = src[srcpos]; + + if (ch >= 0xd800 && ch <= 0xdbff) /* high surrogate */ + { + WCHAR hi, lo; + if (srcpos+1 == srclen) return srcpos; + hi = (WCHAR)ch; + lo = src[++srcpos]; + if (lo < 0xdc00 || lo > 0xdfff) return srcpos; + ch = utf16_surrogates_to_codepoint( hi, lo ); + } + else if (ch >= 0xdc00 && ch <= 0xdfff) /* low surrogate */ + { + return srcpos; + } + + num_decomp = decompose_char_recursive( compat, ch, dst+dstpos, dstlen ); + dstpos += num_decomp; + + if (dstlen > 0) + { + dstlen -= num_decomp; + if (dstlen < 0) + { + while (dstlen < 0) + { + dstpos--; + dstlen++; + } + break; + } + } + + ++srcpos; + } + + return dstpos; } EOF close OUTPUT;
Signed-off-by: Sergio Gómez Del Real sdelreal@codeweavers.com --- tools/make_unicode | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 301 insertions(+), 1 deletion(-)
diff --git a/tools/make_unicode b/tools/make_unicode index 65ae7ab2a0..1ad090219b 100755 --- a/tools/make_unicode +++ b/tools/make_unicode @@ -359,6 +359,8 @@ my @joining_table = (); my @direction_table = (); my @decomp_table = (); my @compose_table = (); +my @comb_class_table = (); +my @full_comp_table = (); my $default_char; my $default_wchar;
@@ -469,6 +471,11 @@ sub READ_DEFAULTS($) } }
+ if ($comb != 0) + { + $comb_class_table[$src] = (hex $comb); + } + next if $decomp eq ""; # no decomposition, skip it
# store decomposition table @@ -561,6 +568,25 @@ sub READ_DEFAULTS($) my $flag = $ctype{$cat}; foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; } } + + my $UNICODE_DERIVED = open_data_file( $UNIDATA, "DerivedNormalizationProps.txt" ); + while (<$UNICODE_DERIVED>) + { + next unless (/^([0-9a-fA-F.]+)\s+;\s+Full_Composition_Exclusion/); + my ($first, $last) = split /../,$1; + $first = hex $first; + if (defined $last) + { + $last = hex $last; + while ($last gt $first) + { + $full_comp_table[$last] = 1; + $last--; + } + } + $full_comp_table[$first] = 1; + } + close $UNICODE_DERIVED; }
@@ -2249,6 +2275,8 @@ sub dump_compose_table($) } print OUTPUT "\n};\n\n"; print OUTPUT <<"EOF"; +#include "decompose.c" + static inline int binary_search( WCHAR ch, int low, int high ) { while (low <= high) @@ -2272,6 +2300,59 @@ WCHAR DECLSPEC_HIDDEN wine_compose( const WCHAR *str ) count = table[2 * pos + 3]; } } + +static inline int is_blocked(WCHAR *ptr1, WCHAR *ptr2) +{ + if (ptr1 >= ptr2) return -1; + + while (++ptr1 < ptr2) + { + const WCHAR *map1, *map2; + map1 = unicode_table_lookup( *ptr1, 0, idx1_comb, 8, idx2_comb, 4, + offsets_comb, 4, data_comb, 0 ); + map2 = unicode_table_lookup( *ptr2, 0, idx1_comb, 8, idx2_comb, 4, + offsets_comb, 4, data_comb, 0 ); + if (*map1 == 0 || *map2 <= *map1) return 1; + } + return 0; +} + +static inline int is_fullexcl(WCHAR ch) +{ + const WCHAR *map; + map = unicode_table_lookup( ch, 0, idx1_fullcomp, 8, idx2_fullcomp, 4, + offsets_fullcomp, 4, data_fullcomp, 0 ); + return (int)*map; +} + +UINT unicode_canonical_composition( WCHAR *str, UINT strlen ) +{ + int i, j; + WCHAR dum[3] = {0}; + + if (strlen == 0) strlen = strlenW( str ); + + for (i = 1; i <= strlen; i++) + { + WCHAR *scratch = str+i, comp; + if (*scratch == 0) break; + for (scratch = str+i; scratch - str > 0; --scratch) + { + if (is_starter( *scratch )) break; + } + if (!is_starter( *scratch ) || is_blocked( scratch, str+i )) continue; + dum[0] = *scratch; + dum[1] = str[i]; + comp = wine_compose( dum ); + if (comp == 0 || is_fullexcl( comp )) continue; + *scratch = comp; + for (j = i; j < strlen-1; j++) str[j] = str[j+1]; + strlen--; + i--; + } + + return strlen; +} EOF close OUTPUT; save_file($filename); @@ -2339,13 +2420,21 @@ sub dump_decompose_table($) my %nfd_lookup = (); my %nfkd_lookup = (); my %decomp_lookup = (); + my %comb_lookup = (); + my %fullcomp_lookup = (); my @decomp_data = (0); + my @comb_data = (0); + my @full_comp_data = (0); my $pos = 1; + my $pos_comb = 1; + my $pos_fullcomp = 1; my $lastchar_decomp; + my $lastchar_comb; + my $lastchar_fullcomp;
for (my $i = 0; $i < $utflim; $i++) { - next unless defined $decomp_table[$i]; + next unless defined $decomp_table[$i] || defined $comb_class_table[$i] || defined $full_comp_table[$i];
if (defined $decomp_table[$i]) { @@ -2400,6 +2489,20 @@ sub dump_decompose_table($) $pos += @nfkd; } } + if (defined $comb_class_table[$i]) + { + push @comb_data, $comb_class_table[$i]; + $lastchar_comb = $i; + $comb_lookup{$i} = $pos_comb; + $pos_comb++; + } + if (defined $full_comp_table[$i]) + { + push @full_comp_data, $full_comp_table[$i]; + $lastchar_fullcomp = $i; + $fullcomp_lookup{$i} = $pos_fullcomp; + $pos_fullcomp++; + } }
printf OUTPUT "static const UINT last_decomposable = 0x%x;\n\n", $lastchar_decomp; @@ -2491,6 +2594,154 @@ sub dump_decompose_table($) } print OUTPUT "\n};\n\n";
+ # now for Compatibility Class + + printf OUTPUT "static const WCHAR data_comb[%d] =\n", $pos_comb; + print OUTPUT "{\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @comb_data ); + print OUTPUT "\n};\n\n"; + + my $comb_pos = 1; + my $comb_lim = ($lastchar_comb >> 8) + 1; + my @comb_filled = (0) x $comb_lim; + for (my $i = 0; $i < $utflim; $i++) + { + last if $i > $lastchar_comb; + next unless defined $comb_class_table[$i]; + $comb_filled[$i >> 8] = $comb_pos++; + $i |= 255; + } + printf OUTPUT "static const BYTE idx1_comb[%d] =\n", $comb_lim; + print OUTPUT "{\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @comb_filled ); + print OUTPUT "\n};\n\n"; + + my $sub_comb_filled_pos = 1; + my %sub_comb_filled = (); + for (my $i = 0; $i < $comb_lim; $i++) + { + next unless $comb_filled[$i]; + for (my $j = 0; $j < 256; $j++) + { + my $idx = ($i << 8) | $j; + next unless defined $comb_class_table[$idx]; + $sub_comb_filled{$idx >> 4} = $sub_comb_filled_pos++; + $j |= 15; + } + } + + printf OUTPUT "static const USHORT idx2_comb[%d] =\n", $comb_pos * 16; + print OUTPUT "{\n"; + @null_idx = (0) x 16; + print OUTPUT " /* all-zero 256-char blocks get mapped to here */\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx ); + for (my $i = 0; $i < $comb_lim; $i++) + { + next unless $comb_filled[$i]; + printf OUTPUT ",\n /* sub-index 0x%02x */\n", $comb_filled[$i]; + + my @sub_idx; + for (my $j = 0; $j < 16; $j++) + { + my $idx = ($i << 4) | $j; + $sub_idx[$j] = $sub_comb_filled{$idx} || 0; + } + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx ); + } + print OUTPUT "\n};\n\n"; + + printf OUTPUT "static const USHORT offsets_comb[%d] =\n", 16 * $sub_comb_filled_pos; + print OUTPUT "{\n"; + @null_table = (0) x 16; + print OUTPUT " /* all-zero 16-char blocks get mapped to here */\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table ); + for my $key (sort {$a <=> $b} keys %sub_comb_filled) + { + printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $key, $key; + my @sub_table; + for (my $j = 0; $j < 16; $j++) + { + my $idx = ($key << 4) | $j; + $sub_table[$j] = $comb_lookup{$idx} || 0; + } + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table ); + } + print OUTPUT "\n};\n\n"; + + # now for Full Composition Exclusion + + printf OUTPUT "const WCHAR data_fullcomp[%d] =\n", $pos_fullcomp; + print OUTPUT "{\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @full_comp_data ); + print OUTPUT "\n};\n\n"; + + my $fullcomp_pos = 1; + my $fullcomp_lim = ($lastchar_fullcomp >> 8) + 1; + my @fullcomp_filled = (0) x $fullcomp_lim; + for (my $i = 0; $i < $utflim; $i++) + { + last if $i > $lastchar_fullcomp; + next unless defined $full_comp_table[$i]; + $fullcomp_filled[$i >> 8] = $fullcomp_pos++; + $i |= 255; + } + printf OUTPUT "const BYTE idx1_fullcomp[%d] =\n", $fullcomp_lim; + print OUTPUT "{\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @fullcomp_filled ); + print OUTPUT "\n};\n\n"; + + my $sub_fullcomp_filled_pos = 1; + my %sub_fullcomp_filled = (); + for (my $i = 0; $i < $fullcomp_lim; $i++) + { + next unless $fullcomp_filled[$i]; + for (my $j = 0; $j < 256; $j++) + { + my $idx = ($i << 8) | $j; + next unless defined $full_comp_table[$idx]; + $sub_fullcomp_filled{$idx >> 4} = $sub_fullcomp_filled_pos++; + $j |= 15; + } + } + + printf OUTPUT "const USHORT idx2_fullcomp[%d] =\n", $fullcomp_pos * 16; + print OUTPUT "{\n"; + @null_idx = (0) x 16; + print OUTPUT " /* all-zero 256-char blocks get mapped to here */\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx ); + for (my $i = 0; $i < $fullcomp_lim; $i++) + { + next unless $fullcomp_filled[$i]; + printf OUTPUT ",\n /* sub-index 0x%02x */\n", $fullcomp_filled[$i]; + + my @sub_idx; + for (my $j = 0; $j < 16; $j++) + { + my $idx = ($i << 4) | $j; + $sub_idx[$j] = $sub_fullcomp_filled{$idx} || 0; + } + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx ); + } + print OUTPUT "\n};\n\n"; + + printf OUTPUT "const USHORT offsets_fullcomp[%d] =\n", 16 * $sub_fullcomp_filled_pos; + print OUTPUT "{\n"; + @null_table = (0) x 16; + print OUTPUT " /* all-zero 16-char blocks get mapped to here */\n"; + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table ); + for my $key (sort {$a <=> $b} keys %sub_fullcomp_filled) + { + printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $key, $key; + my @sub_table; + for (my $j = 0; $j < 16; $j++) + { + my $idx = ($key << 4) | $j; + $sub_table[$j] = $fullcomp_lookup{$idx} || 0; + } + printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table ); + } + print OUTPUT "\n};\n\n"; + print OUTPUT <<"EOF"; static const WCHAR *unicode_table_lookup( UINT cp, int compat, const BYTE *idx1, UINT scale_idx1, const USHORT *idx2, UINT scale_idx2, const USHORT *offsets, @@ -2533,6 +2784,20 @@ static inline int decompose_hangul( WCHAR ch, WCHAR dum[4], int dstlen ) return 0; }
+static inline int reorderable_pair( WCHAR ch1, WCHAR ch2 ) +{ + const WCHAR *cc1, *cc2; + + if (ch1 == 0 || ch2 == 0) return 0; + + cc1 = unicode_table_lookup( ch1, 0, idx1_comb, 8, idx2_comb, 4, + offsets_comb, 4, data_comb, 0 ); + cc2 = unicode_table_lookup( ch2, 0, idx1_comb, 8, idx2_comb, 4, + offsets_comb, 4, data_comb, 0 ); + if (*cc2 < *cc1) return 1; + else return 0; +} + static inline UINT utf16_codepoint_to_surrogates( UINT cp ) { UINT ch = cp; @@ -2666,6 +2931,41 @@ unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src,
return dstpos; } + +int is_starter( WCHAR ch ) +{ + const WCHAR *map = unicode_table_lookup( ch, 0, idx1_comb, 8, idx2_comb, 4, + offsets_comb, 4, data_comb, 0 ); + return (*map == 0) ? 1 : 0; +} + +void unicode_canon_order( WCHAR *str, int strlen ) +{ + int i, j, m; + int sublen = 0, tot_sublen = 0; + WCHAR *substr = str; + + for (m = 1; m < strlen; m++) + { + if (m == strlen || is_starter( str[m] )) sublen = m - tot_sublen; + else continue; + + for (i = 0; i < sublen; i++) + { + for (j = i+1; j < sublen; j++) + { + if (reorderable_pair( substr[i], substr[j] )) + { + WCHAR swp = substr[i]; + substr[i] = substr[j]; + substr[j] = swp; + } + } + } + tot_sublen += m; + substr = str+m; + } +} EOF close OUTPUT; save_file($filename);
Signed-off-by: Sergio Gómez Del Real sdelreal@codeweavers.com --- dlls/kernel32/locale.c | 66 ++++++++++++++++++++++++++++++--- dlls/kernel32/tests/locale.c | 88 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 5 deletions(-)
diff --git a/dlls/kernel32/locale.c b/dlls/kernel32/locale.c index c5eeabfbbe..4b0a7902f8 100644 --- a/dlls/kernel32/locale.c +++ b/dlls/kernel32/locale.c @@ -5355,13 +5355,69 @@ INT WINAPI GetUserDefaultLocaleName(LPWSTR localename, int buffersize)
/****************************************************************************** * NormalizeString (KERNEL32.@) + * + * Normalizes a string according to a Unicode Normalization Form. + * + * PARAMS + * norm [I] Normalization Form + * src [I] Source string to normalize + * srclen [I] Length of source string (if -1, source string is null-terminated) + * dst [O] Buffer to write normalized source string (can be NULL) + * dstlen [I] Length of dst string (can be 0) + * + * RETURNS + * Success: If dstlen is 0, return size needed, else return size of normalized string. + * Failure: ret <= 0. Use GetLastError to determine error. */ -INT WINAPI NormalizeString(NORM_FORM NormForm, LPCWSTR lpSrcString, INT cwSrcLength, - LPWSTR lpDstString, INT cwDstLength) +INT WINAPI NormalizeString(NORM_FORM norm, LPCWSTR src, INT srclen, + LPWSTR dst, INT dstlen) { - FIXME("%x %p %d %p %d\n", NormForm, lpSrcString, cwSrcLength, lpDstString, cwDstLength); - SetLastError(ERROR_CALL_NOT_IMPLEMENTED); - return 0; + extern unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src, + int srclen, WCHAR *dst, int dstlen ); + extern unsigned int unicode_canonical_composition( WCHAR *str, UINT strlen ); + extern void unicode_canon_order( WCHAR *str, int strlen ); + + WCHAR *decomp = NULL; + INT compat = 0; + UINT needed_len; + + if (norm == NormalizationKC || norm == NormalizationKD) compat++; + + if (srclen == -1) srclen = strlenW( src ) + 1; + + needed_len = wine_unicode_decompose_string( compat, src, srclen, NULL, 0 ); + if (norm == NormalizationC || norm == NormalizationKC) + { + decomp = HeapAlloc( GetProcessHeap(), 0, needed_len*sizeof(WCHAR)+1 ); + wine_unicode_decompose_string( compat, src, srclen, decomp, needed_len ); + unicode_canon_order( decomp, needed_len ); + needed_len = unicode_canonical_composition( decomp, needed_len ); + } + + if (dstlen < needed_len && dstlen > 0) + { + if (decomp) HeapFree(GetProcessHeap(), 0, decomp); + SetLastError(ERROR_INSUFFICIENT_BUFFER); + return -1; + } + else if (dstlen <= 0) + { + if (decomp) HeapFree(GetProcessHeap(), 0, decomp); + return needed_len; + } + + if (norm == NormalizationC || norm == NormalizationKC) + { + lstrcpynW( dst, decomp, needed_len ); + HeapFree(GetProcessHeap(), 0, decomp); + return needed_len; + } + else + { + int decomp_len = wine_unicode_decompose_string( compat, src, srclen, dst, needed_len ); + unicode_canon_order( dst, needed_len ); + return decomp_len; + } }
/****************************************************************************** diff --git a/dlls/kernel32/tests/locale.c b/dlls/kernel32/tests/locale.c index d8f51d32fa..531752cd43 100644 --- a/dlls/kernel32/tests/locale.c +++ b/dlls/kernel32/tests/locale.c @@ -36,6 +36,8 @@ #include "winerror.h" #include "winnls.h"
+#include "normalization_tests.h" + static const WCHAR upper_case[] = {'\t','J','U','S','T','!',' ','A',',',' ','T','E','S','T',';',' ','S','T','R','I','N','G',' ','1','/','*','+','-','.','\r','\n',0}; static const WCHAR lower_case[] = {'\t','j','u','s','t','!',' ','a',',',' ','t','e','s','t',';',' ','s','t','r','i','n','g',' ','1','/','*','+','-','.','\r','\n',0}; static const WCHAR title_case[] = {'\t','J','u','s','t','!',' ','A',',',' ','T','e','s','t',';',' ','S','t','r','i','n','g',' ','1','/','*','+','-','.','\r','\n',0}; @@ -104,6 +106,7 @@ static BOOL (WINAPI *pGetUserPreferredUILanguages)(DWORD, ULONG*, WCHAR*, ULONG* static WCHAR (WINAPI *pRtlUpcaseUnicodeChar)(WCHAR); static INT (WINAPI *pGetNumberFormatEx)(LPCWSTR, DWORD, LPCWSTR, const NUMBERFMTW *, LPWSTR, int); static INT (WINAPI *pFindNLSStringEx)(LPCWSTR, DWORD, LPCWSTR, INT, LPCWSTR, INT, LPINT, LPNLSVERSIONINFO, LPVOID, LPARAM); +static INT (WINAPI *pNormalizeString)(NORM_FORM, LPCWSTR, INT, LPWSTR, INT);
static void InitFunctionPointers(void) { @@ -137,6 +140,7 @@ static void InitFunctionPointers(void) X(GetUserPreferredUILanguages); X(GetNumberFormatEx); X(FindNLSStringEx); + X(NormalizeString);
mod = GetModuleHandleA("ntdll"); X(RtlUpcaseUnicodeChar); @@ -5444,6 +5448,89 @@ static void test_FindNLSStringEx(void) } }
+static void test_NormalizeString(void) +{ + struct test_data_normal test_arr[] = + { + { part0_str1, part0_nfc1, part0_nfd1, part0_nfkc1, part0_nfkd1 }, + { part0_str2, part0_nfc2, part0_nfd2, part0_nfkc2, part0_nfkd2 }, + { part0_str3, part0_nfc3, part0_nfd3, part0_nfkc3, part0_nfkd3 }, + { part0_str4, part0_nfc4, part0_nfd4, part0_nfkc4, part0_nfkd4 }, + { part0_str5, part0_nfc5, part0_nfd5, part0_nfkc5, part0_nfkd5 }, + { part0_str6, part0_nfc6, part0_nfd6, part0_nfkc6, part0_nfkd6 }, + { part0_str8, part0_nfc8, part0_nfd8, part0_nfkc8, part0_nfkd8 }, + { part0_str9, part0_nfc9, part0_nfd9, part0_nfkc9, part0_nfkd9 }, + { part0_str10, part0_nfc10, part0_nfd10, part0_nfkc10, part0_nfkd10 }, + { part0_str11, part0_nfc11, part0_nfd11, part0_nfkc11, part0_nfkd11 }, + { part0_str12, part0_nfc12, part0_nfd12, part0_nfkc12, part0_nfkd12 }, + { part1_str1, part1_nfc1, part1_nfd1, part1_nfkc1, part1_nfkd1 }, + { part1_str2, part1_nfc2, part1_nfd2, part1_nfkc2, part1_nfkd2 }, + { part1_str3, part1_nfc3, part1_nfd3, part1_nfkc3, part1_nfkd3 }, + { part1_str4, part1_nfc4, part1_nfd4, part1_nfkc4, part1_nfkd4 }, + { part1_str5, part1_nfc5, part1_nfd5, part1_nfkc5, part1_nfkd5 }, + { part1_str6, part1_nfc6, part1_nfd6, part1_nfkc6, part1_nfkd6 }, + { part1_str7, part1_nfc7, part1_nfd7, part1_nfkc7, part1_nfkd7 }, + { part1_str8, part1_nfc8, part1_nfd8, part1_nfkc8, part1_nfkd8 }, + { part1_str9, part1_nfc9, part1_nfd9, part1_nfkc9, part1_nfkd9 }, + { part1_str10, part1_nfc10, part1_nfd10, part1_nfkc10, part1_nfkd10 }, + { part1_str11, part1_nfc11, part1_nfd11, part1_nfkc11, part1_nfkd11 }, + { 0 } + }; + + struct test_data_normal *ptest = test_arr; + + if (!pFindNLSStringEx) + { + win_skip("NormalizeString is not available.\n"); + return; + } + + while (ptest->str != 0) + { + WCHAR *dst; + int str_cmp; + int dstlen; + + dstlen = pNormalizeString( NormalizationD, ptest->str, -1, NULL, 0 ); + dst = HeapAlloc(GetProcessHeap(), 0, dstlen * sizeof(WCHAR) + 1); + dstlen = pNormalizeString( NormalizationD, ptest->str, -1, dst, dstlen ); + ok(dstlen == strlenW(ptest->nfd)+1, "Copied length differed: was %d, should be %d\n", + dstlen, strlenW(ptest->nfd)+1); + str_cmp = strncmpW(ptest->nfd, dst, dstlen + 1); + ok(str_cmp == 0, "NFD test failed: returned value was %d\n", str_cmp); + HeapFree(GetProcessHeap(), 0, dst); + + dstlen = pNormalizeString( NormalizationC, ptest->str, -1, NULL, 0 ); + dst = HeapAlloc(GetProcessHeap(), 0, dstlen * sizeof(WCHAR) + 1); + dstlen = pNormalizeString( NormalizationC, ptest->str, -1, dst, dstlen ); + ok(dstlen == strlenW(ptest->nfc)+1, "Copied length differed: was %d, should be %d\n", + dstlen, strlenW(ptest->nfc)+1); + str_cmp = strncmpW(ptest->nfc, dst, dstlen + 1); + ok(str_cmp == 0, "NFC test failed: returned value was %d\n", str_cmp); + HeapFree(GetProcessHeap(), 0, dst); + + dstlen = pNormalizeString( NormalizationKD, ptest->str, -1, NULL, 0 ); + dst = HeapAlloc(GetProcessHeap(), 0, dstlen * sizeof(WCHAR) + 1); + dstlen = pNormalizeString( NormalizationKD, ptest->str, -1, dst, dstlen ); + ok(dstlen == strlenW(ptest->nfkd)+1, "Copied length differed: was %d, should be %d\n", + dstlen, strlenW(ptest->nfkd)+1); + str_cmp = strncmpW(ptest->nfkd, dst, dstlen + 1); + ok(str_cmp == 0, "NFKD test failed: returned value was %d\n", str_cmp); + HeapFree(GetProcessHeap(), 0, dst); + + dstlen = pNormalizeString( NormalizationKC, ptest->str, -1, NULL, 0 ); + dst = HeapAlloc(GetProcessHeap(), 0, dstlen * sizeof(WCHAR) + 1); + dstlen = pNormalizeString( NormalizationKC, ptest->str, -1, dst, dstlen ); + ok(dstlen == strlenW(ptest->nfkc)+1, "Copied length differed: was %d, should be %d\n", + dstlen, strlenW(ptest->nfkc)+1); + str_cmp = strncmpW(ptest->nfkc, dst, dstlen + 1); + ok(str_cmp == 0, "NFKC test failed: returned value was %d\n", str_cmp); + HeapFree(GetProcessHeap(), 0, dst); + + ptest++; + } +} + START_TEST(locale) { InitFunctionPointers(); @@ -5491,6 +5578,7 @@ START_TEST(locale) test_GetThreadPreferredUILanguages(); test_GetUserPreferredUILanguages(); test_FindNLSStringEx(); + test_NormalizeString(); /* this requires collation table patch to make it MS compatible */ if (0) test_sorting(); }
Hi,
While running your changed tests on Windows, I think I found new failures. Being a bot and all I'm not very good at pattern recognition, so I might be wrong, but could you please double-check? Full results can be found at https://testbot.winehq.org/JobDetails.pl?Key=37453
Your paranoid android.
=== build (build) === Patch failed to apply