Signed-off-by: Fabian Maurer dark.shadow4@web.de --- dlls/kernel32/tests/locale.c | 110 ++++++++ dlls/kernelbase/locale.c | 477 ++++++++++++++++++++++++++--------- 2 files changed, 464 insertions(+), 123 deletions(-)
diff --git a/dlls/kernel32/tests/locale.c b/dlls/kernel32/tests/locale.c index 4c1e1b4d73..13839bb10a 100644 --- a/dlls/kernel32/tests/locale.c +++ b/dlls/kernel32/tests/locale.c @@ -2681,6 +2681,13 @@ static void test_lcmapstring_unicode(lcmapstring_wrapper func_ptr, const char *f lstrlenW(symbols_stripped) + 1, ret); ok(!lstrcmpW(buf, symbols_stripped), "%s string comparison mismatch\n", func_name);
+ /* test small buffer */ + lstrcpyW(buf, fooW); + ret = func_ptr(LCMAP_SORTKEY, lower_case, -1, buf, 2); + ok(ret == 0, "Expected a failure\n"); + ok(GetLastError() == ERROR_INSUFFICIENT_BUFFER, + "%s unexpected error code %d\n", func_name, GetLastError());; + /* test srclen = 0 */ SetLastError(0xdeadbeef); ret = func_ptr(0, upper_case, 0, buf, ARRAY_SIZE(buf)); @@ -3108,6 +3115,108 @@ static void test_sorting(void) } }
+struct sorting_test_entry { + const WCHAR* locale; + DWORD flags; + const WCHAR* first; + const WCHAR* second; + int result_sortkey; + int result_compare; + BOOL broken_on_old_win; +}; + +static const struct sorting_test_entry unicode_sorting_tests[] = +{ + /* 0 */ { L"en-US", 0, L"\ue6e3\u0a02", L"\ue6e3\u20dc", CSTR_LESS_THAN, 0, TRUE }, /* Test default character, when there is main weight extra there must be no diacritic weight */ + /* 1 */ { L"en-US", 0, L"\u276a", L"\u2768", CSTR_GREATER_THAN }, /* Test symbols, must add diacritic weight */ + /* 2 */ { L"en-US", 0, L"\u204d", L"\uff02", CSTR_LESS_THAN }, /* Test symbols, must add case weight */ + /* 3 */ { L"en-US", 0, L"a \u2060 b", L"a b", CSTR_EQUAL }, /* Test unsortable characters */ + /* 4 */ { L"en-US", 0, L"a \xfff0 b", L"a b", CSTR_EQUAL }, /* Test invalid characters */ + /* 5 */ { L"en-US", 0, L"\x00fc", L"\x016d", CSTR_LESS_THAN }, + /* 6 */ { L"en-US", 0, L"\x3fcb\x7fd5", L"\x0006\x3032", CSTR_GREATER_THAN }, + /* 7 */ { L"en-US", 0, L"\x00fc\x30fd", L"\x00fa\x1833", CSTR_LESS_THAN }, + /* 8 */ { L"en-US", 0, L"\x0037", L"\x277c", CSTR_LESS_THAN, 0, TRUE }, /* Normal character */ + /* 9 */ { L"en-US", 0, L"\x1eca", L"\x1ecb", CSTR_GREATER_THAN }, /* Normal character */ + /* 10 */ { L"en-US", 0, L"\x1d05", L"\x1d48", CSTR_GREATER_THAN }, /* Normal character */ + /* 11 */ { L"en-US", 0, L"\x19d7", L"\x096d", CSTR_GREATER_THAN }, /* Normal character diacritics */ + /* 12 */ { L"en-US", 0, L"\x00f5", L"\x1ecf", CSTR_LESS_THAN }, /* Normal character diacritics */ + /* 13 */ { L"en-US", 0, L"\x2793", L"\x0d70", CSTR_LESS_THAN, 0, TRUE }, /* Normal character diacritics */ + /* 14 */ { L"en-US", 0, L"A", L"a", CSTR_GREATER_THAN }, /* Normal character case weights */ + /* 15 */ { L"en-US", 0, L"z", L"Z", CSTR_LESS_THAN }, /* Normal character case weights */ + /* 16 */ { L"en-US", 0, L"\xe5a6", L"\xe5a5\x0333", CSTR_GREATER_THAN, 0, TRUE }, /* CJK with extra value */ + /* 17 */ { L"en-US", 0, L"\xe5d7", L"\xe5d6\x0330", CSTR_GREATER_THAN, 0, TRUE }, /* CJK with extra value */ + /* 18 */ { L"en-US", 0, L"\x1B56\x0330", L"\x1096", CSTR_GREATER_THAN }, /* Diacritic is added */ + /* 19 */ { L"en-US", 0, L"\x1817\x0333", L"\x19d7", CSTR_GREATER_THAN }, /* Diacritic is added */ + /* 20 */ { L"en-US", 0, L"\x04de\x05ac", L"\x0499", CSTR_GREATER_THAN }, /* Diacritic is added */ + /* 21 */ { L"en-US", 0, L"\x01ba\x0654", L"\x01b8", CSTR_LESS_THAN }, /* Diacritic can overflow */ + /* 22 */ { L"en-US", 0, L"\x06b7\x06eb", L"\x06b6", CSTR_LESS_THAN }, /* Diacritic can overflow */ + /* 23 */ { L"en-US", 0, L"\x1420\x0333", L"\x141f", CSTR_LESS_THAN }, /* Diacritic can overflow */ + /* 24 */ { L"en-US", 0, L"\x11bc", L"\x110b", CSTR_GREATER_THAN }, /* Jamo case weight */ + /* 25 */ { L"en-US", 0, L"\x11c1", L"\x1111", CSTR_GREATER_THAN }, /* Jamo case weight */ + /* 26 */ { L"en-US", 0, L"\x11af", L"\x1105", CSTR_GREATER_THAN }, /* Jamo case weight */ + /* 27 */ { L"en-US", 0, L"\x11c2", L"\x11f5", CSTR_LESS_THAN }, /* Jamo main weight */ + /* 28 */ { L"en-US", 0, L"\x1108", L"\x1121", CSTR_LESS_THAN }, /* Jamo main weight */ + /* 29 */ { L"en-US", 0, L"\x1116", L"\x11c7", CSTR_LESS_THAN }, /* Jamo main weight */ + /* 30 */ { L"en-US", 0, L"\x11b1", L"\x11d1", CSTR_LESS_THAN }, /* Jamo main weight */ + /* 31 */ { L"en-US", 0, L"\x4550\x73d2", L"\x3211\x23ad", CSTR_GREATER_THAN }, /* Script 5 main weight 1 */ + /* 32 */ { L"en-US", 0, L"\x3265", L"\x4079", CSTR_LESS_THAN }, /* Script 5 main weight 1 */ + /* 33 */ { L"en-US", 0, L"\x4c19\x68d0\x52d0", L"\x316d", CSTR_GREATER_THAN }, /* Script 5 main weight 1 */ + /* 34 */ { L"en-US", 0, L"\x72dd", L"\x6b8a", CSTR_GREATER_THAN }, /* Script 5 main weight 2 */ + /* 35 */ { L"en-US", 0, L"\x6785\x3bff\x6f83", L"\x7550\x34c9\x71a7", CSTR_LESS_THAN }, /* Script 5 main weight 2 */ + /* 36 */ { L"en-US", 0, L"\x5d61", L"\x3aef", CSTR_LESS_THAN }, /* Script 5 main weight 2 */ + /* 37 */ { L"en-US", 0, L"\x207a", L"\xfe62", CSTR_GREATER_THAN }, /* Symbols case weights */ + /* 38 */ { L"en-US", 0, L"\xfe65", L"\xff1e", CSTR_GREATER_THAN }, /* Symbols case weights */ + /* 39 */ { L"en-US", 0, L"\x2502", L"\xffe8", CSTR_GREATER_THAN }, /* Symbols case weights */ + /* 40 */ { L"en-US", 0, L"\x21da", L"\x21dc", CSTR_LESS_THAN }, /* Symbols diacritic weights */ + /* 41 */ { L"en-US", 0, L"\x29fb", L"\x2295", CSTR_LESS_THAN }, /* Symbols diacritic weights */ + /* 42 */ { L"en-US", 0, L"\x0092", L"\x009c", CSTR_LESS_THAN }, /* Symbols diacritic weights */ + /* 43 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x21da", L"\x21dc", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */ + /* 44 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x29fb", L"\x2295", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */ + /* 45 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x0092", L"\x009c", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */ + /* 46 */ { L"en-US", 0, L"\x3099", L"\x309a", CSTR_EQUAL }, /* MIN_WEIGHT */ + /* 47 */ { L"en-US", 0, L"\x309b", L"\x05a2", CSTR_EQUAL }, /* MIN_WEIGHT */ + /* 48 */ { L"en-US", 0, L"\xff9e", L"\x0e47", CSTR_EQUAL }, /* MIN_WEIGHT */ +}; + +static void test_unicode_sorting(void) +{ + int i; + if (!pLCMapStringEx) + { + + win_skip("LCMapStringEx not available\n"); + return; + } + for (i = 0; i < ARRAY_SIZE(unicode_sorting_tests); i++) + { + int pos; + BYTE buff1[1000]; + BYTE buff2[1000]; + int len1, len2; + int result = CSTR_EQUAL; + const struct sorting_test_entry* entry = &unicode_sorting_tests[i]; + + len1 = pLCMapStringEx(entry->locale, LCMAP_SORTKEY | entry->flags, entry->first, -1, (WCHAR*)buff1, ARRAY_SIZE(buff1), NULL, NULL, 0); + len2 = pLCMapStringEx(entry->locale, LCMAP_SORTKEY | entry->flags, entry->second, -1, (WCHAR*)buff2, ARRAY_SIZE(buff2), NULL, NULL, 0); + + for (pos = 0; pos < len1 && pos < len2; pos++) + { + if (buff1[pos] > buff2[pos]) + { + result = CSTR_GREATER_THAN; + break; + } + else if (buff1[pos] < buff2[pos]) + { + result = CSTR_LESS_THAN; + break; + } + } + + ok (result == entry->result_sortkey || broken(entry->broken_on_old_win), "Test %d - Expected %d, got %d\n", i, entry->result_sortkey, result); + } +} + static void test_FoldStringA(void) { int ret, i, j; @@ -6897,4 +7006,5 @@ START_TEST(locale) test_NLSVersion(); /* this requires collation table patch to make it MS compatible */ if (0) test_sorting(); + test_unicode_sorting(); } diff --git a/dlls/kernelbase/locale.c b/dlls/kernelbase/locale.c index 53e4e42da3..74177371d9 100644 --- a/dlls/kernelbase/locale.c +++ b/dlls/kernelbase/locale.c @@ -2126,127 +2126,6 @@ static int wcstombs_codepage( UINT codepage, DWORD flags, const WCHAR *src, int return wcstombs_sbcs( info, src, srclen, dst, dstlen ); }
- -static int get_sortkey( DWORD flags, const WCHAR *src, int srclen, char *dst, int dstlen ) -{ - WCHAR dummy[4]; /* no decomposition is larger than 4 chars */ - int key_len[4]; - char *key_ptr[4]; - const WCHAR *src_save = src; - int srclen_save = srclen; - - key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0; - for (; srclen; srclen--, src++) - { - unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/ - dummy[0] = *src; - if (decomposed_len) - { - for (i = 0; i < decomposed_len; i++) - { - WCHAR wch = dummy[i]; - unsigned int ce; - - if ((flags & NORM_IGNORESYMBOLS) && - (get_char_type( CT_CTYPE1, wch ) & (C1_PUNCT | C1_SPACE))) - continue; - - if (flags & NORM_IGNORECASE) wch = casemap( nls_info.LowerCaseTable, wch ); - - ce = collation_table[collation_table[collation_table[wch >> 8] + ((wch >> 4) & 0x0f)] + (wch & 0xf)]; - if (ce != (unsigned int)-1) - { - if (ce >> 16) key_len[0] += 2; - if ((ce >> 8) & 0xff) key_len[1]++; - if ((ce >> 4) & 0x0f) key_len[2]++; - if (ce & 1) - { - if (wch >> 8) key_len[3]++; - key_len[3]++; - } - } - else - { - key_len[0] += 2; - if (wch >> 8) key_len[0]++; - if (wch & 0xff) key_len[0]++; - } - } - } - } - - if (!dstlen) /* compute length */ - /* 4 * '\1' + key length */ - return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4; - - if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1) - return 0; /* overflow */ - - src = src_save; - srclen = srclen_save; - - key_ptr[0] = dst; - key_ptr[1] = key_ptr[0] + key_len[0] + 1; - key_ptr[2] = key_ptr[1] + key_len[1] + 1; - key_ptr[3] = key_ptr[2] + key_len[2] + 1; - - for (; srclen; srclen--, src++) - { - unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/ - dummy[0] = *src; - if (decomposed_len) - { - for (i = 0; i < decomposed_len; i++) - { - WCHAR wch = dummy[i]; - unsigned int ce; - - if ((flags & NORM_IGNORESYMBOLS) && - (get_char_type( CT_CTYPE1, wch ) & (C1_PUNCT | C1_SPACE))) - continue; - - if (flags & NORM_IGNORECASE) wch = casemap( nls_info.LowerCaseTable, wch ); - - ce = collation_table[collation_table[collation_table[wch >> 8] + ((wch >> 4) & 0x0f)] + (wch & 0xf)]; - if (ce != (unsigned int)-1) - { - WCHAR key; - if ((key = ce >> 16)) - { - *key_ptr[0]++ = key >> 8; - *key_ptr[0]++ = key & 0xff; - } - /* make key 1 start from 2 */ - if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1; - /* make key 2 start from 2 */ - if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1; - /* key 3 is always a character code */ - if (ce & 1) - { - if (wch >> 8) *key_ptr[3]++ = wch >> 8; - if (wch & 0xff) *key_ptr[3]++ = wch & 0xff; - } - } - else - { - *key_ptr[0]++ = 0xff; - *key_ptr[0]++ = 0xfe; - if (wch >> 8) *key_ptr[0]++ = wch >> 8; - if (wch & 0xff) *key_ptr[0]++ = wch & 0xff; - } - } - } - } - - *key_ptr[0] = 1; - *key_ptr[1] = 1; - *key_ptr[2] = 1; - *key_ptr[3]++ = 1; - *key_ptr[3] = 0; - return key_ptr[3] - dst; -} - - /* compose a full-width katakana. return consumed source characters. */ static int compose_katakana( const WCHAR *src, int srclen, WCHAR *dst ) { @@ -2574,6 +2453,358 @@ static int compare_weights(int flags, const WCHAR *str1, int len1, return len1 - len2; }
+/* Start sortkey handler code. */ + +/* Defines */ + +#define JAPANESE 3 +#define MIN_WEIGHT 2 +#define LIST_STACK_BUFFER 1000 + +/* Internal structures */ + +typedef struct _character_info +{ + BYTE weight_primary; + BYTE script_member; + BYTE weight_diacritic; + BYTE weight_case; +} character_info; + +typedef struct _weight_main_info +{ + BYTE script_member; + BYTE weight_primary; + BYTE extra; +} weight_main_info; + +typedef struct _list +{ + int extra_len; + int len; + BYTE buffer[LIST_STACK_BUFFER]; + int buffer_count; + BYTE* extra; + int element_size; +} list; + +typedef struct _sortkey_data +{ + int flags; + list key; + list weights_main; + list weights_diacritic; + list weights_case; +} sortkey_data; + +/* List functions */ + +static void LIST_INIT(list* name, int type_size) +{ + name->extra_len = 0; + name->len = 0; + name->extra = 0; + name->buffer_count = LIST_STACK_BUFFER / type_size; + name->element_size = type_size; +} + +static void LIST_DESTROY(list* name) +{ + RtlFreeHeap(GetProcessHeap(), 0, name->extra); +} + +static void* LIST_GET(list* name, int index) +{ + if ((index + 1) * name->element_size <= LIST_STACK_BUFFER) + return &name->buffer[index * name->element_size]; + else + return &name->extra[index * name->element_size - name->buffer_count]; +} + +/* Add entry to list, resizing as needed */ +static void LIST_ADD(list* name, const void *value) +{ + void* entry; + if ((name->len + 1) * name->element_size > name->extra_len + LIST_STACK_BUFFER) + { + if (!name->extra) /* First allocation */ + { + name->extra_len = LIST_STACK_BUFFER; + name->extra = RtlAllocateHeap(GetProcessHeap(), 0, name->extra_len); + } + else + { + name->extra_len *= 2; + name->extra = RtlReAllocateHeap(GetProcessHeap(), 0,name->extra, name->extra_len); + } + } + entry = LIST_GET(name, name->len); + memcpy(entry, value, name->element_size); + name->len++; +} + +/* Append a weight list to the sortkey */ +#define APPEND_LIST_TO_SORTKEY(data, weights, type, statement_get_value, statement_is_ignored) \ + do { \ + int z; \ + int end = data->weights.len - 1; \ + while (end >= 0) \ + { \ + const type* element = LIST_GET(&data->weights, end); \ + (void)element; \ + if (!(statement_is_ignored)) break; \ + end--; \ + } \ + for (z = 0; z <= end; z++) \ + { \ + const type* element = LIST_GET(&data->weights, z); \ + LIST_ADD(&data->key, statement_get_value); \ + } \ + } while (0); + +/* Helper functions */ + +static BOOL get_char(sortkey_data* data, character_info* info, WCHAR ch) +{ + DWORD value = sort.keys[ch]; + + info->weight_case = value >> 24; + info->weight_diacritic = (value >> 16) & 0xff; + info->script_member = (value >> 8) & 0xff; + info->weight_primary = value & 0xff; + return info->script_member != 0; +} + +static void sortkey_data_init(sortkey_data* data, int flags, const WCHAR* locale, BOOL is_compare_string) +{ + data->flags = flags; + LIST_INIT(&data->key, sizeof(BYTE)); + LIST_INIT(&data->weights_main, sizeof(BYTE)); + LIST_INIT(&data->weights_diacritic, sizeof(BYTE)); + LIST_INIT(&data->weights_case, sizeof(BYTE)); +} + +static void sortkey_data_destroy(sortkey_data* data) +{ + LIST_DESTROY(&data->key); + LIST_DESTROY(&data->weights_main); + LIST_DESTROY(&data->weights_diacritic); + LIST_DESTROY(&data->weights_case); +} + +static weight_main_info create_weight_main(BYTE script_member, BYTE weight_primary) +{ + weight_main_info ret = { 0 }; + ret.script_member = script_member; + ret.weight_primary = weight_primary; + return ret; +} + +static void case_weights_add(sortkey_data* data, BYTE value) +{ + int flags = data->flags; + if (NORM_IGNORECASE & flags) + value = value & ~(16 + 8); + if (NORM_IGNOREWIDTH & flags) + value = value & ~(1); + if (NORM_IGNOREKANATYPE & flags) + value = value & ~(32); + + LIST_ADD(&data->weights_case, &value); +} + +static void main_weights_add(sortkey_data *data, weight_main_info* value) +{ + LIST_ADD(&data->weights_main, &value->script_member); + LIST_ADD(&data->weights_main, &value->weight_primary); + if (value->extra > 0) + LIST_ADD(&data->weights_main, &value->extra); +} + +static void diacritic_weights_add(sortkey_data* data, const character_info* info, BYTE value) +{ + LIST_ADD(&data->weights_diacritic, &value); +} + +/* Main sortkey logic */ + +static void sortkey_handle_default_character(sortkey_data* data, WCHAR c) +{ + weight_main_info weightmain; + character_info info; + + if (!get_char(data, &info, c)) + { + return; + } + + weightmain = create_weight_main(info.script_member, info.weight_primary); + if (info.script_member >= 0xa9 && info.script_member <= 0xaf) /* Some CJK have extra value */ + weightmain.extra = info.weight_diacritic; + else + diacritic_weights_add(data, &info, info.weight_diacritic); + + main_weights_add(data, &weightmain); + + case_weights_add(data, info.weight_case); +} + +static BOOL sortkey_handle_character(sortkey_data* data, WCHAR c, const WCHAR* str, int i) +{ + weight_main_info weightmain; + character_info info; + int flags = data->flags; + + if (!get_char(data, &info, c)) + { + return FALSE; + } + + switch (info.script_member) + { + case 0: /* Not sorted */ + break; + + case 1: + if (data->weights_diacritic.len > 0) + { + BYTE* entry = LIST_GET(&data->weights_diacritic, data->weights_diacritic.len - 1); + *entry += info.weight_diacritic; /* Overflow can happen, that's okay */ + } + else + diacritic_weights_add(data, &info, info.weight_diacritic); + break; + + case JAPANESE: + /* TODO */ + break; + + case 4: /* Jamo */ + weightmain = create_weight_main(info.weight_primary, info.weight_diacritic); + main_weights_add(data, &weightmain); + + diacritic_weights_add(data, &info, MIN_WEIGHT); + + case_weights_add(data, info.weight_case); + break; + + case 5: + weightmain = create_weight_main(253, 255); + main_weights_add(data, &weightmain); + + weightmain = create_weight_main(info.weight_primary, info.weight_diacritic); + main_weights_add(data, &weightmain); + + diacritic_weights_add(data, &info, MIN_WEIGHT); + + case_weights_add(data, MIN_WEIGHT); + break; + + case 6: /* Punctuation */ + /* TODO */ + break; + + case 7: /* Symbols */ + case 8: /* Symbols */ + case 9: /* Symbols */ + case 10: /* Symbols */ + case 11: /* Symbols */ + case 12: /* Symbols */ + if (flags & NORM_IGNORESYMBOLS) + break; + + weightmain = create_weight_main(info.script_member, info.weight_primary); + main_weights_add(data, &weightmain); + + diacritic_weights_add(data, &info, info.weight_diacritic); + + case_weights_add(data, info.weight_case); + break; + + default: + sortkey_handle_default_character(data, c); + break; + } + return TRUE; +} + +static void sortkey_write_result(sortkey_data* data) +{ + int flags = data->flags; + + const BYTE SORTKEY_SEPARATOR = 1; + const BYTE SORTKEY_TERMINATOR = 0; + + /* Main weights */ + + APPEND_LIST_TO_SORTKEY(data, weights_main, BYTE, element, FALSE); + + LIST_ADD(&data->key, &SORTKEY_SEPARATOR); + + /* Diacritic weights */ + + if ((flags & NORM_IGNORENONSPACE) == 0) + { + APPEND_LIST_TO_SORTKEY(data, weights_diacritic, BYTE, element, *element <= MIN_WEIGHT); + } + + LIST_ADD(&data->key, &SORTKEY_SEPARATOR); + + /* Case weights */ + if ((NORM_IGNORECASE & flags) == 0 || (NORM_IGNOREWIDTH & flags) == 0) + { + APPEND_LIST_TO_SORTKEY(data, weights_case, BYTE, element, FALSE); + } + + LIST_ADD(&data->key, &SORTKEY_SEPARATOR); + + /* Extra weights */ + /* TODO */ + + LIST_ADD(&data->key, &SORTKEY_SEPARATOR); + + /* Special weights */ + /* TODO */ + + LIST_ADD(&data->key, &SORTKEY_TERMINATOR); +} + +static int sortkey_generate(int flags, const WCHAR* locale, const WCHAR* str, int str_len, BYTE* buffer, int buffer_len) +{ + int i; + sortkey_data data; + int ret = 0; + + sortkey_data_init(&data, flags, locale, FALSE); + + if (str_len == -1) + str_len = wcslen(str); + + for (i = 0; i < str_len; i++) + { + sortkey_handle_character(&data, str[i], str, i); + } + + sortkey_write_result(&data); + + if (data.key.len <= buffer_len) + { + for (i = 0; i < data.key.len; i++) + { + BYTE* value = LIST_GET(&data.key, i); + buffer[i] = *value; + } + ret = data.key.len; + } + else if (!buffer) + { + ret = data.key.len; + } + sortkey_data_destroy(&data); + return ret; +} + +/* End sortkey handler code */
static const struct geoinfo *get_geoinfo_ptr( GEOID geoid ) { @@ -4964,8 +5195,8 @@ INT WINAPI DECLSPEC_HOTPATCH LCMapStringEx( const WCHAR *locale, DWORD flags, co TRACE( "(%s,0x%08x,%s,%d,%p,%d)\n", debugstr_w(locale), flags, debugstr_wn(src, srclen), srclen, dst, dstlen );
- if ((ret = get_sortkey( flags, src, srclen, (char *)dst, dstlen ))) ret++; - else SetLastError( ERROR_INSUFFICIENT_BUFFER ); + if (!(ret = sortkey_generate(flags, L"", src, srclen, (BYTE *)dst, dstlen ))) + SetLastError( ERROR_INSUFFICIENT_BUFFER ); return ret; }
-- 2.26.2
Signed-off-by: Fabian Maurer dark.shadow4@web.de --- dlls/kernel32/tests/locale.c | 17 ++++++++++++++++ dlls/kernelbase/locale.c | 39 ++++++++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 2 deletions(-)
diff --git a/dlls/kernel32/tests/locale.c b/dlls/kernel32/tests/locale.c index 13839bb10a..e3b1710581 100644 --- a/dlls/kernel32/tests/locale.c +++ b/dlls/kernel32/tests/locale.c @@ -3176,6 +3176,23 @@ static const struct sorting_test_entry unicode_sorting_tests[] = /* 46 */ { L"en-US", 0, L"\x3099", L"\x309a", CSTR_EQUAL }, /* MIN_WEIGHT */ /* 47 */ { L"en-US", 0, L"\x309b", L"\x05a2", CSTR_EQUAL }, /* MIN_WEIGHT */ /* 48 */ { L"en-US", 0, L"\xff9e", L"\x0e47", CSTR_EQUAL }, /* MIN_WEIGHT */ + /* 49 */ { L"en-US", 0, L"\x001b", L"\x001c", CSTR_LESS_THAN }, /* Punctuation primary weight */ + /* 50 */ { L"en-US", 0, L"\x0005", L"\x0006", CSTR_LESS_THAN }, /* Punctuation primary weight */ + /* 51 */ { L"en-US", 0, L"\x0027", L"\xff07", CSTR_LESS_THAN, 0, TRUE }, /* Punctuation diacritic/case weight */ + /* 52 */ { L"en-US", 0, L"\x07f4", L"\x07f5", CSTR_LESS_THAN, 0, TRUE }, /* Punctuation diacritic/case weight */ + /* 53 */ { L"en-US", 0, L"\x207b", L"\x0008", CSTR_GREATER_THAN }, /* Punctuation diacritic/case weight */ + /* 54 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x207b", L"\x0008", CSTR_EQUAL }, /* Punctuation NORM_IGNORESYMBOLS */ + /* 55 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x0004", L"\x0011", CSTR_EQUAL }, /* Punctuation NORM_IGNORESYMBOLS */ + /* 56 */ { L"en-US", NORM_IGNORESYMBOLS | SORT_STRINGSORT, L"\x207b", L"\x0008", CSTR_EQUAL }, /* Punctuation NORM_IGNORESYMBOLS SORT_STRINGSORT */ + /* 57 */ { L"en-US", NORM_IGNORESYMBOLS | SORT_STRINGSORT, L"\x0004", L"\x0011", CSTR_EQUAL }, /* Punctuation NORM_IGNORESYMBOLS SORT_STRINGSORT */ + /* 58 */ { L"en-US", SORT_STRINGSORT, L"\x001a", L"\x001b", CSTR_LESS_THAN }, /* Punctuation SORT_STRINGSORT main weight */ + /* 59 */ { L"en-US", SORT_STRINGSORT, L"\x2027", L"\x2011", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT main weight */ + /* 60 */ { L"en-US", SORT_STRINGSORT, L"\x3030", L"\x301c", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT main weight */ + /* 61 */ { L"en-US", SORT_STRINGSORT, L"\x058a", L"\x2010", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT diacritic weight */ + /* 62 */ { L"en-US", SORT_STRINGSORT, L"\x07F5", L"\x07F4", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT diacritic weight */ + /* 63 */ { L"en-US", SORT_STRINGSORT, L"\xfe32", L"\x2013", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT case weight */ + /* 64 */ { L"en-US", SORT_STRINGSORT, L"\xfe31", L"\xfe58", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT case weight */ + /* 65 */ { L"en-US", SORT_STRINGSORT, L"\xff07", L"\x0027", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT case weight */ };
static void test_unicode_sorting(void) diff --git a/dlls/kernelbase/locale.c b/dlls/kernelbase/locale.c index 74177371d9..f72061adb7 100644 --- a/dlls/kernelbase/locale.c +++ b/dlls/kernelbase/locale.c @@ -2478,6 +2478,12 @@ typedef struct _weight_main_info BYTE extra; } weight_main_info;
+typedef struct _weight_special_info +{ + BYTE script_member; + BYTE weight_primary; +} weight_special_info; + typedef struct _list { int extra_len; @@ -2495,6 +2501,7 @@ typedef struct _sortkey_data list weights_main; list weights_diacritic; list weights_case; + list weights_special; } sortkey_data;
/* List functions */ @@ -2582,6 +2589,7 @@ static void sortkey_data_init(sortkey_data* data, int flags, const WCHAR* locale LIST_INIT(&data->weights_main, sizeof(BYTE)); LIST_INIT(&data->weights_diacritic, sizeof(BYTE)); LIST_INIT(&data->weights_case, sizeof(BYTE)); + LIST_INIT(&data->weights_special, sizeof(BYTE)); }
static void sortkey_data_destroy(sortkey_data* data) @@ -2590,6 +2598,7 @@ static void sortkey_data_destroy(sortkey_data* data) LIST_DESTROY(&data->weights_main); LIST_DESTROY(&data->weights_diacritic); LIST_DESTROY(&data->weights_case); + LIST_DESTROY(&data->weights_special); }
static weight_main_info create_weight_main(BYTE script_member, BYTE weight_primary) @@ -2621,6 +2630,12 @@ static void main_weights_add(sortkey_data *data, weight_main_info* value) LIST_ADD(&data->weights_main, &value->extra); }
+static void special_weights_add(sortkey_data* data, weight_special_info* value) +{ + LIST_ADD(&data->weights_special, &value->script_member); + LIST_ADD(&data->weights_special, &value->weight_primary); +} + static void diacritic_weights_add(sortkey_data* data, const character_info* info, BYTE value) { LIST_ADD(&data->weights_diacritic, &value); @@ -2701,7 +2716,26 @@ static BOOL sortkey_handle_character(sortkey_data* data, WCHAR c, const WCHAR* s break;
case 6: /* Punctuation */ - /* TODO */ + if (flags & NORM_IGNORESYMBOLS) + break; + + if (flags & SORT_STRINGSORT) + { + weightmain = create_weight_main(info.script_member, info.weight_primary); + main_weights_add(data, &weightmain); + + diacritic_weights_add(data, &info, info.weight_diacritic); + + case_weights_add(data, info.weight_case); + } + else + { + weight_special_info special; + + special.script_member = info.weight_primary; + special.weight_primary = (BYTE)(info.weight_diacritic * 8 + info.weight_case); /* Logic found through testing, seems to work reliably */ + special_weights_add(data, &special); + } break;
case 7: /* Symbols */ @@ -2764,7 +2798,8 @@ static void sortkey_write_result(sortkey_data* data) LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
/* Special weights */ - /* TODO */ + + APPEND_LIST_TO_SORTKEY(data, weights_special, BYTE, element, FALSE);
LIST_ADD(&data->key, &SORTKEY_TERMINATOR); } -- 2.26.2
Signed-off-by: Fabian Maurer dark.shadow4@web.de --- dlls/kernel32/tests/locale.c | 26 +++++++++++++++ dlls/kernelbase/locale.c | 65 ++++++++++++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 2 deletions(-)
diff --git a/dlls/kernel32/tests/locale.c b/dlls/kernel32/tests/locale.c index e3b1710581..5796fdbf9a 100644 --- a/dlls/kernel32/tests/locale.c +++ b/dlls/kernel32/tests/locale.c @@ -3193,6 +3193,32 @@ static const struct sorting_test_entry unicode_sorting_tests[] = /* 63 */ { L"en-US", SORT_STRINGSORT, L"\xfe32", L"\x2013", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT case weight */ /* 64 */ { L"en-US", SORT_STRINGSORT, L"\xfe31", L"\xfe58", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT case weight */ /* 65 */ { L"en-US", SORT_STRINGSORT, L"\xff07", L"\x0027", CSTR_GREATER_THAN }, /* Punctuation SORT_STRINGSORT case weight */ + /* 66 */ { L"en-US", 0, L"\x04b0", L"\x32db", CSTR_LESS_THAN }, /* Japanese main weight */ + /* 67 */ { L"en-US", 0, L"\x3093", L"\x1e62\x013f", CSTR_GREATER_THAN }, /* japanese main weight */ + /* 68 */ { L"en-US", 0, L"\x30d3", L"\x30d4", CSTR_LESS_THAN }, /* japanese diacritic weight */ + /* 69 */ { L"en-US", 0, L"\x307b", L"\x307c", CSTR_LESS_THAN }, /* japanese diacritic weight */ + /* 70 */ { L"en-US", 0, L"\x30ea", L"\x32f7", CSTR_LESS_THAN }, /* japanese diacritic weight */ + /* 71 */ { L"en-US", 0, L"\x31fb", L"\x30e9", CSTR_LESS_THAN }, /* japanese case weight small */ + /* 72 */ { L"en-US", 0, L"\x30db", L"\x31f9", CSTR_GREATER_THAN }, /* japanese case weight small */ + /* 73 */ { L"en-US", 0, L"\xff6d", L"\xff95", CSTR_LESS_THAN }, /* japanese case weight small */ + /* 74 */ { L"en-US", NORM_IGNORENONSPACE, L"\x31fb", L"\x30e9", CSTR_EQUAL }, /* japanese case weight small */ + /* 75 */ { L"en-US", NORM_IGNORENONSPACE, L"\x30db", L"\x31f9", CSTR_EQUAL }, /* japanese case weight small */ + /* 76 */ { L"en-US", NORM_IGNORENONSPACE, L"\xff6d", L"\xff95", CSTR_EQUAL }, /* japanese case weight small */ + /* 77 */ { L"en-US", 0, L"\x30d5", L"\x3075", CSTR_LESS_THAN }, /* japanese case weight kana */ + /* 78 */ { L"en-US", 0, L"\x306a", L"\x30ca", CSTR_GREATER_THAN }, /* japanese case weight kana */ + /* 79 */ { L"en-US", 0, L"\x305a", L"\x30ba", CSTR_GREATER_THAN }, /* japanese case weight kana */ + /* 80 */ { L"en-US", NORM_IGNOREKANATYPE, L"\x30d5", L"\x3075", CSTR_EQUAL }, /* japanese case weight kana */ + /* 81 */ { L"en-US", NORM_IGNOREKANATYPE, L"\x306a", L"\x30ca", CSTR_EQUAL }, /* japanese case weight kana */ + /* 82 */ { L"en-US", NORM_IGNOREKANATYPE, L"\x305a", L"\x30ba", CSTR_EQUAL }, /* japanese case weight kana */ + /* 83 */ { L"en-US", 0, L"\x30bf", L"\xff80", CSTR_GREATER_THAN }, /* japanese case weight width */ + /* 84 */ { L"en-US", 0, L"\x30ab", L"\xff76", CSTR_GREATER_THAN }, /* japanese case weight width */ + /* 85 */ { L"en-US", 0, L"\x30a2", L"\xff71", CSTR_GREATER_THAN }, /* japanese case weight width */ + /* 86 */ { L"en-US", NORM_IGNOREWIDTH, L"\x30bf", L"\xff80", CSTR_EQUAL }, /* japanese case weight width */ + /* 87 */ { L"en-US", NORM_IGNOREWIDTH, L"\x30ab", L"\xff76", CSTR_EQUAL }, /* japanese case weight width */ + /* 88 */ { L"en-US", NORM_IGNOREWIDTH, L"\x30a2", L"\xff71", CSTR_EQUAL }, /* japanese case weight width */ + /* 89 */ { L"en-US", NORM_IGNORENONSPACE, L"\x31a2", L"\x3110", CSTR_EQUAL }, /* NORM_IGNORENONSPACE */ + /* 90 */ { L"en-US", NORM_IGNORENONSPACE, L"\x1342", L"\x133a", CSTR_EQUAL }, /* NORM_IGNORENONSPACE */ + /* 91 */ { L"en-US", NORM_IGNORENONSPACE, L"\x16a4", L"\x16a5", CSTR_EQUAL }, /* NORM_IGNORENONSPACE */ };
static void test_unicode_sorting(void) diff --git a/dlls/kernelbase/locale.c b/dlls/kernelbase/locale.c index f72061adb7..40b5f521e0 100644 --- a/dlls/kernelbase/locale.c +++ b/dlls/kernelbase/locale.c @@ -2484,6 +2484,13 @@ typedef struct _weight_special_info BYTE weight_primary; } weight_special_info;
+typedef struct _weight_extra_info +{ + BYTE flag_small; + BYTE flag_kana; + BYTE flag_width; +} weight_extra_info; + typedef struct _list { int extra_len; @@ -2502,6 +2509,7 @@ typedef struct _sortkey_data list weights_diacritic; list weights_case; list weights_special; + list weights_extra; } sortkey_data;
/* List functions */ @@ -2590,6 +2598,7 @@ static void sortkey_data_init(sortkey_data* data, int flags, const WCHAR* locale LIST_INIT(&data->weights_diacritic, sizeof(BYTE)); LIST_INIT(&data->weights_case, sizeof(BYTE)); LIST_INIT(&data->weights_special, sizeof(BYTE)); + LIST_INIT(&data->weights_extra, sizeof(weight_extra_info)); }
static void sortkey_data_destroy(sortkey_data* data) @@ -2599,6 +2608,7 @@ static void sortkey_data_destroy(sortkey_data* data) LIST_DESTROY(&data->weights_diacritic); LIST_DESTROY(&data->weights_case); LIST_DESTROY(&data->weights_special); + LIST_DESTROY(&data->weights_extra); }
static weight_main_info create_weight_main(BYTE script_member, BYTE weight_primary) @@ -2664,6 +2674,36 @@ static void sortkey_handle_default_character(sortkey_data* data, WCHAR c) case_weights_add(data, info.weight_case); }
+static void sortkey_handle_japanese_character(sortkey_data* data, weight_main_info* weightmain, const character_info* info, const character_info* info_other) +{ + const BYTE BASELINE_EXTRA = 0xc4; + const BYTE ISOLATE_KANA = 0x20 | BASELINE_EXTRA; /* if bit is set then hiragana, else katakana */ + const BYTE ISOLATE_SMALL = 0x2 | BASELINE_EXTRA; /* if bit is set then normal kana, else small kana */ + const BYTE ISOLATE_WIDTH = 0x1 | BASELINE_EXTRA; /* if bit is set then full width, else half width */ + int weight_case; + weight_extra_info extra; + + weightmain->script_member = 34; + weightmain->weight_primary = info_other->weight_primary; + + main_weights_add(data, weightmain); + + weight_case = info_other->weight_case | BASELINE_EXTRA; + + extra.flag_small = (BYTE)(weight_case & ISOLATE_SMALL); + extra.flag_kana = (BYTE)(weight_case & ISOLATE_KANA); + extra.flag_width = (BYTE)(weight_case & ISOLATE_WIDTH); + + if (data->flags & NORM_IGNOREKANATYPE) + extra.flag_kana = BASELINE_EXTRA; + if (data->flags & NORM_IGNOREWIDTH) + extra.flag_width = BASELINE_EXTRA; + LIST_ADD(&data->weights_extra, &extra); + + diacritic_weights_add(data, info, info->weight_diacritic); + case_weights_add(data, MIN_WEIGHT); +} + static BOOL sortkey_handle_character(sortkey_data* data, WCHAR c, const WCHAR* str, int i) { weight_main_info weightmain; @@ -2691,7 +2731,16 @@ static BOOL sortkey_handle_character(sortkey_data* data, WCHAR c, const WCHAR* s break;
case JAPANESE: - /* TODO */ + weightmain = create_weight_main(info.script_member, info.weight_primary); + + if (weightmain.weight_primary <= 1) + { + /* TODO */ + } + else + { + sortkey_handle_japanese_character(data, &weightmain, &info, &info); + } break;
case 4: /* Jamo */ @@ -2793,7 +2842,19 @@ static void sortkey_write_result(sortkey_data* data) LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
/* Extra weights */ - /* TODO */ + if (data->weights_extra.len > 0) + { + const BYTE EXTRA_SEPARATOR = 0xff; + if ((NORM_IGNORENONSPACE & flags) == 0) + { + APPEND_LIST_TO_SORTKEY(data, weights_extra, weight_extra_info, &element->flag_small, element->flag_small > 196); + } + LIST_ADD(&data->key, &EXTRA_SEPARATOR); + APPEND_LIST_TO_SORTKEY(data, weights_extra, weight_extra_info, &element->flag_kana, element->flag_kana > 196); + LIST_ADD(&data->key, &EXTRA_SEPARATOR); + APPEND_LIST_TO_SORTKEY(data, weights_extra, weight_extra_info, &element->flag_width, element->flag_width > 196); + LIST_ADD(&data->key, &EXTRA_SEPARATOR); + }
LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
-- 2.26.2
Signed-off-by: Fabian Maurer dark.shadow4@web.de --- dlls/kernel32/tests/locale.c | 3 +++ dlls/kernelbase/locale.c | 45 +++++++++++++++++++++++++++++++++--- 2 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/dlls/kernel32/tests/locale.c b/dlls/kernel32/tests/locale.c index 5796fdbf9a..8c976e70fc 100644 --- a/dlls/kernel32/tests/locale.c +++ b/dlls/kernel32/tests/locale.c @@ -3219,6 +3219,9 @@ static const struct sorting_test_entry unicode_sorting_tests[] = /* 89 */ { L"en-US", NORM_IGNORENONSPACE, L"\x31a2", L"\x3110", CSTR_EQUAL }, /* NORM_IGNORENONSPACE */ /* 90 */ { L"en-US", NORM_IGNORENONSPACE, L"\x1342", L"\x133a", CSTR_EQUAL }, /* NORM_IGNORENONSPACE */ /* 91 */ { L"en-US", NORM_IGNORENONSPACE, L"\x16a4", L"\x16a5", CSTR_EQUAL }, /* NORM_IGNORENONSPACE */ + /* 92 */ { L"en-US", 0, L"\x00c6", L"\x0041\x0045", CSTR_EQUAL }, /* Expansion */ + /* 93 */ { L"en-US", 0, L"\x0f5c", L"\x0f5b\x0fb7", CSTR_EQUAL }, /* Expansion */ + /* 94 */ { L"en-US", 0, L"\x05f0", L"\x05d5\x05d5", CSTR_EQUAL }, /* Expansion */ };
static void test_unicode_sorting(void) diff --git a/dlls/kernelbase/locale.c b/dlls/kernelbase/locale.c index 40b5f521e0..a12b8bdb30 100644 --- a/dlls/kernelbase/locale.c +++ b/dlls/kernelbase/locale.c @@ -2471,6 +2471,13 @@ typedef struct _character_info BYTE weight_case; } character_info;
+typedef struct _character_info_expansion +{ + int character; + int character_result1; + int character_result2; +} character_info_expansion; + typedef struct _weight_main_info { BYTE script_member; @@ -2582,6 +2589,8 @@ static void LIST_ADD(list* name, const void *value) static BOOL get_char(sortkey_data* data, character_info* info, WCHAR ch) { DWORD value = sort.keys[ch]; + if ((WORD)value == 0x200) /* Expansion */ + return FALSE;
info->weight_case = value >> 24; info->weight_diacritic = (value >> 16) & 0xff; @@ -2590,6 +2599,28 @@ static BOOL get_char(sortkey_data* data, character_info* info, WCHAR ch) return info->script_member != 0; }
+static BOOL get_expansion(character_info_expansion* info, WCHAR ch) +{ + DWORD pos_info = sort.keys[ch]; + int count = (WORD)pos_info; + int pos = pos_info >> 16; + const DWORD* ptr; + const WCHAR* p; + int count_expansion; + if (count != 0x200) + return FALSE; + ptr = (const DWORD *)(sort.guids + sort.guid_count); + count_expansion = *ptr++; + if (pos >= count_expansion) + return FALSE; + p = (const WCHAR *)(ptr + pos); + info->character = ch; + info->character_result1 = p[0]; + info->character_result2 = p[1]; + return TRUE; +} + + static void sortkey_data_init(sortkey_data* data, int flags, const WCHAR* locale, BOOL is_compare_string) { data->flags = flags; @@ -2653,14 +2684,21 @@ static void diacritic_weights_add(sortkey_data* data, const character_info* info
/* Main sortkey logic */
-static void sortkey_handle_default_character(sortkey_data* data, WCHAR c) +static BOOL sortkey_handle_default_character(sortkey_data* data, WCHAR c) { weight_main_info weightmain; character_info info; + character_info_expansion expansion;
if (!get_char(data, &info, c)) { - return; + if (get_expansion(&expansion, c)) + { + sortkey_handle_default_character(data, (WCHAR)expansion.character_result1); + sortkey_handle_default_character(data, (WCHAR)expansion.character_result2); + return TRUE; + } + return FALSE; }
weightmain = create_weight_main(info.script_member, info.weight_primary); @@ -2672,6 +2710,7 @@ static void sortkey_handle_default_character(sortkey_data* data, WCHAR c) main_weights_add(data, &weightmain);
case_weights_add(data, info.weight_case); + return TRUE; }
static void sortkey_handle_japanese_character(sortkey_data* data, weight_main_info* weightmain, const character_info* info, const character_info* info_other) @@ -2712,7 +2751,7 @@ static BOOL sortkey_handle_character(sortkey_data* data, WCHAR c, const WCHAR* s
if (!get_char(data, &info, c)) { - return FALSE; + return sortkey_handle_default_character(data, c); }
switch (info.script_member) -- 2.26.2
Signed-off-by: Fabian Maurer dark.shadow4@web.de --- dlls/kernel32/tests/locale.c | 20 ++++++++++++++++++++ dlls/kernelbase/locale.c | 26 ++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/dlls/kernel32/tests/locale.c b/dlls/kernel32/tests/locale.c index 8c976e70fc..e5102bbbf3 100644 --- a/dlls/kernel32/tests/locale.c +++ b/dlls/kernel32/tests/locale.c @@ -3222,6 +3222,26 @@ static const struct sorting_test_entry unicode_sorting_tests[] = /* 92 */ { L"en-US", 0, L"\x00c6", L"\x0041\x0045", CSTR_EQUAL }, /* Expansion */ /* 93 */ { L"en-US", 0, L"\x0f5c", L"\x0f5b\x0fb7", CSTR_EQUAL }, /* Expansion */ /* 94 */ { L"en-US", 0, L"\x05f0", L"\x05d5\x05d5", CSTR_EQUAL }, /* Expansion */ + /* 95 */ { L"ja-JP", 0, L"\x6df8", L"\x654b\x29e9", CSTR_LESS_THAN }, /* japanese locale */ + /* 96 */ { L"ja-JP", 0, L"\x685d\x1239\x1b61", L"\x59b6\x6542\x2a62\x04a7", CSTR_LESS_THAN }, /* japanese locale */ + /* 97 */ { L"ja-JP", 0, L"\x62f3\x43e9", L"\x5760", CSTR_LESS_THAN }, /* japanese locale */ + /* 98 */ { L"ja-JP", 0, L"\x634c", L"\x2f0d\x5f1c\x7124", CSTR_LESS_THAN }, /* japanese locale */ + /* 99 */ { L"ja-JP", 0, L"\x69e7\x0502", L"\x57cc" , CSTR_LESS_THAN }, /* japanese locale */ + /* 100 */ { L"ja-JP", 0, L"\x7589", L"\x67c5" , CSTR_LESS_THAN }, /* japanese locale */ + /* 101 */ { L"ja-JP", 0, L"\x5ede\x765c", L"\x7324" , CSTR_GREATER_THAN }, /* japanese locale */ + /* 102 */ { L"ja-JP", 0, L"\x5c7f\x5961", L"\x7cbe" , CSTR_GREATER_THAN }, /* japanese locale */ + /* 103 */ { L"ja-JP", 0, L"\x3162", L"\x6a84\x1549\x0b60" , CSTR_GREATER_THAN }, /* japanese locale */ + /* 104 */ { L"ja-JP", 0, L"\x769e\x448e", L"\x4e6e" , CSTR_LESS_THAN }, /* japanese locale */ + /* 105 */ { L"ja-JP", 0, L"\x59a4", L"\x5faa\x607c", CSTR_GREATER_THAN }, /* japanese locale */ + /* 106 */ { L"ja-JP", 0, L"\x529b", L"\x733f", CSTR_GREATER_THAN }, /* japanese locale */ + /* 107 */ { L"ja-JP", 0, L"\x6ff8\x2a0a", L"\x7953\x6712" , CSTR_GREATER_THAN }, /* japanese locale */ + /* 108 */ { L"ja-JP", 0, L"\x6dfb", L"\x6793", CSTR_LESS_THAN }, /* japanese locale */ + /* 109 */ { L"ja-JP", 0, L"\x67ed", L"\x6aa2", CSTR_GREATER_THAN }, /* japanese locale */ + /* 110 */ { L"ja-JP", 0, L"\x4e61", L"\x6350\x6b08", CSTR_GREATER_THAN }, /* japanese locale */ + /* 111 */ { L"ja-JP", 0, L"\x5118", L"\x53b3\x75b4", CSTR_GREATER_THAN }, /* japanese locale */ + /* 112 */ { L"ja-JP", 0, L"\x6bbf", L"\x65a3" , CSTR_LESS_THAN }, /* japanese locale */ + /* 113 */ { L"ja-JP", 0, L"\x5690", L"\x5fa8", CSTR_GREATER_THAN }, /* japanese locale */ + /* 114 */ { L"ja-JP", 0, L"\x61e2", L"\x76e5" , CSTR_GREATER_THAN }, /* japanese locale */ };
static void test_unicode_sorting(void) diff --git a/dlls/kernelbase/locale.c b/dlls/kernelbase/locale.c index a12b8bdb30..48284741b0 100644 --- a/dlls/kernelbase/locale.c +++ b/dlls/kernelbase/locale.c @@ -2517,6 +2517,7 @@ typedef struct _sortkey_data list weights_case; list weights_special; list weights_extra; + const struct sortguid* locale; } sortkey_data;
/* List functions */ @@ -2586,9 +2587,28 @@ static void LIST_ADD(list* name, const void *value)
/* Helper functions */
+static DWORD get_exception(sortkey_data* data, WCHAR ch) +{ + if (data->locale && data->locale->except) + { + DWORD* table = sort.keys + data->locale->except; + DWORD hi = ch >> 8; + DWORD lo = ch & 0xff; + if (table[hi] == hi * 0x100) + return 0; + if (sort.keys[table[hi] + lo] == sort.keys[hi * 0x100 + lo]) + return 0; + return sort.keys[table[hi] + lo]; + } + return 0; +} + static BOOL get_char(sortkey_data* data, character_info* info, WCHAR ch) { - DWORD value = sort.keys[ch]; + DWORD value = get_exception(data, ch); + if (!value) + value = sort.keys[ch]; + if ((WORD)value == 0x200) /* Expansion */ return FALSE;
@@ -2596,6 +2616,7 @@ static BOOL get_char(sortkey_data* data, character_info* info, WCHAR ch) info->weight_diacritic = (value >> 16) & 0xff; info->script_member = (value >> 8) & 0xff; info->weight_primary = value & 0xff; + return info->script_member != 0; }
@@ -2630,6 +2651,7 @@ static void sortkey_data_init(sortkey_data* data, int flags, const WCHAR* locale LIST_INIT(&data->weights_case, sizeof(BYTE)); LIST_INIT(&data->weights_special, sizeof(BYTE)); LIST_INIT(&data->weights_extra, sizeof(weight_extra_info)); + data->locale = get_language_sort(locale); }
static void sortkey_data_destroy(sortkey_data* data) @@ -5330,7 +5352,7 @@ INT WINAPI DECLSPEC_HOTPATCH LCMapStringEx( const WCHAR *locale, DWORD flags, co TRACE( "(%s,0x%08x,%s,%d,%p,%d)\n", debugstr_w(locale), flags, debugstr_wn(src, srclen), srclen, dst, dstlen );
- if (!(ret = sortkey_generate(flags, L"", src, srclen, (BYTE *)dst, dstlen ))) + if (!(ret = sortkey_generate(flags, locale, src, srclen, (BYTE *)dst, dstlen ))) SetLastError( ERROR_INSUFFICIENT_BUFFER ); return ret; } -- 2.26.2
On 4/28/20 1:17 PM, Fabian Maurer wrote:
Signed-off-by: Fabian Maurer dark.shadow4@web.de
dlls/kernel32/tests/locale.c | 110 ++++++++ dlls/kernelbase/locale.c | 477 ++++++++++++++++++++++++++--------- 2 files changed, 464 insertions(+), 123 deletions(-)
So as far as I understand, the sort key algorithm writes the level 0 weights (script and alphabetic weight) for the whole string to the sort key, then the level 1 weights (diacritic), and so on, right?
In that case, what seems potentially simpler to me is to calculate those weights one level at a time, rather than one character at a time. That is, you'd end up doing something like
static int get_sortkey( DWORD flags, const WCHAR *src, int srclen, char *dst, int dstlen ) { int used = 0; for (i = 0; i < srclen; ++i) { used += get_main_weights(src[i], dst + used, dstlen - used); if (!(flags & NORM_IGNORENONSPACE)) used += get_diacritic_weights(src[i], dst + used, dstlen - used); ... } }
This avoids the need to store temporary buffers.
As that example shows, I also think it's probably simpler to just pass the buffer directly to whatever functions are writing sortkey bytes into it.
diff --git a/dlls/kernel32/tests/locale.c b/dlls/kernel32/tests/locale.c index 4c1e1b4d73..13839bb10a 100644 --- a/dlls/kernel32/tests/locale.c +++ b/dlls/kernel32/tests/locale.c @@ -2681,6 +2681,13 @@ static void test_lcmapstring_unicode(lcmapstring_wrapper func_ptr, const char *f lstrlenW(symbols_stripped) + 1, ret); ok(!lstrcmpW(buf, symbols_stripped), "%s string comparison mismatch\n", func_name);
- /* test small buffer */
- lstrcpyW(buf, fooW);
- ret = func_ptr(LCMAP_SORTKEY, lower_case, -1, buf, 2);
- ok(ret == 0, "Expected a failure\n");
- ok(GetLastError() == ERROR_INSUFFICIENT_BUFFER,
"%s unexpected error code %d\n", func_name, GetLastError());;
/* test srclen = 0 */ SetLastError(0xdeadbeef); ret = func_ptr(0, upper_case, 0, buf, ARRAY_SIZE(buf));
@@ -3108,6 +3115,108 @@ static void test_sorting(void) } }
+struct sorting_test_entry {
- const WCHAR* locale;
- DWORD flags;
- const WCHAR* first;
- const WCHAR* second;
- int result_sortkey;
- int result_compare;
- BOOL broken_on_old_win;
+};
+static const struct sorting_test_entry unicode_sorting_tests[] = +{
- /* 0 */ { L"en-US", 0, L"\ue6e3\u0a02", L"\ue6e3\u20dc", CSTR_LESS_THAN, 0, TRUE }, /* Test default character, when there is main weight extra there must be no diacritic weight */
- /* 1 */ { L"en-US", 0, L"\u276a", L"\u2768", CSTR_GREATER_THAN }, /* Test symbols, must add diacritic weight */
- /* 2 */ { L"en-US", 0, L"\u204d", L"\uff02", CSTR_LESS_THAN }, /* Test symbols, must add case weight */
- /* 3 */ { L"en-US", 0, L"a \u2060 b", L"a b", CSTR_EQUAL }, /* Test unsortable characters */
- /* 4 */ { L"en-US", 0, L"a \xfff0 b", L"a b", CSTR_EQUAL }, /* Test invalid characters */
- /* 5 */ { L"en-US", 0, L"\x00fc", L"\x016d", CSTR_LESS_THAN },
- /* 6 */ { L"en-US", 0, L"\x3fcb\x7fd5", L"\x0006\x3032", CSTR_GREATER_THAN },
- /* 7 */ { L"en-US", 0, L"\x00fc\x30fd", L"\x00fa\x1833", CSTR_LESS_THAN },
- /* 8 */ { L"en-US", 0, L"\x0037", L"\x277c", CSTR_LESS_THAN, 0, TRUE }, /* Normal character */
- /* 9 */ { L"en-US", 0, L"\x1eca", L"\x1ecb", CSTR_GREATER_THAN }, /* Normal character */
- /* 10 */ { L"en-US", 0, L"\x1d05", L"\x1d48", CSTR_GREATER_THAN }, /* Normal character */
- /* 11 */ { L"en-US", 0, L"\x19d7", L"\x096d", CSTR_GREATER_THAN }, /* Normal character diacritics */
- /* 12 */ { L"en-US", 0, L"\x00f5", L"\x1ecf", CSTR_LESS_THAN }, /* Normal character diacritics */
- /* 13 */ { L"en-US", 0, L"\x2793", L"\x0d70", CSTR_LESS_THAN, 0, TRUE }, /* Normal character diacritics */
- /* 14 */ { L"en-US", 0, L"A", L"a", CSTR_GREATER_THAN }, /* Normal character case weights */
- /* 15 */ { L"en-US", 0, L"z", L"Z", CSTR_LESS_THAN }, /* Normal character case weights */
- /* 16 */ { L"en-US", 0, L"\xe5a6", L"\xe5a5\x0333", CSTR_GREATER_THAN, 0, TRUE }, /* CJK with extra value */
- /* 17 */ { L"en-US", 0, L"\xe5d7", L"\xe5d6\x0330", CSTR_GREATER_THAN, 0, TRUE }, /* CJK with extra value */
- /* 18 */ { L"en-US", 0, L"\x1B56\x0330", L"\x1096", CSTR_GREATER_THAN }, /* Diacritic is added */
- /* 19 */ { L"en-US", 0, L"\x1817\x0333", L"\x19d7", CSTR_GREATER_THAN }, /* Diacritic is added */
- /* 20 */ { L"en-US", 0, L"\x04de\x05ac", L"\x0499", CSTR_GREATER_THAN }, /* Diacritic is added */
- /* 21 */ { L"en-US", 0, L"\x01ba\x0654", L"\x01b8", CSTR_LESS_THAN }, /* Diacritic can overflow */
- /* 22 */ { L"en-US", 0, L"\x06b7\x06eb", L"\x06b6", CSTR_LESS_THAN }, /* Diacritic can overflow */
- /* 23 */ { L"en-US", 0, L"\x1420\x0333", L"\x141f", CSTR_LESS_THAN }, /* Diacritic can overflow */
- /* 24 */ { L"en-US", 0, L"\x11bc", L"\x110b", CSTR_GREATER_THAN }, /* Jamo case weight */
- /* 25 */ { L"en-US", 0, L"\x11c1", L"\x1111", CSTR_GREATER_THAN }, /* Jamo case weight */
- /* 26 */ { L"en-US", 0, L"\x11af", L"\x1105", CSTR_GREATER_THAN }, /* Jamo case weight */
- /* 27 */ { L"en-US", 0, L"\x11c2", L"\x11f5", CSTR_LESS_THAN }, /* Jamo main weight */
- /* 28 */ { L"en-US", 0, L"\x1108", L"\x1121", CSTR_LESS_THAN }, /* Jamo main weight */
- /* 29 */ { L"en-US", 0, L"\x1116", L"\x11c7", CSTR_LESS_THAN }, /* Jamo main weight */
- /* 30 */ { L"en-US", 0, L"\x11b1", L"\x11d1", CSTR_LESS_THAN }, /* Jamo main weight */
- /* 31 */ { L"en-US", 0, L"\x4550\x73d2", L"\x3211\x23ad", CSTR_GREATER_THAN }, /* Script 5 main weight 1 */
- /* 32 */ { L"en-US", 0, L"\x3265", L"\x4079", CSTR_LESS_THAN }, /* Script 5 main weight 1 */
- /* 33 */ { L"en-US", 0, L"\x4c19\x68d0\x52d0", L"\x316d", CSTR_GREATER_THAN }, /* Script 5 main weight 1 */
- /* 34 */ { L"en-US", 0, L"\x72dd", L"\x6b8a", CSTR_GREATER_THAN }, /* Script 5 main weight 2 */
- /* 35 */ { L"en-US", 0, L"\x6785\x3bff\x6f83", L"\x7550\x34c9\x71a7", CSTR_LESS_THAN }, /* Script 5 main weight 2 */
- /* 36 */ { L"en-US", 0, L"\x5d61", L"\x3aef", CSTR_LESS_THAN }, /* Script 5 main weight 2 */
- /* 37 */ { L"en-US", 0, L"\x207a", L"\xfe62", CSTR_GREATER_THAN }, /* Symbols case weights */
- /* 38 */ { L"en-US", 0, L"\xfe65", L"\xff1e", CSTR_GREATER_THAN }, /* Symbols case weights */
- /* 39 */ { L"en-US", 0, L"\x2502", L"\xffe8", CSTR_GREATER_THAN }, /* Symbols case weights */
- /* 40 */ { L"en-US", 0, L"\x21da", L"\x21dc", CSTR_LESS_THAN }, /* Symbols diacritic weights */
- /* 41 */ { L"en-US", 0, L"\x29fb", L"\x2295", CSTR_LESS_THAN }, /* Symbols diacritic weights */
- /* 42 */ { L"en-US", 0, L"\x0092", L"\x009c", CSTR_LESS_THAN }, /* Symbols diacritic weights */
- /* 43 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x21da", L"\x21dc", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */
- /* 44 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x29fb", L"\x2295", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */
- /* 45 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x0092", L"\x009c", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */
- /* 46 */ { L"en-US", 0, L"\x3099", L"\x309a", CSTR_EQUAL }, /* MIN_WEIGHT */
- /* 47 */ { L"en-US", 0, L"\x309b", L"\x05a2", CSTR_EQUAL }, /* MIN_WEIGHT */
- /* 48 */ { L"en-US", 0, L"\xff9e", L"\x0e47", CSTR_EQUAL }, /* MIN_WEIGHT */
+};
+static void test_unicode_sorting(void) +{
- int i;
- if (!pLCMapStringEx)
- {
win_skip("LCMapStringEx not available\n");
return;
- }
- for (i = 0; i < ARRAY_SIZE(unicode_sorting_tests); i++)
- {
int pos;
BYTE buff1[1000];
BYTE buff2[1000];
int len1, len2;
int result = CSTR_EQUAL;
const struct sorting_test_entry* entry = &unicode_sorting_tests[i];
len1 = pLCMapStringEx(entry->locale, LCMAP_SORTKEY | entry->flags, entry->first, -1, (WCHAR*)buff1, ARRAY_SIZE(buff1), NULL, NULL, 0);
len2 = pLCMapStringEx(entry->locale, LCMAP_SORTKEY | entry->flags, entry->second, -1, (WCHAR*)buff2, ARRAY_SIZE(buff2), NULL, NULL, 0);
Is there a reason to use LCMapStringEx() here rather than LCMapString()?
for (pos = 0; pos < len1 && pos < len2; pos++)
{
if (buff1[pos] > buff2[pos])
{
result = CSTR_GREATER_THAN;
break;
}
else if (buff1[pos] < buff2[pos])
{
result = CSTR_LESS_THAN;
break;
}
}
ok (result == entry->result_sortkey || broken(entry->broken_on_old_win), "Test %d - Expected %d, got %d\n", i, entry->result_sortkey, result);
- }
+}
- static void test_FoldStringA(void) { int ret, i, j;
@@ -6897,4 +7006,5 @@ START_TEST(locale) test_NLSVersion(); /* this requires collation table patch to make it MS compatible */ if (0) test_sorting();
The fact that this test is commented out never struck me as great. I'm pretty sure that with todo_wine added as appropriate, it could pass. A first patch in this series could be to do that.
- test_unicode_sorting(); }
diff --git a/dlls/kernelbase/locale.c b/dlls/kernelbase/locale.c index 53e4e42da3..74177371d9 100644 --- a/dlls/kernelbase/locale.c +++ b/dlls/kernelbase/locale.c @@ -2126,127 +2126,6 @@ static int wcstombs_codepage( UINT codepage, DWORD flags, const WCHAR *src, int return wcstombs_sbcs( info, src, srclen, dst, dstlen ); }
-static int get_sortkey( DWORD flags, const WCHAR *src, int srclen, char *dst, int dstlen ) -{
- WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
- int key_len[4];
- char *key_ptr[4];
- const WCHAR *src_save = src;
- int srclen_save = srclen;
- key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
- for (; srclen; srclen--, src++)
- {
unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
dummy[0] = *src;
if (decomposed_len)
{
for (i = 0; i < decomposed_len; i++)
{
WCHAR wch = dummy[i];
unsigned int ce;
if ((flags & NORM_IGNORESYMBOLS) &&
(get_char_type( CT_CTYPE1, wch ) & (C1_PUNCT | C1_SPACE)))
continue;
if (flags & NORM_IGNORECASE) wch = casemap( nls_info.LowerCaseTable, wch );
ce = collation_table[collation_table[collation_table[wch >> 8] + ((wch >> 4) & 0x0f)] + (wch & 0xf)];
if (ce != (unsigned int)-1)
{
if (ce >> 16) key_len[0] += 2;
if ((ce >> 8) & 0xff) key_len[1]++;
if ((ce >> 4) & 0x0f) key_len[2]++;
if (ce & 1)
{
if (wch >> 8) key_len[3]++;
key_len[3]++;
}
}
else
{
key_len[0] += 2;
if (wch >> 8) key_len[0]++;
if (wch & 0xff) key_len[0]++;
}
}
}
- }
- if (!dstlen) /* compute length */
/* 4 * '\1' + key length */
return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4;
- if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
return 0; /* overflow */
- src = src_save;
- srclen = srclen_save;
- key_ptr[0] = dst;
- key_ptr[1] = key_ptr[0] + key_len[0] + 1;
- key_ptr[2] = key_ptr[1] + key_len[1] + 1;
- key_ptr[3] = key_ptr[2] + key_len[2] + 1;
- for (; srclen; srclen--, src++)
- {
unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
dummy[0] = *src;
if (decomposed_len)
{
for (i = 0; i < decomposed_len; i++)
{
WCHAR wch = dummy[i];
unsigned int ce;
if ((flags & NORM_IGNORESYMBOLS) &&
(get_char_type( CT_CTYPE1, wch ) & (C1_PUNCT | C1_SPACE)))
continue;
if (flags & NORM_IGNORECASE) wch = casemap( nls_info.LowerCaseTable, wch );
ce = collation_table[collation_table[collation_table[wch >> 8] + ((wch >> 4) & 0x0f)] + (wch & 0xf)];
if (ce != (unsigned int)-1)
{
WCHAR key;
if ((key = ce >> 16))
{
*key_ptr[0]++ = key >> 8;
*key_ptr[0]++ = key & 0xff;
}
/* make key 1 start from 2 */
if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
/* make key 2 start from 2 */
if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
/* key 3 is always a character code */
if (ce & 1)
{
if (wch >> 8) *key_ptr[3]++ = wch >> 8;
if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
}
}
else
{
*key_ptr[0]++ = 0xff;
*key_ptr[0]++ = 0xfe;
if (wch >> 8) *key_ptr[0]++ = wch >> 8;
if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
}
}
}
- }
- *key_ptr[0] = 1;
- *key_ptr[1] = 1;
- *key_ptr[2] = 1;
- *key_ptr[3]++ = 1;
- *key_ptr[3] = 0;
- return key_ptr[3] - dst;
-}
- /* compose a full-width katakana. return consumed source characters. */ static int compose_katakana( const WCHAR *src, int srclen, WCHAR *dst ) {
@@ -2574,6 +2453,358 @@ static int compare_weights(int flags, const WCHAR *str1, int len1, return len1 - len2; }
+/* Start sortkey handler code. */
+/* Defines */
+#define JAPANESE 3 +#define MIN_WEIGHT 2 +#define LIST_STACK_BUFFER 1000
+/* Internal structures */
Are these comments useful?
+typedef struct _character_info +{
- BYTE weight_primary;
- BYTE script_member;
- BYTE weight_diacritic;
- BYTE weight_case;
+} character_info;
I get the impression that typedefs have largely fallen out of favour.
+typedef struct _weight_main_info +{
- BYTE script_member;
- BYTE weight_primary;
- BYTE extra;
+} weight_main_info;
+typedef struct _list +{
- int extra_len;
- int len;
- BYTE buffer[LIST_STACK_BUFFER];
- int buffer_count;
- BYTE* extra;
- int element_size;
+} list;
+typedef struct _sortkey_data +{
- int flags;
- list key;
- list weights_main;
- list weights_diacritic;
- list weights_case;
+} sortkey_data;
+/* List functions */
+static void LIST_INIT(list* name, int type_size) +{
- name->extra_len = 0;
- name->len = 0;
- name->extra = 0;
- name->buffer_count = LIST_STACK_BUFFER / type_size;
- name->element_size = type_size;
+}
+static void LIST_DESTROY(list* name) +{
- RtlFreeHeap(GetProcessHeap(), 0, name->extra);
+}
+static void* LIST_GET(list* name, int index) +{
- if ((index + 1) * name->element_size <= LIST_STACK_BUFFER)
return &name->buffer[index * name->element_size];
- else
return &name->extra[index * name->element_size - name->buffer_count];
+}
+/* Add entry to list, resizing as needed */ +static void LIST_ADD(list* name, const void *value) +{
- void* entry;
- if ((name->len + 1) * name->element_size > name->extra_len + LIST_STACK_BUFFER)
- {
if (!name->extra) /* First allocation */
{
name->extra_len = LIST_STACK_BUFFER;
name->extra = RtlAllocateHeap(GetProcessHeap(), 0, name->extra_len);
}
else
{
name->extra_len *= 2;
name->extra = RtlReAllocateHeap(GetProcessHeap(), 0,name->extra, name->extra_len);
}
- }
- entry = LIST_GET(name, name->len);
- memcpy(entry, value, name->element_size);
- name->len++;
+}
+/* Append a weight list to the sortkey */ +#define APPEND_LIST_TO_SORTKEY(data, weights, type, statement_get_value, statement_is_ignored) \
- do { \
int z; \
int end = data->weights.len - 1; \
while (end >= 0) \
{ \
const type* element = LIST_GET(&data->weights, end); \
(void)element; \
if (!(statement_is_ignored)) break; \
end--; \
} \
for (z = 0; z <= end; z++) \
{ \
const type* element = LIST_GET(&data->weights, z); \
LIST_ADD(&data->key, statement_get_value); \
} \
- } while (0);
+/* Helper functions */
+static BOOL get_char(sortkey_data* data, character_info* info, WCHAR ch) +{
- DWORD value = sort.keys[ch];
- info->weight_case = value >> 24;
- info->weight_diacritic = (value >> 16) & 0xff;
- info->script_member = (value >> 8) & 0xff;
- info->weight_primary = value & 0xff;
- return info->script_member != 0;
+}
+static void sortkey_data_init(sortkey_data* data, int flags, const WCHAR* locale, BOOL is_compare_string) +{
- data->flags = flags;
- LIST_INIT(&data->key, sizeof(BYTE));
- LIST_INIT(&data->weights_main, sizeof(BYTE));
- LIST_INIT(&data->weights_diacritic, sizeof(BYTE));
- LIST_INIT(&data->weights_case, sizeof(BYTE));
+}
+static void sortkey_data_destroy(sortkey_data* data) +{
- LIST_DESTROY(&data->key);
- LIST_DESTROY(&data->weights_main);
- LIST_DESTROY(&data->weights_diacritic);
- LIST_DESTROY(&data->weights_case);
+}
+static weight_main_info create_weight_main(BYTE script_member, BYTE weight_primary) +{
- weight_main_info ret = { 0 };
- ret.script_member = script_member;
- ret.weight_primary = weight_primary;
- return ret;
+}
+static void case_weights_add(sortkey_data* data, BYTE value) +{
- int flags = data->flags;
- if (NORM_IGNORECASE & flags)
value = value & ~(16 + 8);
- if (NORM_IGNOREWIDTH & flags)
value = value & ~(1);
- if (NORM_IGNOREKANATYPE & flags)
value = value & ~(32);
- LIST_ADD(&data->weights_case, &value);
+}
+static void main_weights_add(sortkey_data *data, weight_main_info* value) +{
- LIST_ADD(&data->weights_main, &value->script_member);
- LIST_ADD(&data->weights_main, &value->weight_primary);
- if (value->extra > 0)
LIST_ADD(&data->weights_main, &value->extra);
+}
+static void diacritic_weights_add(sortkey_data* data, const character_info* info, BYTE value) +{
- LIST_ADD(&data->weights_diacritic, &value);
+}
+/* Main sortkey logic */
+static void sortkey_handle_default_character(sortkey_data* data, WCHAR c) +{
- weight_main_info weightmain;
- character_info info;
- if (!get_char(data, &info, c))
- {
return;
- }
- weightmain = create_weight_main(info.script_member, info.weight_primary);
- if (info.script_member >= 0xa9 && info.script_member <= 0xaf) /* Some CJK have extra value */
weightmain.extra = info.weight_diacritic;
- else
diacritic_weights_add(data, &info, info.weight_diacritic);
- main_weights_add(data, &weightmain);
- case_weights_add(data, info.weight_case);
+}
+static BOOL sortkey_handle_character(sortkey_data* data, WCHAR c, const WCHAR* str, int i) +{
- weight_main_info weightmain;
- character_info info;
- int flags = data->flags;
- if (!get_char(data, &info, c))
- {
return FALSE;
- }
- switch (info.script_member)
- {
- case 0: /* Not sorted */
break;
- case 1:
if (data->weights_diacritic.len > 0)
{
BYTE* entry = LIST_GET(&data->weights_diacritic, data->weights_diacritic.len - 1);
*entry += info.weight_diacritic; /* Overflow can happen, that's okay */
}
else
diacritic_weights_add(data, &info, info.weight_diacritic);
break;
- case JAPANESE:
/* TODO */
break;
- case 4: /* Jamo */
weightmain = create_weight_main(info.weight_primary, info.weight_diacritic);
main_weights_add(data, &weightmain);
diacritic_weights_add(data, &info, MIN_WEIGHT);
case_weights_add(data, info.weight_case);
break;
- case 5:
weightmain = create_weight_main(253, 255);
main_weights_add(data, &weightmain);
weightmain = create_weight_main(info.weight_primary, info.weight_diacritic);
main_weights_add(data, &weightmain);
diacritic_weights_add(data, &info, MIN_WEIGHT);
case_weights_add(data, MIN_WEIGHT);
break;
- case 6: /* Punctuation */
/* TODO */
break;
- case 7: /* Symbols */
- case 8: /* Symbols */
- case 9: /* Symbols */
- case 10: /* Symbols */
- case 11: /* Symbols */
- case 12: /* Symbols */
if (flags & NORM_IGNORESYMBOLS)
break;
weightmain = create_weight_main(info.script_member, info.weight_primary);
main_weights_add(data, &weightmain);
diacritic_weights_add(data, &info, info.weight_diacritic);
case_weights_add(data, info.weight_case);
break;
- default:
sortkey_handle_default_character(data, c);
break;
The fact that exactly one of these integer cases has a symbolic constant attached seems less than ideal.
- }
- return TRUE;
+}
+static void sortkey_write_result(sortkey_data* data) +{
- int flags = data->flags;
- const BYTE SORTKEY_SEPARATOR = 1;
- const BYTE SORTKEY_TERMINATOR = 0;
- /* Main weights */
- APPEND_LIST_TO_SORTKEY(data, weights_main, BYTE, element, FALSE);
- LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
- /* Diacritic weights */
- if ((flags & NORM_IGNORENONSPACE) == 0)
- {
APPEND_LIST_TO_SORTKEY(data, weights_diacritic, BYTE, element, *element <= MIN_WEIGHT);
- }
- LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
- /* Case weights */
- if ((NORM_IGNORECASE & flags) == 0 || (NORM_IGNOREWIDTH & flags) == 0)
- {
APPEND_LIST_TO_SORTKEY(data, weights_case, BYTE, element, FALSE);
- }
- LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
- /* Extra weights */
- /* TODO */
- LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
- /* Special weights */
- /* TODO */
- LIST_ADD(&data->key, &SORTKEY_TERMINATOR);
+}
+static int sortkey_generate(int flags, const WCHAR* locale, const WCHAR* str, int str_len, BYTE* buffer, int buffer_len) +{
- int i;
- sortkey_data data;
- int ret = 0;
- sortkey_data_init(&data, flags, locale, FALSE);
- if (str_len == -1)
str_len = wcslen(str);
- for (i = 0; i < str_len; i++)
- {
sortkey_handle_character(&data, str[i], str, i);
- }
- sortkey_write_result(&data);
- if (data.key.len <= buffer_len)
- {
for (i = 0; i < data.key.len; i++)
{
BYTE* value = LIST_GET(&data.key, i);
buffer[i] = *value;
}
ret = data.key.len;
- }
- else if (!buffer)
- {
ret = data.key.len;
- }
- sortkey_data_destroy(&data);
- return ret;
+}
+/* End sortkey handler code */
static const struct geoinfo *get_geoinfo_ptr( GEOID geoid ) { @@ -4964,8 +5195,8 @@ INT WINAPI DECLSPEC_HOTPATCH LCMapStringEx( const WCHAR *locale, DWORD flags, co TRACE( "(%s,0x%08x,%s,%d,%p,%d)\n", debugstr_w(locale), flags, debugstr_wn(src, srclen), srclen, dst, dstlen );
if ((ret = get_sortkey( flags, src, srclen, (char *)dst, dstlen ))) ret++;
else SetLastError( ERROR_INSUFFICIENT_BUFFER );
if (!(ret = sortkey_generate(flags, L"", src, srclen, (BYTE *)dst, dstlen )))
SetLastError( ERROR_INSUFFICIENT_BUFFER ); return ret; }
-- 2.26.2
On Thu, 30 Apr 2020 at 05:22, Zebediah Figura z.figura12@gmail.com wrote:
+typedef struct _character_info +{
- BYTE weight_primary;
- BYTE script_member;
- BYTE weight_diacritic;
- BYTE weight_case;
+} character_info;
I get the impression that typedefs have largely fallen out of favour.
This particular kind of typedef never was in favour in C, but it gets you behaviour similar to C++ where struct/enum/union/class declarations (generally) behave as if they introduce this kind of typedef implicitly. Some people (typically with a Windows/C++ background) prefer that, although it's arguably needless obfuscation and idiosyncratic C.
Hello Zebediah,
thanks for your reply!
So as far as I understand, the sort key algorithm writes the level 0 weights (script and alphabetic weight) for the whole string to the sort key, then the level 1 weights (diacritic), and so on, right?
Yes.
In that case, what seems potentially simpler to me is to calculate those weights one level at a time, rather than one character at a time.
But that would mean that we would iterate over the string multiple times, and in the function have a branch as to which weight to write. To me, my current approach seemed easier. I've thought quite a while about abandoning my lists, but the results ended up being a lot more complex.
That is, you'd end up doing something like
static int get_sortkey( DWORD flags, const WCHAR *src, int srclen, char *dst, int dstlen ) { int used = 0; for (i = 0; i < srclen; ++i) { used += get_main_weights(src[i], dst + used, dstlen - used); if (!(flags & NORM_IGNORENONSPACE)) used += get_diacritic_weights(src[i], dst + used, dstlen - used); ... } }
This won't work, since we first need all main weights, then the others. Also keep in mind, that we need to have some temporary buffer anyways, since weights that are very small only get added when there is a bigger one following it. Either that, or we need to backtrack and remove weights again. Due to those issues, I opted for the current approach.
As that example shows, I also think it's probably simpler to just pass the buffer directly to whatever functions are writing sortkey bytes into it.
As noted, it's not that easy. Especially when you need to deal with buffers that are too short - we'd need to mix in a stop condition but still return the needed length. Sometimes, we just need to continue iterating despite knowing we can't copy it into the sortkey.
Is there a reason to use LCMapStringEx() here rather than LCMapString()?
LCMapStringA/W completely ignore the locale, and use a LCID. We'd need to fix those functions to properly convert the locale. To start simple, I decided to use the more low-level function and once that works, built the rest on top.
The fact that this test is commented out never struck me as great. I'm pretty sure that with todo_wine added as appropriate, it could pass. A first patch in this series could be to do that.
Probably. I decided to leave it alone for now, and comment it in once it works, but sure, I can change that.
Are these comments useful?
Eh, I used them for a quick visual separation. I can remove them if they're too much noise though.
I get the impression that typedefs have largely fallen out of favour.
I always use them when I can, since they make the code shorter and I don't think having to add the struct keyword adds much. I've seen both in wine code, but given a policy I'll stick to that.
The fact that exactly one of these integer cases has a symbolic constant attached seems less than ideal.
True, I'll add constants for the others as well.
Regards, Fabian Maurer
On 4/30/20 12:50 PM, Fabian Maurer wrote:
Hello Zebediah,
thanks for your reply!
So as far as I understand, the sort key algorithm writes the level 0 weights (script and alphabetic weight) for the whole string to the sort key, then the level 1 weights (diacritic), and so on, right?
Yes.
In that case, what seems potentially simpler to me is to calculate those weights one level at a time, rather than one character at a time.
But that would mean that we would iterate over the string multiple times, and in the function have a branch as to which weight to write. To me, my current approach seemed easier. I've thought quite a while about abandoning my lists, but the results ended up being a lot more complex.
Yes, you'd have to iterate over the string multiple times. However, that's a fixed, small number. I wouldn't be concerned about performance until we find evidence that performance is a concern.
If you're concerned about readability, I would submit that from an outside perspective, the current implementation is not very readable.
I don't think you need to write it such that you switch over which weight to write, either. It's probably easier just to make them separate functions, like in my example below.
That is, you'd end up doing something like
static int get_sortkey( DWORD flags, const WCHAR *src, int srclen, char *dst, int dstlen ) { int used = 0; for (i = 0; i < srclen; ++i) { used += get_main_weights(src[i], dst + used, dstlen - used); if (!(flags & NORM_IGNORENONSPACE)) used += get_diacritic_weights(src[i], dst + used, dstlen - used); ... } }
This won't work, since we first need all main weights, then the others.
Er, yes. I misthought. What I meant to write was
for (i = 0; i < srclen; ++i) used += get_main_weights(src[i], dst + used, dstlen - used);
if (!(flags & NORM_IGNORENONSPACE)) { for (i = 0; i < srclen; ++i) used += get_diacritic_weights(src[i], dst + used, dstlen - used); }
...
Also keep in mind, that we need to have some temporary buffer anyways, since weights that are very small only get added when there is a bigger one following it. Either that, or we need to backtrack and remove weights again. Due to those issues, I opted for the current approach.
Can you please give an example of this? I'm not sure I see it in any of your patches.
As that example shows, I also think it's probably simpler to just pass the buffer directly to whatever functions are writing sortkey bytes into it.
As noted, it's not that easy. Especially when you need to deal with buffers that are too short - we'd need to mix in a stop condition but still return the needed length. Sometimes, we just need to continue iterating despite knowing we can't copy it into the sortkey.
Generally, you can continue to calculate size but only write when there's space in the buffer. When I've written functions like this I've found it to be relatively simple, though maybe there's some details here that I'm missing...
Is there a reason to use LCMapStringEx() here rather than LCMapString()?
LCMapStringA/W completely ignore the locale, and use a LCID. We'd need to fix those functions to properly convert the locale. To start simple, I decided to use the more low-level function and once that works, built the rest on top.
It seems a simple enough fix; just call LCIDToLocaleName().
It's not terribly important, though.
The fact that this test is commented out never struck me as great. I'm pretty sure that with todo_wine added as appropriate, it could pass. A first patch in this series could be to do that.
Probably. I decided to leave it alone for now, and comment it in once it works, but sure, I can change that.
Are these comments useful?
Eh, I used them for a quick visual separation. I can remove them if they're too much noise though.
I get the impression that typedefs have largely fallen out of favour.
I always use them when I can, since they make the code shorter and I don't think having to add the struct keyword adds much. I've seen both in wine code, but given a policy I'll stick to that.
Generally I believe that explicitly specifying "struct" is preferred as it makes clear what kind of object you're dealing with.
The fact that exactly one of these integer cases has a symbolic constant attached seems less than ideal.
True, I'll add constants for the others as well.
Regards, Fabian Maurer
Hello Zebediah,
Yes, you'd have to iterate over the string multiple times. However, that's a fixed, small number. I wouldn't be concerned about performance until we find evidence that performance is a concern.
If you say so. I'll see what I can come up with.
I don't think you need to write it such that you switch over which weight to write, either. It's probably easier just to make them separate functions, like in my example below.
That would lead to quite a bunch of code duplication, especially when we come to the more difficult to handle characters. I'd like to avoid that. A switch for which weight to write should be a lot simpler.
Also keep in mind, that we need to have some temporary buffer anyways, since weights that are very small only get added when there is a bigger one following it. Either that, or we need to backtrack and remove weights again. Due to those issues, I opted for the current approach.
Can you please give an example of this? I'm not sure I see it in any of your patches.
It's in patch 1, inside APPEND_LIST_TO_SORTKEY, see 'statement_is_ignored'. And AFAIK not only diacritics are affected. Not sure how I would best handle that when writing directly to the sortkey.
Generally, you can continue to calculate size but only write when there's space in the buffer. When I've written functions like this I've found it to be relatively simple, though maybe there's some details here that I'm missing...
Well, there multiple places where we'd need to check. But maybe it'll work out, just seemed more complex to me.
Is there a reason to use LCMapStringEx() here rather than LCMapString()?
LCMapStringA/W completely ignore the locale, and use a LCID. We'd need to fix those functions to properly convert the locale. To start simple, I decided to use the more low-level function and once that works, built the rest on top.
It seems a simple enough fix; just call LCIDToLocaleName().
Yeah, probably. But I'd rather get the low-level stuff working first. Then I can write tests for the higher level functions and then fix them.
Generally I believe that explicitly specifying "struct" is preferred as it makes clear what kind of object you're dealing with.
Okay then.
Regards, Fabian Maurer
On 4/30/20 1:42 PM, Fabian Maurer wrote:
Hello Zebediah,
Yes, you'd have to iterate over the string multiple times. However, that's a fixed, small number. I wouldn't be concerned about performance until we find evidence that performance is a concern.
If you say so. I'll see what I can come up with.
I don't think you need to write it such that you switch over which weight to write, either. It's probably easier just to make them separate functions, like in my example below.
That would lead to quite a bunch of code duplication, especially when we come to the more difficult to handle characters. I'd like to avoid that. A switch for which weight to write should be a lot simpler.
Could be, though I'd warn that attempts to reduce duplication (e.g. by introducing complicated macros) can often make code much less readable.
Also keep in mind, that we need to have some temporary buffer anyways, since weights that are very small only get added when there is a bigger one following it. Either that, or we need to backtrack and remove weights again. Due to those issues, I opted for the current approach.
Can you please give an example of this? I'm not sure I see it in any of your patches.
It's in patch 1, inside APPEND_LIST_TO_SORTKEY, see 'statement_is_ignored'. And AFAIK not only diacritics are affected. Not sure how I would best handle that when writing directly to the sortkey.
Okay, I'm not sure I'm reading this macro right, but this is all internal to diacritic weights, right? If I understand, final weights which are 1 or 0 get stripped from the end of the sortkey, right? I think you could easily handle that by stripping weights after adding all of the diacritics, e.g. to use my previous example code,
if (!(flags & NORM_IGNORENONSPACE)) { for (i = 0; i < srclen; ++i) used += get_diacritic_weights(src[i], dst + used, dstlen - used);
while (dst[used - 1] < 2) used--; }
Generally, you can continue to calculate size but only write when there's space in the buffer. When I've written functions like this I've found it to be relatively simple, though maybe there's some details here that I'm missing...
Well, there multiple places where we'd need to check. But maybe it'll work out, just seemed more complex to me.
Is there a reason to use LCMapStringEx() here rather than LCMapString()?
LCMapStringA/W completely ignore the locale, and use a LCID. We'd need to fix those functions to properly convert the locale. To start simple, I decided to use the more low-level function and once that works, built the rest on top.
It seems a simple enough fix; just call LCIDToLocaleName().
Yeah, probably. But I'd rather get the low-level stuff working first. Then I can write tests for the higher level functions and then fix them.
Generally I believe that explicitly specifying "struct" is preferred as it makes clear what kind of object you're dealing with.
Okay then.
Regards, Fabian Maurer
Hello Zebediah,
Could be, though I'd warn that attempts to reduce duplication (e.g. by introducing complicated macros) can often make code much less readable.
Sure, but in this case it should be very well worth it.
Okay, I'm not sure I'm reading this macro right, but this is all internal to diacritic weights, right? If I understand, final weights which are 1 or 0 get stripped from the end of the sortkey, right? I think you could easily handle that by stripping weights after adding all of the diacritics, e.g. to use my previous example code,
if (!(flags & NORM_IGNORENONSPACE)) { for (i = 0; i < srclen; ++i) used += get_diacritic_weights(src[i], dst + used, dstlen - used);
while (dst[used - 1] < 2) used--;
}
Generally, yes. But it not only affects diacritics. And one needs to keep in mind that the separator is also a 0 and must not be removed. But maybe here the duplication is worth it?
Regards, Fabian Maurer
On 4/30/20 2:15 PM, Fabian Maurer wrote:
Hello Zebediah,
Could be, though I'd warn that attempts to reduce duplication (e.g. by introducing complicated macros) can often make code much less readable.
Sure, but in this case it should be very well worth it.
Okay, I'm not sure I'm reading this macro right, but this is all internal to diacritic weights, right? If I understand, final weights which are 1 or 0 get stripped from the end of the sortkey, right? I think you could easily handle that by stripping weights after adding all of the diacritics, e.g. to use my previous example code,
if (!(flags & NORM_IGNORENONSPACE)) { for (i = 0; i < srclen; ++i) used += get_diacritic_weights(src[i], dst + used, dstlen - used);
while (dst[used - 1] < 2) used--;
}
Generally, yes. But it not only affects diacritics. And one needs to keep in mind that the separator is also a 0 and must not be removed. But maybe here the duplication is worth it?
Sure, you'd need to guard a lower bound for "used" there in any case.
Regards, Fabian Maurer