[PATCH v4 0/1] MR10704: Display invalid Indic syllables

newer
[PATCH v5 0/1] MR10704: Display...

older
[PATCH v15 0/6] MR10775: vbscript:...

Aric Stewart (＠aricstewart)

April 29, 2026

12:21 p.m.

Use the U+25CC DOTTED CIRCLE as the base glyph for invalid Indic syllables https://bugs.winehq.org/show_bug.cgi?id=27637 -- v4: Display invalid Indic syllables https://gitlab.winehq.org/wine/wine/-/merge_requests/10704

Show replies by date

Aric Stewart

April 2026

12:21 p.m.

New subject: [PATCH v4 1/1] Display invalid Indic syllables

From: Aric Stewart <aric@codeweavers.com> Use the U+25CC DOTTED CIRCLE as the base glyph for invalid Indic syllables Handle the resulting pwLogClust changes correctly. https://bugs.winehq.org/show_bug.cgi?id=27637 --- dlls/gdi32/uniscribe/indic.c | 38 +++++++++------- dlls/gdi32/uniscribe/shape.c | 62 +++++++++++++++++++++++++++ dlls/gdi32/uniscribe/usp10_internal.h | 1 + 3 files changed, 85 insertions(+), 16 deletions(-) diff --git a/dlls/gdi32/uniscribe/indic.c b/dlls/gdi32/uniscribe/indic.c index 2d527ddbd1a..bae638ced63 100644 --- a/dlls/gdi32/uniscribe/indic.c +++ b/dlls/gdi32/uniscribe/indic.c @@ -326,6 +326,7 @@ void Indic_ParseSyllables(HDC hdc, SCRIPT_ANALYSIS *psa, ScriptCache *psc, const unsigned int center = 0; int index = 0; int next = 0; + BOOL valid; *syllable_count = 0; @@ -344,24 +345,29 @@ void Indic_ParseSyllables(HDC hdc, SCRIPT_ANALYSIS *psa, ScriptCache *psc, const if (next >= cChar) break; next = Indic_process_next_syllable(input, cChar, 0, ¢er, index, lex); - if (next != -1) - { - *syllables = realloc(*syllables, sizeof(IndicSyllable)*(*syllable_count+1)); - (*syllables)[*syllable_count].start = index; - (*syllables)[*syllable_count].base = center; - (*syllables)[*syllable_count].ralf = -1; - (*syllables)[*syllable_count].blwf = -1; - (*syllables)[*syllable_count].pref = -1; - (*syllables)[*syllable_count].end = next-1; - FindBaseConsonant(hdc, psa, psc, input, &(*syllables)[*syllable_count], lex, modern); - index = next; - *syllable_count = (*syllable_count)+1; - } - else if (index < cChar) - { + valid = (next != -1); + if (index < cChar && !valid) { TRACE("Processing failed at %i\n",index); - next = ++index; + center = index; + next = index + 1; } + *syllables = realloc(*syllables, sizeof(IndicSyllable)*(*syllable_count+1)); + if (!*syllables) { + ERR("Allocation failure of syllables\n"); + *syllable_count = 0; + return; + } + (*syllables)[*syllable_count].valid = valid; + (*syllables)[*syllable_count].start = index; + (*syllables)[*syllable_count].base = center; + (*syllables)[*syllable_count].ralf = -1; + (*syllables)[*syllable_count].blwf = -1; + (*syllables)[*syllable_count].pref = -1; + (*syllables)[*syllable_count].end = next-1; + if (valid) + FindBaseConsonant(hdc, psa, psc, input, &(*syllables)[*syllable_count], lex, modern); + index = next; + *syllable_count = (*syllable_count)+1; } TRACE("Processed %i of %i characters into %i syllables\n",index,cChar,*syllable_count); } diff --git a/dlls/gdi32/uniscribe/shape.c b/dlls/gdi32/uniscribe/shape.c index b0bd0c5a473..4cd7b1d99f4 100644 --- a/dlls/gdi32/uniscribe/shape.c +++ b/dlls/gdi32/uniscribe/shape.c @@ -2209,6 +2209,57 @@ static void ShapeIndicSyllables(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS *psa, } } +static void mark_invalid_syllables(HDC hdc, const WCHAR* pwcChars, INT cChars, WORD *pwGlyphs, INT *pcGlyphs, INT cMaxGlyphs, WORD *pwLogClust, IndicSyllable *syllables, int syllable_count, lexical_function lexical) +{ + int i; + WCHAR invalid = 0x25cc; + WORD invalid_glyph; + int offset = 0; + + if (!hdc || !pwcChars || !pwGlyphs || !pcGlyphs || !pwLogClust || !syllables || syllable_count <= 0) { + ERR("Invalid parameters in mark_invalid_syllables\n"); + return; + } + if (cChars <= 0 || cMaxGlyphs <= 0) { + ERR("Invalid size parameters\n"); + return; + } + if (NtGdiGetGlyphIndicesW(hdc, &invalid, 1, &invalid_glyph, 0) == GDI_ERROR || invalid_glyph == 0x0000) { + TRACE("Invalid glyph 0x25cc not found in font, using placeholder\n"); + invalid_glyph = 0x0020; // Use space as fallback + } + + for (i = 0; i < syllable_count; i++) + if (!syllables[i].valid) break; + + if (i >= syllable_count) { + /* Everything valid */ + return; + } + + /* Mark invalid combinations */ + for (i = 0; i < syllable_count; i++) + { + if (!syllables[i].valid) { + if (*pcGlyphs + 1 > cMaxGlyphs) { + ERR("Number of glyphs exceed buffer(%i, %i)\n", *pcGlyphs, cMaxGlyphs); + pwGlyphs[syllables[i].start] = invalid_glyph; + } else { + int dir = (lexical(pwcChars[syllables[i].start]) == lex_Matra_pre)?1:0; + int index = syllables[i].start+dir+offset; + for (int j = *pcGlyphs; j>=index; j--) + pwGlyphs[j+1] = pwGlyphs[j]; + pwGlyphs[index] = invalid_glyph; + *pcGlyphs = *pcGlyphs+1; + + for (int j = cChars; j>syllables[i].start; j--) + pwLogClust[j] = pwLogClust[j] + 1; + offset++; + } + } + } +} + static inline int unicode_lex(WCHAR c) { int type; @@ -2325,6 +2376,7 @@ static void ContextualShape_Sinhala(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS * ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, sinhala_lex, NULL, TRUE); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, sinhala_lex); free(syllables); } @@ -2383,6 +2435,7 @@ static void ContextualShape_Devanagari(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSI ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, devanagari_lex, NULL, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, devanagari_lex); free(syllables); } @@ -2453,6 +2506,7 @@ static void ContextualShape_Bengali(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS * ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, bengali_lex, NULL, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, bengali_lex); free(syllables); } @@ -2503,6 +2557,7 @@ static void ContextualShape_Gurmukhi(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, gurmukhi_lex, NULL, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, gurmukhi_lex); free(syllables); } @@ -2543,6 +2598,7 @@ static void ContextualShape_Gujarati(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, gujarati_lex, NULL, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, gujarati_lex); free(syllables); } @@ -2599,6 +2655,7 @@ static void ContextualShape_Oriya(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS *ps ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, oriya_lex, NULL, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, oriya_lex); free(syllables); } @@ -2649,6 +2706,7 @@ static void ContextualShape_Tamil(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS *ps ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, tamil_lex, SecondReorder_Like_Tamil, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, tamil_lex); free(syllables); } @@ -2698,6 +2756,7 @@ static void ContextualShape_Telugu(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS *p ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, telugu_lex, SecondReorder_Like_Telugu, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, telugu_lex); free(syllables); } @@ -2750,6 +2809,7 @@ static void ContextualShape_Kannada(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS * ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, kannada_lex, SecondReorder_Like_Telugu, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, kannada_lex); free(syllables); } @@ -2795,6 +2855,7 @@ static void ContextualShape_Malayalam(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, malayalam_lex, SecondReorder_Like_Tamil, modern); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, malayalam_lex); free(syllables); } @@ -2829,6 +2890,7 @@ static void ContextualShape_Khmer(HDC hdc, ScriptCache *psc, SCRIPT_ANALYSIS *ps ShapeIndicSyllables(hdc, psc, psa, input, cChars, syllables, syllable_count, pwOutGlyphs, pcGlyphs, pwLogClust, khmer_lex, NULL, FALSE); free(input); + mark_invalid_syllables(hdc, pwcChars, cChars, pwOutGlyphs, pcGlyphs, cMaxGlyphs, pwLogClust, syllables, syllable_count, khmer_lex); free(syllables); } diff --git a/dlls/gdi32/uniscribe/usp10_internal.h b/dlls/gdi32/uniscribe/usp10_internal.h index b8ae1fb1a57..aa24df22308 100644 --- a/dlls/gdi32/uniscribe/usp10_internal.h +++ b/dlls/gdi32/uniscribe/usp10_internal.h @@ -218,6 +218,7 @@ typedef struct _scriptData } scriptData; typedef struct { + BOOL valid; INT start; INT base; INT ralf; -- GitLab https://gitlab.winehq.org/wine/wine/-/merge_requests/10704

Aric Stewart (＠aricstewart)

12:22 p.m.

On Wed Apr 29 12:22:16 2026 +0000, समीरसिंह Sameer Singh wrote:

...

AFAICT, this is because the dotted circle is inserted in `pwGlyphs` but not in `pwcChars`, so it has no entry in `pwLogClust`. `UpdateClusters` does not know this, it pushes every glyph in `pwLogClust` forward at or beyond the insertion point. This pushes the matra's cluster entry forward to point at the dotted circle's glyph slot. Thanks for the pointer. I have corrected the `pwLogClust` issues. Using wine notepad the cursor and selection aspects seem to be behaving correctly now.

-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10704#note_138187

समीरसिंह Sameer Singh (＠ss141309)

4:39 p.m.

On Wed Apr 29 12:22:16 2026 +0000, Aric Stewart wrote:

...

Thanks for the pointer. I have corrected the `pwLogClust` issues. Using wine notepad the cursor and selection aspects seem to be behaving correctly now. Great! This appears to be fixed now.

Now AFAIK, only the case where the dotted circle appears between the glyph needs to be fixed `ৌ` . -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10704#note_138222

Aric Stewart (＠aricstewart)

5:16 p.m.

On Wed Apr 29 16:39:12 2026 +0000, समीरसिंह Sameer Singh wrote:

...

Great! This appears to be fixed now. Now AFAIK, only the case where the dotted circle appears between the glyph needs to be fixed `ৌ` . Ok I loaded a string on my windows machines and see that if I repeat the above string twice I get 'ৌৌ' -> 5 glyphs, 2 leading, 1 invalid glyph mark and 2 trailing. That shaping may be far more complicated than is addressed in this bug but I will investigate.

-- https://gitlab.winehq.org/wine/wine/-/merge_requests/10704#note_138397

Aric Stewart (＠aricstewart)

May 2026

4:36 p.m.

On Thu Apr 30 17:16:16 2026 +0000, Aric Stewart wrote:

...

Ok I loaded a string on my windows machines and see that if I repeat the above string twice I get 'ৌৌ' -> 5 glyphs, 2 leading, 1 invalid glyph mark and 2 trailing. That shaping may be far more complicated than is addressed in this bug but I will investigate. Ok that case is very interesting. It is the `BENGALI VOWEL SIGN AU ৌ U+09CC`. It decomposes to ` ৌ <U+09C7, U+09D7>`. There is nice information here https://util.unicode.org/UnicodeJsps/character.jsp?a=09CC

Looking at uniscribe, we are properly decomposing: ``` 01a0:trace:uniscribe:ScriptGetCMap (0000000009010074,00000000000389C0,L"\09cc\09cc\09cc\09cc\09cc",5,0x0,0000 00000082F750)` ... 01a0:trace:uniscribe:ContextualShape_Bengali New composed string L"\09c7\09d7\09c7\09d7\09c7\09d7\09c7\09d7\0 9c7\09d7" (10) 01a0:trace:uniscribe:debug_output_string MmMpMmMpMmMpMmMpMmMp ``` That is a string of pre-Matras and post-Matras. What we are doing now is we identify each Mm and Mp as an incomplete syllable and my new code I putting the invalid character mark resulting in the string attached as an image. MacOs's textpad does the same thing actaully. However that native uniscribe, and Chrome both group all the Mm together at the beginning with a single invalid character mark and then all the Mp at the end. So there is quite a bit of re-ordering happening there that is not happening now. I think this is out of the scope of this bug and should be included in a new bug. I would also be curious about the behavior of U+09cc in other situations. Is it being properly shaped in string where it is being used correctly? I do not know Bengali so I am not sure how to find it. This string `মৌমাছি` appears to be `u+09ae u+09cc u+09ae u+09be u+099b u+09bf` and a quick visual inspection seems to show it being shaped correctly. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10704#note_138548

समीरसिंह Sameer Singh (＠ss141309)

5:24 p.m.

On Fri May 1 16:36:51 2026 +0000, Aric Stewart wrote:

...

Ok that case is very interesting. It is the `BENGALI VOWEL SIGN AU ৌ U+09CC`. It decomposes to ` ৌ <U+09C7, U+09D7>`. There is nice information here https://util.unicode.org/UnicodeJsps/character.jsp?a=09CC Looking at uniscribe, we are properly decomposing: ``` 01a0:trace:uniscribe:ScriptGetCMap (0000000009010074,00000000000389C0,L"\09cc\09cc\09cc\09cc\09cc",5,0x0,0000 00000082F750)` ... 01a0:trace:uniscribe:ContextualShape_Bengali New composed string L"\09c7\09d7\09c7\09d7\09c7\09d7\09c7\09d7\0 9c7\09d7" (10) 01a0:trace:uniscribe:debug_output_string MmMpMmMpMmMpMmMpMmMp ``` That is a string of pre-Matras and post-Matras. What we are doing now is we identify each Mm and Mp as an incomplete syllable and my new code I putting the invalid character mark resulting in the string attached as an image. MacOs's textpad does the same thing actaully. However that native uniscribe, and Chrome both group all the Mm together at the beginning with a single invalid character mark and then all the Mp at the end. So there is quite a bit of re-ordering happening there that is not happening now. I think this is out of the scope of this bug and should be included in a new bug. I would also be curious about the behavior of U+09cc in other situations. Is it being properly shaped in string where it is being used correctly? I do not know Bengali so I am not sure how to find it. This string `মৌমাছি` appears to be `u+09ae u+09cc u+09ae u+09be u+099b u+09bf` and a quick visual inspection seems to show it being shaped correctly. hmm, it looks like harfbuzz also groups all Mm at the beginning and all Mp at the end with the dotted circle in the middle.

``` hb-view /usr/share/fonts/TiroIndigo-otf/TiroBangla-Regular.otf "ৌৌৌ" --output-format=png --output-file=test.png ``` ![test.png](/uploads/2957adf971b6bb6da63eceaf9600d613/test.png){width=716 height=325} This can be also viewed using `hb-shape` ```

...

hb-shape /usr/share/fonts/TiroIndigo-otf/TiroBangla-Regular.otf "ৌৌৌ"

[bSignE.init=0+396|bSignE=0+405|bSignE=0+405|BASE=0+724|bAuMark=0+247|bAuMark=0+247|bAuMark.fina=0+247] ``` Where bSignE = Mm, bAuMark = Mp So this does not seem like a bug. The bug I was talking about was when U+09CC is preceded by a space, a dotted circle was inserted at the start of the glyph. Are you aware of this? ![image.png](/uploads/cb2d0378c2c1b92ed23a5fde25630f7a/image.png){width=486 height=90} Looking closely at the second line, I can see that the glyph is decomposed here, evident by the fact that Mm has a top line.

...

I would also be curious about the behavior of U+09cc in other situations. Is it being properly shaped in string where it is being used correctly? I do not know Bengali so I am not sure how to find it.

This string `মৌমাছি` appears to be `u+09ae u+09cc u+09ae u+09be u+099b u+09bf` and a quick visual inspection seems to show it being shaped correctly.

Yes I can see that it is being shaped properly. -- https://gitlab.winehq.org/wine/wine/-/merge_requests/10704#note_138556

Age (days ago)

Last active (days ago)

List overview

6 comments

3 participants

participants (3)

Aric Stewart
Aric Stewart (＠aricstewart)
समीरसिंह Sameer Singh (＠ss141309)