diff --git a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h index 74ea6c84c3c..df29873f0f1 100644 --- a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h +++ b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h @@ -21,10 +21,18 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface [System::Runtime::InteropServices::Out] bool% isIndic, [System::Runtime::InteropServices::Out] bool% isDigit, [System::Runtime::InteropServices::Out] bool% isLatin, - [System::Runtime::InteropServices::Out] bool% isStrong + [System::Runtime::InteropServices::Out] bool% isStrong, + [System::Runtime::InteropServices::Out] bool% isScriptAgnosticCombining ); + + /// + /// Check whether two Unicode scalar values belong to the same script. + /// This is used to determine if combining marks should stay with their base character + /// for font fallback purposes. (See PR #6857 / Issue #6801) + /// + bool IsSameScript(int unicodeScalar1, int unicodeScalar2); }; }}}}//MS::Internal::Text::TextInterface -#endif //__ICLASSIFICATION_H \ No newline at end of file +#endif //__ICLASSIFICATION_H diff --git a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp index b7186f36dfd..9a2b88e9b8c 100644 --- a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp +++ b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp @@ -154,24 +154,31 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface bool isLatin; bool isStrong; bool isExtended; + bool isScriptAgnosticCombining; + WCHAR ch = text[0]; classificationUtility->GetCharAttribute( - text[0], + ch, isCombining, needsCaretInfo, isIndic, isDigit, isLatin, - isStrong - ); + isStrong, + isScriptAgnosticCombining + ); - isExtended = ItemizerHelper::IsExtendedCharacter(text[0]); + isExtended = ItemizerHelper::IsExtendedCharacter(ch); UINT32 isDigitRangeStart = 0; UINT32 isDigitRangeEnd = 0; bool previousIsDigitValue = (numberCulture == nullptr) ? false : isDigit; bool currentIsDigitValue; + // Track base character for combining mark script comparison (PR #6857 / Issue #6801) + // A combining mark should only stay with its base character if they have the same script. + int baseChar = isCombining ? -1 : ch; + // pCharAttribute is assumed to have the same length as text. This is enforced by Itemize(). pCharAttribute[0] = (CharAttributeType) (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None) @@ -183,18 +190,40 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface for (UINT32 i = 1; i < length; ++i) { + ch = text[i]; classificationUtility->GetCharAttribute( - text[i], - isCombining, - needsCaretInfo, - isIndic, - isDigit, - isLatin, - isStrong + ch, + isCombining, + needsCaretInfo, + isIndic, + isDigit, + isLatin, + isStrong, + isScriptAgnosticCombining ); - isExtended = ItemizerHelper::IsExtendedCharacter(text[i]); - + isExtended = ItemizerHelper::IsExtendedCharacter(ch); + + // For combining marks, check if they have the same script as the base character. + // If not, they should not be treated as combining with the base (PR #6857 / Issue #6801). + // However, script-agnostic combining marks (variation selectors, ZWJ, emoji modifiers, etc.) + // are designed to work with any base character regardless of script, so skip the check + // for them to allow emoji sequences to stay together. + bool isCombiningWithBase = isCombining; + if (isCombining && baseChar >= 0 && !isScriptAgnosticCombining) + { + if (!classificationUtility->IsSameScript(baseChar, ch)) + { + // Different script - this combining mark should not stay with the base character + isCombiningWithBase = false; + } + } + + // Update base character tracking + if (!isCombining) + { + baseChar = ch; + } pCharAttribute[i] = (CharAttributeType) (((isCombining) ? CharAttribute::IsCombining : CharAttribute::None) diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs index 6ea88c585e2..a0ad0e5224b 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs @@ -108,7 +108,8 @@ public void GetCharAttribute( out bool isIndic, out bool isDigit, out bool isLatin, - out bool isStrong + out bool isStrong, + out bool isScriptAgnosticCombining ) { CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar)); @@ -119,7 +120,7 @@ out bool isStrong || Classification.IsIVS(unicodeScalar)); isStrong = (itemClass == (byte)ItemClass.StrongClass); - + int script = charAttribute.Script; needsCaretInfo = ScriptCaretInfo[script]; @@ -134,6 +135,16 @@ out bool isStrong { isIndic = IsScriptIndic(scriptId); } + + isScriptAgnosticCombining = Classification.IsScriptAgnosticCombining(unicodeScalar); + } + + /// + /// Check whether two Unicode scalar values belong to the same script. + /// + public bool IsSameScript(int unicodeScalar1, int unicodeScalar2) + { + return Classification.IsSameScript(unicodeScalar1, unicodeScalar2); } /// @@ -159,6 +170,7 @@ private static bool IsScriptIndic(ScriptID scriptId) } } } + /// /// Hold the classification table pointers. /// @@ -253,16 +265,76 @@ public static short GetUnicodeClass(int unicodeScalar) /// - /// Lookup script ID for a Unicode scalar value + /// Check whether two Unicode scalar values belong to the same script /// - public static ScriptID GetScript(int unicodeScalar) + public static bool IsSameScript(int unicodeScalar1, int unicodeScalar2) { unsafe { - return (ScriptID)Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].Script; + short unicodeClass1 = GetUnicodeClass(unicodeScalar1); + short unicodeClass2 = GetUnicodeClass(unicodeScalar2); + if (unicodeClass1 != unicodeClass2) + { + CharacterAttribute a1 = Classification.CharAttributeTable[unicodeClass1]; + CharacterAttribute a2 = Classification.CharAttributeTable[unicodeClass2]; + if (a1.Script != a2.Script) + { + return false; + } + } + + return true; } } + /// + /// Check whether the character is a script-agnostic combining mark (font extender) that should + /// stay with its base character regardless of script differences. + /// + /// + /// Corresponds to a subset of DWriteCore's is_font_extender predicate, covering characters + /// that require special handling to prevent run-splitting when script comparisons would + /// otherwise split them. These are combining marks whose Unicode script is not the same + /// as the base character's script, so that emoji sequences like "1️⃣" (digit + VS16 + + /// U+20E3 combining enclosing keycap) stay together. + /// + /// Note: ZWJ (U+200D) is NOT listed here because it is a JoinerClass character. + /// IsCombining() returns false for it, so this function would never be reached for ZWJ. + /// ZWJ is handled upstream by IsJoiner() and the prevWasJoiner logic in MapCharacters. + /// + /// + public static bool IsScriptAgnosticCombining(int unicodeScalar) + { + // Variation Selectors VS1-VS16 (U+FE00-U+FE0F) + if (unicodeScalar >= 0xFE00 && unicodeScalar <= 0xFE0F) + return true; + + // Ideographic Variation Selectors VS17-VS256 (U+E0100-U+E01EF) + if (IsIVS(unicodeScalar)) + return true; + + // Combining Diacritical Marks Extended (U+1AB0-U+1AFF) + if (unicodeScalar >= 0x1AB0 && unicodeScalar <= 0x1AFF) + return true; + + // Combining Diacritical Marks Supplement (U+1DC0-U+1DFF) + if (unicodeScalar >= 0x1DC0 && unicodeScalar <= 0x1DFF) + return true; + + // Combining Diacritical Marks for Symbols (U+20D0-U+20FF) - includes U+20E3 keycap + if (unicodeScalar >= 0x20D0 && unicodeScalar <= 0x20FF) + return true; + + // Combining Half Marks (U+FE20-U+FE2F) + if (unicodeScalar >= 0xFE20 && unicodeScalar <= 0xFE2F) + return true; + + // Emoji Modifiers / Skin tones (U+1F3FB-U+1F3FF) + if (unicodeScalar >= 0x1F3FB && unicodeScalar <= 0x1F3FF) + return true; + + return false; + } /// /// Compute Unicode scalar value from unicode codepoint stream diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs index dbc88663016..1a4597e4cfe 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // @@ -304,7 +304,11 @@ out sizeofChar { // continue to advance for combining mark with base char (can be precomposed by shaping engine) // except if it is a different script (#6801) - if (Classification.GetScript(baseChar) == Classification.GetScript(originalChar)) + // However, script-agnostic combining marks (variation selectors, combining enclosing marks) + // should stay with their base character regardless of script, to allow emoji sequences + // like "1️⃣" (digit + VS16 + combining enclosing keycap) to stay together. + if (Classification.IsScriptAgnosticCombining(originalChar) + || Classification.IsSameScript(baseChar, originalChar)) { continue; } @@ -336,6 +340,13 @@ out sizeofChar // UnicodeScalar won't return a sizeofChar that exceeds the string length. Debug.Assert(advance + sizeofChar <= unicodeString.Length); + // Track whether the previous character was a joiner. DWriteCore's font fallback + // algorithm extends the unmapped run to include the character immediately following + // a joiner (is_joiner(previous_char) in try_map_font). This keeps ZWJ emoji + // sequences like "👨‍👩‍👧" together in the unmapped run so they are sent to + // fallback as a unit. + bool prevWasJoiner = false; + for (nextValid = advance + sizeofChar; nextValid < unicodeString.Length; nextValid += sizeofChar) { // Get the character. @@ -347,6 +358,12 @@ out sizeofChar // Apply digit substitution, if any. int ch = digitMap[originalChar]; + if (Classification.IsJoiner(originalChar)) + { + prevWasJoiner = true; + continue; + } + // // Combining mark should always be shaped by the same font as the base char. // If the physical font is invalid for the base char, it should also be invalid for the @@ -357,12 +374,25 @@ out sizeofChar // as the base char such that they will eventually be resolved to the same physical font. // That means FamilyMap for the combining mark is not used when it follows a base char. // - // The same goes for joiner. Note that "hasBaseChar" here indicates if there is an invalid base - // char in front. - if (Classification.IsJoiner(ch) - || (baseChar != NOBASE && Classification.IsCombining(ch) && Classification.GetScript(ch) == Classification.GetScript(baseChar)) + // Script-agnostic combining marks (variation selectors, combining enclosing marks) should + // also stay with the base character regardless of script differences. + // + // If the previous character was a joiner, pull this character into the unmapped run + // regardless of whether it is a combining mark (mirrors DWriteCore is_joiner(previous_char)). + if (prevWasJoiner + || (baseChar != NOBASE && Classification.IsCombining(ch) + && (Classification.IsScriptAgnosticCombining(ch) || Classification.IsSameScript(baseChar, ch))) ) - continue; + { + // Update baseChar for any strong char pulled into the unmapped run by a joiner so + // that combining marks that follow it are associated with the correct base. + if (prevWasJoiner && !Classification.IsCombining(ch)) + baseChar = ch; + prevWasJoiner = false; + continue; + } + + prevWasJoiner = false; // If we have a glyph it's valid. if (font.HasCharacter(checked((uint)ch)))