dotnet · etvorun · Jan 26, 2026 · Jan 27, 2026 · Feb 19, 2026 · Feb 19, 2026
@@ -21,10 +21,18 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
             [System::Runtime::InteropServices::Out] bool% isIndic,
             [System::Runtime::InteropServices::Out] bool% isDigit,
             [System::Runtime::InteropServices::Out] bool% isLatin,
-            [System::Runtime::InteropServices::Out] bool% isStrong
+            [System::Runtime::InteropServices::Out] bool% isStrong,
+            [System::Runtime::InteropServices::Out] bool% isScriptAgnosticCombining
             );
+
+        /// <summary>
+        /// Check whether two Unicode scalar values belong to the same script.
+        /// This is used to determine if combining marks should stay with their base character
+        /// for font fallback purposes. (See PR #6857 / Issue #6801)
+        /// </summary>
+        bool IsSameScript(int unicodeScalar1, int unicodeScalar2);
     };
 
 }}}}//MS::Internal::Text::TextInterface
 
-#endif //__ICLASSIFICATION_H
+#endif //__ICLASSIFICATION_H
@@ -154,24 +154,31 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
         bool isLatin;
         bool isStrong;
         bool isExtended;
+        bool isScriptAgnosticCombining;
 
+        WCHAR ch = text[0];
         classificationUtility->GetCharAttribute(
-            text[0],
+            ch,
             isCombining,
             needsCaretInfo,
             isIndic,
             isDigit,
             isLatin,
-            isStrong
-            );
+            isStrong,
+            isScriptAgnosticCombining
+        );
 
-        isExtended = ItemizerHelper::IsExtendedCharacter(text[0]);
+        isExtended = ItemizerHelper::IsExtendedCharacter(ch);
 
         UINT32 isDigitRangeStart = 0;
         UINT32 isDigitRangeEnd = 0;
         bool   previousIsDigitValue = (numberCulture == nullptr) ? false : isDigit;
         bool   currentIsDigitValue;
 
+        // Track base character for combining mark script comparison (PR #6857 / Issue #6801)
+        // A combining mark should only stay with its base character if they have the same script.
+        int baseChar = isCombining ? -1 : ch;
+
         // pCharAttribute is assumed to have the same length as text. This is enforced by Itemize().
         pCharAttribute[0] = (CharAttributeType)
                             (((isCombining)    ? CharAttribute::IsCombining    : CharAttribute::None)
@@ -183,18 +190,40 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
 
         for (UINT32 i = 1; i < length; ++i)
         {
+            ch = text[i];
             classificationUtility->GetCharAttribute(
-            text[i],
-            isCombining,
-            needsCaretInfo,
-            isIndic,
-            isDigit,
-            isLatin,
-            isStrong
+                ch,
+                isCombining,
+                needsCaretInfo,
+                isIndic,
+                isDigit,
+                isLatin,
+                isStrong,
+                isScriptAgnosticCombining
             );
 
-            isExtended = ItemizerHelper::IsExtendedCharacter(text[i]);
-
+            isExtended = ItemizerHelper::IsExtendedCharacter(ch);
+
+            // For combining marks, check if they have the same script as the base character.
+            // If not, they should not be treated as combining with the base (PR #6857 / Issue #6801).
+            // However, script-agnostic combining marks (variation selectors, ZWJ, emoji modifiers, etc.)
+            // are designed to work with any base character regardless of script, so skip the check
+            // for them to allow emoji sequences to stay together.
+            bool isCombiningWithBase = isCombining;
+            if (isCombining && baseChar >= 0 && !isScriptAgnosticCombining)
+            {
+                if (!classificationUtility->IsSameScript(baseChar, ch))
+                {
+                    // Different script - this combining mark should not stay with the base character
+                    isCombiningWithBase = false;
+                }
+            }
+
+            // Update base character tracking
+            if (!isCombining)
+            {
+                baseChar = ch;
+            }
 
             pCharAttribute[i] = (CharAttributeType)
                                 (((isCombining)    ? CharAttribute::IsCombining    : CharAttribute::None)

@@ -108,7 +108,8 @@ public void GetCharAttribute(
                                     out bool isIndic,
                                     out bool isDigit,
                                     out bool isLatin,
-                                    out bool isStrong
+                                    out bool isStrong,
+                                    out bool isScriptAgnosticCombining
                                     )
         {
             CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar));
@@ -119,7 +120,7 @@ out bool isStrong
                         || Classification.IsIVS(unicodeScalar));
 
             isStrong = (itemClass == (byte)ItemClass.StrongClass);
-            
+
             int script = charAttribute.Script;
             needsCaretInfo = ScriptCaretInfo[script];
 
@@ -134,6 +135,16 @@ out bool isStrong
             {
                 isIndic = IsScriptIndic(scriptId);
             }
+
+            isScriptAgnosticCombining = Classification.IsScriptAgnosticCombining(unicodeScalar);
+        }
+
+        /// <summary>
+        /// Check whether two Unicode scalar values belong to the same script.
+        /// </summary>
+        public bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
+        {
+            return Classification.IsSameScript(unicodeScalar1, unicodeScalar2);
         }
 
         /// <summary>
@@ -159,6 +170,7 @@ private static bool IsScriptIndic(ScriptID scriptId)
             }
         }
     }
+
     /// <summary>
     /// Hold the classification table pointers. 
     /// </summary>    
@@ -253,16 +265,76 @@ public static short GetUnicodeClass(int unicodeScalar)
 
 
         /// <summary>
-        /// Lookup script ID for a Unicode scalar value
+        /// Check whether two Unicode scalar values belong to the same script
         /// </summary>
-        public static ScriptID GetScript(int unicodeScalar)
+        public static bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
         {
             unsafe
             {
-                return (ScriptID)Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].Script;
+                short unicodeClass1 = GetUnicodeClass(unicodeScalar1);
+                short unicodeClass2 = GetUnicodeClass(unicodeScalar2);
+                if (unicodeClass1 != unicodeClass2)
+                {
+                    CharacterAttribute a1 = Classification.CharAttributeTable[unicodeClass1];
+                    CharacterAttribute a2 = Classification.CharAttributeTable[unicodeClass2];
+                    if (a1.Script != a2.Script)
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
             }
         }
 
+        /// <summary>
+        /// Check whether the character is a script-agnostic combining mark (font extender) that should
+        /// stay with its base character regardless of script differences.
+        /// </summary>
+        /// <remarks>
+        /// Corresponds to a subset of DWriteCore's is_font_extender predicate, covering characters
+        /// that require special handling to prevent run-splitting when script comparisons would
+        /// otherwise split them. These are combining marks whose Unicode script is not the same
+        /// as the base character's script, so that emoji sequences like "1️⃣" (digit + VS16 +
+        /// U+20E3 combining enclosing keycap) stay together.
+        /// <para>
+        /// Note: ZWJ (U+200D) is NOT listed here because it is a JoinerClass character.
+        /// IsCombining() returns false for it, so this function would never be reached for ZWJ.
+        /// ZWJ is handled upstream by IsJoiner() and the prevWasJoiner logic in MapCharacters.
+        /// </para>
+        /// </remarks>
+        public static bool IsScriptAgnosticCombining(int unicodeScalar)
+        {
+            // Variation Selectors VS1-VS16 (U+FE00-U+FE0F)
+            if (unicodeScalar >= 0xFE00 && unicodeScalar <= 0xFE0F)
+                return true;
+
+            // Ideographic Variation Selectors VS17-VS256 (U+E0100-U+E01EF)
+            if (IsIVS(unicodeScalar))
+                return true;
+
+            // Combining Diacritical Marks Extended (U+1AB0-U+1AFF)
+            if (unicodeScalar >= 0x1AB0 && unicodeScalar <= 0x1AFF)
+                return true;
+
+            // Combining Diacritical Marks Supplement (U+1DC0-U+1DFF)
+            if (unicodeScalar >= 0x1DC0 && unicodeScalar <= 0x1DFF)
+                return true;
+
+            // Combining Diacritical Marks for Symbols (U+20D0-U+20FF) - includes U+20E3 keycap
+            if (unicodeScalar >= 0x20D0 && unicodeScalar <= 0x20FF)
+                return true;
+
+            // Combining Half Marks (U+FE20-U+FE2F)
+            if (unicodeScalar >= 0xFE20 && unicodeScalar <= 0xFE2F)
+                return true;
+
+            // Emoji Modifiers / Skin tones (U+1F3FB-U+1F3FF)
+            if (unicodeScalar >= 0x1F3FB && unicodeScalar <= 0x1F3FF)
+                return true;
+
+            return false;
+        }
 
         /// <summary>
         /// Compute Unicode scalar value from unicode codepoint stream

@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
 //
@@ -304,7 +304,11 @@ out sizeofChar
                 {
                     // continue to advance for combining mark with base char (can be precomposed by shaping engine)
                     // except if it is a different script (#6801)
-                    if (Classification.GetScript(baseChar) == Classification.GetScript(originalChar))
+                    // However, script-agnostic combining marks (variation selectors, combining enclosing marks)
+                    // should stay with their base character regardless of script, to allow emoji sequences
+                    // like "1️⃣" (digit + VS16 + combining enclosing keycap) to stay together.
+                    if (Classification.IsScriptAgnosticCombining(originalChar)
+                        || Classification.IsSameScript(baseChar, originalChar))
                     {
                         continue;
                     }
@@ -336,6 +340,13 @@ out sizeofChar
                 // UnicodeScalar won't return a sizeofChar that exceeds the string length.
                 Debug.Assert(advance + sizeofChar <= unicodeString.Length);
 
+                // Track whether the previous character was a joiner. DWriteCore's font fallback
+                // algorithm extends the unmapped run to include the character immediately following
+                // a joiner (is_joiner(previous_char) in try_map_font). This keeps ZWJ emoji
+                // sequences like "👨‍👩‍👧" together in the unmapped run so they are sent to
+                // fallback as a unit.
+                bool prevWasJoiner = false;
+
                 for (nextValid = advance + sizeofChar; nextValid < unicodeString.Length; nextValid += sizeofChar)
                 {
                     // Get the character.
@@ -347,6 +358,12 @@ out sizeofChar
                     // Apply digit substitution, if any.
                     int ch = digitMap[originalChar];
 
+                    if (Classification.IsJoiner(originalChar))
+                    {
+                        prevWasJoiner = true;
+                        continue;
+                    }
+
                     //
                     // Combining mark should always be shaped by the same font as the base char.
                     // If the physical font is invalid for the base char, it should also be invalid for the
@@ -357,12 +374,25 @@ out sizeofChar
                     //   as the base char such that they will eventually be resolved to the same physical font.
                     //   That means FamilyMap for the combining mark is not used when it follows a base char.
                     //
-                    // The same goes for joiner. Note that "hasBaseChar" here indicates if there is an invalid base
-                    // char in front.
-                    if (Classification.IsJoiner(ch)
-                       || (baseChar != NOBASE && Classification.IsCombining(ch) && Classification.GetScript(ch) == Classification.GetScript(baseChar))
+                    // Script-agnostic combining marks (variation selectors, combining enclosing marks) should
+                    // also stay with the base character regardless of script differences.
+                    //
+                    // If the previous character was a joiner, pull this character into the unmapped run
+                    // regardless of whether it is a combining mark (mirrors DWriteCore is_joiner(previous_char)).
+                    if (prevWasJoiner
+                       || (baseChar != NOBASE && Classification.IsCombining(ch)
+                           && (Classification.IsScriptAgnosticCombining(ch) || Classification.IsSameScript(baseChar, ch)))
                        )
-                       continue;
+                    {
+                        // Update baseChar for any strong char pulled into the unmapped run by a joiner so
+                        // that combining marks that follow it are associated with the correct base.
+                        if (prevWasJoiner && !Classification.IsCombining(ch))
+                            baseChar = ch;
+                        prevWasJoiner = false;
+                        continue;
+                    }
+
+                    prevWasJoiner = false;
 
                     // If we have a glyph it's valid.
                     if (font.HasCharacter(checked((uint)ch)))