diff --git a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h
index 74ea6c84c3c..df29873f0f1 100644
--- a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h
+++ b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/IClassification.h
@@ -21,10 +21,18 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
[System::Runtime::InteropServices::Out] bool% isIndic,
[System::Runtime::InteropServices::Out] bool% isDigit,
[System::Runtime::InteropServices::Out] bool% isLatin,
- [System::Runtime::InteropServices::Out] bool% isStrong
+ [System::Runtime::InteropServices::Out] bool% isStrong,
+ [System::Runtime::InteropServices::Out] bool% isScriptAgnosticCombining
);
+
+ ///
+ /// Check whether two Unicode scalar values belong to the same script.
+ /// This is used to determine if combining marks should stay with their base character
+ /// for font fallback purposes. (See PR #6857 / Issue #6801)
+ ///
+ bool IsSameScript(int unicodeScalar1, int unicodeScalar2);
};
}}}}//MS::Internal::Text::TextInterface
-#endif //__ICLASSIFICATION_H
\ No newline at end of file
+#endif //__ICLASSIFICATION_H
diff --git a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp
index b7186f36dfd..9a2b88e9b8c 100644
--- a/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp
+++ b/src/Microsoft.DotNet.Wpf/src/DirectWriteForwarder/CPP/DWriteWrapper/TextAnalyzer.cpp
@@ -154,24 +154,31 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
bool isLatin;
bool isStrong;
bool isExtended;
+ bool isScriptAgnosticCombining;
+ WCHAR ch = text[0];
classificationUtility->GetCharAttribute(
- text[0],
+ ch,
isCombining,
needsCaretInfo,
isIndic,
isDigit,
isLatin,
- isStrong
- );
+ isStrong,
+ isScriptAgnosticCombining
+ );
- isExtended = ItemizerHelper::IsExtendedCharacter(text[0]);
+ isExtended = ItemizerHelper::IsExtendedCharacter(ch);
UINT32 isDigitRangeStart = 0;
UINT32 isDigitRangeEnd = 0;
bool previousIsDigitValue = (numberCulture == nullptr) ? false : isDigit;
bool currentIsDigitValue;
+ // Track base character for combining mark script comparison (PR #6857 / Issue #6801)
+ // A combining mark should only stay with its base character if they have the same script.
+ int baseChar = isCombining ? -1 : ch;
+
// pCharAttribute is assumed to have the same length as text. This is enforced by Itemize().
pCharAttribute[0] = (CharAttributeType)
(((isCombining) ? CharAttribute::IsCombining : CharAttribute::None)
@@ -183,18 +190,40 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
for (UINT32 i = 1; i < length; ++i)
{
+ ch = text[i];
classificationUtility->GetCharAttribute(
- text[i],
- isCombining,
- needsCaretInfo,
- isIndic,
- isDigit,
- isLatin,
- isStrong
+ ch,
+ isCombining,
+ needsCaretInfo,
+ isIndic,
+ isDigit,
+ isLatin,
+ isStrong,
+ isScriptAgnosticCombining
);
- isExtended = ItemizerHelper::IsExtendedCharacter(text[i]);
-
+ isExtended = ItemizerHelper::IsExtendedCharacter(ch);
+
+ // For combining marks, check if they have the same script as the base character.
+ // If not, they should not be treated as combining with the base (PR #6857 / Issue #6801).
+ // However, script-agnostic combining marks (variation selectors, ZWJ, emoji modifiers, etc.)
+ // are designed to work with any base character regardless of script, so skip the check
+ // for them to allow emoji sequences to stay together.
+ bool isCombiningWithBase = isCombining;
+ if (isCombining && baseChar >= 0 && !isScriptAgnosticCombining)
+ {
+ if (!classificationUtility->IsSameScript(baseChar, ch))
+ {
+ // Different script - this combining mark should not stay with the base character
+ isCombiningWithBase = false;
+ }
+ }
+
+ // Update base character tracking
+ if (!isCombining)
+ {
+ baseChar = ch;
+ }
pCharAttribute[i] = (CharAttributeType)
(((isCombining) ? CharAttribute::IsCombining : CharAttribute::None)
diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs
index 6ea88c585e2..a0ad0e5224b 100644
--- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs
+++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/Classification.cs
@@ -108,7 +108,8 @@ public void GetCharAttribute(
out bool isIndic,
out bool isDigit,
out bool isLatin,
- out bool isStrong
+ out bool isStrong,
+ out bool isScriptAgnosticCombining
)
{
CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar));
@@ -119,7 +120,7 @@ out bool isStrong
|| Classification.IsIVS(unicodeScalar));
isStrong = (itemClass == (byte)ItemClass.StrongClass);
-
+
int script = charAttribute.Script;
needsCaretInfo = ScriptCaretInfo[script];
@@ -134,6 +135,16 @@ out bool isStrong
{
isIndic = IsScriptIndic(scriptId);
}
+
+ isScriptAgnosticCombining = Classification.IsScriptAgnosticCombining(unicodeScalar);
+ }
+
+ ///
+ /// Check whether two Unicode scalar values belong to the same script.
+ ///
+ public bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
+ {
+ return Classification.IsSameScript(unicodeScalar1, unicodeScalar2);
}
///
@@ -159,6 +170,7 @@ private static bool IsScriptIndic(ScriptID scriptId)
}
}
}
+
///
/// Hold the classification table pointers.
///
@@ -253,16 +265,76 @@ public static short GetUnicodeClass(int unicodeScalar)
///
- /// Lookup script ID for a Unicode scalar value
+ /// Check whether two Unicode scalar values belong to the same script
///
- public static ScriptID GetScript(int unicodeScalar)
+ public static bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
{
unsafe
{
- return (ScriptID)Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].Script;
+ short unicodeClass1 = GetUnicodeClass(unicodeScalar1);
+ short unicodeClass2 = GetUnicodeClass(unicodeScalar2);
+ if (unicodeClass1 != unicodeClass2)
+ {
+ CharacterAttribute a1 = Classification.CharAttributeTable[unicodeClass1];
+ CharacterAttribute a2 = Classification.CharAttributeTable[unicodeClass2];
+ if (a1.Script != a2.Script)
+ {
+ return false;
+ }
+ }
+
+ return true;
}
}
+ ///
+ /// Check whether the character is a script-agnostic combining mark (font extender) that should
+ /// stay with its base character regardless of script differences.
+ ///
+ ///
+ /// Corresponds to a subset of DWriteCore's is_font_extender predicate, covering characters
+ /// that require special handling to prevent run-splitting when script comparisons would
+ /// otherwise split them. These are combining marks whose Unicode script is not the same
+ /// as the base character's script, so that emoji sequences like "1️⃣" (digit + VS16 +
+ /// U+20E3 combining enclosing keycap) stay together.
+ ///
+ /// Note: ZWJ (U+200D) is NOT listed here because it is a JoinerClass character.
+ /// IsCombining() returns false for it, so this function would never be reached for ZWJ.
+ /// ZWJ is handled upstream by IsJoiner() and the prevWasJoiner logic in MapCharacters.
+ ///
+ ///
+ public static bool IsScriptAgnosticCombining(int unicodeScalar)
+ {
+ // Variation Selectors VS1-VS16 (U+FE00-U+FE0F)
+ if (unicodeScalar >= 0xFE00 && unicodeScalar <= 0xFE0F)
+ return true;
+
+ // Ideographic Variation Selectors VS17-VS256 (U+E0100-U+E01EF)
+ if (IsIVS(unicodeScalar))
+ return true;
+
+ // Combining Diacritical Marks Extended (U+1AB0-U+1AFF)
+ if (unicodeScalar >= 0x1AB0 && unicodeScalar <= 0x1AFF)
+ return true;
+
+ // Combining Diacritical Marks Supplement (U+1DC0-U+1DFF)
+ if (unicodeScalar >= 0x1DC0 && unicodeScalar <= 0x1DFF)
+ return true;
+
+ // Combining Diacritical Marks for Symbols (U+20D0-U+20FF) - includes U+20E3 keycap
+ if (unicodeScalar >= 0x20D0 && unicodeScalar <= 0x20FF)
+ return true;
+
+ // Combining Half Marks (U+FE20-U+FE2F)
+ if (unicodeScalar >= 0xFE20 && unicodeScalar <= 0xFE2F)
+ return true;
+
+ // Emoji Modifiers / Skin tones (U+1F3FB-U+1F3FF)
+ if (unicodeScalar >= 0x1F3FB && unicodeScalar <= 0x1F3FF)
+ return true;
+
+ return false;
+ }
///
/// Compute Unicode scalar value from unicode codepoint stream
diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs
index dbc88663016..1a4597e4cfe 100644
--- a/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs
+++ b/src/Microsoft.DotNet.Wpf/src/PresentationCore/MS/internal/FontFace/PhysicalFontFamily.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
//
@@ -304,7 +304,11 @@ out sizeofChar
{
// continue to advance for combining mark with base char (can be precomposed by shaping engine)
// except if it is a different script (#6801)
- if (Classification.GetScript(baseChar) == Classification.GetScript(originalChar))
+ // However, script-agnostic combining marks (variation selectors, combining enclosing marks)
+ // should stay with their base character regardless of script, to allow emoji sequences
+ // like "1️⃣" (digit + VS16 + combining enclosing keycap) to stay together.
+ if (Classification.IsScriptAgnosticCombining(originalChar)
+ || Classification.IsSameScript(baseChar, originalChar))
{
continue;
}
@@ -336,6 +340,13 @@ out sizeofChar
// UnicodeScalar won't return a sizeofChar that exceeds the string length.
Debug.Assert(advance + sizeofChar <= unicodeString.Length);
+ // Track whether the previous character was a joiner. DWriteCore's font fallback
+ // algorithm extends the unmapped run to include the character immediately following
+ // a joiner (is_joiner(previous_char) in try_map_font). This keeps ZWJ emoji
+ // sequences like "👨👩👧" together in the unmapped run so they are sent to
+ // fallback as a unit.
+ bool prevWasJoiner = false;
+
for (nextValid = advance + sizeofChar; nextValid < unicodeString.Length; nextValid += sizeofChar)
{
// Get the character.
@@ -347,6 +358,12 @@ out sizeofChar
// Apply digit substitution, if any.
int ch = digitMap[originalChar];
+ if (Classification.IsJoiner(originalChar))
+ {
+ prevWasJoiner = true;
+ continue;
+ }
+
//
// Combining mark should always be shaped by the same font as the base char.
// If the physical font is invalid for the base char, it should also be invalid for the
@@ -357,12 +374,25 @@ out sizeofChar
// as the base char such that they will eventually be resolved to the same physical font.
// That means FamilyMap for the combining mark is not used when it follows a base char.
//
- // The same goes for joiner. Note that "hasBaseChar" here indicates if there is an invalid base
- // char in front.
- if (Classification.IsJoiner(ch)
- || (baseChar != NOBASE && Classification.IsCombining(ch) && Classification.GetScript(ch) == Classification.GetScript(baseChar))
+ // Script-agnostic combining marks (variation selectors, combining enclosing marks) should
+ // also stay with the base character regardless of script differences.
+ //
+ // If the previous character was a joiner, pull this character into the unmapped run
+ // regardless of whether it is a combining mark (mirrors DWriteCore is_joiner(previous_char)).
+ if (prevWasJoiner
+ || (baseChar != NOBASE && Classification.IsCombining(ch)
+ && (Classification.IsScriptAgnosticCombining(ch) || Classification.IsSameScript(baseChar, ch)))
)
- continue;
+ {
+ // Update baseChar for any strong char pulled into the unmapped run by a joiner so
+ // that combining marks that follow it are associated with the correct base.
+ if (prevWasJoiner && !Classification.IsCombining(ch))
+ baseChar = ch;
+ prevWasJoiner = false;
+ continue;
+ }
+
+ prevWasJoiner = false;
// If we have a glyph it's valid.
if (font.HasCharacter(checked((uint)ch)))