Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,18 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
[System::Runtime::InteropServices::Out] bool% isIndic,
[System::Runtime::InteropServices::Out] bool% isDigit,
[System::Runtime::InteropServices::Out] bool% isLatin,
[System::Runtime::InteropServices::Out] bool% isStrong
[System::Runtime::InteropServices::Out] bool% isStrong,
[System::Runtime::InteropServices::Out] bool% isScriptAgnosticCombining
);

/// <summary>
/// Check whether two Unicode scalar values belong to the same script.
/// This is used to determine if combining marks should stay with their base character
/// for font fallback purposes. (See PR #6857 / Issue #6801)
/// </summary>
bool IsSameScript(int unicodeScalar1, int unicodeScalar2);
};

}}}}//MS::Internal::Text::TextInterface

#endif //__ICLASSIFICATION_H
#endif //__ICLASSIFICATION_H
Original file line number Diff line number Diff line change
Expand Up @@ -154,24 +154,31 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface
bool isLatin;
bool isStrong;
bool isExtended;
bool isScriptAgnosticCombining;

WCHAR ch = text[0];
classificationUtility->GetCharAttribute(
text[0],
ch,
isCombining,
needsCaretInfo,
Comment thread
etvorun marked this conversation as resolved.
isIndic,
isDigit,
isLatin,
isStrong
);
isStrong,
isScriptAgnosticCombining
);

isExtended = ItemizerHelper::IsExtendedCharacter(text[0]);
isExtended = ItemizerHelper::IsExtendedCharacter(ch);

UINT32 isDigitRangeStart = 0;
UINT32 isDigitRangeEnd = 0;
bool previousIsDigitValue = (numberCulture == nullptr) ? false : isDigit;
bool currentIsDigitValue;

// Track base character for combining mark script comparison (PR #6857 / Issue #6801)
// A combining mark should only stay with its base character if they have the same script.
int baseChar = isCombining ? -1 : ch;

// pCharAttribute is assumed to have the same length as text. This is enforced by Itemize().
pCharAttribute[0] = (CharAttributeType)
(((isCombining) ? CharAttribute::IsCombining : CharAttribute::None)
Expand All @@ -183,18 +190,40 @@ namespace MS { namespace Internal { namespace Text { namespace TextInterface

for (UINT32 i = 1; i < length; ++i)
{
ch = text[i];
classificationUtility->GetCharAttribute(
text[i],
isCombining,
needsCaretInfo,
isIndic,
isDigit,
isLatin,
isStrong
ch,
isCombining,
needsCaretInfo,
isIndic,
isDigit,
isLatin,
isStrong,
isScriptAgnosticCombining
);

isExtended = ItemizerHelper::IsExtendedCharacter(text[i]);

isExtended = ItemizerHelper::IsExtendedCharacter(ch);

// For combining marks, check if they have the same script as the base character.
// If not, they should not be treated as combining with the base (PR #6857 / Issue #6801).
// However, script-agnostic combining marks (variation selectors, ZWJ, emoji modifiers, etc.)
// are designed to work with any base character regardless of script, so skip the check
// for them to allow emoji sequences to stay together.
bool isCombiningWithBase = isCombining;
if (isCombining && baseChar >= 0 && !isScriptAgnosticCombining)
{
if (!classificationUtility->IsSameScript(baseChar, ch))
{
// Different script - this combining mark should not stay with the base character
isCombiningWithBase = false;
}
}

// Update base character tracking
if (!isCombining)
{
baseChar = ch;
}

pCharAttribute[i] = (CharAttributeType)
(((isCombining) ? CharAttribute::IsCombining : CharAttribute::None)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ public void GetCharAttribute(
out bool isIndic,
out bool isDigit,
out bool isLatin,
out bool isStrong
out bool isStrong,
out bool isScriptAgnosticCombining
)
{
CharacterAttribute charAttribute = Classification.CharAttributeOf((int)Classification.GetUnicodeClass(unicodeScalar));
Expand All @@ -119,7 +120,7 @@ out bool isStrong
|| Classification.IsIVS(unicodeScalar));

isStrong = (itemClass == (byte)ItemClass.StrongClass);

int script = charAttribute.Script;
needsCaretInfo = ScriptCaretInfo[script];

Expand All @@ -134,6 +135,16 @@ out bool isStrong
{
isIndic = IsScriptIndic(scriptId);
}

isScriptAgnosticCombining = Classification.IsScriptAgnosticCombining(unicodeScalar);
}

/// <summary>
/// Check whether two Unicode scalar values belong to the same script.
/// </summary>
public bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
{
return Classification.IsSameScript(unicodeScalar1, unicodeScalar2);
}

/// <summary>
Expand All @@ -159,6 +170,7 @@ private static bool IsScriptIndic(ScriptID scriptId)
}
}
}

/// <summary>
/// Hold the classification table pointers.
/// </summary>
Expand Down Expand Up @@ -253,16 +265,76 @@ public static short GetUnicodeClass(int unicodeScalar)


/// <summary>
/// Lookup script ID for a Unicode scalar value
/// Check whether two Unicode scalar values belong to the same script
/// </summary>
public static ScriptID GetScript(int unicodeScalar)
public static bool IsSameScript(int unicodeScalar1, int unicodeScalar2)
{
unsafe
{
return (ScriptID)Classification.CharAttributeTable[GetUnicodeClass(unicodeScalar)].Script;
short unicodeClass1 = GetUnicodeClass(unicodeScalar1);
short unicodeClass2 = GetUnicodeClass(unicodeScalar2);
if (unicodeClass1 != unicodeClass2)
{
CharacterAttribute a1 = Classification.CharAttributeTable[unicodeClass1];
CharacterAttribute a2 = Classification.CharAttributeTable[unicodeClass2];
if (a1.Script != a2.Script)
{
return false;
}
}

return true;
}
}

/// <summary>
/// Check whether the character is a script-agnostic combining mark (font extender) that should
/// stay with its base character regardless of script differences.
/// </summary>
/// <remarks>
/// Corresponds to a subset of DWriteCore's is_font_extender predicate, covering characters
/// that require special handling to prevent run-splitting when script comparisons would
/// otherwise split them. These are combining marks whose Unicode script is not the same
/// as the base character's script, so that emoji sequences like "1️⃣" (digit + VS16 +
/// U+20E3 combining enclosing keycap) stay together.
/// <para>
/// Note: ZWJ (U+200D) is NOT listed here because it is a JoinerClass character.
/// IsCombining() returns false for it, so this function would never be reached for ZWJ.
/// ZWJ is handled upstream by IsJoiner() and the prevWasJoiner logic in MapCharacters.
/// </para>
/// </remarks>
public static bool IsScriptAgnosticCombining(int unicodeScalar)
{
// Variation Selectors VS1-VS16 (U+FE00-U+FE0F)
if (unicodeScalar >= 0xFE00 && unicodeScalar <= 0xFE0F)
return true;

// Ideographic Variation Selectors VS17-VS256 (U+E0100-U+E01EF)
if (IsIVS(unicodeScalar))
return true;

// Combining Diacritical Marks Extended (U+1AB0-U+1AFF)
if (unicodeScalar >= 0x1AB0 && unicodeScalar <= 0x1AFF)
return true;

// Combining Diacritical Marks Supplement (U+1DC0-U+1DFF)
if (unicodeScalar >= 0x1DC0 && unicodeScalar <= 0x1DFF)
return true;

// Combining Diacritical Marks for Symbols (U+20D0-U+20FF) - includes U+20E3 keycap
if (unicodeScalar >= 0x20D0 && unicodeScalar <= 0x20FF)
return true;

// Combining Half Marks (U+FE20-U+FE2F)
if (unicodeScalar >= 0xFE20 && unicodeScalar <= 0xFE2F)
return true;

// Emoji Modifiers / Skin tones (U+1F3FB-U+1F3FF)
if (unicodeScalar >= 0x1F3FB && unicodeScalar <= 0x1F3FF)
return true;

return false;
}

/// <summary>
/// Compute Unicode scalar value from unicode codepoint stream
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

//
Expand Down Expand Up @@ -304,7 +304,11 @@ out sizeofChar
{
// continue to advance for combining mark with base char (can be precomposed by shaping engine)
// except if it is a different script (#6801)
if (Classification.GetScript(baseChar) == Classification.GetScript(originalChar))
// However, script-agnostic combining marks (variation selectors, combining enclosing marks)
// should stay with their base character regardless of script, to allow emoji sequences
// like "1️⃣" (digit + VS16 + combining enclosing keycap) to stay together.
if (Classification.IsScriptAgnosticCombining(originalChar)
|| Classification.IsSameScript(baseChar, originalChar))
{
continue;
}
Expand Down Expand Up @@ -336,6 +340,13 @@ out sizeofChar
// UnicodeScalar won't return a sizeofChar that exceeds the string length.
Debug.Assert(advance + sizeofChar <= unicodeString.Length);

// Track whether the previous character was a joiner. DWriteCore's font fallback
// algorithm extends the unmapped run to include the character immediately following
// a joiner (is_joiner(previous_char) in try_map_font). This keeps ZWJ emoji
// sequences like "👨‍👩‍👧" together in the unmapped run so they are sent to
// fallback as a unit.
bool prevWasJoiner = false;

for (nextValid = advance + sizeofChar; nextValid < unicodeString.Length; nextValid += sizeofChar)
{
// Get the character.
Expand All @@ -347,6 +358,12 @@ out sizeofChar
// Apply digit substitution, if any.
int ch = digitMap[originalChar];

if (Classification.IsJoiner(originalChar))
{
prevWasJoiner = true;
continue;
}

//
// Combining mark should always be shaped by the same font as the base char.
// If the physical font is invalid for the base char, it should also be invalid for the
Expand All @@ -357,12 +374,25 @@ out sizeofChar
// as the base char such that they will eventually be resolved to the same physical font.
// That means FamilyMap for the combining mark is not used when it follows a base char.
//
// The same goes for joiner. Note that "hasBaseChar" here indicates if there is an invalid base
// char in front.
if (Classification.IsJoiner(ch)
|| (baseChar != NOBASE && Classification.IsCombining(ch) && Classification.GetScript(ch) == Classification.GetScript(baseChar))
// Script-agnostic combining marks (variation selectors, combining enclosing marks) should
// also stay with the base character regardless of script differences.
//
// If the previous character was a joiner, pull this character into the unmapped run
// regardless of whether it is a combining mark (mirrors DWriteCore is_joiner(previous_char)).
if (prevWasJoiner
|| (baseChar != NOBASE && Classification.IsCombining(ch)
&& (Classification.IsScriptAgnosticCombining(ch) || Classification.IsSameScript(baseChar, ch)))
)
continue;
{
// Update baseChar for any strong char pulled into the unmapped run by a joiner so
// that combining marks that follow it are associated with the correct base.
if (prevWasJoiner && !Classification.IsCombining(ch))
baseChar = ch;
prevWasJoiner = false;
continue;
}

prevWasJoiner = false;

// If we have a glyph it's valid.
if (font.HasCharacter(checked((uint)ch)))
Expand Down