Skip to content

Commit f02b3be

Browse files
TheTrueFlopsyJohan Sarge
andauthored
Consider the Unicode display width of characters in Composite #117
This PR is intended to improve the Unicode support in the Composite class. The class has not been taking the display width of characters (i.e. code points) into account when it assigns columns to them. All characters have been assumed to occupy a single column, even though they might in fact occupy two (e.g. East Asian characters) or zero (e.g. combining diacritics) when displayed in a terminal. For this, Composite::str() has been rewritten. The new code uses mk_wcwidth() to determine the display width of each character in layer texts and implements a many-to-many relationship between characters and columns (more specifically, many-to-no-more-than-two). This solution should produce correct output strings for any Composite where all layer text characters have a context-independent display width that is correctly reported by mk_wcwidth(). Note that the new code will still fail in cases where the display width of the layer text is not equal to the sum of the reported widths of its characters individually. Signed-off-by: Johan Sarge <rabiteman_2000@yahoo.com> Co-authored-by: Johan Sarge <rabiteman_2000@yahoo.com>
1 parent 7db4f53 commit f02b3be

4 files changed

Lines changed: 237 additions & 40 deletions

File tree

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ The following submitted code, packages or analysis, and deserve special thanks:
2525
Tobias Predel
2626
Andrew Poelstra
2727
thaafox
28+
Johan Sarge
2829

2930
Thanks to the following, who submitted detailed bug reports and excellent
3031
suggestions:

ChangeLog

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
master/HEAD
2+
- #117 Extended Unicode support in Composite, character display width now taken into account
3+
(thanks to Johan Sarge)
24
- #111 Duration: support negative durations by prefixing a '-' before the P in ISO format
35
(thanks to Andrew Poelstra)
46
- #113 Set CMAKE_CURRENT_SOURCE_DIR instead of CMAKE_SOURCE_DIR

src/Composite.cpp

Lines changed: 188 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
////////////////////////////////////////////////////////////////////////////////
22
//
3-
// Copyright 2016 - 2021, 2023, Gothenburg Bit Factory.
3+
// Copyright 2016 - 2021, 2023, 2026 Gothenburg Bit Factory.
44
//
55
// Permission is hereby granted, free of charge, to any person obtaining a copy
66
// of this software and associated documentation files (the "Software"), to deal
@@ -25,10 +25,111 @@
2525
////////////////////////////////////////////////////////////////////////////////
2626

2727
#include <Composite.h>
28+
#include <format.h>
29+
#include <limits>
2830
#include <sstream>
29-
#include <stack>
3031
#include <utf8.h>
3132

33+
34+
////////////////////////////////////////////////////////////////////////////////
35+
36+
namespace
37+
{
38+
39+
// Helper function that either replaces a pre-existing element at index (i) in
40+
// a std::vector with the value (x) (if (i) is less than the size of the vector)
41+
// or extends the vector in such a way that it ends up with (i+1) elements, with
42+
// the value (x) at index (i) and the padding value (pad) at each index between
43+
// that of the final pre-existing element of the vector and (i).
44+
template <typename T>
45+
void put_or_extend (
46+
std::vector<T>& v, typename std::vector<T>::size_type i, const T& x, const T& pad = T {})
47+
{
48+
if (i < v.size ())
49+
v[i] = x;
50+
else
51+
{
52+
v.resize (i, pad);
53+
v.push_back (x);
54+
}
55+
}
56+
57+
// Helper class that is used to store information about columns in a Composite.
58+
struct ColumnData
59+
{
60+
// Number of topmost layer that overlaps with the column represented by this ColumnData.
61+
// NOTE: Layer numbers start at 1. "Layer 0" is background not covered by any layer.
62+
unsigned int layer_num;
63+
64+
// Byte offset into the UTF-8 text string of the layer identified by (layer_num).
65+
// Points to the first byte of the first character to include in the content
66+
// of the column represented by this ColumnData.
67+
std::string::size_type text_begin_i;
68+
69+
// Byte offset into the UTF-8 text string of the layer identified by (layer_num).
70+
// Points to the first byte after the last character to include in the content
71+
// of the column represented by this ColumnData.
72+
std::string::size_type text_end_i;
73+
74+
// Unicode display width of the first character to include in the content
75+
// of the column represented by this ColumnData. Should always be 1 or 2,
76+
// unless this ColumnData represents a padding column.
77+
unsigned char char_0_width;
78+
79+
ColumnData (
80+
unsigned int layer = 0, std::string::size_type begin_i = 1, std::string::size_type end_i = 0,
81+
unsigned char c_0_w = 0)
82+
:
83+
layer_num (layer), text_begin_i (begin_i), text_end_i (end_i), char_0_width (c_0_w)
84+
{}
85+
86+
ColumnData (const ColumnData& orig) = default;
87+
88+
ColumnData& operator= (const ColumnData& orig) = default;
89+
90+
std::string::difference_type byte_count () const
91+
{
92+
return text_end_i - text_begin_i;
93+
}
94+
95+
// Changes the state of this ColumnData to one that indicates that the ColumnData
96+
// represents a padding column (i.e. a state where byte_count is negative).
97+
void make_padding ()
98+
{
99+
text_begin_i = 1;
100+
text_end_i = 0;
101+
char_0_width = 0;
102+
}
103+
104+
bool is_padding () const
105+
{
106+
return byte_count () < 0;
107+
}
108+
};
109+
110+
const ColumnData LAYER_0_PAD; // ColumnData representing a padding column on "layer 0".
111+
112+
// Special column index value, distinct from any valid column index.
113+
const std::string::size_type INVALID_COLUMN_I = std::numeric_limits<std::string::size_type>::max ();
114+
115+
// Helper function that turns the uncovered half of half-covered wide characters into padding.
116+
inline void do_halfcovered_wide_char_check (
117+
std::vector<ColumnData>& columns, std::vector<ColumnData>::size_type column_i)
118+
{
119+
// If there is a wide character (on a lower layer) in the preceding column, replace
120+
// that character (and any nonspacing characters associated with it) with padding.
121+
// (Because the second half of that character will be covered, and we couldn't display
122+
// half a character if we wanted to.)
123+
if (column_i >= 1 && column_i - 1 < columns.size ())
124+
{
125+
ColumnData& prev_col_data = columns[column_i - 1];
126+
if (prev_col_data.char_0_width == 2)
127+
prev_col_data.make_padding ();
128+
}
129+
}
130+
131+
};
132+
32133
////////////////////////////////////////////////////////////////////////////////
33134
// Initially assume no text, but infinite virtual space.
34135
//
@@ -74,65 +175,114 @@ void Composite::add (
74175
// bbbbb // Layer 2
75176
// c // Layer 3
76177
//
77-
// Walk all strings left to right, selecting the character and color from the
178+
// Walk all layers left to right, selecting the character and color from the
78179
// highest numbered layer. Emit color codes only on edge detection.
79180
//
80181
std::string Composite::str () const
81182
{
82-
// The strings are broken into a vector of int, for UTF8 support.
83-
std::vector <int> characters;
84-
std::vector <int> colors;
85-
for (unsigned int layer = 0; layer < _layers.size (); ++layer)
183+
std::vector <ColumnData> columns;
184+
185+
for (unsigned int layer_i = 0; layer_i < _layers.size (); ++layer_i)
86186
{
87-
const auto& text = std::get <0> (_layers[layer]);
88-
auto offset = std::get <1> (_layers[layer]);
89-
auto len = utf8_text_length (text);
187+
const auto& text = std::get <0> (_layers[layer_i]);
188+
auto offset = std::get <1> (_layers[layer_i]);
189+
auto len = utf8_text_length (text);
90190

91-
// Make sure the vectors are large enough to support a write operator[].
92-
if (characters.size () < offset + len)
93-
{
94-
characters.resize (offset + len, 32);
95-
colors.resize (offset + len, 0);
96-
}
191+
// Make sure the capacity of the column vector is large enough to support push_back()
192+
// without reallocation.
193+
if (columns.capacity () < offset + len)
194+
columns.reserve (offset + len);
97195

98-
// Copy in the layer characters and color indexes.
196+
// Inspect and decide how to handle each character (i.e. Unicode code point)
197+
// in the current layer's text string.
198+
std::string::size_type prev_cursor = 0;
99199
std::string::size_type cursor = 0;
100-
int character;
101-
int count = 0;
200+
unsigned int column_count = 0;
201+
std::string::size_type prev_spacer_column_i = INVALID_COLUMN_I;
202+
unsigned int character;
102203
while ((character = utf8_next_char (text, cursor)))
103204
{
104-
characters[offset + count] = character;
105-
colors [offset + count] = layer + 1;
106-
++count;
205+
std::string::size_type column_i = offset + column_count;
206+
int ch_width = mk_wcwidth ((wchar_t)character);
207+
208+
switch (ch_width)
209+
{
210+
case 0: // zero-width / nonspacing character
211+
if (prev_spacer_column_i == INVALID_COLUMN_I) // No preceding spacing character on this layer.
212+
; // Skip this character.
213+
else // There is a preceding spacing character on this layer.
214+
{
215+
// Append the nonspacing character to the column of the previous spacing character.
216+
columns[prev_spacer_column_i].text_end_i = cursor;
217+
}
218+
break;
219+
case 1: // ordinary narrow spacing character
220+
if (prev_spacer_column_i == INVALID_COLUMN_I)
221+
do_halfcovered_wide_char_check (columns, column_i);
222+
223+
// Put the character in the appropriate column. Pad out the column list as necessary.
224+
put_or_extend (columns, column_i, ColumnData (layer_i + 1, prev_cursor, cursor, 1), LAYER_0_PAD);
225+
226+
prev_spacer_column_i = column_i;
227+
column_count += 1;
228+
break;
229+
case 2: // graphically wide spacing character
230+
if (prev_spacer_column_i == INVALID_COLUMN_I)
231+
do_halfcovered_wide_char_check (columns, column_i);
232+
233+
// Put the character in the appropriate column. Pad out the column list as necessary.
234+
// Make the column after the current one (which is also covered by the wide character)
235+
// a padding column on the current layer.
236+
put_or_extend (columns, column_i, ColumnData (layer_i + 1, prev_cursor, cursor, 2), LAYER_0_PAD);
237+
put_or_extend (columns, column_i + 1, ColumnData (layer_i + 1), LAYER_0_PAD);
238+
239+
prev_spacer_column_i = column_i;
240+
column_count += 2;
241+
break;
242+
default: // Should not happen.
243+
throw format ("Unexpected character width {1} of code point 0x{2}.", ch_width, formatHex (character));
244+
}
245+
246+
// Remember byte offset of first UTF-8 byte of next character in the layer text.
247+
prev_cursor = cursor;
107248
}
108249
}
109250

110-
// Now walk the character and color vector, emitting every character and
111-
// every detected color change.
251+
// Now walk the column vector, emitting every character and every detected layer change.
112252
std::stringstream out;
113-
int prev_color = 0;
114-
for (unsigned int i = 0; i < characters.size (); ++i)
253+
unsigned int prev_layer = 0;
254+
for (unsigned int column_i = 0; column_i < columns.size (); ++column_i)
115255
{
116-
// A change in color triggers a code emit.
117-
if (prev_color != colors[i])
256+
auto column_data = columns[column_i];
257+
auto curr_layer = column_data.layer_num;
258+
const auto& text = std::get <0> (_layers[curr_layer - 1]);
259+
260+
// A change in layer triggers an ANSI escape code emit.
261+
if (prev_layer != curr_layer)
118262
{
119-
if (prev_color)
120-
out << std::get <2> (_layers[prev_color - 1]).end ();
263+
if (prev_layer) // Reset attributes (if any) of previous layer.
264+
out << std::get <2> (_layers[prev_layer - 1]).end ();
121265

122-
if (colors[i])
123-
out << std::get <2> (_layers[colors[i] - 1]).code ();
124-
else
125-
out << std::get <2> (_layers[prev_color - 1]).end ();
266+
if (curr_layer) // Set attributes (if any) of current layer.
267+
out << std::get <2> (_layers[curr_layer - 1]).code ();
126268

127-
prev_color = colors[i];
269+
prev_layer = curr_layer;
128270
}
129271

130-
out << utf8_character (characters[i]);
272+
// The layer text string is already UTF-8, so we can output its bytes verbatim,
273+
// provided that we're keeping track of character (i.e. code point) boundaries.
274+
if (column_data.is_padding ())
275+
out << ' '; // Display padding columns as spaces.
276+
else // Display a slice of the layer text (Spacer [Nonspacer ...]).
277+
out.write(text.data () + column_data.text_begin_i, column_data.byte_count ());
278+
279+
if (column_data.char_0_width == 2)
280+
++column_i; // Wide characters cover two columns.
131281
}
132282

133283
// Terminate the color codes, if necessary.
134-
if (prev_color)
135-
out << std::get <2> (_layers[prev_color - 1]).end ();
284+
if (prev_layer)
285+
out << std::get <2> (_layers[prev_layer - 1]).end ();
136286

137287
return out.str ();
138288
}

test/composite.t.cpp

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
////////////////////////////////////////////////////////////////////////////////
3131
int main (int, char**)
3232
{
33-
UnitTest t (3);
33+
UnitTest t (4);
3434

3535
Composite c1;
3636
c1.add ("left", 2, Color ());
@@ -130,8 +130,52 @@ int main (int, char**)
130130
c8.add ( "foo", 7, Color ("white on red"));
131131
t.diag (c8.str ());
132132

133+
// Add layers containing characters with non-standard Unicode width.
134+
// Verify that they are composited correctly.
135+
// * Each zero-width character should be included in the column of the
136+
// preceding non-zero-width character on the same layer. (If there is
137+
// no such character, the zero-width character should be skipped.)
138+
// * Each wide character should be treated as occupying two columns of the
139+
// layer, the one corresponding to the array index at which the character
140+
// code is stored, and the next one.
141+
// * If exactly one of the columns occupied by a wide characher is also
142+
// occupied by a character in a higher layer (obscuring half of the wide
143+
// character), then the wide character should not be displayed at all.
144+
// The unobscured column should be treated as containing blank space
145+
// (but still be covered by the current layer).
146+
Composite c9;
147+
c9.add ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, Color ()); // BG
148+
c9.add ("a", 50, Color ()); // more BG
149+
c9.add ("😃😃😃", 1, Color ()); // some wide chars
150+
c9.add ("bb", 1, Color ()); // obscure the first of the two wide chars
151+
c9.add ("😖😖😖", 8, Color ()); // a few more wide chars
152+
c9.add ("cc", 9, Color ()); // obscure half of each of the first two
153+
c9.add ("😬😬😬", 15, Color ()); // even more
154+
c9.add ("会会会", 18, Color ()); // obscure the last one-and-half
155+
c9.add ("[èé][ñn̄][öô]", 25, Color ()); // layer with zero-width chars (combining diacritics)
156+
c9.add ("}{", 32, Color ()); // obscure two of the non-zero-width chars
157+
c9.add ("è🐋é🐋", 38, Color ()); // 1-col, 0-col and 2-col chars on same layer
158+
c9.add ("\a\aff", 45, Color ()); // zero-width characters at beginning of layer
159+
t.is (c9.str (), "abb😃😃a cc 😖a😬 会会会a[èé][ñn̄}{öô]aè🐋é🐋affa a", "Composite ... --> 'abb😃😃a cc 😖a😬 会会会a[èé][ñn̄}{öô]aè🐋é🐋affa a'");
160+
161+
// Add colored layers containing characters with non-standard Unicode width.
162+
// Display the result.
163+
Composite c10;
164+
c10.add ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, Color ("black on bright blue")); // BG
165+
c10.add ("a", 50, Color ("black on bright blue")); // more BG
166+
c10.add ("😃😃😃", 1, Color ("yellow on grey10")); // some wide chars
167+
c10.add ("bb", 1, Color ("red on black")); // obscure the first of the two wide chars
168+
c10.add ("😖😖😖", 8, Color ("green on blue")); // a few more wide chars
169+
c10.add ("cc", 9, Color ("grey18 on green")); // obscure half of each of the first two
170+
c10.add ("😬😬😬", 15, Color ("white on red")); // even more
171+
c10.add ("会会会", 18, Color ("magenta on grey6")); // obscure the last one-and-half
172+
c10.add ("[èé][ñn̄][öô]", 25, Color ("blue on white")); // layer with zero-width chars (combining diacritics)
173+
c10.add ("}{", 32, Color ("red on white")); // obscure two of the non-zero-width chars
174+
c10.add ("è🐋é🐋", 38, Color ("yellow on cyan")); // 1-col, 0-col and 2-col chars on same layer
175+
c10.add ("\a\aff", 45, Color ("black on bright yellow")); // zero-width characters at beginning of layer
176+
t.diag (c10.str ());
177+
133178
return 0;
134179
}
135180

136181
////////////////////////////////////////////////////////////////////////////////
137-

0 commit comments

Comments
 (0)