Skip to content

Commit 692a20a

Browse files
author
Johan Sarge
committed
Squashing down PR #117 (consider the Unicode display width of characters in Composite) into a single commit.
Signed-off-by: Johan Sarge <rabiteman_2000@yahoo.com>
1 parent 7db4f53 commit 692a20a

4 files changed

Lines changed: 237 additions & 40 deletions

File tree

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ The following submitted code, packages or analysis, and deserve special thanks:
2525
Tobias Predel
2626
Andrew Poelstra
2727
thaafox
28+
Johan Sarge
2829

2930
Thanks to the following, who submitted detailed bug reports and excellent
3031
suggestions:

ChangeLog

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
master/HEAD
2+
- #117 Extended Unicode support in Composite, character display width now taken into account
3+
(thanks to Johan Sarge)
24
- #111 Duration: support negative durations by prefixing a '-' before the P in ISO format
35
(thanks to Andrew Poelstra)
46
- #113 Set CMAKE_CURRENT_SOURCE_DIR instead of CMAKE_SOURCE_DIR

src/Composite.cpp

Lines changed: 188 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
////////////////////////////////////////////////////////////////////////////////
22
//
3-
// Copyright 2016 - 2021, 2023, Gothenburg Bit Factory.
3+
// Copyright 2016 - 2021, 2023, 2026 Gothenburg Bit Factory.
44
//
55
// Permission is hereby granted, free of charge, to any person obtaining a copy
66
// of this software and associated documentation files (the "Software"), to deal
@@ -25,10 +25,111 @@
2525
////////////////////////////////////////////////////////////////////////////////
2626

2727
#include <Composite.h>
28+
#include <format.h>
29+
#include <limits>
2830
#include <sstream>
29-
#include <stack>
3031
#include <utf8.h>
3132

33+
34+
////////////////////////////////////////////////////////////////////////////////
35+
36+
namespace
37+
{
38+
39+
// Helper function that either replaces a pre-existing element at index (i) in
40+
// a std::vector with the value (x) (if (i) is less than the size of the vector)
41+
// or extends the vector in such a way that it ends up with (i+1) elements, with
42+
// the value (x) at index (i) and the padding value (pad) at each index between
43+
// that of the final pre-existing element of the vector and (i).
44+
template <typename T>
45+
void put_or_extend (
46+
std::vector<T>& v, typename std::vector<T>::size_type i, const T& x, const T& pad = T {})
47+
{
48+
if (i < v.size ())
49+
v[i] = x;
50+
else
51+
{
52+
v.resize (i, pad);
53+
v.push_back (x);
54+
}
55+
}
56+
57+
// Helper class that is used to store information about columns in a Composite.
58+
struct ColumnData
59+
{
60+
// Number of topmost layer that overlaps with the column represented by this ColumnData.
61+
// NOTE: Layer numbers start at 1. "Layer 0" is background not covered by any layer.
62+
unsigned int layer_num;
63+
64+
// Byte offset into the UTF-8 text string of the layer identified by (layer_num).
65+
// Points to the first byte of the first character to include in the content
66+
// of the column represented by this ColumnData.
67+
std::string::size_type text_begin_i;
68+
69+
// Byte offset into the UTF-8 text string of the layer identified by (layer_num).
70+
// Points to the first byte after the last character to include in the content
71+
// of the column represented by this ColumnData.
72+
std::string::size_type text_end_i;
73+
74+
// Unicode display width of the first character to include in the content
75+
// of the column represented by this ColumnData. Should always be 1 or 2,
76+
// unless this ColumnData represents a padding column.
77+
unsigned char char_0_width;
78+
79+
ColumnData (
80+
unsigned int layer = 0, std::string::size_type begin_i = 1, std::string::size_type end_i = 0,
81+
unsigned char c_0_w = 0)
82+
:
83+
layer_num (layer), text_begin_i (begin_i), text_end_i (end_i), char_0_width (c_0_w)
84+
{}
85+
86+
ColumnData (const ColumnData& orig) = default;
87+
88+
ColumnData& operator= (const ColumnData& orig) = default;
89+
90+
std::string::difference_type byte_count () const
91+
{
92+
return text_end_i - text_begin_i;
93+
}
94+
95+
// Changes the state of this ColumnData to one that indicates that the ColumnData
96+
// represents a padding column (i.e. a state where byte_count is negative).
97+
void make_padding ()
98+
{
99+
text_begin_i = 1;
100+
text_end_i = 0;
101+
char_0_width = 0;
102+
}
103+
104+
bool is_padding () const
105+
{
106+
return byte_count () < 0;
107+
}
108+
};
109+
110+
const ColumnData LAYER_0_PAD; // ColumnData representing a padding column on "layer 0".
111+
112+
// Special column index value, distinct from any valid column index.
113+
const std::string::size_type INVALID_COLUMN_I = std::numeric_limits<std::string::size_type>::max ();
114+
115+
// Helper function that turns the uncovered half of half-covered wide characters into padding.
116+
inline void do_halfcovered_wide_char_check (
117+
std::vector<ColumnData>& columns, std::vector<ColumnData>::size_type column_i)
118+
{
119+
// If there is a wide character (on a lower layer) in the preceding column, replace
120+
// that character (and any nonspacing characters associated with it) with padding.
121+
// (Because the second half of that character will be covered, and we couldn't display
122+
// half a character if we wanted to.)
123+
if (column_i >= 1 && column_i - 1 < columns.size ())
124+
{
125+
ColumnData& prev_col_data = columns[column_i - 1];
126+
if (prev_col_data.char_0_width == 2)
127+
prev_col_data.make_padding ();
128+
}
129+
}
130+
131+
};
132+
32133
////////////////////////////////////////////////////////////////////////////////
33134
// Initially assume no text, but infinite virtual space.
34135
//
@@ -74,65 +175,114 @@ void Composite::add (
74175
// bbbbb // Layer 2
75176
// c // Layer 3
76177
//
77-
// Walk all strings left to right, selecting the character and color from the
178+
// Walk all layers left to right, selecting the character and color from the
78179
// highest numbered layer. Emit color codes only on edge detection.
79180
//
80181
std::string Composite::str () const
81182
{
82-
// The strings are broken into a vector of int, for UTF8 support.
83-
std::vector <int> characters;
84-
std::vector <int> colors;
85-
for (unsigned int layer = 0; layer < _layers.size (); ++layer)
183+
std::vector <ColumnData> columns;
184+
185+
for (unsigned int layer_i = 0; layer_i < _layers.size (); ++layer_i)
86186
{
87-
const auto& text = std::get <0> (_layers[layer]);
88-
auto offset = std::get <1> (_layers[layer]);
89-
auto len = utf8_text_length (text);
187+
const auto& text = std::get <0> (_layers[layer_i]);
188+
auto offset = std::get <1> (_layers[layer_i]);
189+
auto len = utf8_text_length (text);
90190

91-
// Make sure the vectors are large enough to support a write operator[].
92-
if (characters.size () < offset + len)
93-
{
94-
characters.resize (offset + len, 32);
95-
colors.resize (offset + len, 0);
96-
}
191+
// Make sure the capacity of the column vector is large enough to support push_back()
192+
// without reallocation.
193+
if (columns.capacity () < offset + len)
194+
columns.reserve (offset + len);
97195

98-
// Copy in the layer characters and color indexes.
196+
// Inspect and decide how to handle each character (i.e. Unicode code point)
197+
// in the current layer's text string.
198+
std::string::size_type prev_cursor = 0;
99199
std::string::size_type cursor = 0;
100-
int character;
101-
int count = 0;
200+
unsigned int column_count = 0;
201+
std::string::size_type prev_spacer_column_i = INVALID_COLUMN_I;
202+
unsigned int character;
102203
while ((character = utf8_next_char (text, cursor)))
103204
{
104-
characters[offset + count] = character;
105-
colors [offset + count] = layer + 1;
106-
++count;
205+
std::string::size_type column_i = offset + column_count;
206+
int ch_width = mk_wcwidth ((wchar_t)character);
207+
208+
switch (ch_width)
209+
{
210+
case 0: // zero-width / nonspacing character
211+
if (prev_spacer_column_i == INVALID_COLUMN_I) // No preceding spacing character on this layer.
212+
; // Skip this character.
213+
else // There is a preceding spacing character on this layer.
214+
{
215+
// Append the nonspacing character to the column of the previous spacing character.
216+
columns[prev_spacer_column_i].text_end_i = cursor;
217+
}
218+
break;
219+
case 1: // ordinary narrow spacing character
220+
if (prev_spacer_column_i == INVALID_COLUMN_I)
221+
do_halfcovered_wide_char_check (columns, column_i);
222+
223+
// Put the character in the appropriate column. Pad out the column list as necessary.
224+
put_or_extend (columns, column_i, ColumnData (layer_i + 1, prev_cursor, cursor, 1), LAYER_0_PAD);
225+
226+
prev_spacer_column_i = column_i;
227+
column_count += 1;
228+
break;
229+
case 2: // graphically wide spacing character
230+
if (prev_spacer_column_i == INVALID_COLUMN_I)
231+
do_halfcovered_wide_char_check (columns, column_i);
232+
233+
// Put the character in the appropriate column. Pad out the column list as necessary.
234+
// Make the column after the current one (which is also covered by the wide character)
235+
// a padding column on the current layer.
236+
put_or_extend (columns, column_i, ColumnData (layer_i + 1, prev_cursor, cursor, 2), LAYER_0_PAD);
237+
put_or_extend (columns, column_i + 1, ColumnData (layer_i + 1), LAYER_0_PAD);
238+
239+
prev_spacer_column_i = column_i;
240+
column_count += 2;
241+
break;
242+
default: // Should not happen.
243+
throw format ("Unexpected character width {1} of code point 0x{2}.", ch_width, formatHex (character));
244+
}
245+
246+
// Remember byte offset of first UTF-8 byte of next character in the layer text.
247+
prev_cursor = cursor;
107248
}
108249
}
109250

110-
// Now walk the character and color vector, emitting every character and
111-
// every detected color change.
251+
// Now walk the column vector, emitting every character and every detected layer change.
112252
std::stringstream out;
113-
int prev_color = 0;
114-
for (unsigned int i = 0; i < characters.size (); ++i)
253+
unsigned int prev_layer = 0;
254+
for (unsigned int column_i = 0; column_i < columns.size (); ++column_i)
115255
{
116-
// A change in color triggers a code emit.
117-
if (prev_color != colors[i])
256+
auto column_data = columns[column_i];
257+
auto curr_layer = column_data.layer_num;
258+
const auto& text = std::get <0> (_layers[curr_layer - 1]);
259+
260+
// A change in layer triggers an ANSI escape code emit.
261+
if (prev_layer != curr_layer)
118262
{
119-
if (prev_color)
120-
out << std::get <2> (_layers[prev_color - 1]).end ();
263+
if (prev_layer) // Reset attributes (if any) of previous layer.
264+
out << std::get <2> (_layers[prev_layer - 1]).end ();
121265

122-
if (colors[i])
123-
out << std::get <2> (_layers[colors[i] - 1]).code ();
124-
else
125-
out << std::get <2> (_layers[prev_color - 1]).end ();
266+
if (curr_layer) // Set attributes (if any) of current layer.
267+
out << std::get <2> (_layers[curr_layer - 1]).code ();
126268

127-
prev_color = colors[i];
269+
prev_layer = curr_layer;
128270
}
129271

130-
out << utf8_character (characters[i]);
272+
// The layer text string is already UTF-8, so we can output its bytes verbatim,
273+
// provided that we're keeping track of character (i.e. code point) boundaries.
274+
if (column_data.is_padding ())
275+
out << ' '; // Display padding columns as spaces.
276+
else // Display a slice of the layer text (Spacer [Nonspacer ...]).
277+
out.write(text.data () + column_data.text_begin_i, column_data.byte_count ());
278+
279+
if (column_data.char_0_width == 2)
280+
++column_i; // Wide characters cover two columns.
131281
}
132282

133283
// Terminate the color codes, if necessary.
134-
if (prev_color)
135-
out << std::get <2> (_layers[prev_color - 1]).end ();
284+
if (prev_layer)
285+
out << std::get <2> (_layers[prev_layer - 1]).end ();
136286

137287
return out.str ();
138288
}

test/composite.t.cpp

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
////////////////////////////////////////////////////////////////////////////////
3131
int main (int, char**)
3232
{
33-
UnitTest t (3);
33+
UnitTest t (4);
3434

3535
Composite c1;
3636
c1.add ("left", 2, Color ());
@@ -130,8 +130,52 @@ int main (int, char**)
130130
c8.add ( "foo", 7, Color ("white on red"));
131131
t.diag (c8.str ());
132132

133+
// Add layers containing characters with non-standard Unicode width.
134+
// Verify that they are composited correctly.
135+
// * Each zero-width character should be included in the column of the
136+
// preceding non-zero-width character on the same layer. (If there is
137+
// no such character, the zero-width character should be skipped.)
138+
// * Each wide character should be treated as occupying two columns of the
139+
// layer, the one corresponding to the array index at which the character
140+
// code is stored, and the next one.
141+
// * If exactly one of the columns occupied by a wide characher is also
142+
// occupied by a character in a higher layer (obscuring half of the wide
143+
// character), then the wide character should not be displayed at all.
144+
// The unobscured column should be treated as containing blank space
145+
// (but still be covered by the current layer).
146+
Composite c9;
147+
c9.add ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, Color ()); // BG
148+
c9.add ("a", 50, Color ()); // more BG
149+
c9.add ("😃😃😃", 1, Color ()); // some wide chars
150+
c9.add ("bb", 1, Color ()); // obscure the first of the two wide chars
151+
c9.add ("😖😖😖", 8, Color ()); // a few more wide chars
152+
c9.add ("cc", 9, Color ()); // obscure half of each of the first two
153+
c9.add ("😬😬😬", 15, Color ()); // even more
154+
c9.add ("会会会", 18, Color ()); // obscure the last one-and-half
155+
c9.add ("[èé][ñn̄][öô]", 25, Color ()); // layer with zero-width chars (combining diacritics)
156+
c9.add ("}{", 32, Color ()); // obscure two of the non-zero-width chars
157+
c9.add ("è🐋é🐋", 38, Color ()); // 1-col, 0-col and 2-col chars on same layer
158+
c9.add ("\a\aff", 45, Color ()); // zero-width characters at beginning of layer
159+
t.is (c9.str (), "abb😃😃a cc 😖a😬 会会会a[èé][ñn̄}{öô]aè🐋é🐋affa a", "Composite ... --> 'abb😃😃a cc 😖a😬 会会会a[èé][ñn̄}{öô]aè🐋é🐋affa a'");
160+
161+
// Add colored layers containing characters with non-standard Unicode width.
162+
// Display the result.
163+
Composite c10;
164+
c10.add ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, Color ("black on bright blue")); // BG
165+
c10.add ("a", 50, Color ("black on bright blue")); // more BG
166+
c10.add ("😃😃😃", 1, Color ("yellow on grey10")); // some wide chars
167+
c10.add ("bb", 1, Color ("red on black")); // obscure the first of the two wide chars
168+
c10.add ("😖😖😖", 8, Color ("green on blue")); // a few more wide chars
169+
c10.add ("cc", 9, Color ("grey18 on green")); // obscure half of each of the first two
170+
c10.add ("😬😬😬", 15, Color ("white on red")); // even more
171+
c10.add ("会会会", 18, Color ("magenta on grey6")); // obscure the last one-and-half
172+
c10.add ("[èé][ñn̄][öô]", 25, Color ("blue on white")); // layer with zero-width chars (combining diacritics)
173+
c10.add ("}{", 32, Color ("red on white")); // obscure two of the non-zero-width chars
174+
c10.add ("è🐋é🐋", 38, Color ("yellow on cyan")); // 1-col, 0-col and 2-col chars on same layer
175+
c10.add ("\a\aff", 45, Color ("black on bright yellow")); // zero-width characters at beginning of layer
176+
t.diag (c10.str ());
177+
133178
return 0;
134179
}
135180

136181
////////////////////////////////////////////////////////////////////////////////
137-

0 commit comments

Comments
 (0)