Skip to content

Commit 268ec52

Browse files
committed
Introduce multibyte string scanner functions
* String_makePrintable() * EncodePrintableString() * String_lineBreakWidth() * String_mbswidth() Signed-off-by: Kang-Che Sung <explorer09@gmail.com>
1 parent b83a55b commit 268ec52

2 files changed

Lines changed: 342 additions & 0 deletions

File tree

XUtils.c

Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ in the source distribution for its full text.
1010
#include "XUtils.h"
1111

1212
#include <assert.h>
13+
#include <ctype.h> // IWYU pragma: keep
1314
#include <errno.h>
1415
#include <fcntl.h>
1516
#include <limits.h>
@@ -259,6 +260,297 @@ size_t strnlen(const char* str, size_t maxLen) {
259260
}
260261
#endif
261262

263+
#ifdef HAVE_LIBNCURSESW
264+
static void String_encodeWChar(WCharEncoderState* ps, wchar_t wc) {
265+
assert(!ps->buf || ps->pos < ps->size);
266+
267+
char tempBuf[MB_LEN_MAX];
268+
269+
// This function will null terminate the string only upon a call
270+
// with (wc == 0). It might take more than a single NUL byte to
271+
// terminate a string when using the C multibyte functions and a
272+
// non-Unicode encoding, thus this function won't support truncation
273+
// of a string. The caller must provide the right size in ps->size
274+
// if ps->buf is not NULL.
275+
size_t len = wcrtomb(tempBuf, wc, &ps->mbState);
276+
assert(len != 0);
277+
if (len == (size_t)-1) {
278+
assert(len != (size_t)-1);
279+
fail();
280+
}
281+
if (ps->buf) {
282+
if (len > ps->size - ps->pos) {
283+
fail();
284+
}
285+
memcpy((char*)ps->buf + ps->pos, tempBuf, len);
286+
}
287+
ps->pos += len;
288+
}
289+
#else
290+
static void String_encodeWChar(WCharEncoderState* ps, int c) {
291+
assert(!ps->buf || ps->pos < ps->size);
292+
293+
char* buf = ps->buf;
294+
if (buf)
295+
buf[ps->pos] = (char)c;
296+
297+
ps->pos += 1;
298+
}
299+
#endif
300+
301+
void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar) {
302+
assert(src || maxLen == 0);
303+
304+
size_t pos = 0;
305+
bool wasReplaced = false;
306+
307+
#ifdef HAVE_LIBNCURSESW
308+
const wchar_t replacementChar = CRT_utf8 ? L'\xFFFD' : L'?';
309+
wchar_t ch;
310+
311+
mbstate_t decState = {0};
312+
#else
313+
const char replacementChar = '?';
314+
char ch;
315+
#endif
316+
317+
do {
318+
size_t len = 0;
319+
bool shouldReplace = false;
320+
ch = 0;
321+
322+
if (pos < maxLen) {
323+
// Read the next character from the byte sequence
324+
#ifdef HAVE_LIBNCURSESW
325+
mbstate_t newState;
326+
memcpy(&newState, &decState, sizeof(newState));
327+
len = mbrtowc(&ch, &src[pos], maxLen - pos, &newState);
328+
329+
assert(len != 0 || ch == 0);
330+
switch (len) {
331+
case (size_t)-2:
332+
errno = EILSEQ;
333+
shouldReplace = true;
334+
len = maxLen - pos;
335+
break;
336+
337+
case (size_t)-1:
338+
shouldReplace = true;
339+
len = 1;
340+
break;
341+
342+
default:
343+
memcpy(&decState, &newState, sizeof(decState));
344+
}
345+
#else
346+
len = 1;
347+
ch = src[pos];
348+
#endif
349+
}
350+
351+
pos += len;
352+
353+
// Filter unprintable characters
354+
if (!shouldReplace && ch != 0) {
355+
#ifdef HAVE_LIBNCURSESW
356+
shouldReplace = !iswprint(ch);
357+
#else
358+
shouldReplace = !isprint((unsigned char)ch);
359+
#endif
360+
}
361+
362+
if (shouldReplace) {
363+
ch = replacementChar;
364+
if (wasReplaced)
365+
continue;
366+
}
367+
wasReplaced = shouldReplace;
368+
369+
encodeWChar(ps, ch);
370+
} while (ch != 0);
371+
}
372+
373+
char* String_makePrintable(const char* str, size_t maxLen) {
374+
WCharEncoderState encState = {0};
375+
376+
EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
377+
size_t size = encState.pos;
378+
assert(size > 0);
379+
380+
memset(&encState, 0, sizeof(encState));
381+
char* buf = xMalloc(size);
382+
encState.size = size;
383+
encState.buf = buf;
384+
EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
385+
assert(encState.pos == size);
386+
387+
return buf;
388+
}
389+
390+
bool MBStringDecoder_nextWChar(MBStringDecoder* decoder) {
391+
if (!decoder->str || decoder->maxLen == 0)
392+
return false;
393+
394+
// If the previous call of this function encounters an invalid sequence,
395+
// do not continue (because the "mbState" object for mbrtowc() is
396+
// undefined). The caller is supposed to reset the state.
397+
#ifdef HAVE_LIBNCURSESW
398+
if (decoder->ch == WEOF)
399+
return false;
400+
#endif
401+
402+
#ifdef HAVE_LIBNCURSESW
403+
wchar_t ch;
404+
size_t len = mbrtowc(&ch, decoder->str, decoder->maxLen, &decoder->mbState);
405+
406+
// These assertions ensure the mbrtowc() implementation is correct
407+
assert(len == 0 || len >= (size_t)-2 || ch != 0);
408+
assert(len != 0 || ch == 0);
409+
410+
switch (len) {
411+
case (size_t)-1:
412+
// Invalid sequence. decoder->str remains at the position where
413+
// the first byte of the invalid sequence is found.
414+
decoder->ch = WEOF;
415+
return false;
416+
417+
case (size_t)-2:
418+
// Incomplete sequence
419+
decoder->str += decoder->maxLen;
420+
decoder->maxLen = 0;
421+
return false;
422+
423+
case 0:
424+
// End of string. This assignment is an optimization hint.
425+
ch = 0;
426+
}
427+
#else
428+
char ch = *decoder->str;
429+
const size_t len = 1;
430+
#endif
431+
432+
if (ch == 0) {
433+
// Setting "str" to NULL prevents subsequent calls from reading
434+
// out of bounds.
435+
decoder->str = NULL;
436+
decoder->maxLen = 0;
437+
} else {
438+
decoder->str += len;
439+
decoder->maxLen -= len;
440+
}
441+
decoder->ch = ch;
442+
return true;
443+
}
444+
445+
int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator) {
446+
assert(*str || maxLen == 0);
447+
448+
// The caller should ensure (maxWidth >= 0).
449+
// It's possible for a Unicode string to occupy 0 terminal columns, so this
450+
// function allows (maxWidth == 0).
451+
if (maxWidth < 0)
452+
maxWidth = INT_MAX;
453+
454+
#ifdef HAVE_LIBNCURSESW
455+
// If the character takes zero columns, include the character in the
456+
// substring if the working encoding is UTF-8, and ignore it otherwise.
457+
// In Unicode, combining characters are always placed after the base
458+
// character, but some legacy 8-bit encodings instead place combining
459+
// characters before the base character.
460+
const bool isUnicode = CRT_utf8;
461+
#else
462+
const bool isUnicode = false;
463+
#endif
464+
465+
int totalWidth = 0;
466+
467+
MBStringDecoder decoder = {0};
468+
decoder.str = *str;
469+
decoder.maxLen = maxLen;
470+
471+
bool inSpaces = true;
472+
const char* breakPos = NULL;
473+
int breakWidth = 0;
474+
475+
while (totalWidth < maxWidth || isUnicode) {
476+
assert(totalWidth <= maxWidth);
477+
478+
if (!MBStringDecoder_nextWChar(&decoder))
479+
break;
480+
if (decoder.ch == 0)
481+
break;
482+
483+
if (decoder.ch == ' ' && separator == ' ' && !inSpaces) {
484+
inSpaces = true;
485+
breakPos = *str;
486+
breakWidth = totalWidth;
487+
}
488+
489+
#ifdef HAVE_LIBNCURSESW
490+
int cw = wcwidth((wchar_t)decoder.ch);
491+
if (cw < 0) {
492+
// This function should not be used with string containing unprintable
493+
// characters. Tolerate them on release build, however.
494+
assert(cw >= 0);
495+
break;
496+
}
497+
#else
498+
assert(isprint(decoder.ch));
499+
const int cw = 1;
500+
#endif
501+
502+
if (cw > maxWidth - totalWidth) {
503+
// This character cannot fit the line with the given maxWidth.
504+
if (breakPos) {
505+
// Rewind the scanning state to the last found separator.
506+
totalWidth = breakWidth;
507+
*str = breakPos;
508+
}
509+
break;
510+
}
511+
512+
if (cw <= 0 && !isUnicode)
513+
continue;
514+
515+
totalWidth += cw;
516+
517+
// (*str - start) will represent the length of the substring bounded
518+
// by the width limit.
519+
*str = decoder.str;
520+
521+
if (decoder.ch != ' ')
522+
inSpaces = false;
523+
524+
#ifdef HAVE_LIBNCURSESW
525+
bool isSeparator = decoder.ch == (wint_t)separator;
526+
#else
527+
bool isSeparator = decoder.ch == (int)separator;
528+
#endif
529+
if (isSeparator && separator != ' ') {
530+
breakPos = *str;
531+
breakWidth = totalWidth;
532+
}
533+
}
534+
535+
return totalWidth;
536+
}
537+
538+
int String_mbswidth(const char** str, size_t maxLen, int maxWidth) {
539+
#ifdef HAVE_LIBNCURSESW
540+
return String_lineBreakWidth(str, maxLen, maxWidth, '\0');
541+
#else
542+
assert(*str || maxLen == 0);
543+
544+
if (maxWidth < 0)
545+
maxWidth = INT_MAX;
546+
547+
maxLen = MINIMUM((size_t)maxWidth, maxLen);
548+
size_t len = strnlen(*str, maxLen);
549+
*str += len;
550+
return (int)len;
551+
#endif
552+
}
553+
262554
int xAsprintf(char** strp, const char* fmt, ...) {
263555
*strp = NULL;
264556

XUtils.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,36 @@ in the source distribution for its full text.
2222
#include <string.h> // IWYU pragma: keep
2323

2424
#include "Macros.h"
25+
#include "ProvideCurses.h"
26+
27+
28+
typedef struct WCharEncoderState_ {
29+
size_t pos;
30+
size_t size;
31+
void* buf;
32+
mbstate_t mbState;
33+
} WCharEncoderState;
34+
35+
/* Object for reading wide characters from a multibyte string.
36+
"str" and "maxLen" are input but will be modified during process.
37+
"str" will be set to NULL when the decoding is finished with the
38+
terminating L'\0' character. */
39+
typedef struct MBStringDecoder_ {
40+
const char* str;
41+
size_t maxLen;
42+
#ifdef HAVE_LIBNCURSESW
43+
wint_t ch;
44+
mbstate_t mbState;
45+
#else
46+
int ch;
47+
#endif
48+
} MBStringDecoder;
2549

50+
#ifdef HAVE_LIBNCURSESW
51+
typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, wchar_t wc);
52+
#else
53+
typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, int c);
54+
#endif
2655

2756
ATTR_NORETURN
2857
void fail(void);
@@ -108,6 +137,27 @@ size_t String_safeStrncpy(char* restrict dest, const char* restrict src, size_t
108137
size_t strnlen(const char* str, size_t maxLen);
109138
#endif
110139

140+
ATTR_NONNULL_N(1, 4) ATTR_ACCESS2_W(1) ATTR_ACCESS3_R(2, 3)
141+
void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar);
142+
143+
ATTR_RETNONNULL ATTR_MALLOC ATTR_ACCESS3_R(1, 2)
144+
char* String_makePrintable(const char* str, size_t maxLen);
145+
146+
ATTR_NONNULL
147+
bool MBStringDecoder_nextWChar(MBStringDecoder* ps);
148+
149+
ATTR_NONNULL ATTR_ACCESS2_RW(1)
150+
int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator);
151+
152+
/* Count the number of terminal columns needed to display a string, or
153+
count how many characters from the string that can be displayed
154+
with the column limit ("maxWidth").
155+
"maxLen" is in bytes.
156+
maxLen = SIZE_MAX to take the whole string.
157+
maxWidth = INT_MAX for no terminal column limit. */
158+
ATTR_NONNULL ATTR_ACCESS2_RW(1)
159+
int String_mbswidth(const char** str, size_t maxLen, int maxWidth);
160+
111161
ATTR_FORMAT(printf, 2, 3) ATTR_NONNULL_N(1, 2)
112162
int xAsprintf(char** strp, const char* fmt, ...);
113163

0 commit comments

Comments
 (0)