Skip to content

Commit b88d83e

Browse files
committed
Introduce multibyte string scanner functions
* String_makePrintable() * EncodePrintableString() * String_lineBreakWidth() * String_mbswidth() Signed-off-by: Kang-Che Sung <explorer09@gmail.com>
1 parent b7f9df9 commit b88d83e

2 files changed

Lines changed: 341 additions & 0 deletions

File tree

XUtils.c

Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ in the source distribution for its full text.
1010
#include "XUtils.h"
1111

1212
#include <assert.h>
13+
#include <ctype.h> // IWYU pragma: keep
1314
#include <errno.h>
1415
#include <fcntl.h>
1516
#include <limits.h>
@@ -259,6 +260,300 @@ size_t strnlen(const char* str, size_t maxLen) {
259260
}
260261
#endif
261262

263+
#ifdef HAVE_LIBNCURSESW
264+
static void String_encodeWChar(WCharEncoderState* ps, wchar_t wc) {
265+
assert(!ps->buf || ps->pos < ps->size);
266+
267+
char tempBuf[MB_LEN_MAX];
268+
269+
// This function will null terminate the string only upon a call
270+
// with (wc == 0). It might take more than a single NUL byte to
271+
// terminate a string when using the C multibyte functions and a
272+
// non-Unicode encoding, thus this function won't support truncation
273+
// of a string. The caller must provide the right size in ps->size
274+
// if ps->buf is not NULL.
275+
size_t len = wcrtomb(tempBuf, wc, &ps->mbState);
276+
assert(len != 0);
277+
if (len == (size_t)-1) {
278+
assert(len != (size_t)-1);
279+
fail();
280+
}
281+
if (ps->buf) {
282+
if (len > ps->size - ps->pos) {
283+
fail();
284+
}
285+
memcpy((char*)ps->buf + ps->pos, tempBuf, len);
286+
}
287+
ps->pos += len;
288+
}
289+
#else
290+
static void String_encodeWChar(WCharEncoderState* ps, int c) {
291+
assert(!ps->buf || ps->pos < ps->size);
292+
293+
char* buf = ps->buf;
294+
if (buf)
295+
buf[ps->pos] = (char)c;
296+
297+
ps->pos += 1;
298+
}
299+
#endif
300+
301+
void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar) {
302+
assert(src || maxLen == 0);
303+
304+
size_t pos = 0;
305+
bool wasReplaced = false;
306+
307+
#ifdef HAVE_LIBNCURSESW
308+
const wchar_t replacementChar = CRT_utf8 ? L'\xFFFD' : L'?';
309+
wchar_t ch;
310+
311+
mbstate_t decState = {0};
312+
#else
313+
const char replacementChar = '?';
314+
char ch;
315+
#endif
316+
317+
do {
318+
size_t len = 0;
319+
bool shouldReplace = false;
320+
ch = 0;
321+
322+
if (pos < maxLen) {
323+
// Read the next character from the byte sequence
324+
#ifdef HAVE_LIBNCURSESW
325+
mbstate_t newState;
326+
memcpy(&newState, &decState, sizeof(newState));
327+
len = mbrtowc(&ch, &src[pos], maxLen - pos, &newState);
328+
329+
assert(len != 0 || ch == 0);
330+
switch (len) {
331+
case (size_t)-2:
332+
errno = EILSEQ;
333+
shouldReplace = true;
334+
len = maxLen - pos;
335+
break;
336+
337+
case (size_t)-1:
338+
shouldReplace = true;
339+
len = 1;
340+
break;
341+
342+
default:
343+
memcpy(&decState, &newState, sizeof(decState));
344+
}
345+
#else
346+
len = 1;
347+
ch = src[pos];
348+
#endif
349+
}
350+
351+
pos += len;
352+
353+
// Filter unprintable characters
354+
if (!shouldReplace && ch != 0) {
355+
#ifdef HAVE_LIBNCURSESW
356+
shouldReplace = !iswprint(ch);
357+
#else
358+
shouldReplace = !isprint((unsigned char)ch);
359+
#endif
360+
}
361+
362+
if (shouldReplace) {
363+
ch = replacementChar;
364+
if (wasReplaced)
365+
continue;
366+
}
367+
wasReplaced = shouldReplace;
368+
369+
encodeWChar(ps, ch);
370+
} while (ch != 0);
371+
}
372+
373+
char* String_makePrintable(const char* str, size_t maxLen) {
374+
WCharEncoderState encState = {0};
375+
376+
EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
377+
size_t size = encState.pos;
378+
assert(size > 0);
379+
380+
memset(&encState, 0, sizeof(encState));
381+
char* buf = xMalloc(size);
382+
encState.size = size;
383+
encState.buf = buf;
384+
EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
385+
assert(encState.pos == size);
386+
387+
return buf;
388+
}
389+
390+
bool String_decodeNextWChar(MBStringDecoderState* ps) {
391+
if (!ps->str || ps->maxLen == 0)
392+
return false;
393+
394+
// If the previous call of this function encounters an invalid sequence,
395+
// do not continue (because the "mbState" object for mbrtowc() is
396+
// undefined). The caller is supposed to reset the state.
397+
#ifdef HAVE_LIBNCURSESW
398+
bool isStateDefined = ps->ch != WEOF;
399+
#else
400+
bool isStateDefined = ps->ch != EOF;
401+
#endif
402+
if (!isStateDefined)
403+
return false;
404+
405+
#ifdef HAVE_LIBNCURSESW
406+
wchar_t wc;
407+
size_t len = mbrtowc(&wc, ps->str, ps->maxLen, &ps->mbState);
408+
switch (len) {
409+
case (size_t)-1:
410+
// Invalid sequence
411+
ps->ch = WEOF;
412+
return false;
413+
414+
case (size_t)-2:
415+
// Incomplete sequence
416+
ps->str += ps->maxLen;
417+
ps->maxLen = 0;
418+
return false;
419+
420+
case 0:
421+
assert(wc == 0);
422+
423+
ps->str = NULL;
424+
ps->maxLen = 0;
425+
ps->ch = wc;
426+
return true;
427+
428+
default:
429+
ps->str += len;
430+
ps->maxLen -= len;
431+
ps->ch = wc;
432+
}
433+
return true;
434+
#else
435+
const size_t len = 1;
436+
ps->ch = *ps->str;
437+
if (ps->ch == 0) {
438+
ps->str = NULL;
439+
ps->maxLen = 0;
440+
} else {
441+
ps->str += len;
442+
ps->maxLen -= len;
443+
}
444+
return true;
445+
#endif
446+
}
447+
448+
int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator) {
449+
assert(*str || maxLen == 0);
450+
451+
// The caller should ensure (maxWidth >= 0).
452+
// It's possible for a Unicode string to occupy 0 terminal columns, so this
453+
// function allows (maxWidth == 0).
454+
if (maxWidth < 0)
455+
maxWidth = INT_MAX;
456+
457+
#ifdef HAVE_LIBNCURSESW
458+
// If the character takes zero columns, include the character in the
459+
// substring if the working encoding is UTF-8, and ignore it otherwise.
460+
// In Unicode, combining characters are always placed after the base
461+
// character, but some legacy 8-bit encodings instead place combining
462+
// characters before the base character.
463+
const bool isUnicode = CRT_utf8;
464+
#else
465+
const bool isUnicode = false;
466+
#endif
467+
468+
int totalWidth = 0;
469+
470+
MBStringDecoderState state = {0};
471+
state.str = *str;
472+
state.maxLen = maxLen;
473+
474+
bool inSpaces = true;
475+
const char* breakPos = NULL;
476+
int breakWidth = 0;
477+
478+
while (totalWidth < maxWidth || isUnicode) {
479+
assert(totalWidth <= maxWidth);
480+
481+
if (!String_decodeNextWChar(&state))
482+
break;
483+
if (state.ch == 0)
484+
break;
485+
486+
if (state.ch == ' ' && separator == ' ' && !inSpaces) {
487+
inSpaces = true;
488+
breakPos = *str;
489+
breakWidth = totalWidth;
490+
}
491+
492+
#ifdef HAVE_LIBNCURSESW
493+
int cw = wcwidth((wchar_t)state.ch);
494+
if (cw < 0) {
495+
// This function should not be used with string containing unprintable
496+
// characters. Tolerate them on release build, however.
497+
assert(cw >= 0);
498+
break;
499+
}
500+
#else
501+
assert(isprint(state.ch));
502+
const int cw = 1;
503+
#endif
504+
505+
if (cw > maxWidth - totalWidth) {
506+
// This character cannot fit the line with the given maxWidth.
507+
if (breakPos) {
508+
// Rewind the scanning state to the last found separator.
509+
totalWidth = breakWidth;
510+
*str = breakPos;
511+
}
512+
break;
513+
}
514+
515+
if (cw <= 0 && !isUnicode)
516+
continue;
517+
518+
totalWidth += cw;
519+
520+
// (*str - start) will represent the length of the substring bounded
521+
// by the width limit.
522+
*str = state.str;
523+
524+
if (state.ch != ' ')
525+
inSpaces = false;
526+
527+
#ifdef HAVE_LIBNCURSESW
528+
bool isSeparator = state.ch == (wint_t)separator;
529+
#else
530+
bool isSeparator = state.ch == (int)separator;
531+
#endif
532+
if (isSeparator && separator != ' ') {
533+
breakPos = *str;
534+
breakWidth = totalWidth;
535+
}
536+
}
537+
538+
return totalWidth;
539+
}
540+
541+
int String_mbswidth(const char** str, size_t maxLen, int maxWidth) {
542+
#ifdef HAVE_LIBNCURSESW
543+
return String_lineBreakWidth(str, maxLen, maxWidth, '\0');
544+
#else
545+
assert(*str || maxLen == 0);
546+
547+
if (maxWidth < 0)
548+
maxWidth = INT_MAX;
549+
550+
maxLen = MINIMUM((size_t)maxWidth, maxLen);
551+
size_t len = strnlen(*str, maxLen);
552+
*str += len;
553+
return (int)len;
554+
#endif
555+
}
556+
262557
int xAsprintf(char** strp, const char* fmt, ...) {
263558
*strp = NULL;
264559

XUtils.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,32 @@ in the source distribution for its full text.
2222
#include <string.h> // IWYU pragma: keep
2323

2424
#include "Macros.h"
25+
#include "ProvideCurses.h"
26+
27+
28+
typedef struct WCharEncoderState_ {
29+
size_t pos;
30+
size_t size;
31+
void* buf;
32+
mbstate_t mbState;
33+
} WCharEncoderState;
34+
35+
typedef struct MBStringDecoderState_ {
36+
const char* str;
37+
size_t maxLen;
38+
#ifdef HAVE_LIBNCURSESW
39+
wint_t ch;
40+
mbstate_t mbState;
41+
#else
42+
int ch;
43+
#endif
44+
} MBStringDecoderState;
2545

46+
#ifdef HAVE_LIBNCURSESW
47+
typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, wchar_t wc);
48+
#else
49+
typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, int c);
50+
#endif
2651

2752
ATTR_NORETURN
2853
void fail(void);
@@ -108,6 +133,27 @@ size_t String_safeStrncpy(char* restrict dest, const char* restrict src, size_t
108133
size_t strnlen(const char* str, size_t maxLen);
109134
#endif
110135

136+
ATTR_NONNULL_N(1, 4) ATTR_ACCESS2_W(1) ATTR_ACCESS3_R(2, 3)
137+
void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar);
138+
139+
ATTR_RETNONNULL ATTR_MALLOC ATTR_ACCESS3_R(1, 2)
140+
char* String_makePrintable(const char* str, size_t maxLen);
141+
142+
ATTR_NONNULL
143+
bool String_decodeNextWChar(MBStringDecoderState* ps);
144+
145+
ATTR_NONNULL ATTR_ACCESS2_RW(1)
146+
int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator);
147+
148+
/* Count the number of terminal columns needed to display a string, or
149+
count how many characters from the string that can be displayed
150+
with the column limit ("maxWidth").
151+
"maxLen" is in bytes.
152+
maxLen = SIZE_MAX to take the whole string.
153+
maxWidth = INT_MAX for no terminal column limit. */
154+
ATTR_NONNULL ATTR_ACCESS2_RW(1)
155+
int String_mbswidth(const char** str, size_t maxLen, int maxWidth);
156+
111157
ATTR_FORMAT(printf, 2, 3) ATTR_NONNULL_N(1, 2)
112158
int xAsprintf(char** strp, const char* fmt, ...);
113159

0 commit comments

Comments
 (0)