Skip to content

Commit ed7c064

Browse files
committed
Introduce multibyte string scanner functions
* String_makePrintable() * EncodePrintableString() * String_lineBreakWidth() * String_mbswidth() Signed-off-by: Kang-Che Sung <explorer09@gmail.com>
1 parent ae424de commit ed7c064

2 files changed

Lines changed: 334 additions & 0 deletions

File tree

XUtils.c

Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ in the source distribution for its full text.
1010
#include "XUtils.h"
1111

1212
#include <assert.h>
13+
#include <ctype.h> // IWYU pragma: keep
1314
#include <errno.h>
1415
#include <fcntl.h>
1516
#include <limits.h>
@@ -259,6 +260,299 @@ size_t strnlen(const char* str, size_t maxLen) {
259260
}
260261
#endif
261262

263+
#ifdef HAVE_LIBNCURSESW
264+
static void String_encodeWChar(WCharEncoderState* ps, wchar_t wc) {
265+
assert(!ps->buf || ps->pos < ps->size);
266+
267+
char tempBuf[MB_LEN_MAX];
268+
char* dest = ps->buf ? (char*)ps->buf + ps->pos : tempBuf;
269+
270+
// It is unnecessarily expensive to fix the output string if the caller
271+
// gives an incorrect buffer size. This function would not support any
272+
// truncation of the output string.
273+
size_t len = wcrtomb(dest, wc, &ps->mbState);
274+
assert(len > 0);
275+
if (len == (size_t)-1) {
276+
assert(len != (size_t)-1);
277+
fail();
278+
}
279+
if (ps->buf && len > ps->size - ps->pos) {
280+
assert(!ps->buf || len <= ps->size - ps->pos);
281+
fail();
282+
}
283+
284+
ps->pos += len;
285+
}
286+
#else
287+
static void String_encodeWChar(WCharEncoderState* ps, int c) {
288+
assert(!ps->buf || ps->pos < ps->size);
289+
290+
char* buf = ps->buf;
291+
if (buf)
292+
buf[ps->pos] = (char)c;
293+
294+
ps->pos += 1;
295+
}
296+
#endif
297+
298+
void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar) {
299+
assert(src || maxLen == 0);
300+
301+
size_t pos = 0;
302+
bool wasReplaced = false;
303+
304+
#ifdef HAVE_LIBNCURSESW
305+
const wchar_t replacementChar = CRT_utf8 ? L'\xFFFD' : L'?';
306+
wchar_t ch;
307+
308+
mbstate_t decState;
309+
memset(&decState, 0, sizeof(decState));
310+
#else
311+
const char replacementChar = '?';
312+
char ch;
313+
#endif
314+
315+
do {
316+
size_t len = 0;
317+
bool shouldReplace = false;
318+
ch = 0;
319+
320+
if (pos < maxLen) {
321+
// Read the next character from the byte sequence
322+
#ifdef HAVE_LIBNCURSESW
323+
mbstate_t newState;
324+
memcpy(&newState, &decState, sizeof(newState));
325+
len = mbrtowc(&ch, &src[pos], maxLen - pos, &newState);
326+
327+
assert(len != 0 || ch == 0);
328+
switch (len) {
329+
case (size_t)-2:
330+
errno = EILSEQ;
331+
shouldReplace = true;
332+
len = maxLen - pos;
333+
break;
334+
335+
case (size_t)-1:
336+
shouldReplace = true;
337+
len = 1;
338+
break;
339+
340+
default:
341+
memcpy(&decState, &newState, sizeof(decState));
342+
}
343+
#else
344+
len = 1;
345+
ch = src[pos];
346+
#endif
347+
}
348+
349+
pos += len;
350+
351+
// Filter unprintable characters
352+
if (!shouldReplace && ch != 0) {
353+
#ifdef HAVE_LIBNCURSESW
354+
shouldReplace = !iswprint(ch);
355+
#else
356+
shouldReplace = !isprint((unsigned char)ch);
357+
#endif
358+
}
359+
360+
if (shouldReplace) {
361+
ch = replacementChar;
362+
if (wasReplaced)
363+
continue;
364+
}
365+
wasReplaced = shouldReplace;
366+
367+
encodeWChar(ps, ch);
368+
} while (ch != 0);
369+
}
370+
371+
char* String_makePrintable(const char* str, size_t maxLen) {
372+
WCharEncoderState encState;
373+
374+
memset(&encState, 0, sizeof(encState));
375+
EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
376+
size_t size = encState.pos;
377+
assert(size > 0);
378+
379+
memset(&encState, 0, sizeof(encState));
380+
char* buf = xMalloc(size);
381+
encState.size = size;
382+
encState.buf = buf;
383+
EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
384+
assert(encState.pos == size);
385+
386+
return buf;
387+
}
388+
389+
bool String_decodeNextWChar(MBStringDecoderState* ps) {
390+
if (!ps->str || ps->maxLen == 0)
391+
return false;
392+
393+
// If the previous call of this function encounters an invalid sequence,
394+
// do not continue (because the "mbState" object for mbrtowc() is
395+
// undefined). The caller is supposed to reset the state.
396+
#ifdef HAVE_LIBNCURSESW
397+
bool isStateDefined = ps->ch != WEOF;
398+
#else
399+
bool isStateDefined = ps->ch != EOF;
400+
#endif
401+
if (!isStateDefined)
402+
return false;
403+
404+
#ifdef HAVE_LIBNCURSESW
405+
wchar_t wc;
406+
size_t len = mbrtowc(&wc, ps->str, ps->maxLen, &ps->mbState);
407+
switch (len) {
408+
case (size_t)-1:
409+
// Invalid sequence
410+
ps->ch = WEOF;
411+
return false;
412+
413+
case (size_t)-2:
414+
// Incomplete sequence
415+
ps->str += ps->maxLen;
416+
ps->maxLen = 0;
417+
return false;
418+
419+
case 0:
420+
assert(wc == 0);
421+
422+
ps->str = NULL;
423+
ps->maxLen = 0;
424+
ps->ch = wc;
425+
return true;
426+
427+
default:
428+
ps->str += len;
429+
ps->maxLen -= len;
430+
ps->ch = wc;
431+
}
432+
return true;
433+
#else
434+
const size_t len = 1;
435+
ps->ch = *ps->str;
436+
if (ps->ch == 0) {
437+
ps->str = NULL;
438+
ps->maxLen = 0;
439+
} else {
440+
ps->str += len;
441+
ps->maxLen -= len;
442+
}
443+
return true;
444+
#endif
445+
}
446+
447+
int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator) {
448+
assert(*str || maxLen == 0);
449+
450+
// The caller should ensure (maxWidth >= 0).
451+
// It's possible for a Unicode string to occupy 0 terminal columns, so this
452+
// function allows (maxWidth == 0).
453+
if (maxWidth < 0)
454+
maxWidth = INT_MAX;
455+
456+
#ifdef HAVE_LIBNCURSESW
457+
// If the character takes zero columns, include the character in the
458+
// substring if the working encoding is UTF-8, and ignore it otherwise.
459+
// In Unicode, combining characters are always placed after the base
460+
// character, but some legacy 8-bit encodings instead place combining
461+
// characters before the base character.
462+
const bool isUnicode = CRT_utf8;
463+
#else
464+
const bool isUnicode = false;
465+
#endif
466+
467+
int totalWidth = 0;
468+
469+
MBStringDecoderState state = {0};
470+
state.str = *str;
471+
state.maxLen = maxLen;
472+
473+
bool inSpaces = true;
474+
const char* breakPos = NULL;
475+
int breakWidth = 0;
476+
477+
while (totalWidth < maxWidth || isUnicode) {
478+
assert(totalWidth <= maxWidth);
479+
480+
if (!String_decodeNextWChar(&state))
481+
break;
482+
if (state.ch == 0)
483+
break;
484+
485+
if (state.ch == ' ' && separator == ' ' && !inSpaces) {
486+
inSpaces = true;
487+
breakPos = *str;
488+
breakWidth = totalWidth;
489+
}
490+
491+
#ifdef HAVE_LIBNCURSESW
492+
int cw = wcwidth((wchar_t)state.ch);
493+
if (cw < 0) {
494+
// This function should not be used with string containing unprintable
495+
// characters. Tolerate them on release build, however.
496+
assert(cw >= 0);
497+
break;
498+
}
499+
#else
500+
assert(isprint(state.ch));
501+
const int cw = 1;
502+
#endif
503+
504+
if (cw > maxWidth - totalWidth) {
505+
// This character cannot fit the line with the given maxWidth.
506+
if (breakPos) {
507+
// Rewind the scanning state to the last found separator.
508+
totalWidth = breakWidth;
509+
*str = breakPos;
510+
}
511+
break;
512+
}
513+
514+
if (cw <= 0 && !isUnicode)
515+
continue;
516+
517+
totalWidth += cw;
518+
519+
// (*str - start) will represent the length of the substring bounded
520+
// by the width limit.
521+
*str = state.str;
522+
523+
if (state.ch != ' ')
524+
inSpaces = false;
525+
526+
#ifdef HAVE_LIBNCURSESW
527+
bool isSeparator = state.ch == (wint_t)separator;
528+
#else
529+
bool isSeparator = state.ch == (int)separator;
530+
#endif
531+
if (isSeparator && separator != ' ') {
532+
breakPos = *str;
533+
breakWidth = totalWidth;
534+
}
535+
}
536+
537+
return totalWidth;
538+
}
539+
540+
int String_mbswidth(const char** str, size_t maxLen, int maxWidth) {
541+
#ifdef HAVE_LIBNCURSESW
542+
return String_lineBreakWidth(str, maxLen, maxWidth, '\0');
543+
#else
544+
assert(*str || maxLen == 0);
545+
546+
if (maxWidth < 0)
547+
maxWidth = INT_MAX;
548+
549+
maxLen = MINIMUM((size_t)maxWidth, maxLen);
550+
size_t len = strnlen(*str, maxLen);
551+
*str += len;
552+
return (int)len;
553+
#endif
554+
}
555+
262556
int xAsprintf(char** strp, const char* fmt, ...) {
263557
*strp = NULL;
264558

XUtils.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,32 @@ in the source distribution for its full text.
2222
#include <string.h> // IWYU pragma: keep
2323

2424
#include "Macros.h"
25+
#include "ProvideCurses.h"
26+
27+
28+
typedef struct WCharEncoderState_ {
29+
size_t pos;
30+
size_t size;
31+
void* buf;
32+
mbstate_t mbState;
33+
} WCharEncoderState;
34+
35+
typedef struct MBStringDecoderState_ {
36+
const char* str;
37+
size_t maxLen;
38+
#ifdef HAVE_LIBNCURSESW
39+
wint_t ch;
40+
mbstate_t mbState;
41+
#else
42+
int ch;
43+
#endif
44+
} MBStringDecoderState;
2545

46+
#ifdef HAVE_LIBNCURSESW
47+
typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, wchar_t wc);
48+
#else
49+
typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, int c);
50+
#endif
2651

2752
ATTR_NORETURN
2853
void fail(void);
@@ -108,6 +133,21 @@ size_t String_safeStrncpy(char* restrict dest, const char* restrict src, size_t
108133
size_t strnlen(const char* str, size_t maxLen);
109134
#endif
110135

136+
ATTR_NONNULL_N(1, 4) ATTR_ACCESS2_W(1) ATTR_ACCESS3_R(2, 3)
137+
void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar);
138+
139+
ATTR_RETNONNULL ATTR_MALLOC ATTR_ACCESS3_R(1, 2)
140+
char* String_makePrintable(const char* str, size_t maxLen);
141+
142+
ATTR_NONNULL
143+
bool String_decodeNextWChar(MBStringDecoderState* ps);
144+
145+
ATTR_NONNULL ATTR_ACCESS2_RW(1)
146+
int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator);
147+
148+
ATTR_NONNULL ATTR_ACCESS2_RW(1)
149+
int String_mbswidth(const char** str, size_t maxLen, int maxWidth);
150+
111151
ATTR_FORMAT(printf, 2, 3) ATTR_NONNULL_N(1, 2)
112152
int xAsprintf(char** strp, const char* fmt, ...);
113153

0 commit comments

Comments
 (0)