-
-
Notifications
You must be signed in to change notification settings - Fork 589
Unicode character support in screen tab names #1642
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ in the source distribution for its full text. | |
| #include "XUtils.h" | ||
|
|
||
| #include <assert.h> | ||
| #include <ctype.h> // IWYU pragma: keep | ||
| #include <errno.h> | ||
| #include <fcntl.h> | ||
| #include <limits.h> | ||
|
|
@@ -259,6 +260,297 @@ size_t strnlen(const char* str, size_t maxLen) { | |
| } | ||
| #endif | ||
|
|
||
| #ifdef HAVE_LIBNCURSESW | ||
| static void String_encodeWChar(WCharEncoderState* ps, wchar_t wc) { | ||
| assert(!ps->buf || ps->pos < ps->size); | ||
|
|
||
| char tempBuf[MB_LEN_MAX]; | ||
|
|
||
| // This function will null terminate the string only upon a call | ||
| // with (wc == 0). It might take more than a single NUL byte to | ||
| // terminate a string when using the C multibyte functions and a | ||
| // non-Unicode encoding, thus this function won't support truncation | ||
| // of a string. The caller must provide the right size in ps->size | ||
| // if ps->buf is not NULL. | ||
| size_t len = wcrtomb(tempBuf, wc, &ps->mbState); | ||
| assert(len != 0); | ||
| if (len == (size_t)-1) { | ||
| assert(len != (size_t)-1); | ||
| fail(); | ||
| } | ||
| if (ps->buf) { | ||
| if (len > ps->size - ps->pos) { | ||
| fail(); | ||
| } | ||
| memcpy((char*)ps->buf + ps->pos, tempBuf, len); | ||
| } | ||
| ps->pos += len; | ||
| } | ||
| #else | ||
| static void String_encodeWChar(WCharEncoderState* ps, int c) { | ||
| assert(!ps->buf || ps->pos < ps->size); | ||
|
|
||
| char* buf = ps->buf; | ||
| if (buf) | ||
| buf[ps->pos] = (char)c; | ||
|
|
||
| ps->pos += 1; | ||
| } | ||
| #endif | ||
|
|
||
| void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar) { | ||
| assert(src || maxLen == 0); | ||
|
|
||
| size_t pos = 0; | ||
| bool wasReplaced = false; | ||
|
|
||
| #ifdef HAVE_LIBNCURSESW | ||
|
Explorer09 marked this conversation as resolved.
|
||
| const wchar_t replacementChar = CRT_utf8 ? L'\xFFFD' : L'?'; | ||
| wchar_t ch; | ||
|
|
||
| mbstate_t decState = {0}; | ||
| #else | ||
| const char replacementChar = '?'; | ||
| char ch; | ||
| #endif | ||
|
|
||
| do { | ||
| size_t len = 0; | ||
| bool shouldReplace = false; | ||
| ch = 0; | ||
|
|
||
| if (pos < maxLen) { | ||
| // Read the next character from the byte sequence | ||
| #ifdef HAVE_LIBNCURSESW | ||
| mbstate_t newState; | ||
| memcpy(&newState, &decState, sizeof(newState)); | ||
| len = mbrtowc(&ch, &src[pos], maxLen - pos, &newState); | ||
|
|
||
| assert(len != 0 || ch == 0); | ||
| switch (len) { | ||
| case (size_t)-2: | ||
| errno = EILSEQ; | ||
| shouldReplace = true; | ||
| len = maxLen - pos; | ||
| break; | ||
|
|
||
| case (size_t)-1: | ||
| shouldReplace = true; | ||
| len = 1; | ||
| break; | ||
|
|
||
| default: | ||
| memcpy(&decState, &newState, sizeof(decState)); | ||
| } | ||
| #else | ||
| len = 1; | ||
| ch = src[pos]; | ||
| #endif | ||
| } | ||
|
|
||
| pos += len; | ||
|
|
||
| // Filter unprintable characters | ||
| if (!shouldReplace && ch != 0) { | ||
| #ifdef HAVE_LIBNCURSESW | ||
| shouldReplace = !iswprint(ch); | ||
| #else | ||
| shouldReplace = !isprint((unsigned char)ch); | ||
| #endif | ||
| } | ||
|
|
||
| if (shouldReplace) { | ||
| ch = replacementChar; | ||
| if (wasReplaced) | ||
| continue; | ||
| } | ||
| wasReplaced = shouldReplace; | ||
|
|
||
| encodeWChar(ps, ch); | ||
| } while (ch != 0); | ||
| } | ||
|
|
||
| char* String_makePrintable(const char* str, size_t maxLen) { | ||
| WCharEncoderState encState = {0}; | ||
|
|
||
| EncodePrintableString(&encState, str, maxLen, String_encodeWChar); | ||
| size_t size = encState.pos; | ||
| assert(size > 0); | ||
|
|
||
| memset(&encState, 0, sizeof(encState)); | ||
| char* buf = xMalloc(size); | ||
| encState.size = size; | ||
| encState.buf = buf; | ||
| EncodePrintableString(&encState, str, maxLen, String_encodeWChar); | ||
| assert(encState.pos == size); | ||
|
|
||
| return buf; | ||
| } | ||
|
|
||
| bool MBStringDecoder_nextWChar(MBStringDecoder* decoder) { | ||
| if (!decoder->str || decoder->maxLen == 0) | ||
| return false; | ||
|
|
||
| // If the previous call of this function encounters an invalid sequence, | ||
| // do not continue (because the "mbState" object for mbrtowc() is | ||
| // undefined). The caller is supposed to reset the state. | ||
| #ifdef HAVE_LIBNCURSESW | ||
| if (decoder->ch == WEOF) | ||
| return false; | ||
| #endif | ||
|
|
||
| #ifdef HAVE_LIBNCURSESW | ||
| wchar_t ch; | ||
| size_t len = mbrtowc(&ch, decoder->str, decoder->maxLen, &decoder->mbState); | ||
|
|
||
| // These assertions ensure the mbrtowc() implementation is correct | ||
| assert(len == 0 || len >= (size_t)-2 || ch != 0); | ||
| assert(len != 0 || ch == 0); | ||
|
|
||
| switch (len) { | ||
| case (size_t)-1: | ||
| // Invalid sequence. decoder->str remains at the position where | ||
| // the first byte of the invalid sequence is found. | ||
| decoder->ch = WEOF; | ||
| return false; | ||
|
|
||
|
Comment on lines
+411
to
+416
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Verify callers that may loop on String_lineBreakWidth/String_mbswidth
# and require forward progress when malformed bytes are present.
rg -n -C3 '\bString_lineBreakWidth\s*\(|\bString_mbswidth\s*\('Repository: htop-dev/htop Length of output: 2394 🏁 Script executed: # Get the size of XUtils.c first to determine reading strategy
wc -l XUtils.cRepository: htop-dev/htop Length of output: 70 🏁 Script executed: # Read the decoder function around lines 411-415
sed -n '385,420p' XUtils.cRepository: htop-dev/htop Length of output: 969 🏁 Script executed: # Read the String_lineBreakWidth function around lines 442-480
sed -n '442,480p' XUtils.cRepository: htop-dev/htop Length of output: 1278 🏁 Script executed: # Read the full String_decodeNextWChar to understand the complete context
sed -n '370,440p' XUtils.cRepository: htop-dev/htop Length of output: 1725 🏁 Script executed: sed -n '442,533p' XUtils.cRepository: htop-dev/htop Length of output: 2586 Consume bytes on invalid multibyte sequences to ensure forward progress. Invalid UTF-8 at line 411 returns Proposed fix switch (len) {
case (size_t)-1:
- // Invalid sequence
- ps->ch = WEOF;
- return false;
+ // Invalid sequence: consume one byte to guarantee progress
+ ps->str += 1;
+ ps->maxLen -= 1;
+ ps->ch = WEOF;
+ return false;
case (size_t)-2:And in - if (!String_decodeNextWChar(&state))
+ if (!String_decodeNextWChar(&state)) {
+ *str = state.str;
break;
+ } |
||
| case (size_t)-2: | ||
| // Incomplete sequence | ||
| decoder->str += decoder->maxLen; | ||
| decoder->maxLen = 0; | ||
| return false; | ||
|
|
||
| case 0: | ||
| // End of string. This assignment is an optimization hint. | ||
| ch = 0; | ||
| } | ||
| #else | ||
| char ch = *decoder->str; | ||
| const size_t len = 1; | ||
| #endif | ||
|
|
||
| if (ch == 0) { | ||
| // Setting "str" to NULL prevents subsequent calls from reading | ||
| // out of bounds. | ||
| decoder->str = NULL; | ||
| decoder->maxLen = 0; | ||
| } else { | ||
| decoder->str += len; | ||
| decoder->maxLen -= len; | ||
| } | ||
| decoder->ch = ch; | ||
| return true; | ||
| } | ||
|
|
||
| int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator) { | ||
| assert(*str || maxLen == 0); | ||
|
|
||
| // The caller should ensure (maxWidth >= 0). | ||
| // It's possible for a Unicode string to occupy 0 terminal columns, so this | ||
| // function allows (maxWidth == 0). | ||
| if (maxWidth < 0) | ||
| maxWidth = INT_MAX; | ||
|
|
||
| #ifdef HAVE_LIBNCURSESW | ||
| // If the character takes zero columns, include the character in the | ||
| // substring if the working encoding is UTF-8, and ignore it otherwise. | ||
| // In Unicode, combining characters are always placed after the base | ||
| // character, but some legacy 8-bit encodings instead place combining | ||
| // characters before the base character. | ||
| const bool isUnicode = CRT_utf8; | ||
| #else | ||
| const bool isUnicode = false; | ||
| #endif | ||
|
|
||
| int totalWidth = 0; | ||
|
|
||
| MBStringDecoder decoder = {0}; | ||
| decoder.str = *str; | ||
| decoder.maxLen = maxLen; | ||
|
|
||
| bool inSpaces = true; | ||
| const char* breakPos = NULL; | ||
| int breakWidth = 0; | ||
|
|
||
| while (totalWidth < maxWidth || isUnicode) { | ||
| assert(totalWidth <= maxWidth); | ||
|
|
||
| if (!MBStringDecoder_nextWChar(&decoder)) | ||
| break; | ||
| if (decoder.ch == 0) | ||
| break; | ||
|
|
||
| if (decoder.ch == ' ' && separator == ' ' && !inSpaces) { | ||
| inSpaces = true; | ||
| breakPos = *str; | ||
| breakWidth = totalWidth; | ||
| } | ||
|
|
||
| #ifdef HAVE_LIBNCURSESW | ||
| int cw = wcwidth((wchar_t)decoder.ch); | ||
| if (cw < 0) { | ||
| // This function should not be used with string containing unprintable | ||
| // characters. Tolerate them on release build, however. | ||
| assert(cw >= 0); | ||
| break; | ||
| } | ||
| #else | ||
| assert(isprint(decoder.ch)); | ||
| const int cw = 1; | ||
| #endif | ||
|
|
||
| if (cw > maxWidth - totalWidth) { | ||
| // This character cannot fit the line with the given maxWidth. | ||
| if (breakPos) { | ||
| // Rewind the scanning state to the last found separator. | ||
| totalWidth = breakWidth; | ||
| *str = breakPos; | ||
| } | ||
| break; | ||
| } | ||
|
|
||
| if (cw <= 0 && !isUnicode) | ||
| continue; | ||
|
|
||
| totalWidth += cw; | ||
|
|
||
| // (*str - start) will represent the length of the substring bounded | ||
| // by the width limit. | ||
| *str = decoder.str; | ||
|
|
||
| if (decoder.ch != ' ') | ||
| inSpaces = false; | ||
|
|
||
| #ifdef HAVE_LIBNCURSESW | ||
| bool isSeparator = decoder.ch == (wint_t)separator; | ||
| #else | ||
| bool isSeparator = decoder.ch == (int)separator; | ||
| #endif | ||
| if (isSeparator && separator != ' ') { | ||
| breakPos = *str; | ||
| breakWidth = totalWidth; | ||
| } | ||
| } | ||
|
|
||
| return totalWidth; | ||
| } | ||
|
|
||
| int String_mbswidth(const char** str, size_t maxLen, int maxWidth) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why have the public API of this function modify an input argument to move to the end of the function?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function has two outputs: One is the terminal width (i.e. num of terminal columns) and the other is the end character position after calculating the width. The second output is useful to tell, for example, how many Unicode characters would it take to fit the number of terminal columns. This functions has two uses combined in a single API: (1) to count the number of terminal columns for a given multibyte string; (2) to count the number characters it takes to fit the given terminal width.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might seem redundant, but I'd prefer if this output parameter was a separate (NULL-able) argument to the function. This separates concerns (input argument vs. optional additional output) and makes the API clearer. This also allows to just skip the extra local variable in case you don't care AND don't have it unexpectedly be changed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
A By the way, the "input pointer that would be modified on output" is not my original idea. There are precedents in mbsrtowcs(3) and mbsnrtowcs(3) (compare mbsrtowcs(3) with mbstowcs(3) and I would argue that the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I gave both variants of the API to Copilot to compare (without initially disclosing what the function would do) and except for "if it runs in a loop for parsing" the additional out argument was the preferred variant. This only solidified further once I gave Copilot also some more semantics of the function it just had rated. One interesting aspect was that modern compiler tend to optimize the additional out parameter variant better. So there's several aspects that lean towards that variant of the API. Using the C standard library's string function API as a baseline for API quality sets a very low bar for quality … ;-)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
What variants of the API did you give to Copilot in particular?
Again I'm not sure what variant you are referring to.
I personally value utility of the API over the "quality" which I'm not sure what it is. Convenience of passing the arguments and calling the function? Register and stack usage? It's not I disagree with changing the API, but I didn't see an API suggestion to compare with, thus I cannot judge anything regarding what Copilot said. |
||
| #ifdef HAVE_LIBNCURSESW | ||
| return String_lineBreakWidth(str, maxLen, maxWidth, '\0'); | ||
| #else | ||
| assert(*str || maxLen == 0); | ||
|
|
||
| if (maxWidth < 0) | ||
| maxWidth = INT_MAX; | ||
|
|
||
| maxLen = MINIMUM((size_t)maxWidth, maxLen); | ||
| size_t len = strnlen(*str, maxLen); | ||
| *str += len; | ||
| return (int)len; | ||
| #endif | ||
| } | ||
|
|
||
| int xAsprintf(char** strp, const char* fmt, ...) { | ||
| *strp = NULL; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,36 @@ in the source distribution for its full text. | |
| #include <string.h> // IWYU pragma: keep | ||
|
|
||
| #include "Macros.h" | ||
| #include "ProvideCurses.h" | ||
|
|
||
|
|
||
| typedef struct WCharEncoderState_ { | ||
| size_t pos; | ||
| size_t size; | ||
| void* buf; | ||
| mbstate_t mbState; | ||
| } WCharEncoderState; | ||
|
|
||
| /* Object for reading wide characters from a multibyte string. | ||
| "str" and "maxLen" are input but will be modified during process. | ||
| "str" will be set to NULL when the decoding is finished with the | ||
| terminating L'\0' character. */ | ||
| typedef struct MBStringDecoder_ { | ||
| const char* str; | ||
| size_t maxLen; | ||
| #ifdef HAVE_LIBNCURSESW | ||
| wint_t ch; | ||
| mbstate_t mbState; | ||
| #else | ||
| int ch; | ||
| #endif | ||
| } MBStringDecoder; | ||
|
|
||
| #ifdef HAVE_LIBNCURSESW | ||
| typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, wchar_t wc); | ||
| #else | ||
| typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, int c); | ||
| #endif | ||
|
|
||
| ATTR_NORETURN | ||
| void fail(void); | ||
|
|
@@ -108,6 +137,27 @@ size_t String_safeStrncpy(char* restrict dest, const char* restrict src, size_t | |
| size_t strnlen(const char* str, size_t maxLen); | ||
| #endif | ||
|
|
||
| ATTR_NONNULL_N(1, 4) ATTR_ACCESS2_W(1) ATTR_ACCESS3_R(2, 3) | ||
| void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major | ⚡ Quick win Rename This new public function name breaks the project’s C naming convention and makes the API inconsistent with surrounding As per coding guidelines, use |
||
|
|
||
| ATTR_RETNONNULL ATTR_MALLOC ATTR_ACCESS3_R(1, 2) | ||
| char* String_makePrintable(const char* str, size_t maxLen); | ||
|
|
||
| ATTR_NONNULL | ||
| bool MBStringDecoder_nextWChar(MBStringDecoder* ps); | ||
|
|
||
| ATTR_NONNULL ATTR_ACCESS2_RW(1) | ||
| int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator); | ||
|
|
||
| /* Count the number of terminal columns needed to display a string, or | ||
| count how many characters from the string that can be displayed | ||
| with the column limit ("maxWidth"). | ||
| "maxLen" is in bytes. | ||
| maxLen = SIZE_MAX to take the whole string. | ||
| maxWidth = INT_MAX for no terminal column limit. */ | ||
| ATTR_NONNULL ATTR_ACCESS2_RW(1) | ||
| int String_mbswidth(const char** str, size_t maxLen, int maxWidth); | ||
|
|
||
| ATTR_FORMAT(printf, 2, 3) ATTR_NONNULL_N(1, 2) | ||
| int xAsprintf(char** strp, const char* fmt, ...); | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why introduce this extra variable? Was this patch written by AI? ;-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically no AI is used :)
The
ptris a temporary variable holding a pointer because of the API design I explained in https://github.com/htop-dev/htop/pull/1642/changes/BASE..6b82bcef3b8f7aecd9be6c83d66e84af152a1d58#r3211053429