View Issue Details
ID | Project | Category | View Status | Date Submitted | Last Update |
---|---|---|---|---|---|
0000363 | file | General | public | 2022-07-01 01:55 | 2022-08-31 13:53 |
Reporter | dimich | Assigned To | christos | ||
Priority | normal | Severity | major | Reproducibility | always |
Status | resolved | Resolution | fixed | ||
Platform | x86_64 | OS | Linux | OS Version | Arch Linux |
Product Version | 5.42 | ||||
Fixed in Version | 5.43 | ||||
Summary | 0000363: Truncated filenames containing multibyte characters | ||||
Description | Bugfix for issue 351 introduced new bug: filenames are truncated due to incorrect calculation of printable filename width. Filename width is calculated first in file_mbswidth() with respect to multibyte characters (first statement of #if/#endif), it uses iswprint() and handles multibyte characters correctly. Then filename is passed to file_printable() which uses simple isprint() and replaces every byte of multibyte character with 4 characters. Filename width is limited by previously calculated width (wid argument) and truncated, even in raw mode. | ||||
Steps To Reproduce | $ touch файл.txt $ $ ls --zero | hexdump -b 0000000 321 204 320 260 320 271 320 273 056 164 170 164 000 000000d $ $ file файл.txt \321\204\320\260\320\271\320\273: empty $ $ file -r файл.txt файл: empty | ||||
Additional Information | I think --raw option should not affect filenames at all. Non-printable characters may be replaced but at least with respect to multibyte encodings. See also issue 362. | ||||
Tags | bug, filename, multibyte | ||||
|
This patch seems fix the issue issue363.patch (2,036 bytes)
diff --git a/src/file.c b/src/file.c index 5300e5af..239bcdcf 100644 --- a/src/file.c +++ b/src/file.c @@ -538,6 +538,52 @@ unwrap(struct magic_set *ms, const char *fn) return e; } +#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \ + defined(HAVE_WCTYPE_H) +/* + * convert string to multibyte printable format. + */ +private char * +file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz, + const char *str, size_t slen) +{ + char *ptr, *eptr = buf + bufsiz - 1; + size_t len = strlen(str); + size_t bytesconsumed; + mbstate_t state; + wchar_t nextchar; + + (void)memset(&state, 0, sizeof(mbstate_t)); + + for (ptr = buf; ptr < eptr && len && slen && *str; --slen) { + bytesconsumed = mbrtowc(&nextchar, str, len, &state); + if (bytesconsumed == CAST(size_t, -1) || + bytesconsumed == CAST(size_t, -2)) { + /* Something went wrong */ + break; + } + if ((ms->flags & MAGIC_RAW) != 0 || iswprint(nextchar)) { + if (ptr >= eptr - bytesconsumed) { + break; + } + memcpy(ptr, str, bytesconsumed); + ptr += bytesconsumed; + } else { + if (ptr >= eptr - 3) + break; + *ptr++ = '\\'; + *ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0'; + *ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0'; + *ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0'; + } + str += bytesconsumed; + len -= bytesconsumed; + } + *ptr = '\0'; + return buf; +} +#endif + /* * Called for each input file on the command line (or in a list of files) */ @@ -554,7 +600,12 @@ process(struct magic_set *ms, const char *inname, int wid) file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen); if (wid > 0 && !bflag) { +#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \ + defined(HAVE_WCTYPE_H) + pname = file_name_wprintable(ms, pbuf, plen, inname, wid); +#else pname = file_printable(ms, pbuf, plen, inname, wid); +#endif (void)printf("%s", std_in ? "/dev/stdin" : pname); if (nulsep) (void)putc('\0', stdout); |
|
Updated patch: handle invalid sequences in filenames and multicolumn characters. issue363-upd1.patch (3,308 bytes)
diff --git a/src/file.c b/src/file.c index 5300e5af..a64c004b 100644 --- a/src/file.c +++ b/src/file.c @@ -538,6 +538,59 @@ unwrap(struct magic_set *ms, const char *fn) return e; } +#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \ + defined(HAVE_WCTYPE_H) +/* + * convert string to multibyte printable format. + */ +private char * +file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz, + const char *str, size_t slen) +{ + char *ptr, *eptr = buf + bufsiz - 1; + size_t n; + size_t bytesconsumed; + mbstate_t state; + wchar_t nextchar; + int valid; + (void)memset(&state, 0, sizeof(mbstate_t)); + n = strlen(str); + + for (ptr = buf; ptr < eptr && n && slen && *str;) { + valid = 1; + bytesconsumed = mbrtowc(&nextchar, str, n, &state); + if (bytesconsumed == CAST(size_t, -1) || + bytesconsumed == CAST(size_t, -2)) { + /* Something went wrong */ + valid = 0; + bytesconsumed = 1; + (void)memset(&state, 0, sizeof(mbstate_t)); + } + if (valid && + ((ms->flags & MAGIC_RAW) != 0 || iswprint(nextchar))) { + if (ptr >= eptr - bytesconsumed) { + break; + } + memcpy(ptr, str, bytesconsumed); + ptr += bytesconsumed; + slen -= wcwidth(nextchar); + } else { + if (ptr >= eptr - 3) + break; + *ptr++ = '\\'; + *ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0'; + *ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0'; + *ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0'; + slen -= 4; + } + str += bytesconsumed; + n -= bytesconsumed; + } + *ptr = '\0'; + return buf; +} +#endif + /* * Called for each input file on the command line (or in a list of files) */ @@ -554,7 +607,12 @@ process(struct magic_set *ms, const char *inname, int wid) file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen); if (wid > 0 && !bflag) { +#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \ + defined(HAVE_WCTYPE_H) + pname = file_name_wprintable(ms, pbuf, plen, inname, wid); +#else pname = file_printable(ms, pbuf, plen, inname, wid); +#endif (void)printf("%s", std_in ? "/dev/stdin" : pname); if (nulsep) (void)putc('\0', stdout); @@ -584,22 +642,26 @@ file_mbswidth(struct magic_set *ms, const char *s) size_t width = 0; #if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \ defined(HAVE_WCTYPE_H) - size_t bytesconsumed, old_n, n; + size_t bytesconsumed, n; mbstate_t state; wchar_t nextchar; (void)memset(&state, 0, sizeof(mbstate_t)); - old_n = n = strlen(s); + n = strlen(s); while (n > 0) { bytesconsumed = mbrtowc(&nextchar, s, n, &state); if (bytesconsumed == CAST(size_t, -1) || bytesconsumed == CAST(size_t, -2)) { - /* Something went wrong, return something reasonable */ - return old_n; + /* Something went wrong */ + /* Invalid code will be replaced with octal value */ + /* Try next byte in sequence */ + width += 4; + bytesconsumed = 1; + (void)memset(&state, 0, sizeof(mbstate_t)); + } else { + width += ((ms->flags & MAGIC_RAW) != 0 + || iswprint(nextchar)) ? wcwidth(nextchar) : 4; } - width += ((ms->flags & MAGIC_RAW) != 0 - || iswprint(nextchar)) ? wcwidth(nextchar) : 4; - s += bytesconsumed, n -= bytesconsumed; } return width; |
|
One more try :) Do not replace invalid sequence characters in raw mode, print as is. The only issue i found is when --raw mode is on, --no-pad is off, LC_CTYPE=C (or another 1-byte encoding) and console is UTF-8. In this case field width cannot be calculated correctly: we don't know how many character cells a sequence will take. Possible solution is to force --no-pad in --raw mode. issue363-upd2.patch (3,406 bytes)
diff --git a/src/file.c b/src/file.c index 5300e5af..56a97b4f 100644 --- a/src/file.c +++ b/src/file.c @@ -538,6 +538,63 @@ unwrap(struct magic_set *ms, const char *fn) return e; } +#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \ + defined(HAVE_WCTYPE_H) +/* + * convert string to multibyte printable format. + */ +private char * +file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz, + const char *str, size_t slen) +{ + char *ptr, *eptr = buf + bufsiz - 1; + size_t n; + size_t bytesconsumed; + mbstate_t state; + wchar_t nextchar; + int valid; + + if ((ms->flags & MAGIC_RAW) != 0) { + strncpy(buf, str, bufsiz); + return buf; + } + + (void)memset(&state, 0, sizeof(mbstate_t)); + n = strlen(str); + + for (ptr = buf; ptr < eptr && n && slen && *str;) { + valid = 1; + bytesconsumed = mbrtowc(&nextchar, str, n, &state); + if (bytesconsumed == CAST(size_t, -1) || + bytesconsumed == CAST(size_t, -2)) { + /* Something went wrong */ + valid = 0; + bytesconsumed = 1; + (void)memset(&state, 0, sizeof(mbstate_t)); + } + if (valid && iswprint(nextchar)) { + if (ptr >= eptr - bytesconsumed) + break; + memcpy(ptr, str, bytesconsumed); + ptr += bytesconsumed; + slen -= wcwidth(nextchar); + } else { + if (ptr >= eptr - 3) + break; + *ptr++ = '\\'; + *ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0'; + *ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0'; + *ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0'; + slen -= 4; + } + str += bytesconsumed; + n -= bytesconsumed; + } + *ptr = '\0'; + return buf; +} +#endif + /* * Called for each input file on the command line (or in a list of files) */ @@ -554,7 +611,12 @@ process(struct magic_set *ms, const char *inname, int wid) file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen); if (wid > 0 && !bflag) { +#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \ + defined(HAVE_WCTYPE_H) + pname = file_name_wprintable(ms, pbuf, plen, inname, wid); +#else pname = file_printable(ms, pbuf, plen, inname, wid); +#endif (void)printf("%s", std_in ? "/dev/stdin" : pname); if (nulsep) (void)putc('\0', stdout); @@ -584,22 +646,27 @@ file_mbswidth(struct magic_set *ms, const char *s) size_t width = 0; #if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \ defined(HAVE_WCTYPE_H) - size_t bytesconsumed, old_n, n; + size_t bytesconsumed, n; mbstate_t state; wchar_t nextchar; (void)memset(&state, 0, sizeof(mbstate_t)); - old_n = n = strlen(s); + n = strlen(s); while (n > 0) { bytesconsumed = mbrtowc(&nextchar, s, n, &state); if (bytesconsumed == CAST(size_t, -1) || bytesconsumed == CAST(size_t, -2)) { - /* Something went wrong, return something reasonable */ - return old_n; + /* Something went wrong. + Invalid code will be replaced with octal value + unless raw mode. + Try next byte in sequence */ + width += ((ms->flags & MAGIC_RAW) != 0) ? 1 : 4; + bytesconsumed = 1; + (void)memset(&state, 0, sizeof(mbstate_t)); + } else { + width += ((ms->flags & MAGIC_RAW) != 0 + || iswprint(nextchar)) ? wcwidth(nextchar) : 4; } - width += ((ms->flags & MAGIC_RAW) != 0 - || iswprint(nextchar)) ? wcwidth(nextchar) : 4; - s += bytesconsumed, n -= bytesconsumed; } return width; |
|
Dup for PR/362 |
|
I like your idea to print invalid as octal, so I applied to my patch. |
Date Modified | Username | Field | Change |
---|---|---|---|
2022-07-01 01:55 | dimich | New Issue | |
2022-07-01 01:55 | dimich | Tag Attached: bug | |
2022-07-01 01:55 | dimich | Tag Attached: filename | |
2022-07-01 01:55 | dimich | Tag Attached: multibyte | |
2022-07-01 03:10 | dimich | Note Added: 0003771 | |
2022-07-01 03:10 | dimich | File Added: issue363.patch | |
2022-07-01 04:20 | dimich | Note Added: 0003772 | |
2022-07-01 04:20 | dimich | File Added: issue363-upd1.patch | |
2022-07-01 06:43 | dimich | Note Added: 0003773 | |
2022-07-01 06:43 | dimich | File Added: issue363-upd2.patch | |
2022-07-04 19:45 | christos | Assigned To | => christos |
2022-07-04 19:45 | christos | Status | new => assigned |
2022-07-04 19:46 | christos | Status | assigned => resolved |
2022-07-04 19:46 | christos | Resolution | open => fixed |
2022-07-04 19:46 | christos | Fixed in Version | => 5.43 |
2022-07-04 19:46 | christos | Note Added: 0003780 | |
2022-07-04 20:16 | christos | Note Added: 0003782 |