View Issue Details
| ID | Project | Category | View Status | Date Submitted | Last Update |
|---|---|---|---|---|---|
| 0000363 | file | General | public | 2022-07-01 01:55 | 2022-08-31 13:53 |
| Reporter | dimich | Assigned To | christos | ||
| Priority | normal | Severity | major | Reproducibility | always |
| Status | resolved | Resolution | fixed | ||
| Platform | x86_64 | OS | Linux | OS Version | Arch Linux |
| Product Version | 5.42 | ||||
| Fixed in Version | 5.43 | ||||
| Summary | 0000363: Truncated filenames containing multibyte characters | ||||
| Description | Bugfix for issue 351 introduced new bug: filenames are truncated due to incorrect calculation of printable filename width. Filename width is calculated first in file_mbswidth() with respect to multibyte characters (first statement of #if/#endif), it uses iswprint() and handles multibyte characters correctly. Then filename is passed to file_printable() which uses simple isprint() and replaces every byte of multibyte character with 4 characters. Filename width is limited by previously calculated width (wid argument) and truncated, even in raw mode. | ||||
| Steps To Reproduce | $ touch файл.txt $ $ ls --zero | hexdump -b 0000000 321 204 320 260 320 271 320 273 056 164 170 164 000 000000d $ $ file файл.txt \321\204\320\260\320\271\320\273: empty $ $ file -r файл.txt файл: empty | ||||
| Additional Information | I think --raw option should not affect filenames at all. Non-printable characters may be replaced but at least with respect to multibyte encodings. See also issue 362. | ||||
| Tags | bug, filename, multibyte | ||||
|
|
This patch seems fix the issue issue363.patch (2,036 bytes)
diff --git a/src/file.c b/src/file.c
index 5300e5af..239bcdcf 100644
--- a/src/file.c
+++ b/src/file.c
@@ -538,6 +538,52 @@ unwrap(struct magic_set *ms, const char *fn)
return e;
}
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+ defined(HAVE_WCTYPE_H)
+/*
+ * convert string to multibyte printable format.
+ */
+private char *
+file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz,
+ const char *str, size_t slen)
+{
+ char *ptr, *eptr = buf + bufsiz - 1;
+ size_t len = strlen(str);
+ size_t bytesconsumed;
+ mbstate_t state;
+ wchar_t nextchar;
+
+ (void)memset(&state, 0, sizeof(mbstate_t));
+
+ for (ptr = buf; ptr < eptr && len && slen && *str; --slen) {
+ bytesconsumed = mbrtowc(&nextchar, str, len, &state);
+ if (bytesconsumed == CAST(size_t, -1) ||
+ bytesconsumed == CAST(size_t, -2)) {
+ /* Something went wrong */
+ break;
+ }
+ if ((ms->flags & MAGIC_RAW) != 0 || iswprint(nextchar)) {
+ if (ptr >= eptr - bytesconsumed) {
+ break;
+ }
+ memcpy(ptr, str, bytesconsumed);
+ ptr += bytesconsumed;
+ } else {
+ if (ptr >= eptr - 3)
+ break;
+ *ptr++ = '\\';
+ *ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0';
+ *ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0';
+ *ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0';
+ }
+ str += bytesconsumed;
+ len -= bytesconsumed;
+ }
+ *ptr = '\0';
+ return buf;
+}
+#endif
+
/*
* Called for each input file on the command line (or in a list of files)
*/
@@ -554,7 +600,12 @@ process(struct magic_set *ms, const char *inname, int wid)
file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen);
if (wid > 0 && !bflag) {
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+ defined(HAVE_WCTYPE_H)
+ pname = file_name_wprintable(ms, pbuf, plen, inname, wid);
+#else
pname = file_printable(ms, pbuf, plen, inname, wid);
+#endif
(void)printf("%s", std_in ? "/dev/stdin" : pname);
if (nulsep)
(void)putc('\0', stdout);
|
|
|
Updated patch: handle invalid sequences in filenames and multicolumn characters. issue363-upd1.patch (3,308 bytes)
diff --git a/src/file.c b/src/file.c
index 5300e5af..a64c004b 100644
--- a/src/file.c
+++ b/src/file.c
@@ -538,6 +538,59 @@ unwrap(struct magic_set *ms, const char *fn)
return e;
}
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+ defined(HAVE_WCTYPE_H)
+/*
+ * convert string to multibyte printable format.
+ */
+private char *
+file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz,
+ const char *str, size_t slen)
+{
+ char *ptr, *eptr = buf + bufsiz - 1;
+ size_t n;
+ size_t bytesconsumed;
+ mbstate_t state;
+ wchar_t nextchar;
+ int valid;
+ (void)memset(&state, 0, sizeof(mbstate_t));
+ n = strlen(str);
+
+ for (ptr = buf; ptr < eptr && n && slen && *str;) {
+ valid = 1;
+ bytesconsumed = mbrtowc(&nextchar, str, n, &state);
+ if (bytesconsumed == CAST(size_t, -1) ||
+ bytesconsumed == CAST(size_t, -2)) {
+ /* Something went wrong */
+ valid = 0;
+ bytesconsumed = 1;
+ (void)memset(&state, 0, sizeof(mbstate_t));
+ }
+ if (valid &&
+ ((ms->flags & MAGIC_RAW) != 0 || iswprint(nextchar))) {
+ if (ptr >= eptr - bytesconsumed) {
+ break;
+ }
+ memcpy(ptr, str, bytesconsumed);
+ ptr += bytesconsumed;
+ slen -= wcwidth(nextchar);
+ } else {
+ if (ptr >= eptr - 3)
+ break;
+ *ptr++ = '\\';
+ *ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0';
+ *ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0';
+ *ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0';
+ slen -= 4;
+ }
+ str += bytesconsumed;
+ n -= bytesconsumed;
+ }
+ *ptr = '\0';
+ return buf;
+}
+#endif
+
/*
* Called for each input file on the command line (or in a list of files)
*/
@@ -554,7 +607,12 @@ process(struct magic_set *ms, const char *inname, int wid)
file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen);
if (wid > 0 && !bflag) {
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+ defined(HAVE_WCTYPE_H)
+ pname = file_name_wprintable(ms, pbuf, plen, inname, wid);
+#else
pname = file_printable(ms, pbuf, plen, inname, wid);
+#endif
(void)printf("%s", std_in ? "/dev/stdin" : pname);
if (nulsep)
(void)putc('\0', stdout);
@@ -584,22 +642,26 @@ file_mbswidth(struct magic_set *ms, const char *s)
size_t width = 0;
#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
defined(HAVE_WCTYPE_H)
- size_t bytesconsumed, old_n, n;
+ size_t bytesconsumed, n;
mbstate_t state;
wchar_t nextchar;
(void)memset(&state, 0, sizeof(mbstate_t));
- old_n = n = strlen(s);
+ n = strlen(s);
while (n > 0) {
bytesconsumed = mbrtowc(&nextchar, s, n, &state);
if (bytesconsumed == CAST(size_t, -1) ||
bytesconsumed == CAST(size_t, -2)) {
- /* Something went wrong, return something reasonable */
- return old_n;
+ /* Something went wrong */
+ /* Invalid code will be replaced with octal value */
+ /* Try next byte in sequence */
+ width += 4;
+ bytesconsumed = 1;
+ (void)memset(&state, 0, sizeof(mbstate_t));
+ } else {
+ width += ((ms->flags & MAGIC_RAW) != 0
+ || iswprint(nextchar)) ? wcwidth(nextchar) : 4;
}
- width += ((ms->flags & MAGIC_RAW) != 0
- || iswprint(nextchar)) ? wcwidth(nextchar) : 4;
-
s += bytesconsumed, n -= bytesconsumed;
}
return width;
|
|
|
One more try :) Do not replace invalid sequence characters in raw mode, print as is. The only issue i found is when --raw mode is on, --no-pad is off, LC_CTYPE=C (or another 1-byte encoding) and console is UTF-8. In this case field width cannot be calculated correctly: we don't know how many character cells a sequence will take. Possible solution is to force --no-pad in --raw mode. issue363-upd2.patch (3,406 bytes)
diff --git a/src/file.c b/src/file.c
index 5300e5af..56a97b4f 100644
--- a/src/file.c
+++ b/src/file.c
@@ -538,6 +538,63 @@ unwrap(struct magic_set *ms, const char *fn)
return e;
}
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+ defined(HAVE_WCTYPE_H)
+/*
+ * convert string to multibyte printable format.
+ */
+private char *
+file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz,
+ const char *str, size_t slen)
+{
+ char *ptr, *eptr = buf + bufsiz - 1;
+ size_t n;
+ size_t bytesconsumed;
+ mbstate_t state;
+ wchar_t nextchar;
+ int valid;
+
+ if ((ms->flags & MAGIC_RAW) != 0) {
+ strncpy(buf, str, bufsiz);
+ return buf;
+ }
+
+ (void)memset(&state, 0, sizeof(mbstate_t));
+ n = strlen(str);
+
+ for (ptr = buf; ptr < eptr && n && slen && *str;) {
+ valid = 1;
+ bytesconsumed = mbrtowc(&nextchar, str, n, &state);
+ if (bytesconsumed == CAST(size_t, -1) ||
+ bytesconsumed == CAST(size_t, -2)) {
+ /* Something went wrong */
+ valid = 0;
+ bytesconsumed = 1;
+ (void)memset(&state, 0, sizeof(mbstate_t));
+ }
+ if (valid && iswprint(nextchar)) {
+ if (ptr >= eptr - bytesconsumed)
+ break;
+ memcpy(ptr, str, bytesconsumed);
+ ptr += bytesconsumed;
+ slen -= wcwidth(nextchar);
+ } else {
+ if (ptr >= eptr - 3)
+ break;
+ *ptr++ = '\\';
+ *ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0';
+ *ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0';
+ *ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0';
+ slen -= 4;
+ }
+ str += bytesconsumed;
+ n -= bytesconsumed;
+ }
+ *ptr = '\0';
+ return buf;
+}
+#endif
+
/*
* Called for each input file on the command line (or in a list of files)
*/
@@ -554,7 +611,12 @@ process(struct magic_set *ms, const char *inname, int wid)
file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen);
if (wid > 0 && !bflag) {
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+ defined(HAVE_WCTYPE_H)
+ pname = file_name_wprintable(ms, pbuf, plen, inname, wid);
+#else
pname = file_printable(ms, pbuf, plen, inname, wid);
+#endif
(void)printf("%s", std_in ? "/dev/stdin" : pname);
if (nulsep)
(void)putc('\0', stdout);
@@ -584,22 +646,27 @@ file_mbswidth(struct magic_set *ms, const char *s)
size_t width = 0;
#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
defined(HAVE_WCTYPE_H)
- size_t bytesconsumed, old_n, n;
+ size_t bytesconsumed, n;
mbstate_t state;
wchar_t nextchar;
(void)memset(&state, 0, sizeof(mbstate_t));
- old_n = n = strlen(s);
+ n = strlen(s);
while (n > 0) {
bytesconsumed = mbrtowc(&nextchar, s, n, &state);
if (bytesconsumed == CAST(size_t, -1) ||
bytesconsumed == CAST(size_t, -2)) {
- /* Something went wrong, return something reasonable */
- return old_n;
+ /* Something went wrong.
+ Invalid code will be replaced with octal value
+ unless raw mode.
+ Try next byte in sequence */
+ width += ((ms->flags & MAGIC_RAW) != 0) ? 1 : 4;
+ bytesconsumed = 1;
+ (void)memset(&state, 0, sizeof(mbstate_t));
+ } else {
+ width += ((ms->flags & MAGIC_RAW) != 0
+ || iswprint(nextchar)) ? wcwidth(nextchar) : 4;
}
- width += ((ms->flags & MAGIC_RAW) != 0
- || iswprint(nextchar)) ? wcwidth(nextchar) : 4;
-
s += bytesconsumed, n -= bytesconsumed;
}
return width;
|
|
|
Dup for PR/362 |
|
|
I like your idea to print invalid as octal, so I applied to my patch. |
| Date Modified | Username | Field | Change |
|---|---|---|---|
| 2022-07-01 01:55 | dimich | New Issue | |
| 2022-07-01 01:55 | dimich | Tag Attached: bug | |
| 2022-07-01 01:55 | dimich | Tag Attached: filename | |
| 2022-07-01 01:55 | dimich | Tag Attached: multibyte | |
| 2022-07-01 03:10 | dimich | Note Added: 0003771 | |
| 2022-07-01 03:10 | dimich | File Added: issue363.patch | |
| 2022-07-01 04:20 | dimich | Note Added: 0003772 | |
| 2022-07-01 04:20 | dimich | File Added: issue363-upd1.patch | |
| 2022-07-01 06:43 | dimich | Note Added: 0003773 | |
| 2022-07-01 06:43 | dimich | File Added: issue363-upd2.patch | |
| 2022-07-04 19:45 | christos | Assigned To | => christos |
| 2022-07-04 19:45 | christos | Status | new => assigned |
| 2022-07-04 19:46 | christos | Status | assigned => resolved |
| 2022-07-04 19:46 | christos | Resolution | open => fixed |
| 2022-07-04 19:46 | christos | Fixed in Version | => 5.43 |
| 2022-07-04 19:46 | christos | Note Added: 0003780 | |
| 2022-07-04 20:16 | christos | Note Added: 0003782 |