View Issue Details

IDProjectCategoryView StatusLast Update
0000363fileGeneralpublic2022-07-04 20:16
Reporterdimich Assigned Tochristos  
PrioritynormalSeveritymajorReproducibilityalways
Status resolvedResolutionfixed 
Platformx86_64OSLinuxOS VersionArch Linux
Product Version5.42 
Fixed in VersionHEAD 
Summary0000363: Truncated filenames containing multibyte characters
DescriptionBugfix for issue 351 introduced new bug: filenames are truncated due to incorrect calculation of printable filename width.

Filename width is calculated first in file_mbswidth() with respect to multibyte characters (first statement of #if/#endif), it uses iswprint() and handles multibyte characters correctly. Then filename is passed to file_printable() which uses simple isprint() and replaces every byte of multibyte character with 4 characters. Filename width is limited by previously calculated width (wid argument) and truncated, even in raw mode.
Steps To Reproduce$ touch файл.txt
$
$ ls --zero | hexdump -b
0000000 321 204 320 260 320 271 320 273 056 164 170 164 000
000000d
$
$ file файл.txt
\321\204\320\260\320\271\320\273: empty
$
$ file -r файл.txt
файл: empty
Additional InformationI think --raw option should not affect filenames at all. Non-printable characters may be replaced but at least with respect to multibyte encodings.
See also issue 362.
Tagsbug, filename, multibyte

Activities

dimich

2022-07-01 03:10

reporter   ~0003771

This patch seems fix the issue
issue363.patch (2,036 bytes)   
diff --git a/src/file.c b/src/file.c
index 5300e5af..239bcdcf 100644
--- a/src/file.c
+++ b/src/file.c
@@ -538,6 +538,52 @@ unwrap(struct magic_set *ms, const char *fn)
 	return e;
 }
 
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+   defined(HAVE_WCTYPE_H)
+/*
+ * convert string to multibyte printable format.
+ */
+private char *
+file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz,
+    const char *str, size_t slen)
+{
+	char *ptr, *eptr = buf + bufsiz - 1;
+	size_t len = strlen(str);
+	size_t bytesconsumed;
+	mbstate_t state;
+	wchar_t nextchar;
+
+	(void)memset(&state, 0, sizeof(mbstate_t));
+
+	for (ptr = buf;  ptr < eptr && len && slen && *str; --slen) {
+		bytesconsumed = mbrtowc(&nextchar, str, len, &state);
+		if (bytesconsumed == CAST(size_t, -1) ||
+		    bytesconsumed == CAST(size_t, -2)) {
+			/* Something went wrong */
+			break;
+		}
+		if ((ms->flags & MAGIC_RAW) != 0 || iswprint(nextchar)) {
+			if (ptr >= eptr - bytesconsumed) {
+				break;
+			}
+			memcpy(ptr, str, bytesconsumed);
+			ptr += bytesconsumed;
+		} else {
+			if (ptr >= eptr - 3)
+				break;
+			*ptr++ = '\\';
+			*ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0';
+			*ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0';
+			*ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0';
+		}
+		str += bytesconsumed;
+		len -= bytesconsumed;
+	}
+	*ptr = '\0';
+	return buf;
+}
+#endif
+
 /*
  * Called for each input file on the command line (or in a list of files)
  */
@@ -554,7 +600,12 @@ process(struct magic_set *ms, const char *inname, int wid)
 	    file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen);
 
 	if (wid > 0 && !bflag) {
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+   defined(HAVE_WCTYPE_H)
+		pname = file_name_wprintable(ms, pbuf, plen, inname, wid);
+#else
 		pname = file_printable(ms, pbuf, plen, inname, wid);
+#endif
 		(void)printf("%s", std_in ? "/dev/stdin" : pname);
 		if (nulsep)
 			(void)putc('\0', stdout);
issue363.patch (2,036 bytes)   

dimich

2022-07-01 04:20

reporter   ~0003772

Updated patch: handle invalid sequences in filenames and multicolumn characters.
issue363-upd1.patch (3,308 bytes)   
diff --git a/src/file.c b/src/file.c
index 5300e5af..a64c004b 100644
--- a/src/file.c
+++ b/src/file.c
@@ -538,6 +538,59 @@ unwrap(struct magic_set *ms, const char *fn)
 	return e;
 }
 
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+   defined(HAVE_WCTYPE_H)
+/*
+ * convert string to multibyte printable format.
+ */
+private char *
+file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz,
+    const char *str, size_t slen)
+{
+	char *ptr, *eptr = buf + bufsiz - 1;
+	size_t n;
+	size_t bytesconsumed;
+	mbstate_t state;
+	wchar_t nextchar;
+	int valid;
+	(void)memset(&state, 0, sizeof(mbstate_t));
+	n = strlen(str);
+
+	for (ptr = buf;  ptr < eptr && n && slen && *str;) {
+		valid = 1;
+		bytesconsumed = mbrtowc(&nextchar, str, n, &state);
+		if (bytesconsumed == CAST(size_t, -1) ||
+		    bytesconsumed == CAST(size_t, -2)) {
+			/* Something went wrong */
+			valid = 0;
+			bytesconsumed = 1;
+			(void)memset(&state, 0, sizeof(mbstate_t));
+		}
+		if (valid &&
+		    ((ms->flags & MAGIC_RAW) != 0 || iswprint(nextchar))) {
+			if (ptr >= eptr - bytesconsumed) {
+				break;
+			}
+			memcpy(ptr, str, bytesconsumed);
+			ptr += bytesconsumed;
+			slen -= wcwidth(nextchar);
+		} else {
+			if (ptr >= eptr - 3)
+				break;
+			*ptr++ = '\\';
+			*ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0';
+			*ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0';
+			*ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0';
+			slen -= 4;
+		}
+		str += bytesconsumed;
+		n -= bytesconsumed;
+	}
+	*ptr = '\0';
+	return buf;
+}
+#endif
+
 /*
  * Called for each input file on the command line (or in a list of files)
  */
@@ -554,7 +607,12 @@ process(struct magic_set *ms, const char *inname, int wid)
 	    file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen);
 
 	if (wid > 0 && !bflag) {
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+   defined(HAVE_WCTYPE_H)
+		pname = file_name_wprintable(ms, pbuf, plen, inname, wid);
+#else
 		pname = file_printable(ms, pbuf, plen, inname, wid);
+#endif
 		(void)printf("%s", std_in ? "/dev/stdin" : pname);
 		if (nulsep)
 			(void)putc('\0', stdout);
@@ -584,22 +642,26 @@ file_mbswidth(struct magic_set *ms, const char *s)
 	size_t width = 0;
 #if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
    defined(HAVE_WCTYPE_H)
-	size_t bytesconsumed, old_n, n;
+	size_t bytesconsumed, n;
 	mbstate_t state;
 	wchar_t nextchar;
 	(void)memset(&state, 0, sizeof(mbstate_t));
-	old_n = n = strlen(s);
+	n = strlen(s);
 
 	while (n > 0) {
 		bytesconsumed = mbrtowc(&nextchar, s, n, &state);
 		if (bytesconsumed == CAST(size_t, -1) ||
 		    bytesconsumed == CAST(size_t, -2)) {
-			/* Something went wrong, return something reasonable */
-			return old_n;
+			/* Something went wrong */
+			/* Invalid code will be replaced with octal value */
+			/* Try next byte in sequence */
+			width += 4;
+			bytesconsumed = 1;
+			(void)memset(&state, 0, sizeof(mbstate_t));
+		} else {
+			width += ((ms->flags & MAGIC_RAW) != 0
+			    || iswprint(nextchar)) ? wcwidth(nextchar) : 4;
 		}
-		width += ((ms->flags & MAGIC_RAW) != 0
-		    || iswprint(nextchar)) ? wcwidth(nextchar) : 4;
-
 		s += bytesconsumed, n -= bytesconsumed;
 	}
 	return width;
issue363-upd1.patch (3,308 bytes)   

dimich

2022-07-01 06:43

reporter   ~0003773

One more try :) Do not replace invalid sequence characters in raw mode, print as is.
The only issue i found is when --raw mode is on, --no-pad is off, LC_CTYPE=C (or another 1-byte encoding) and console is UTF-8. In this case field width cannot be calculated correctly: we don't know how many character cells a sequence will take.
Possible solution is to force --no-pad in --raw mode.
issue363-upd2.patch (3,406 bytes)   
diff --git a/src/file.c b/src/file.c
index 5300e5af..56a97b4f 100644
--- a/src/file.c
+++ b/src/file.c
@@ -538,6 +538,63 @@ unwrap(struct magic_set *ms, const char *fn)
 	return e;
 }
 
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+   defined(HAVE_WCTYPE_H)
+/*
+ * convert string to multibyte printable format.
+ */
+private char *
+file_name_wprintable(struct magic_set *ms, char *buf, size_t bufsiz,
+    const char *str, size_t slen)
+{
+	char *ptr, *eptr = buf + bufsiz - 1;
+	size_t n;
+	size_t bytesconsumed;
+	mbstate_t state;
+	wchar_t nextchar;
+	int valid;
+
+	if ((ms->flags & MAGIC_RAW) != 0) {
+		strncpy(buf, str, bufsiz);
+		return buf;
+	}
+
+	(void)memset(&state, 0, sizeof(mbstate_t));
+	n = strlen(str);
+
+	for (ptr = buf;  ptr < eptr && n && slen && *str;) {
+		valid = 1;
+		bytesconsumed = mbrtowc(&nextchar, str, n, &state);
+		if (bytesconsumed == CAST(size_t, -1) ||
+		    bytesconsumed == CAST(size_t, -2)) {
+			/* Something went wrong */
+			valid = 0;
+			bytesconsumed = 1;
+			(void)memset(&state, 0, sizeof(mbstate_t));
+		}
+		if (valid && iswprint(nextchar)) {
+			if (ptr >= eptr - bytesconsumed)
+				break;
+			memcpy(ptr, str, bytesconsumed);
+			ptr += bytesconsumed;
+			slen -= wcwidth(nextchar);
+		} else {
+			if (ptr >= eptr - 3)
+				break;
+			*ptr++ = '\\';
+			*ptr++ = ((CAST(unsigned int, *str) >> 6) & 7) + '0';
+			*ptr++ = ((CAST(unsigned int, *str) >> 3) & 7) + '0';
+			*ptr++ = ((CAST(unsigned int, *str) >> 0) & 7) + '0';
+			slen -= 4;
+		}
+		str += bytesconsumed;
+		n -= bytesconsumed;
+	}
+	*ptr = '\0';
+	return buf;
+}
+#endif
+
 /*
  * Called for each input file on the command line (or in a list of files)
  */
@@ -554,7 +611,12 @@ process(struct magic_set *ms, const char *inname, int wid)
 	    file_err(EXIT_FAILURE, "Can't allocate %zu bytes", plen);
 
 	if (wid > 0 && !bflag) {
+#if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
+   defined(HAVE_WCTYPE_H)
+		pname = file_name_wprintable(ms, pbuf, plen, inname, wid);
+#else
 		pname = file_printable(ms, pbuf, plen, inname, wid);
+#endif
 		(void)printf("%s", std_in ? "/dev/stdin" : pname);
 		if (nulsep)
 			(void)putc('\0', stdout);
@@ -584,22 +646,27 @@ file_mbswidth(struct magic_set *ms, const char *s)
 	size_t width = 0;
 #if defined(HAVE_WCHAR_H) && defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) && \
    defined(HAVE_WCTYPE_H)
-	size_t bytesconsumed, old_n, n;
+	size_t bytesconsumed, n;
 	mbstate_t state;
 	wchar_t nextchar;
 	(void)memset(&state, 0, sizeof(mbstate_t));
-	old_n = n = strlen(s);
+	n = strlen(s);
 
 	while (n > 0) {
 		bytesconsumed = mbrtowc(&nextchar, s, n, &state);
 		if (bytesconsumed == CAST(size_t, -1) ||
 		    bytesconsumed == CAST(size_t, -2)) {
-			/* Something went wrong, return something reasonable */
-			return old_n;
+			/* Something went wrong.
+			   Invalid code will be replaced with octal value
+			   unless raw mode.
+			   Try next byte in sequence */
+			width += ((ms->flags & MAGIC_RAW) != 0) ? 1 : 4;
+			bytesconsumed = 1;
+			(void)memset(&state, 0, sizeof(mbstate_t));
+		} else {
+			width += ((ms->flags & MAGIC_RAW) != 0
+			    || iswprint(nextchar)) ? wcwidth(nextchar) : 4;
 		}
-		width += ((ms->flags & MAGIC_RAW) != 0
-		    || iswprint(nextchar)) ? wcwidth(nextchar) : 4;
-
 		s += bytesconsumed, n -= bytesconsumed;
 	}
 	return width;
issue363-upd2.patch (3,406 bytes)   

christos

2022-07-04 19:46

manager   ~0003780

Dup for PR/362

christos

2022-07-04 20:16

manager   ~0003782

I like your idea to print invalid as octal, so I applied to my patch.

Issue History

Date Modified Username Field Change
2022-07-01 01:55 dimich New Issue
2022-07-01 01:55 dimich Tag Attached: bug
2022-07-01 01:55 dimich Tag Attached: filename
2022-07-01 01:55 dimich Tag Attached: multibyte
2022-07-01 03:10 dimich Note Added: 0003771
2022-07-01 03:10 dimich File Added: issue363.patch
2022-07-01 04:20 dimich Note Added: 0003772
2022-07-01 04:20 dimich File Added: issue363-upd1.patch
2022-07-01 06:43 dimich Note Added: 0003773
2022-07-01 06:43 dimich File Added: issue363-upd2.patch
2022-07-04 19:45 christos Assigned To => christos
2022-07-04 19:45 christos Status new => assigned
2022-07-04 19:46 christos Status assigned => resolved
2022-07-04 19:46 christos Resolution open => fixed
2022-07-04 19:46 christos Fixed in Version => HEAD
2022-07-04 19:46 christos Note Added: 0003780
2022-07-04 20:16 christos Note Added: 0003782