Merge pull request #8268 from tgrez/main

RenjiSann · web-flow · commit dcc5492f8e7a · 2025-06-27T13:38:07.000+02:00
od: fix handling of non-ascii chars
diff --git a/src/uu/od/src/prn_char.rs b/src/uu/od/src/prn_char.rs
@@ -2,7 +2,6 @@
 //
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
-use std::str::from_utf8;
 
 use crate::formatteriteminfo::{FormatWriter, FormatterItemInfo};
 
@@ -51,33 +50,13 @@ fn format_item_c(bytes: &[u8]) -> String {
     let b = bytes[0];
 
     if b & 0x80 == 0x00 {
+        // ASCII byte (0xxxxxxx)
         match C_CHARS.get(b as usize) {
             Some(s) => format!("{s:>4}"),
             None => format!("{b:>4}"),
         }
-    } else if (b & 0xc0) == 0x80 {
-        // second or subsequent octet of an utf-8 sequence
-        String::from("  **")
-    } else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) {
-        // start of a 2 octet utf-8 sequence
-        match from_utf8(&bytes[0..2]) {
-            Ok(s) => format!("{s:>4}"),
-            Err(_) => format!(" {b:03o}"),
-        }
-    } else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) {
-        // start of a 3 octet utf-8 sequence
-        match from_utf8(&bytes[0..3]) {
-            Ok(s) => format!("{s:>4}"),
-            Err(_) => format!(" {b:03o}"),
-        }
-    } else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) {
-        // start of a 4 octet utf-8 sequence
-        match from_utf8(&bytes[0..4]) {
-            Ok(s) => format!("{s:>4}"),
-            Err(_) => format!(" {b:03o}"),
-        }
     } else {
-        // invalid utf-8
+        // Continuation or leading byte of a multibyte UTF-8 sequence — treat as raw byte
         format!(" {b:03o}")
     }
 }
@@ -125,27 +104,22 @@ fn test_format_item_c() {
     assert_eq!(" 177", format_item_c(&[0x7f]));
     assert_eq!("   A", format_item_c(&[0x41, 0x21]));
 
-    assert_eq!("  **", format_item_c(&[0x80]));
-    assert_eq!("  **", format_item_c(&[0x9f]));
+    assert_eq!(" 200", format_item_c(&[0x80]));
+    assert_eq!(" 237", format_item_c(&[0x9f]));
 
-    assert_eq!("   ß", format_item_c(&[0xc3, 0x9f]));
-    assert_eq!("   ß", format_item_c(&[0xc3, 0x9f, 0x21]));
+    assert_eq!(" 303", format_item_c(&[0xc3, 0x9f]));
+    assert_eq!(" 303", format_item_c(&[0xc3, 0x9f, 0x21]));
 
-    assert_eq!("   \u{1000}", format_item_c(&[0xe1, 0x80, 0x80]));
-    assert_eq!("   \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21]));
+    assert_eq!(" 341", format_item_c(&[0xe1, 0x80, 0x80]));
 
-    assert_eq!("   \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
-    assert_eq!(
-        "   \u{1f496}",
-        format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21])
-    );
+    assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
 
     assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (UTF-8 null)
     assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8
     assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8
     assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong)
     assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet)
-    assert_eq!("   \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8   // spell-checker:ignore 10FFFD FFFD
+    assert_eq!(" 364", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8   // spell-checker:ignore 10FFFD FFFD
     assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8
     assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8
     assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8
diff --git a/tests/by-util/test_od.rs b/tests/by-util/test_od.rs
@@ -279,18 +279,19 @@ fn test_f64() {
 
 #[test]
 fn test_multibyte() {
+    let input = "’‐ˆ‘˜語🙂✅🐶𝛑Universität Tübingen \u{1B000}"; // spell-checker:disable-line
     new_ucmd!()
-        .arg("-c")
-        .arg("-w12")
-        .run_piped_stdin("Universität Tübingen \u{1B000}".as_bytes()) // spell-checker:disable-line
+        .args(&["-t", "c"])
+        .run_piped_stdin(input.as_bytes())
         .success()
         .no_stderr()
         .stdout_is(unindent(
-            "
-            0000000   U   n   i   v   e   r   s   i   t   ä  **   t
-            0000014       T   ü  **   b   i   n   g   e   n       \u{1B000}
-            0000030  **  **  **
-            0000033
+            r"
+            0000000 342 200 231 342 200 220 313 206 342 200 230 313 234 350 252 236
+            0000020 360 237 231 202 342 234 205 360 237 220 266 360 235 233 221   U
+            0000040   n   i   v   e   r   s   i   t 303 244   t       T 303 274   b
+            0000060   i   n   g   e   n     360 233 200 200
+            0000072
             ",
         ));
 }
@@ -714,10 +715,10 @@ fn test_ascii_dump() {
             r"
             0000000  00  01  0a  0d  10  1f  20  61  62  63  7d  7e  7f  80  90  a0  >...... abc}~....<
                     nul soh  nl  cr dle  us  sp   a   b   c   }   ~ del nul dle  sp
-                     \0 001  \n  \r 020 037       a   b   c   }   ~ 177  **  **  **  >...... abc}~....<
+                     \0 001  \n  \r 020 037       a   b   c   }   ~ 177 200 220 240  >...... abc}~....<
             0000020  b0  c0  d0  e0  f0  ff                                          >......<
                       0   @   P   `   p del
-                     ** 300 320 340 360 377                                          >......<
+                    260 300 320 340 360 377                                          >......<
             0000026
             ",
         ));