Skip to content

Commit dcc5492

Browse files
authored
Merge pull request #8268 from tgrez/main
od: fix handling of non-ascii chars
2 parents fb2399f + cad9224 commit dcc5492

File tree

2 files changed

+20
-45
lines changed

2 files changed

+20
-45
lines changed

src/uu/od/src/prn_char.rs

Lines changed: 9 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
//
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
5-
use std::str::from_utf8;
65

76
use crate::formatteriteminfo::{FormatWriter, FormatterItemInfo};
87

@@ -51,33 +50,13 @@ fn format_item_c(bytes: &[u8]) -> String {
5150
let b = bytes[0];
5251

5352
if b & 0x80 == 0x00 {
53+
// ASCII byte (0xxxxxxx)
5454
match C_CHARS.get(b as usize) {
5555
Some(s) => format!("{s:>4}"),
5656
None => format!("{b:>4}"),
5757
}
58-
} else if (b & 0xc0) == 0x80 {
59-
// second or subsequent octet of an utf-8 sequence
60-
String::from(" **")
61-
} else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) {
62-
// start of a 2 octet utf-8 sequence
63-
match from_utf8(&bytes[0..2]) {
64-
Ok(s) => format!("{s:>4}"),
65-
Err(_) => format!(" {b:03o}"),
66-
}
67-
} else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) {
68-
// start of a 3 octet utf-8 sequence
69-
match from_utf8(&bytes[0..3]) {
70-
Ok(s) => format!("{s:>4}"),
71-
Err(_) => format!(" {b:03o}"),
72-
}
73-
} else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) {
74-
// start of a 4 octet utf-8 sequence
75-
match from_utf8(&bytes[0..4]) {
76-
Ok(s) => format!("{s:>4}"),
77-
Err(_) => format!(" {b:03o}"),
78-
}
7958
} else {
80-
// invalid utf-8
59+
// Continuation or leading byte of a multibyte UTF-8 sequence — treat as raw byte
8160
format!(" {b:03o}")
8261
}
8362
}
@@ -125,27 +104,22 @@ fn test_format_item_c() {
125104
assert_eq!(" 177", format_item_c(&[0x7f]));
126105
assert_eq!(" A", format_item_c(&[0x41, 0x21]));
127106

128-
assert_eq!(" **", format_item_c(&[0x80]));
129-
assert_eq!(" **", format_item_c(&[0x9f]));
107+
assert_eq!(" 200", format_item_c(&[0x80]));
108+
assert_eq!(" 237", format_item_c(&[0x9f]));
130109

131-
assert_eq!(" ß", format_item_c(&[0xc3, 0x9f]));
132-
assert_eq!(" ß", format_item_c(&[0xc3, 0x9f, 0x21]));
110+
assert_eq!(" 303", format_item_c(&[0xc3, 0x9f]));
111+
assert_eq!(" 303", format_item_c(&[0xc3, 0x9f, 0x21]));
133112

134-
assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80]));
135-
assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21]));
113+
assert_eq!(" 341", format_item_c(&[0xe1, 0x80, 0x80]));
136114

137-
assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
138-
assert_eq!(
139-
" \u{1f496}",
140-
format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21])
141-
);
115+
assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
142116

143117
assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (UTF-8 null)
144118
assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8
145119
assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8
146120
assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong)
147121
assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet)
148-
assert_eq!(" \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD
122+
assert_eq!(" 364", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD
149123
assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8
150124
assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8
151125
assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8

tests/by-util/test_od.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -279,18 +279,19 @@ fn test_f64() {
279279

280280
#[test]
281281
fn test_multibyte() {
282+
let input = "’‐ˆ‘˜語🙂✅🐶𝛑Universität Tübingen \u{1B000}"; // spell-checker:disable-line
282283
new_ucmd!()
283-
.arg("-c")
284-
.arg("-w12")
285-
.run_piped_stdin("Universität Tübingen \u{1B000}".as_bytes()) // spell-checker:disable-line
284+
.args(&["-t", "c"])
285+
.run_piped_stdin(input.as_bytes())
286286
.success()
287287
.no_stderr()
288288
.stdout_is(unindent(
289-
"
290-
0000000 U n i v e r s i t ä ** t
291-
0000014 T ü ** b i n g e n \u{1B000}
292-
0000030 ** ** **
293-
0000033
289+
r"
290+
0000000 342 200 231 342 200 220 313 206 342 200 230 313 234 350 252 236
291+
0000020 360 237 231 202 342 234 205 360 237 220 266 360 235 233 221 U
292+
0000040 n i v e r s i t 303 244 t T 303 274 b
293+
0000060 i n g e n 360 233 200 200
294+
0000072
294295
",
295296
));
296297
}
@@ -714,10 +715,10 @@ fn test_ascii_dump() {
714715
r"
715716
0000000 00 01 0a 0d 10 1f 20 61 62 63 7d 7e 7f 80 90 a0 >...... abc}~....<
716717
nul soh nl cr dle us sp a b c } ~ del nul dle sp
717-
\0 001 \n \r 020 037 a b c } ~ 177 ** ** ** >...... abc}~....<
718+
\0 001 \n \r 020 037 a b c } ~ 177 200 220 240 >...... abc}~....<
718719
0000020 b0 c0 d0 e0 f0 ff >......<
719720
0 @ P ` p del
720-
** 300 320 340 360 377 >......<
721+
260 300 320 340 360 377 >......<
721722
0000026
722723
",
723724
));

0 commit comments

Comments
 (0)