|
2 | 2 | //
|
3 | 3 | // For the full copyright and license information, please view the LICENSE
|
4 | 4 | // file that was distributed with this source code.
|
5 |
| -use std::str::from_utf8; |
6 | 5 |
|
7 | 6 | use crate::formatteriteminfo::{FormatWriter, FormatterItemInfo};
|
8 | 7 |
|
@@ -51,33 +50,13 @@ fn format_item_c(bytes: &[u8]) -> String {
|
51 | 50 | let b = bytes[0];
|
52 | 51 |
|
53 | 52 | if b & 0x80 == 0x00 {
|
| 53 | + // ASCII byte (0xxxxxxx) |
54 | 54 | match C_CHARS.get(b as usize) {
|
55 | 55 | Some(s) => format!("{s:>4}"),
|
56 | 56 | None => format!("{b:>4}"),
|
57 | 57 | }
|
58 |
| - } else if (b & 0xc0) == 0x80 { |
59 |
| - // second or subsequent octet of an utf-8 sequence |
60 |
| - String::from(" **") |
61 |
| - } else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) { |
62 |
| - // start of a 2 octet utf-8 sequence |
63 |
| - match from_utf8(&bytes[0..2]) { |
64 |
| - Ok(s) => format!("{s:>4}"), |
65 |
| - Err(_) => format!(" {b:03o}"), |
66 |
| - } |
67 |
| - } else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) { |
68 |
| - // start of a 3 octet utf-8 sequence |
69 |
| - match from_utf8(&bytes[0..3]) { |
70 |
| - Ok(s) => format!("{s:>4}"), |
71 |
| - Err(_) => format!(" {b:03o}"), |
72 |
| - } |
73 |
| - } else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) { |
74 |
| - // start of a 4 octet utf-8 sequence |
75 |
| - match from_utf8(&bytes[0..4]) { |
76 |
| - Ok(s) => format!("{s:>4}"), |
77 |
| - Err(_) => format!(" {b:03o}"), |
78 |
| - } |
79 | 58 | } else {
|
80 |
| - // invalid utf-8 |
| 59 | + // Continuation or leading byte of a multibyte UTF-8 sequence — treat as raw byte |
81 | 60 | format!(" {b:03o}")
|
82 | 61 | }
|
83 | 62 | }
|
@@ -125,27 +104,22 @@ fn test_format_item_c() {
|
125 | 104 | assert_eq!(" 177", format_item_c(&[0x7f]));
|
126 | 105 | assert_eq!(" A", format_item_c(&[0x41, 0x21]));
|
127 | 106 |
|
128 |
| - assert_eq!(" **", format_item_c(&[0x80])); |
129 |
| - assert_eq!(" **", format_item_c(&[0x9f])); |
| 107 | + assert_eq!(" 200", format_item_c(&[0x80])); |
| 108 | + assert_eq!(" 237", format_item_c(&[0x9f])); |
130 | 109 |
|
131 |
| - assert_eq!(" ß", format_item_c(&[0xc3, 0x9f])); |
132 |
| - assert_eq!(" ß", format_item_c(&[0xc3, 0x9f, 0x21])); |
| 110 | + assert_eq!(" 303", format_item_c(&[0xc3, 0x9f])); |
| 111 | + assert_eq!(" 303", format_item_c(&[0xc3, 0x9f, 0x21])); |
133 | 112 |
|
134 |
| - assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80])); |
135 |
| - assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21])); |
| 113 | + assert_eq!(" 341", format_item_c(&[0xe1, 0x80, 0x80])); |
136 | 114 |
|
137 |
| - assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96])); |
138 |
| - assert_eq!( |
139 |
| - " \u{1f496}", |
140 |
| - format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21]) |
141 |
| - ); |
| 115 | + assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92, 0x96])); |
142 | 116 |
|
143 | 117 | assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (UTF-8 null)
|
144 | 118 | assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8
|
145 | 119 | assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8
|
146 | 120 | assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong)
|
147 | 121 | assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet)
|
148 |
| - assert_eq!(" \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD |
| 122 | + assert_eq!(" 364", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD |
149 | 123 | assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8
|
150 | 124 | assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8
|
151 | 125 | assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8
|
|
0 commit comments