Skip to content

Commit b8228fb

Browse files
authored
Merge pull request #8241 from phinjensen/fold-non-utf8
fold: process streams as bytes, not strings, to handle non-utf8 data
2 parents b084bad + faa6a9b commit b8228fb

File tree

7 files changed

+104
-23
lines changed

7 files changed

+104
-23
lines changed

src/uu/fold/src/fold.rs

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,17 @@
88
use clap::{Arg, ArgAction, Command};
99
use std::collections::HashMap;
1010
use std::fs::File;
11-
use std::io::{BufRead, BufReader, Read, stdin};
11+
use std::io::{BufRead, BufReader, Read, Write, stdin, stdout};
1212
use std::path::Path;
1313
use uucore::display::Quotable;
1414
use uucore::error::{FromIo, UResult, USimpleError};
1515
use uucore::format_usage;
1616
use uucore::locale::{get_message, get_message_with_args};
1717

1818
const TAB_WIDTH: usize = 8;
19+
const NL: u8 = b'\n';
20+
const CR: u8 = b'\r';
21+
const TAB: u8 = b'\t';
1922

2023
mod options {
2124
pub const BYTES: &str = "bytes";
@@ -141,18 +144,18 @@ fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResul
141144
///
142145
/// If `spaces` is `true`, attempt to break lines at whitespace boundaries.
143146
fn fold_file_bytewise<T: Read>(mut file: BufReader<T>, spaces: bool, width: usize) -> UResult<()> {
144-
let mut line = String::new();
147+
let mut line = Vec::new();
145148

146149
loop {
147150
if file
148-
.read_line(&mut line)
151+
.read_until(NL, &mut line)
149152
.map_err_context(|| get_message("fold-error-readline"))?
150153
== 0
151154
{
152155
break;
153156
}
154157

155-
if line == "\n" {
158+
if line == [NL] {
156159
println!();
157160
line.truncate(0);
158161
continue;
@@ -166,8 +169,13 @@ fn fold_file_bytewise<T: Read>(mut file: BufReader<T>, spaces: bool, width: usiz
166169
let slice = {
167170
let slice = &line[i..i + width];
168171
if spaces && i + width < len {
169-
match slice.rfind(|c: char| c.is_whitespace() && c != '\r') {
170-
Some(m) => &slice[..=m],
172+
match slice
173+
.iter()
174+
.enumerate()
175+
.rev()
176+
.find(|(_, c)| c.is_ascii_whitespace() && **c != CR)
177+
{
178+
Some((m, _)) => &slice[..=m],
171179
None => slice,
172180
}
173181
} else {
@@ -178,7 +186,7 @@ fn fold_file_bytewise<T: Read>(mut file: BufReader<T>, spaces: bool, width: usiz
178186
// Don't duplicate trailing newlines: if the slice is "\n", the
179187
// previous iteration folded just before the end of the line and
180188
// has already printed this newline.
181-
if slice == "\n" {
189+
if slice == [NL] {
182190
break;
183191
}
184192

@@ -187,9 +195,10 @@ fn fold_file_bytewise<T: Read>(mut file: BufReader<T>, spaces: bool, width: usiz
187195
let at_eol = i >= len;
188196

189197
if at_eol {
190-
print!("{slice}");
198+
stdout().write_all(slice)?;
191199
} else {
192-
println!("{slice}");
200+
stdout().write_all(slice)?;
201+
stdout().write_all(&[NL])?;
193202
}
194203
}
195204

@@ -209,8 +218,8 @@ fn fold_file_bytewise<T: Read>(mut file: BufReader<T>, spaces: bool, width: usiz
209218
#[allow(unused_assignments)]
210219
#[allow(clippy::cognitive_complexity)]
211220
fn fold_file<T: Read>(mut file: BufReader<T>, spaces: bool, width: usize) -> UResult<()> {
212-
let mut line = String::new();
213-
let mut output = String::new();
221+
let mut line = Vec::new();
222+
let mut output = Vec::new();
214223
let mut col_count = 0;
215224
let mut last_space = None;
216225

@@ -226,8 +235,9 @@ fn fold_file<T: Read>(mut file: BufReader<T>, spaces: bool, width: usize) -> URe
226235
None => output.len(),
227236
};
228237

229-
println!("{}", &output[..consume]);
230-
output.replace_range(..consume, "");
238+
stdout().write_all(&output[..consume])?;
239+
stdout().write_all(&[NL])?;
240+
output.drain(..consume);
231241

232242
// we know there are no tabs left in output, so each char counts
233243
// as 1 column
@@ -239,15 +249,15 @@ fn fold_file<T: Read>(mut file: BufReader<T>, spaces: bool, width: usize) -> URe
239249

240250
loop {
241251
if file
242-
.read_line(&mut line)
252+
.read_until(NL, &mut line)
243253
.map_err_context(|| get_message("fold-error-readline"))?
244254
== 0
245255
{
246256
break;
247257
}
248258

249-
for ch in line.chars() {
250-
if ch == '\n' {
259+
for ch in &line {
260+
if *ch == NL {
251261
// make sure to _not_ split output at whitespace, since we
252262
// know the entire output will fit
253263
last_space = None;
@@ -259,9 +269,9 @@ fn fold_file<T: Read>(mut file: BufReader<T>, spaces: bool, width: usize) -> URe
259269
emit_output!();
260270
}
261271

262-
match ch {
263-
'\r' => col_count = 0,
264-
'\t' => {
272+
match *ch {
273+
CR => col_count = 0,
274+
TAB => {
265275
let next_tab_stop = col_count + TAB_WIDTH - col_count % TAB_WIDTH;
266276

267277
if next_tab_stop > width && !output.is_empty() {
@@ -271,21 +281,21 @@ fn fold_file<T: Read>(mut file: BufReader<T>, spaces: bool, width: usize) -> URe
271281
col_count = next_tab_stop;
272282
last_space = if spaces { Some(output.len()) } else { None };
273283
}
274-
'\x08' => {
284+
0x08 => {
275285
col_count = col_count.saturating_sub(1);
276286
}
277-
_ if spaces && ch.is_whitespace() => {
287+
_ if spaces && ch.is_ascii_whitespace() => {
278288
last_space = Some(output.len());
279289
col_count += 1;
280290
}
281291
_ => col_count += 1,
282292
}
283293

284-
output.push(ch);
294+
output.push(*ch);
285295
}
286296

287297
if !output.is_empty() {
288-
print!("{output}");
298+
stdout().write_all(&output)?;
289299
output.truncate(0);
290300
}
291301

tests/by-util/test_fold.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,3 +554,30 @@ fn test_obsolete_syntax() {
554554
.succeeds()
555555
.stdout_is("test1\n \ntest2\n \ntest3\n \ntest4\n \ntest5\n \ntest6\n ");
556556
}
557+
#[test]
558+
fn test_byte_break_at_non_utf8_character() {
559+
new_ucmd!()
560+
.arg("-b")
561+
.arg("-s")
562+
.arg("-w")
563+
.arg("40")
564+
.arg("non_utf8.input")
565+
.succeeds()
566+
.stdout_is_fixture_bytes("non_utf8.expected");
567+
}
568+
#[test]
569+
fn test_tab_advances_at_non_utf8_character() {
570+
new_ucmd!()
571+
.arg("-w8")
572+
.arg("non_utf8_tab_stops.input")
573+
.succeeds()
574+
.stdout_is_fixture_bytes("non_utf8_tab_stops_w8.expected");
575+
}
576+
#[test]
577+
fn test_all_tab_advances_at_non_utf8_character() {
578+
new_ucmd!()
579+
.arg("-w16")
580+
.arg("non_utf8_tab_stops.input")
581+
.succeeds()
582+
.stdout_is_fixture_bytes("non_utf8_tab_stops_w16.expected");
583+
}

tests/fixtures/fold/non_utf8.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Alle Menschen sind frei und gleich an
2+
W�rde und Rechten geboren

tests/fixtures/fold/non_utf8.input

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Alle Menschen sind frei und gleich an W�rde und Rechten geboren
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
��
3+
���
4+
����
5+
�����
6+
������
7+
�������
8+
��������
9+
���������
10+
�������� �
11+
�������� � �
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
2+
��
3+
���
4+
����
5+
�����
6+
������
7+
�������
8+
��������
9+
���������
10+
��������
11+
12+
��������
13+
� �
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
2+
��
3+
���
4+
����
5+
�����
6+
������
7+
�������
8+
��������
9+
��������
10+
11+
��������
12+
13+
14+
��������
15+
16+
17+

0 commit comments

Comments
 (0)