Skip to content

Commit 1bb7930

Browse files
authored
Merge pull request #8329 from drinkcat/printf-7209-update
printf: accept non-UTF-8 input in FORMAT and ARGUMENT arguments
2 parents 50704da + ccad817 commit 1bb7930

File tree

11 files changed

+510
-290
lines changed

11 files changed

+510
-290
lines changed

src/uu/echo/src/echo.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@ use clap::{Arg, ArgAction, Command};
88
use std::env;
99
use std::ffi::{OsStr, OsString};
1010
use std::io::{self, StdoutLock, Write};
11-
use uucore::error::{UResult, USimpleError};
11+
use uucore::error::UResult;
1212
use uucore::format::{FormatChar, OctalParsing, parse_escape_only};
13-
use uucore::format_usage;
14-
use uucore::os_str_as_bytes;
13+
use uucore::{format_usage, os_str_as_bytes};
1514

1615
use uucore::locale::get_message;
1716

@@ -223,9 +222,9 @@ pub fn uu_app() -> Command {
223222

224223
fn execute(stdout: &mut StdoutLock, args: Vec<OsString>, options: Options) -> UResult<()> {
225224
for (i, arg) in args.into_iter().enumerate() {
226-
let bytes = os_str_as_bytes(arg.as_os_str())
227-
.map_err(|_| USimpleError::new(1, get_message("echo-error-non-utf8")))?;
225+
let bytes = os_str_as_bytes(&arg)?;
228226

227+
// Don't print a space before the first argument
229228
if i > 0 {
230229
stdout.write_all(b" ")?;
231230
}

src/uu/printf/src/printf.rs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
// file that was distributed with this source code.
55
use clap::{Arg, ArgAction, Command};
66
use std::collections::HashMap;
7+
use std::ffi::OsString;
78
use std::io::stdout;
89
use std::ops::ControlFlow;
910
use uucore::error::{UResult, UUsageError};
@@ -18,21 +19,19 @@ mod options {
1819
pub const FORMAT: &str = "FORMAT";
1920
pub const ARGUMENT: &str = "ARGUMENT";
2021
}
22+
2123
#[uucore::main]
2224
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
2325
let matches = uu_app().get_matches_from(args);
2426

2527
let format = matches
26-
.get_one::<std::ffi::OsString>(options::FORMAT)
28+
.get_one::<OsString>(options::FORMAT)
2729
.ok_or_else(|| UUsageError::new(1, get_message("printf-error-missing-operand")))?;
2830
let format = os_str_as_bytes(format)?;
2931

30-
let values: Vec<_> = match matches.get_many::<std::ffi::OsString>(options::ARGUMENT) {
31-
// FIXME: use os_str_as_bytes once FormatArgument supports Vec<u8>
32+
let values: Vec<_> = match matches.get_many::<OsString>(options::ARGUMENT) {
3233
Some(s) => s
33-
.map(|os_string| {
34-
FormatArgument::Unparsed(std::ffi::OsStr::to_string_lossy(os_string).to_string())
35-
})
34+
.map(|os_string| FormatArgument::Unparsed(os_string.to_owned()))
3635
.collect(),
3736
None => vec![],
3837
};
@@ -62,7 +61,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
6261
"{}",
6362
get_message_with_args(
6463
"printf-warning-ignoring-excess-arguments",
65-
HashMap::from([("arg".to_string(), arg_str.to_string())])
64+
HashMap::from([("arg".to_string(), arg_str.to_string_lossy().to_string())])
6665
)
6766
);
6867
}
@@ -103,10 +102,10 @@ pub fn uu_app() -> Command {
103102
.help(get_message("printf-help-version"))
104103
.action(ArgAction::Version),
105104
)
106-
.arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(std::ffi::OsString)))
105+
.arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(OsString)))
107106
.arg(
108107
Arg::new(options::ARGUMENT)
109108
.action(ArgAction::Append)
110-
.value_parser(clap::value_parser!(std::ffi::OsString)),
109+
.value_parser(clap::value_parser!(OsString)),
111110
)
112111
}

src/uucore/src/lib/features/checksum.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -968,7 +968,7 @@ fn process_checksum_line(
968968
cached_line_format: &mut Option<LineFormat>,
969969
last_algo: &mut Option<String>,
970970
) -> Result<(), LineCheckError> {
971-
let line_bytes = os_str_as_bytes(line)?;
971+
let line_bytes = os_str_as_bytes(line).map_err(|e| LineCheckError::UError(Box::new(e)))?;
972972

973973
// Early return on empty or commented lines.
974974
if line.is_empty() || line_bytes.starts_with(b"#") {

src/uucore/src/lib/features/extendedbigdecimal.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,18 @@ impl From<f64> for ExtendedBigDecimal {
101101
}
102102
}
103103

104+
impl From<u8> for ExtendedBigDecimal {
105+
fn from(val: u8) -> Self {
106+
Self::BigDecimal(val.into())
107+
}
108+
}
109+
110+
impl From<u32> for ExtendedBigDecimal {
111+
fn from(val: u32) -> Self {
112+
Self::BigDecimal(val.into())
113+
}
114+
}
115+
104116
impl ExtendedBigDecimal {
105117
pub fn zero() -> Self {
106118
Self::BigDecimal(0.into())

src/uucore/src/lib/features/format/argument.rs

Lines changed: 105 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,16 @@ use super::ExtendedBigDecimal;
77
use crate::format::spec::ArgumentLocation;
88
use crate::{
99
error::set_exit_code,
10+
os_str_as_bytes,
1011
parser::num_parser::{ExtendedParser, ExtendedParserError},
1112
quoting_style::{QuotingStyle, locale_aware_escape_name},
1213
show_error, show_warning,
1314
};
1415
use os_display::Quotable;
15-
use std::{ffi::OsStr, num::NonZero};
16+
use std::{
17+
ffi::{OsStr, OsString},
18+
num::NonZero,
19+
};
1620

1721
/// An argument for formatting
1822
///
@@ -24,12 +28,12 @@ use std::{ffi::OsStr, num::NonZero};
2428
#[derive(Clone, Debug, PartialEq)]
2529
pub enum FormatArgument {
2630
Char(char),
27-
String(String),
31+
String(OsString),
2832
UnsignedInt(u64),
2933
SignedInt(i64),
3034
Float(ExtendedBigDecimal),
3135
/// Special argument that gets coerced into the other variants
32-
Unparsed(String),
36+
Unparsed(OsString),
3337
}
3438

3539
/// A struct that holds a slice of format arguments and provides methods to access them
@@ -72,62 +76,115 @@ impl<'a> FormatArguments<'a> {
7276
pub fn next_char(&mut self, position: &ArgumentLocation) -> u8 {
7377
match self.next_arg(position) {
7478
Some(FormatArgument::Char(c)) => *c as u8,
75-
Some(FormatArgument::Unparsed(s)) => s.bytes().next().unwrap_or(b'\0'),
79+
Some(FormatArgument::Unparsed(os)) => match os_str_as_bytes(os) {
80+
Ok(bytes) => bytes.first().copied().unwrap_or(b'\0'),
81+
Err(_) => b'\0',
82+
},
7683
_ => b'\0',
7784
}
7885
}
7986

80-
pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a str {
87+
pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a OsStr {
8188
match self.next_arg(position) {
82-
Some(FormatArgument::Unparsed(s) | FormatArgument::String(s)) => s,
83-
_ => "",
89+
Some(FormatArgument::Unparsed(os) | FormatArgument::String(os)) => os,
90+
_ => "".as_ref(),
8491
}
8592
}
8693

8794
pub fn next_i64(&mut self, position: &ArgumentLocation) -> i64 {
8895
match self.next_arg(position) {
8996
Some(FormatArgument::SignedInt(n)) => *n,
90-
Some(FormatArgument::Unparsed(s)) => extract_value(i64::extended_parse(s), s),
97+
Some(FormatArgument::Unparsed(os)) => Self::get_num::<i64>(os),
9198
_ => 0,
9299
}
93100
}
94101

95102
pub fn next_u64(&mut self, position: &ArgumentLocation) -> u64 {
96103
match self.next_arg(position) {
97104
Some(FormatArgument::UnsignedInt(n)) => *n,
98-
Some(FormatArgument::Unparsed(s)) => {
99-
// Check if the string is a character literal enclosed in quotes
100-
if s.starts_with(['"', '\'']) {
101-
// Extract the content between the quotes safely using chars
102-
let mut chars = s.trim_matches(|c| c == '"' || c == '\'').chars();
103-
if let Some(first_char) = chars.next() {
104-
if chars.clone().count() > 0 {
105-
// Emit a warning if there are additional characters
106-
let remaining: String = chars.collect();
107-
show_warning!(
108-
"{remaining}: character(s) following character constant have been ignored"
109-
);
110-
}
111-
return first_char as u64; // Use only the first character
112-
}
113-
return 0; // Empty quotes
114-
}
115-
extract_value(u64::extended_parse(s), s)
116-
}
105+
Some(FormatArgument::Unparsed(os)) => Self::get_num::<u64>(os),
117106
_ => 0,
118107
}
119108
}
120109

121110
pub fn next_extended_big_decimal(&mut self, position: &ArgumentLocation) -> ExtendedBigDecimal {
122111
match self.next_arg(position) {
123112
Some(FormatArgument::Float(n)) => n.clone(),
124-
Some(FormatArgument::Unparsed(s)) => {
125-
extract_value(ExtendedBigDecimal::extended_parse(s), s)
126-
}
113+
Some(FormatArgument::Unparsed(os)) => Self::get_num::<ExtendedBigDecimal>(os),
127114
_ => ExtendedBigDecimal::zero(),
128115
}
129116
}
130117

118+
// Parse an OsStr that we know to start with a '/"
119+
fn parse_quote_start<T>(os: &OsStr) -> Result<T, ExtendedParserError<T>>
120+
where
121+
T: ExtendedParser + From<u8> + From<u32> + Default,
122+
{
123+
// If this fails (this can only happens on Windows), then just
124+
// return NotNumeric.
125+
let s = match os_str_as_bytes(os) {
126+
Ok(s) => s,
127+
Err(_) => return Err(ExtendedParserError::NotNumeric),
128+
};
129+
130+
let bytes = match s.split_first() {
131+
Some((b'"', bytes)) | Some((b'\'', bytes)) => bytes,
132+
_ => {
133+
// This really can't happen, the string we are given must start with '/".
134+
debug_assert!(false);
135+
return Err(ExtendedParserError::NotNumeric);
136+
}
137+
};
138+
139+
if bytes.is_empty() {
140+
return Err(ExtendedParserError::NotNumeric);
141+
}
142+
143+
let (val, len) = if let Some(c) = bytes
144+
.utf8_chunks()
145+
.next()
146+
.expect("bytes should not be empty")
147+
.valid()
148+
.chars()
149+
.next()
150+
{
151+
// Valid UTF-8 character, cast the codepoint to u32 then T
152+
// (largest unicode codepoint is only 3 bytes, so this is safe)
153+
((c as u32).into(), c.len_utf8())
154+
} else {
155+
// Not a valid UTF-8 character, use the first byte
156+
(bytes[0].into(), 1)
157+
};
158+
// Emit a warning if there are additional characters
159+
if bytes.len() > len {
160+
return Err(ExtendedParserError::PartialMatch(
161+
val,
162+
String::from_utf8_lossy(&bytes[len..]).into_owned(),
163+
));
164+
}
165+
166+
Ok(val)
167+
}
168+
169+
fn get_num<T>(os: &OsStr) -> T
170+
where
171+
T: ExtendedParser + From<u8> + From<u32> + Default,
172+
{
173+
let s = os.to_string_lossy();
174+
let first = s.as_bytes().first().copied();
175+
176+
let quote_start = first == Some(b'"') || first == Some(b'\'');
177+
let parsed = if quote_start {
178+
// The string begins with a quote
179+
Self::parse_quote_start(os)
180+
} else {
181+
T::extended_parse(&s)
182+
};
183+
184+
// Get the best possible value, even if parsed was an error.
185+
extract_value(parsed, &s, quote_start)
186+
}
187+
131188
fn get_at_relative_position(&mut self, pos: NonZero<usize>) -> Option<&'a FormatArgument> {
132189
let pos: usize = pos.into();
133190
let pos = (pos - 1).saturating_add(self.current_offset);
@@ -147,7 +204,11 @@ impl<'a> FormatArguments<'a> {
147204
}
148205
}
149206

150-
fn extract_value<T: Default>(p: Result<T, ExtendedParserError<'_, T>>, input: &str) -> T {
207+
fn extract_value<T: Default>(
208+
p: Result<T, ExtendedParserError<T>>,
209+
input: &str,
210+
quote_start: bool,
211+
) -> T {
151212
match p {
152213
Ok(v) => v,
153214
Err(e) => {
@@ -167,14 +228,15 @@ fn extract_value<T: Default>(p: Result<T, ExtendedParserError<'_, T>>, input: &s
167228
Default::default()
168229
}
169230
ExtendedParserError::PartialMatch(v, rest) => {
170-
let bytes = input.as_encoded_bytes();
171-
if !bytes.is_empty() && (bytes[0] == b'\'' || bytes[0] == b'"') {
231+
if quote_start {
232+
set_exit_code(0);
172233
show_warning!(
173234
"{rest}: character(s) following character constant have been ignored"
174235
);
175236
} else {
176237
show_error!("{}: value not completely converted", input.quote());
177238
}
239+
178240
v
179241
}
180242
}
@@ -249,11 +311,11 @@ mod tests {
249311
// Test with different method types in sequence
250312
let args = [
251313
FormatArgument::Char('a'),
252-
FormatArgument::String("hello".to_string()),
253-
FormatArgument::Unparsed("123".to_string()),
254-
FormatArgument::String("world".to_string()),
314+
FormatArgument::String("hello".into()),
315+
FormatArgument::Unparsed("123".into()),
316+
FormatArgument::String("world".into()),
255317
FormatArgument::Char('z'),
256-
FormatArgument::String("test".to_string()),
318+
FormatArgument::String("test".into()),
257319
];
258320
let mut args = FormatArguments::new(&args);
259321

@@ -384,10 +446,10 @@ mod tests {
384446
fn test_unparsed_arguments() {
385447
// Test with unparsed arguments that get coerced
386448
let args = [
387-
FormatArgument::Unparsed("hello".to_string()),
388-
FormatArgument::Unparsed("123".to_string()),
389-
FormatArgument::Unparsed("hello".to_string()),
390-
FormatArgument::Unparsed("456".to_string()),
449+
FormatArgument::Unparsed("hello".into()),
450+
FormatArgument::Unparsed("123".into()),
451+
FormatArgument::Unparsed("hello".into()),
452+
FormatArgument::Unparsed("456".into()),
391453
];
392454
let mut args = FormatArguments::new(&args);
393455

@@ -409,10 +471,10 @@ mod tests {
409471
// Test with mixed types and positional access
410472
let args = [
411473
FormatArgument::Char('a'),
412-
FormatArgument::String("test".to_string()),
474+
FormatArgument::String("test".into()),
413475
FormatArgument::UnsignedInt(42),
414476
FormatArgument::Char('b'),
415-
FormatArgument::String("more".to_string()),
477+
FormatArgument::String("more".into()),
416478
FormatArgument::UnsignedInt(99),
417479
];
418480
let mut args = FormatArguments::new(&args);

0 commit comments

Comments
 (0)