Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions docs/src/extensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,15 @@ See also comments under `printf` for formatting precision and differences.

`seq` provides `-t`/`--terminator` to set the terminator character.

## `sort`

When sorting with `-g`/`--general-numeric-sort`, arbitrary precision decimal numbers
are parsed and compared, unlike GNU coreutils that uses platform-specific long
double floating point numbers.

Extremely large or small values can still overflow or underflow to infinity or zero,
see note in `seq`.

## `ls`

GNU `ls` provides two ways to use a long listing format: `-l` and `--format=long`. We support a
Expand Down
1 change: 1 addition & 0 deletions fuzz/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/uu/sort/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# spell-checker:ignore bigdecimal

[package]
name = "uu_sort"
description = "sort ~ (uutils) sort input lines"
Expand All @@ -18,6 +20,7 @@ workspace = true
path = "src/sort.rs"

[dependencies]
bigdecimal = { workspace = true }
binary-heap-plus = { workspace = true }
clap = { workspace = true }
compare = { workspace = true }
Expand Down
8 changes: 5 additions & 3 deletions src/uu/sort/src/chunks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ use memchr::memchr_iter;
use self_cell::self_cell;
use uucore::error::{UResult, USimpleError};

use crate::{GeneralF64ParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo};
use crate::{
GeneralBigDecimalParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo,
};

self_cell!(
/// The chunk that is passed around between threads.
Expand All @@ -41,7 +43,7 @@ pub struct ChunkContents<'a> {
pub struct LineData<'a> {
pub selections: Vec<&'a str>,
pub num_infos: Vec<NumInfo>,
pub parsed_floats: Vec<GeneralF64ParseResult>,
pub parsed_floats: Vec<GeneralBigDecimalParseResult>,
pub line_num_floats: Vec<Option<f64>>,
}

Expand Down Expand Up @@ -100,7 +102,7 @@ pub struct RecycledChunk {
lines: Vec<Line<'static>>,
selections: Vec<&'static str>,
num_infos: Vec<NumInfo>,
parsed_floats: Vec<GeneralF64ParseResult>,
parsed_floats: Vec<GeneralBigDecimalParseResult>,
line_num_floats: Vec<Option<f64>>,
buffer: Vec<u8>,
}
Expand Down
59 changes: 36 additions & 23 deletions src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html

// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal

mod check;
mod chunks;
Expand All @@ -17,6 +17,7 @@
mod numeric_str_cmp;
mod tmp_dir;

use bigdecimal::BigDecimal;
use chunks::LineData;
use clap::builder::ValueParser;
use clap::{Arg, ArgAction, Command};
Expand Down Expand Up @@ -44,7 +45,9 @@
use uucore::display::Quotable;
use uucore::error::{FromIo, strip_errno};
use uucore::error::{UError, UResult, USimpleError, UUsageError, set_exit_code};
use uucore::extendedbigdecimal::ExtendedBigDecimal;
use uucore::line_ending::LineEnding;
use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
use uucore::parser::parse_size::{ParseSizeError, Parser};
use uucore::parser::shortcut_value_parser::ShortcutValueParser;
use uucore::version_cmp::version_cmp;
Expand Down Expand Up @@ -450,7 +453,7 @@
}
}
enum Selection<'a> {
AsF64(GeneralF64ParseResult),
AsBigDecimal(GeneralBigDecimalParseResult),
WithNumInfo(&'a str, NumInfo),
Str(&'a str),
}
Expand Down Expand Up @@ -492,7 +495,7 @@
.map(|selector| (selector, selector.get_selection(line, token_buffer)))
{
match selection {
Selection::AsF64(parsed_float) => line_data.parsed_floats.push(parsed_float),
Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float),
Selection::WithNumInfo(str, num_info) => {
line_data.num_infos.push(num_info);
line_data.selections.push(str);
Expand Down Expand Up @@ -904,8 +907,8 @@
range = &range[num_range];
Selection::WithNumInfo(range, info)
} else if self.settings.mode == SortMode::GeneralNumeric {
// Parse this number as f64, as this is the requirement for general numeric sorting.
Selection::AsF64(general_f64_parse(&range[get_leading_gen(range)]))
// Parse this number as BigDecimal, as this is the requirement for general numeric sorting.
Selection::AsBigDecimal(general_bd_parse(&range[get_leading_gen(range)]))
} else {
// This is not a numeric sort, so we don't need a NumCache.
Selection::Str(range)
Expand Down Expand Up @@ -1791,35 +1794,45 @@
leading_whitespace_len..input.len()
}

#[derive(Copy, Clone, PartialEq, PartialOrd, Debug)]
pub enum GeneralF64ParseResult {
#[derive(Clone, PartialEq, PartialOrd, Debug)]
pub enum GeneralBigDecimalParseResult {
Invalid,
NaN,
NegInfinity,
Number(f64),
Nan,
MinusInfinity,
Number(BigDecimal),
Infinity,
}

/// Parse the beginning string into a GeneralF64ParseResult.
/// Using a GeneralF64ParseResult instead of f64 is necessary to correctly order floats.
/// Parse the beginning string into a GeneralBigDecimalParseResult.
/// Using a GeneralBigDecimalParseResult instead of ExtendedBigDecimal is necessary to correctly order floats.
#[inline(always)]
fn general_f64_parse(a: &str) -> GeneralF64ParseResult {
// The actual behavior here relies on Rust's implementation of parsing floating points.
// For example "nan", "inf" (ignoring the case) and "infinity" are only parsed to floats starting from 1.53.
// TODO: Once our minimum supported Rust version is 1.53 or above, we should add tests for those cases.
match a.parse::<f64>() {
Ok(a) if a.is_nan() => GeneralF64ParseResult::NaN,
Ok(a) if a == f64::NEG_INFINITY => GeneralF64ParseResult::NegInfinity,
Ok(a) if a == f64::INFINITY => GeneralF64ParseResult::Infinity,
Ok(a) => GeneralF64ParseResult::Number(a),
Err(_) => GeneralF64ParseResult::Invalid,
fn general_bd_parse(a: &str) -> GeneralBigDecimalParseResult {
// Parse digits, and fold in recoverable errors
let ebd = match ExtendedBigDecimal::extended_parse(a) {
Err(ExtendedParserError::NotNumeric) => return GeneralBigDecimalParseResult::Invalid,
Err(ExtendedParserError::PartialMatch(ebd, _))
| Err(ExtendedParserError::Overflow(ebd))
| Err(ExtendedParserError::Underflow(ebd))

Check warning on line 1815 in src/uu/sort/src/sort.rs

View check run for this annotation

Codecov / codecov/patch

src/uu/sort/src/sort.rs#L1813-L1815

Added lines #L1813 - L1815 were not covered by tests
| Ok(ebd) => ebd,
};

match ebd {
ExtendedBigDecimal::BigDecimal(bd) => GeneralBigDecimalParseResult::Number(bd),
ExtendedBigDecimal::Infinity => GeneralBigDecimalParseResult::Infinity,
ExtendedBigDecimal::MinusInfinity => GeneralBigDecimalParseResult::MinusInfinity,
// Minus zero and zero are equal
ExtendedBigDecimal::MinusZero => GeneralBigDecimalParseResult::Number(0.into()),

Check warning on line 1824 in src/uu/sort/src/sort.rs

View check run for this annotation

Codecov / codecov/patch

src/uu/sort/src/sort.rs#L1824

Added line #L1824 was not covered by tests
ExtendedBigDecimal::Nan | ExtendedBigDecimal::MinusNan => GeneralBigDecimalParseResult::Nan,
}
}

/// Compares two floats, with errors and non-numerics assumed to be -inf.
/// Stops coercing at the first non-numeric char.
/// We explicitly need to convert to f64 in this case.
fn general_numeric_compare(a: &GeneralF64ParseResult, b: &GeneralF64ParseResult) -> Ordering {
fn general_numeric_compare(
a: &GeneralBigDecimalParseResult,
b: &GeneralBigDecimalParseResult,
) -> Ordering {
a.partial_cmp(b).unwrap()
}

Expand Down
61 changes: 45 additions & 16 deletions src/uucore/src/lib/features/parser/num_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,41 @@ impl Base {
&self,
str: &'a str,
digits: Option<BigUint>,
) -> (Option<BigUint>, u64, &'a str) {
) -> (Option<BigUint>, i64, &'a str) {
let mut digits: Option<BigUint> = digits;
let mut count: u64 = 0;
let mut count: i64 = 0;
let mut rest = str;

// Doing operations on BigUint is really expensive, so we do as much as we
// can on u64, then add them to the BigUint.
let mut digits_tmp: u64 = 0;
let mut count_tmp: i64 = 0;
let mut mul_tmp: u64 = 1;
while let Some(d) = rest.chars().next().and_then(|c| self.digit(c)) {
(digits, count) = (
Some(digits.unwrap_or_default() * *self as u8 + d),
count + 1,
(digits_tmp, count_tmp, mul_tmp) = (
digits_tmp * *self as u64 + d,
count_tmp + 1,
mul_tmp * *self as u64,
);
rest = &rest[1..];
// In base 16, we parse 4 bits at a time, so we can parse 16 digits at most in a u64.
if count_tmp >= 15 {
// Accumulate what we have so far
(digits, count) = (
Some(digits.unwrap_or_default() * mul_tmp + digits_tmp),
count + count_tmp,
);
// Reset state
(digits_tmp, count_tmp, mul_tmp) = (0, 0, 1);
}
}

// Accumulate the leftovers (if any)
if mul_tmp > 1 {
(digits, count) = (
Some(digits.unwrap_or_default() * mul_tmp + digits_tmp),
count + count_tmp,
);
}
(digits, count, rest)
}
Expand Down Expand Up @@ -265,7 +290,7 @@ impl ExtendedParser for ExtendedBigDecimal {
}
}

fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option<BigUint>, u64, &str) {
fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option<BigUint>, i64, &str) {
// Parse the integral part of the number
let (digits, rest) = base.parse_digits(str);

Expand Down Expand Up @@ -447,7 +472,7 @@ fn construct_extended_big_decimal<'a>(
digits: BigUint,
negative: bool,
base: Base,
scale: u64,
scale: i64,
exponent: BigInt,
) -> Result<ExtendedBigDecimal, ExtendedParserError<'a, ExtendedBigDecimal>> {
if digits == BigUint::zero() {
Expand All @@ -465,16 +490,20 @@ fn construct_extended_big_decimal<'a>(
let bd = if scale == 0 && exponent.is_zero() {
BigDecimal::from_bigint(signed_digits, 0)
} else if base == Base::Decimal {
let new_scale = BigInt::from(scale) - exponent;

// BigDecimal "only" supports i64 scale.
// Note that new_scale is a negative exponent: large value causes an underflow, small value an overflow.
if new_scale > i64::MAX.into() {
return Err(make_error(false, negative));
} else if new_scale < i64::MIN.into() {
return Err(make_error(true, negative));
if exponent.is_zero() {
// Optimization: Converting scale to Bigint and back is relatively slow.
BigDecimal::from_bigint(signed_digits, scale)
} else {
let new_scale = -exponent + scale;

// BigDecimal "only" supports i64 scale.
// Note that new_scale is a negative exponent: large positive value causes an underflow, large negative values an overflow.
if let Some(new_scale) = new_scale.to_i64() {
BigDecimal::from_bigint(signed_digits, new_scale)
} else {
return Err(make_error(new_scale.is_negative(), negative));
}
}
BigDecimal::from_bigint(signed_digits, new_scale.to_i64().unwrap())
} else if base == Base::Hexadecimal {
// pow "only" supports u32 values, just error out if given more than 2**32 fractional digits.
if scale > u32::MAX.into() {
Expand Down
64 changes: 64 additions & 0 deletions tests/by-util/test_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1503,3 +1503,67 @@ fn test_files0_from_zero_length() {
.fails_with_code(2)
.stderr_only("sort: -:2: invalid zero-length file name\n");
}

#[test]
// Test for GNU tests/sort/sort-float.sh
fn test_g_float() {
let input = "0\n-3.3621031431120935063e-4932\n3.3621031431120935063e-4932\n";
let output = "-3.3621031431120935063e-4932\n0\n3.3621031431120935063e-4932\n";
new_ucmd!()
.args(&["-g"])
.pipe_in(input)
.succeeds()
.stdout_is(output);
}

#[test]
// Test misc numbers ("'a" is not interpreted as literal, trailing text is ignored...)
fn test_g_misc() {
let input = "1\n100\n90\n'a\n85hello\n";
let output = "'a\n1\n85hello\n90\n100\n";
new_ucmd!()
.args(&["-g"])
.pipe_in(input)
.succeeds()
.stdout_is(output);
}

#[test]
// Test numbers with a large number of digits, where only the last digit is different.
// We use scientific notation to make sure string sorting does not correctly order them.
fn test_g_arbitrary() {
let input = [
// GNU coreutils doesn't handle those correctly as they don't fit exactly in long double
"3",
"3.000000000000000000000000000000000000000000000000000000000000000004",
"0.3000000000000000000000000000000000000000000000000000000000000000002e1",
"0.03000000000000000000000000000000000000000000000000000000000000000003e2",
"0.003000000000000000000000000000000000000000000000000000000000000000001e3",
// GNU coreutils does handle those correctly though
"10",
"10.000000000000004",
"1.0000000000000002e1",
"0.10000000000000003e2",
"0.010000000000000001e3",
]
.join("\n");
let output = [
"3",
"0.003000000000000000000000000000000000000000000000000000000000000000001e3",
"0.3000000000000000000000000000000000000000000000000000000000000000002e1",
"0.03000000000000000000000000000000000000000000000000000000000000000003e2",
"3.000000000000000000000000000000000000000000000000000000000000000004",
"10",
"0.010000000000000001e3",
"1.0000000000000002e1",
"0.10000000000000003e2",
"10.000000000000004",
]
.join("\n")
+ "\n";
new_ucmd!()
.args(&["-g"])
.pipe_in(input)
.succeeds()
.stdout_is(output);
}
Loading