Skip to content

Commit 9e21259

Browse files
authored
Merge pull request #8062 from drinkcat/sort-float
sort: Make use of ExtendedBigDecimal in -g sorting, then attempt to recover some performance
2 parents 858db60 + 8c98f43 commit 9e21259

File tree

8 files changed

+164
-42
lines changed

8 files changed

+164
-42
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/src/extensions.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,15 @@ See also comments under `printf` for formatting precision and differences.
153153

154154
`seq` provides `-t`/`--terminator` to set the terminator character.
155155

156+
## `sort`
157+
158+
When sorting with `-g`/`--general-numeric-sort`, arbitrary precision decimal numbers
159+
are parsed and compared, unlike GNU coreutils that uses platform-specific long
160+
double floating point numbers.
161+
162+
Extremely large or small values can still overflow or underflow to infinity or zero,
163+
see note in `seq`.
164+
156165
## `ls`
157166

158167
GNU `ls` provides two ways to use a long listing format: `-l` and `--format=long`. We support a

fuzz/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/uu/sort/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# spell-checker:ignore bigdecimal
2+
13
[package]
24
name = "uu_sort"
35
description = "sort ~ (uutils) sort input lines"
@@ -18,6 +20,7 @@ workspace = true
1820
path = "src/sort.rs"
1921

2022
[dependencies]
23+
bigdecimal = { workspace = true }
2124
binary-heap-plus = { workspace = true }
2225
clap = { workspace = true }
2326
compare = { workspace = true }

src/uu/sort/src/chunks.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ use memchr::memchr_iter;
1717
use self_cell::self_cell;
1818
use uucore::error::{UResult, USimpleError};
1919

20-
use crate::{GeneralF64ParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo};
20+
use crate::{
21+
GeneralBigDecimalParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo,
22+
};
2123

2224
self_cell!(
2325
/// The chunk that is passed around between threads.
@@ -41,7 +43,7 @@ pub struct ChunkContents<'a> {
4143
pub struct LineData<'a> {
4244
pub selections: Vec<&'a str>,
4345
pub num_infos: Vec<NumInfo>,
44-
pub parsed_floats: Vec<GeneralF64ParseResult>,
46+
pub parsed_floats: Vec<GeneralBigDecimalParseResult>,
4547
pub line_num_floats: Vec<Option<f64>>,
4648
}
4749

@@ -100,7 +102,7 @@ pub struct RecycledChunk {
100102
lines: Vec<Line<'static>>,
101103
selections: Vec<&'static str>,
102104
num_infos: Vec<NumInfo>,
103-
parsed_floats: Vec<GeneralF64ParseResult>,
105+
parsed_floats: Vec<GeneralBigDecimalParseResult>,
104106
line_num_floats: Vec<Option<f64>>,
105107
buffer: Vec<u8>,
106108
}

src/uu/sort/src/sort.rs

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
88
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
99

10-
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim
10+
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal
1111

1212
mod check;
1313
mod chunks;
@@ -17,6 +17,7 @@ mod merge;
1717
mod numeric_str_cmp;
1818
mod tmp_dir;
1919

20+
use bigdecimal::BigDecimal;
2021
use chunks::LineData;
2122
use clap::builder::ValueParser;
2223
use clap::{Arg, ArgAction, Command};
@@ -44,7 +45,9 @@ use unicode_width::UnicodeWidthStr;
4445
use uucore::display::Quotable;
4546
use uucore::error::{FromIo, strip_errno};
4647
use uucore::error::{UError, UResult, USimpleError, UUsageError, set_exit_code};
48+
use uucore::extendedbigdecimal::ExtendedBigDecimal;
4749
use uucore::line_ending::LineEnding;
50+
use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
4851
use uucore::parser::parse_size::{ParseSizeError, Parser};
4952
use uucore::parser::shortcut_value_parser::ShortcutValueParser;
5053
use uucore::version_cmp::version_cmp;
@@ -448,7 +451,7 @@ impl Default for KeySettings {
448451
}
449452
}
450453
enum Selection<'a> {
451-
AsF64(GeneralF64ParseResult),
454+
AsBigDecimal(GeneralBigDecimalParseResult),
452455
WithNumInfo(&'a str, NumInfo),
453456
Str(&'a str),
454457
}
@@ -490,7 +493,7 @@ impl<'a> Line<'a> {
490493
.map(|selector| (selector, selector.get_selection(line, token_buffer)))
491494
{
492495
match selection {
493-
Selection::AsF64(parsed_float) => line_data.parsed_floats.push(parsed_float),
496+
Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float),
494497
Selection::WithNumInfo(str, num_info) => {
495498
line_data.num_infos.push(num_info);
496499
line_data.selections.push(str);
@@ -902,8 +905,8 @@ impl FieldSelector {
902905
range = &range[num_range];
903906
Selection::WithNumInfo(range, info)
904907
} else if self.settings.mode == SortMode::GeneralNumeric {
905-
// Parse this number as f64, as this is the requirement for general numeric sorting.
906-
Selection::AsF64(general_f64_parse(&range[get_leading_gen(range)]))
908+
// Parse this number as BigDecimal, as this is the requirement for general numeric sorting.
909+
Selection::AsBigDecimal(general_bd_parse(&range[get_leading_gen(range)]))
907910
} else {
908911
// This is not a numeric sort, so we don't need a NumCache.
909912
Selection::Str(range)
@@ -1789,35 +1792,45 @@ fn get_leading_gen(input: &str) -> Range<usize> {
17891792
leading_whitespace_len..input.len()
17901793
}
17911794

1792-
#[derive(Copy, Clone, PartialEq, PartialOrd, Debug)]
1793-
pub enum GeneralF64ParseResult {
1795+
#[derive(Clone, PartialEq, PartialOrd, Debug)]
1796+
pub enum GeneralBigDecimalParseResult {
17941797
Invalid,
1795-
NaN,
1796-
NegInfinity,
1797-
Number(f64),
1798+
Nan,
1799+
MinusInfinity,
1800+
Number(BigDecimal),
17981801
Infinity,
17991802
}
18001803

1801-
/// Parse the beginning string into a GeneralF64ParseResult.
1802-
/// Using a GeneralF64ParseResult instead of f64 is necessary to correctly order floats.
1804+
/// Parse the beginning string into a GeneralBigDecimalParseResult.
1805+
/// Using a GeneralBigDecimalParseResult instead of ExtendedBigDecimal is necessary to correctly order floats.
18031806
#[inline(always)]
1804-
fn general_f64_parse(a: &str) -> GeneralF64ParseResult {
1805-
// The actual behavior here relies on Rust's implementation of parsing floating points.
1806-
// For example "nan", "inf" (ignoring the case) and "infinity" are only parsed to floats starting from 1.53.
1807-
// TODO: Once our minimum supported Rust version is 1.53 or above, we should add tests for those cases.
1808-
match a.parse::<f64>() {
1809-
Ok(a) if a.is_nan() => GeneralF64ParseResult::NaN,
1810-
Ok(a) if a == f64::NEG_INFINITY => GeneralF64ParseResult::NegInfinity,
1811-
Ok(a) if a == f64::INFINITY => GeneralF64ParseResult::Infinity,
1812-
Ok(a) => GeneralF64ParseResult::Number(a),
1813-
Err(_) => GeneralF64ParseResult::Invalid,
1807+
fn general_bd_parse(a: &str) -> GeneralBigDecimalParseResult {
1808+
// Parse digits, and fold in recoverable errors
1809+
let ebd = match ExtendedBigDecimal::extended_parse(a) {
1810+
Err(ExtendedParserError::NotNumeric) => return GeneralBigDecimalParseResult::Invalid,
1811+
Err(ExtendedParserError::PartialMatch(ebd, _))
1812+
| Err(ExtendedParserError::Overflow(ebd))
1813+
| Err(ExtendedParserError::Underflow(ebd))
1814+
| Ok(ebd) => ebd,
1815+
};
1816+
1817+
match ebd {
1818+
ExtendedBigDecimal::BigDecimal(bd) => GeneralBigDecimalParseResult::Number(bd),
1819+
ExtendedBigDecimal::Infinity => GeneralBigDecimalParseResult::Infinity,
1820+
ExtendedBigDecimal::MinusInfinity => GeneralBigDecimalParseResult::MinusInfinity,
1821+
// Minus zero and zero are equal
1822+
ExtendedBigDecimal::MinusZero => GeneralBigDecimalParseResult::Number(0.into()),
1823+
ExtendedBigDecimal::Nan | ExtendedBigDecimal::MinusNan => GeneralBigDecimalParseResult::Nan,
18141824
}
18151825
}
18161826

18171827
/// Compares two floats, with errors and non-numerics assumed to be -inf.
18181828
/// Stops coercing at the first non-numeric char.
18191829
/// We explicitly need to convert to f64 in this case.
1820-
fn general_numeric_compare(a: &GeneralF64ParseResult, b: &GeneralF64ParseResult) -> Ordering {
1830+
fn general_numeric_compare(
1831+
a: &GeneralBigDecimalParseResult,
1832+
b: &GeneralBigDecimalParseResult,
1833+
) -> Ordering {
18211834
a.partial_cmp(b).unwrap()
18221835
}
18231836

src/uucore/src/lib/features/parser/num_parser.rs

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,41 @@ impl Base {
6767
&self,
6868
str: &'a str,
6969
digits: Option<BigUint>,
70-
) -> (Option<BigUint>, u64, &'a str) {
70+
) -> (Option<BigUint>, i64, &'a str) {
7171
let mut digits: Option<BigUint> = digits;
72-
let mut count: u64 = 0;
72+
let mut count: i64 = 0;
7373
let mut rest = str;
74+
75+
// Doing operations on BigUint is really expensive, so we do as much as we
76+
// can on u64, then add them to the BigUint.
77+
let mut digits_tmp: u64 = 0;
78+
let mut count_tmp: i64 = 0;
79+
let mut mul_tmp: u64 = 1;
7480
while let Some(d) = rest.chars().next().and_then(|c| self.digit(c)) {
75-
(digits, count) = (
76-
Some(digits.unwrap_or_default() * *self as u8 + d),
77-
count + 1,
81+
(digits_tmp, count_tmp, mul_tmp) = (
82+
digits_tmp * *self as u64 + d,
83+
count_tmp + 1,
84+
mul_tmp * *self as u64,
7885
);
7986
rest = &rest[1..];
87+
// In base 16, we parse 4 bits at a time, so we can parse 16 digits at most in a u64.
88+
if count_tmp >= 15 {
89+
// Accumulate what we have so far
90+
(digits, count) = (
91+
Some(digits.unwrap_or_default() * mul_tmp + digits_tmp),
92+
count + count_tmp,
93+
);
94+
// Reset state
95+
(digits_tmp, count_tmp, mul_tmp) = (0, 0, 1);
96+
}
97+
}
98+
99+
// Accumulate the leftovers (if any)
100+
if mul_tmp > 1 {
101+
(digits, count) = (
102+
Some(digits.unwrap_or_default() * mul_tmp + digits_tmp),
103+
count + count_tmp,
104+
);
80105
}
81106
(digits, count, rest)
82107
}
@@ -265,7 +290,7 @@ impl ExtendedParser for ExtendedBigDecimal {
265290
}
266291
}
267292

268-
fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option<BigUint>, u64, &str) {
293+
fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option<BigUint>, i64, &str) {
269294
// Parse the integral part of the number
270295
let (digits, rest) = base.parse_digits(str);
271296

@@ -447,7 +472,7 @@ fn construct_extended_big_decimal<'a>(
447472
digits: BigUint,
448473
negative: bool,
449474
base: Base,
450-
scale: u64,
475+
scale: i64,
451476
exponent: BigInt,
452477
) -> Result<ExtendedBigDecimal, ExtendedParserError<'a, ExtendedBigDecimal>> {
453478
if digits == BigUint::zero() {
@@ -465,16 +490,20 @@ fn construct_extended_big_decimal<'a>(
465490
let bd = if scale == 0 && exponent.is_zero() {
466491
BigDecimal::from_bigint(signed_digits, 0)
467492
} else if base == Base::Decimal {
468-
let new_scale = BigInt::from(scale) - exponent;
469-
470-
// BigDecimal "only" supports i64 scale.
471-
// Note that new_scale is a negative exponent: large value causes an underflow, small value an overflow.
472-
if new_scale > i64::MAX.into() {
473-
return Err(make_error(false, negative));
474-
} else if new_scale < i64::MIN.into() {
475-
return Err(make_error(true, negative));
493+
if exponent.is_zero() {
494+
// Optimization: Converting scale to Bigint and back is relatively slow.
495+
BigDecimal::from_bigint(signed_digits, scale)
496+
} else {
497+
let new_scale = -exponent + scale;
498+
499+
// BigDecimal "only" supports i64 scale.
500+
// Note that new_scale is a negative exponent: large positive value causes an underflow, large negative values an overflow.
501+
if let Some(new_scale) = new_scale.to_i64() {
502+
BigDecimal::from_bigint(signed_digits, new_scale)
503+
} else {
504+
return Err(make_error(new_scale.is_negative(), negative));
505+
}
476506
}
477-
BigDecimal::from_bigint(signed_digits, new_scale.to_i64().unwrap())
478507
} else if base == Base::Hexadecimal {
479508
// pow "only" supports u32 values, just error out if given more than 2**32 fractional digits.
480509
if scale > u32::MAX.into() {

tests/by-util/test_sort.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1503,3 +1503,67 @@ fn test_files0_from_zero_length() {
15031503
.fails_with_code(2)
15041504
.stderr_only("sort: -:2: invalid zero-length file name\n");
15051505
}
1506+
1507+
#[test]
1508+
// Test for GNU tests/sort/sort-float.sh
1509+
fn test_g_float() {
1510+
let input = "0\n-3.3621031431120935063e-4932\n3.3621031431120935063e-4932\n";
1511+
let output = "-3.3621031431120935063e-4932\n0\n3.3621031431120935063e-4932\n";
1512+
new_ucmd!()
1513+
.args(&["-g"])
1514+
.pipe_in(input)
1515+
.succeeds()
1516+
.stdout_is(output);
1517+
}
1518+
1519+
#[test]
1520+
// Test misc numbers ("'a" is not interpreted as literal, trailing text is ignored...)
1521+
fn test_g_misc() {
1522+
let input = "1\n100\n90\n'a\n85hello\n";
1523+
let output = "'a\n1\n85hello\n90\n100\n";
1524+
new_ucmd!()
1525+
.args(&["-g"])
1526+
.pipe_in(input)
1527+
.succeeds()
1528+
.stdout_is(output);
1529+
}
1530+
1531+
#[test]
1532+
// Test numbers with a large number of digits, where only the last digit is different.
1533+
// We use scientific notation to make sure string sorting does not correctly order them.
1534+
fn test_g_arbitrary() {
1535+
let input = [
1536+
// GNU coreutils doesn't handle those correctly as they don't fit exactly in long double
1537+
"3",
1538+
"3.000000000000000000000000000000000000000000000000000000000000000004",
1539+
"0.3000000000000000000000000000000000000000000000000000000000000000002e1",
1540+
"0.03000000000000000000000000000000000000000000000000000000000000000003e2",
1541+
"0.003000000000000000000000000000000000000000000000000000000000000000001e3",
1542+
// GNU coreutils does handle those correctly though
1543+
"10",
1544+
"10.000000000000004",
1545+
"1.0000000000000002e1",
1546+
"0.10000000000000003e2",
1547+
"0.010000000000000001e3",
1548+
]
1549+
.join("\n");
1550+
let output = [
1551+
"3",
1552+
"0.003000000000000000000000000000000000000000000000000000000000000000001e3",
1553+
"0.3000000000000000000000000000000000000000000000000000000000000000002e1",
1554+
"0.03000000000000000000000000000000000000000000000000000000000000000003e2",
1555+
"3.000000000000000000000000000000000000000000000000000000000000000004",
1556+
"10",
1557+
"0.010000000000000001e3",
1558+
"1.0000000000000002e1",
1559+
"0.10000000000000003e2",
1560+
"10.000000000000004",
1561+
]
1562+
.join("\n")
1563+
+ "\n";
1564+
new_ucmd!()
1565+
.args(&["-g"])
1566+
.pipe_in(input)
1567+
.succeeds()
1568+
.stdout_is(output);
1569+
}

0 commit comments

Comments
 (0)