Merge pull request #8062 from drinkcat/sort-float

sylvestre · web-flow · commit 9e21259e2d62 · 2025-06-06T12:01:04.000+02:00
sort: Make use of ExtendedBigDecimal in -g sorting, then attempt to recover some performance
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/docs/src/extensions.md b/docs/src/extensions.md
@@ -153,6 +153,15 @@ See also comments under `printf` for formatting precision and differences.
 
 `seq` provides `-t`/`--terminator` to set the terminator character.
 
+## `sort`
+
+When sorting with `-g`/`--general-numeric-sort`, arbitrary precision decimal numbers
+are parsed and compared, unlike GNU coreutils that uses platform-specific long
+double floating point numbers.
+
+Extremely large or small values can still overflow or underflow to infinity or zero,
+see note in `seq`.
+
 ## `ls`
 
 GNU `ls` provides two ways to use a long listing format: `-l` and `--format=long`. We support a
diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock
diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml
@@ -1,3 +1,5 @@
+# spell-checker:ignore bigdecimal
+
 [package]
 name = "uu_sort"
 description = "sort ~ (uutils) sort input lines"
@@ -18,6 +20,7 @@ workspace = true
 path = "src/sort.rs"
 
 [dependencies]
+bigdecimal = { workspace = true }
 binary-heap-plus = { workspace = true }
 clap = { workspace = true }
 compare = { workspace = true }
diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs
@@ -17,7 +17,9 @@ use memchr::memchr_iter;
 use self_cell::self_cell;
 use uucore::error::{UResult, USimpleError};
 
-use crate::{GeneralF64ParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo};
+use crate::{
+    GeneralBigDecimalParseResult, GlobalSettings, Line, SortError, numeric_str_cmp::NumInfo,
+};
 
 self_cell!(
     /// The chunk that is passed around between threads.
@@ -41,7 +43,7 @@ pub struct ChunkContents<'a> {
 pub struct LineData<'a> {
     pub selections: Vec<&'a str>,
     pub num_infos: Vec<NumInfo>,
-    pub parsed_floats: Vec<GeneralF64ParseResult>,
+    pub parsed_floats: Vec<GeneralBigDecimalParseResult>,
     pub line_num_floats: Vec<Option<f64>>,
 }
 
@@ -100,7 +102,7 @@ pub struct RecycledChunk {
     lines: Vec<Line<'static>>,
     selections: Vec<&'static str>,
     num_infos: Vec<NumInfo>,
-    parsed_floats: Vec<GeneralF64ParseResult>,
+    parsed_floats: Vec<GeneralBigDecimalParseResult>,
     line_num_floats: Vec<Option<f64>>,
     buffer: Vec<u8>,
 }
diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs
@@ -7,7 +7,7 @@
 // https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
 // https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
 
-// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim
+// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal
 
 mod check;
 mod chunks;
@@ -17,6 +17,7 @@ mod merge;
 mod numeric_str_cmp;
 mod tmp_dir;
 
+use bigdecimal::BigDecimal;
 use chunks::LineData;
 use clap::builder::ValueParser;
 use clap::{Arg, ArgAction, Command};
@@ -44,7 +45,9 @@ use unicode_width::UnicodeWidthStr;
 use uucore::display::Quotable;
 use uucore::error::{FromIo, strip_errno};
 use uucore::error::{UError, UResult, USimpleError, UUsageError, set_exit_code};
+use uucore::extendedbigdecimal::ExtendedBigDecimal;
 use uucore::line_ending::LineEnding;
+use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
 use uucore::parser::parse_size::{ParseSizeError, Parser};
 use uucore::parser::shortcut_value_parser::ShortcutValueParser;
 use uucore::version_cmp::version_cmp;
@@ -448,7 +451,7 @@ impl Default for KeySettings {
     }
 }
 enum Selection<'a> {
-    AsF64(GeneralF64ParseResult),
+    AsBigDecimal(GeneralBigDecimalParseResult),
     WithNumInfo(&'a str, NumInfo),
     Str(&'a str),
 }
@@ -490,7 +493,7 @@ impl<'a> Line<'a> {
             .map(|selector| (selector, selector.get_selection(line, token_buffer)))
         {
             match selection {
-                Selection::AsF64(parsed_float) => line_data.parsed_floats.push(parsed_float),
+                Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float),
                 Selection::WithNumInfo(str, num_info) => {
                     line_data.num_infos.push(num_info);
                     line_data.selections.push(str);
@@ -902,8 +905,8 @@ impl FieldSelector {
             range = &range[num_range];
             Selection::WithNumInfo(range, info)
         } else if self.settings.mode == SortMode::GeneralNumeric {
-            // Parse this number as f64, as this is the requirement for general numeric sorting.
-            Selection::AsF64(general_f64_parse(&range[get_leading_gen(range)]))
+            // Parse this number as BigDecimal, as this is the requirement for general numeric sorting.
+            Selection::AsBigDecimal(general_bd_parse(&range[get_leading_gen(range)]))
         } else {
             // This is not a numeric sort, so we don't need a NumCache.
             Selection::Str(range)
@@ -1789,35 +1792,45 @@ fn get_leading_gen(input: &str) -> Range<usize> {
     leading_whitespace_len..input.len()
 }
 
-#[derive(Copy, Clone, PartialEq, PartialOrd, Debug)]
-pub enum GeneralF64ParseResult {
+#[derive(Clone, PartialEq, PartialOrd, Debug)]
+pub enum GeneralBigDecimalParseResult {
     Invalid,
-    NaN,
-    NegInfinity,
-    Number(f64),
+    Nan,
+    MinusInfinity,
+    Number(BigDecimal),
     Infinity,
 }
 
-/// Parse the beginning string into a GeneralF64ParseResult.
-/// Using a GeneralF64ParseResult instead of f64 is necessary to correctly order floats.
+/// Parse the beginning string into a GeneralBigDecimalParseResult.
+/// Using a GeneralBigDecimalParseResult instead of ExtendedBigDecimal is necessary to correctly order floats.
 #[inline(always)]
-fn general_f64_parse(a: &str) -> GeneralF64ParseResult {
-    // The actual behavior here relies on Rust's implementation of parsing floating points.
-    // For example "nan", "inf" (ignoring the case) and "infinity" are only parsed to floats starting from 1.53.
-    // TODO: Once our minimum supported Rust version is 1.53 or above, we should add tests for those cases.
-    match a.parse::<f64>() {
-        Ok(a) if a.is_nan() => GeneralF64ParseResult::NaN,
-        Ok(a) if a == f64::NEG_INFINITY => GeneralF64ParseResult::NegInfinity,
-        Ok(a) if a == f64::INFINITY => GeneralF64ParseResult::Infinity,
-        Ok(a) => GeneralF64ParseResult::Number(a),
-        Err(_) => GeneralF64ParseResult::Invalid,
+fn general_bd_parse(a: &str) -> GeneralBigDecimalParseResult {
+    // Parse digits, and fold in recoverable errors
+    let ebd = match ExtendedBigDecimal::extended_parse(a) {
+        Err(ExtendedParserError::NotNumeric) => return GeneralBigDecimalParseResult::Invalid,
+        Err(ExtendedParserError::PartialMatch(ebd, _))
+        | Err(ExtendedParserError::Overflow(ebd))
+        | Err(ExtendedParserError::Underflow(ebd))
+        | Ok(ebd) => ebd,
+    };
+
+    match ebd {
+        ExtendedBigDecimal::BigDecimal(bd) => GeneralBigDecimalParseResult::Number(bd),
+        ExtendedBigDecimal::Infinity => GeneralBigDecimalParseResult::Infinity,
+        ExtendedBigDecimal::MinusInfinity => GeneralBigDecimalParseResult::MinusInfinity,
+        // Minus zero and zero are equal
+        ExtendedBigDecimal::MinusZero => GeneralBigDecimalParseResult::Number(0.into()),
+        ExtendedBigDecimal::Nan | ExtendedBigDecimal::MinusNan => GeneralBigDecimalParseResult::Nan,
     }
 }
 
 /// Compares two floats, with errors and non-numerics assumed to be -inf.
 /// Stops coercing at the first non-numeric char.
 /// We explicitly need to convert to f64 in this case.
-fn general_numeric_compare(a: &GeneralF64ParseResult, b: &GeneralF64ParseResult) -> Ordering {
+fn general_numeric_compare(
+    a: &GeneralBigDecimalParseResult,
+    b: &GeneralBigDecimalParseResult,
+) -> Ordering {
     a.partial_cmp(b).unwrap()
 }
 
diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs
@@ -67,16 +67,41 @@ impl Base {
         &self,
         str: &'a str,
         digits: Option<BigUint>,
-    ) -> (Option<BigUint>, u64, &'a str) {
+    ) -> (Option<BigUint>, i64, &'a str) {
         let mut digits: Option<BigUint> = digits;
-        let mut count: u64 = 0;
+        let mut count: i64 = 0;
         let mut rest = str;
+
+        // Doing operations on BigUint is really expensive, so we do as much as we
+        // can on u64, then add them to the BigUint.
+        let mut digits_tmp: u64 = 0;
+        let mut count_tmp: i64 = 0;
+        let mut mul_tmp: u64 = 1;
         while let Some(d) = rest.chars().next().and_then(|c| self.digit(c)) {
-            (digits, count) = (
-                Some(digits.unwrap_or_default() * *self as u8 + d),
-                count + 1,
+            (digits_tmp, count_tmp, mul_tmp) = (
+                digits_tmp * *self as u64 + d,
+                count_tmp + 1,
+                mul_tmp * *self as u64,
             );
             rest = &rest[1..];
+            // In base 16, we parse 4 bits at a time, so we can parse 16 digits at most in a u64.
+            if count_tmp >= 15 {
+                // Accumulate what we have so far
+                (digits, count) = (
+                    Some(digits.unwrap_or_default() * mul_tmp + digits_tmp),
+                    count + count_tmp,
+                );
+                // Reset state
+                (digits_tmp, count_tmp, mul_tmp) = (0, 0, 1);
+            }
+        }
+
+        // Accumulate the leftovers (if any)
+        if mul_tmp > 1 {
+            (digits, count) = (
+                Some(digits.unwrap_or_default() * mul_tmp + digits_tmp),
+                count + count_tmp,
+            );
         }
         (digits, count, rest)
     }
@@ -265,7 +290,7 @@ impl ExtendedParser for ExtendedBigDecimal {
     }
 }
 
-fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option<BigUint>, u64, &str) {
+fn parse_digits(base: Base, str: &str, fractional: bool) -> (Option<BigUint>, i64, &str) {
     // Parse the integral part of the number
     let (digits, rest) = base.parse_digits(str);
 
@@ -447,7 +472,7 @@ fn construct_extended_big_decimal<'a>(
     digits: BigUint,
     negative: bool,
     base: Base,
-    scale: u64,
+    scale: i64,
     exponent: BigInt,
 ) -> Result<ExtendedBigDecimal, ExtendedParserError<'a, ExtendedBigDecimal>> {
     if digits == BigUint::zero() {
@@ -465,16 +490,20 @@ fn construct_extended_big_decimal<'a>(
     let bd = if scale == 0 && exponent.is_zero() {
         BigDecimal::from_bigint(signed_digits, 0)
     } else if base == Base::Decimal {
-        let new_scale = BigInt::from(scale) - exponent;
-
-        // BigDecimal "only" supports i64 scale.
-        // Note that new_scale is a negative exponent: large value causes an underflow, small value an overflow.
-        if new_scale > i64::MAX.into() {
-            return Err(make_error(false, negative));
-        } else if new_scale < i64::MIN.into() {
-            return Err(make_error(true, negative));
+        if exponent.is_zero() {
+            // Optimization: Converting scale to Bigint and back is relatively slow.
+            BigDecimal::from_bigint(signed_digits, scale)
+        } else {
+            let new_scale = -exponent + scale;
+
+            // BigDecimal "only" supports i64 scale.
+            // Note that new_scale is a negative exponent: large positive value causes an underflow, large negative values an overflow.
+            if let Some(new_scale) = new_scale.to_i64() {
+                BigDecimal::from_bigint(signed_digits, new_scale)
+            } else {
+                return Err(make_error(new_scale.is_negative(), negative));
+            }
         }
-        BigDecimal::from_bigint(signed_digits, new_scale.to_i64().unwrap())
     } else if base == Base::Hexadecimal {
         // pow "only" supports u32 values, just error out if given more than 2**32 fractional digits.
         if scale > u32::MAX.into() {
diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs
@@ -1503,3 +1503,67 @@ fn test_files0_from_zero_length() {
         .fails_with_code(2)
         .stderr_only("sort: -:2: invalid zero-length file name\n");
 }
+
+#[test]
+// Test for GNU tests/sort/sort-float.sh
+fn test_g_float() {
+    let input = "0\n-3.3621031431120935063e-4932\n3.3621031431120935063e-4932\n";
+    let output = "-3.3621031431120935063e-4932\n0\n3.3621031431120935063e-4932\n";
+    new_ucmd!()
+        .args(&["-g"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_is(output);
+}
+
+#[test]
+// Test misc numbers ("'a" is not interpreted as literal, trailing text is ignored...)
+fn test_g_misc() {
+    let input = "1\n100\n90\n'a\n85hello\n";
+    let output = "'a\n1\n85hello\n90\n100\n";
+    new_ucmd!()
+        .args(&["-g"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_is(output);
+}
+
+#[test]
+// Test numbers with a large number of digits, where only the last digit is different.
+// We use scientific notation to make sure string sorting does not correctly order them.
+fn test_g_arbitrary() {
+    let input = [
+        // GNU coreutils doesn't handle those correctly as they don't fit exactly in long double
+        "3",
+        "3.000000000000000000000000000000000000000000000000000000000000000004",
+        "0.3000000000000000000000000000000000000000000000000000000000000000002e1",
+        "0.03000000000000000000000000000000000000000000000000000000000000000003e2",
+        "0.003000000000000000000000000000000000000000000000000000000000000000001e3",
+        // GNU coreutils does handle those correctly though
+        "10",
+        "10.000000000000004",
+        "1.0000000000000002e1",
+        "0.10000000000000003e2",
+        "0.010000000000000001e3",
+    ]
+    .join("\n");
+    let output = [
+        "3",
+        "0.003000000000000000000000000000000000000000000000000000000000000000001e3",
+        "0.3000000000000000000000000000000000000000000000000000000000000000002e1",
+        "0.03000000000000000000000000000000000000000000000000000000000000000003e2",
+        "3.000000000000000000000000000000000000000000000000000000000000000004",
+        "10",
+        "0.010000000000000001e3",
+        "1.0000000000000002e1",
+        "0.10000000000000003e2",
+        "10.000000000000004",
+    ]
+    .join("\n")
+        + "\n";
+    new_ucmd!()
+        .args(&["-g"])
+        .pipe_in(input)
+        .succeeds()
+        .stdout_is(output);
+}