Skip to content

Commit 56233d6

Browse files
Correct case folding
Case folding had a serious bug causing some code points to fold to the wrong values. This affected codepoints in FoldRanges with modulo 2 or more which should not change when case folded, but erroneously did. The affected code points are: The 24 odd code points in the range U+100 - U+12E The 3 odd code points in the range U+132 - U+136 The 7 even code points in the range U+139 - U+147 The 23 odd code points in the range U+14A - U+176 The 3 odd code points in the range U+1A0 - U+1A4 The 8 even code points in the range U+1CB - U+1DB The 9 odd code points in the range U+1DE - U+1EE The 20 odd code points in the range U+1F8 - U+21E The 9 odd code points in the range U+222 - U+232 The 5 odd code points in the range U+246 - U+24E The 12 odd code points in the range U+3D8 - U+3EE The 17 odd code points in the range U+460 - U+480 The 27 odd code points in the range U+48A - U+4BE The 6 even code points in the range U+4C1 - U+4CD The 48 odd code points in the range U+4D0 - U+52E The 75 odd code points in the range U+1E00 - U+1E94 The 48 odd code points in the range U+1EA0 - U+1EFE The 3 even code points in the range U+1F59 - U+1F5F The 50 odd code points in the range U+2C80 - U+2CE2 The 23 odd code points in the range U+A640 - U+A66C The 14 odd code points in the range U+A680 - U+A69A The 7 odd code points in the range U+A722 - U+A72E The 31 odd code points in the range U+A732 - U+A76E The 5 odd code points in the range U+A77E - U+A786 The 10 odd code points in the range U+A796 - U+A7A8 The 8 odd code points in the range U+A7B4 - U+A7C2 The code points U+17A, U+17C, U+182, U+184, U+1B4, U+1B9, U+1BA, U+1BB, U+1F2, U+1F4, U+370, U+372, U+2C68, U+2C6A, U+2CEC, U+A77A, U+A790, U+A792, U+A7C8, U+A7D6, U+A7D8
1 parent 8b842f1 commit 56233d6

File tree

2 files changed

+116
-2
lines changed

2 files changed

+116
-2
lines changed

src/unicode.rs

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ use crate::util::SliceHelp;
44
#[cfg(not(feature = "std"))]
55
use alloc::vec::Vec;
66
use core::cmp::Ordering;
7+
#[cfg(test)]
8+
use std::collections::HashMap;
79

810
// CodePointRange packs a code point and a length together into a u32.
911
// We currently do not need to store any information about code points in plane 16 (U+100000),
@@ -100,7 +102,8 @@ pub(crate) struct FoldRange {
100102
impl FoldRange {
101103
#[inline(always)]
102104
pub const fn from(start: u32, length: u32, delta: i32, modulo: u8) -> Self {
103-
let mask = (1 << modulo) - 1;
105+
const_assert_true!(modulo.is_power_of_two());
106+
let mask = (modulo - 1) as i32;
104107
const_assert_true!(mask < (1 << PREDICATE_MASK_BITS));
105108
const_assert_true!(((delta << PREDICATE_MASK_BITS) >> PREDICATE_MASK_BITS) == delta);
106109
let extra = mask | (delta << PREDICATE_MASK_BITS);
@@ -126,7 +129,7 @@ impl FoldRange {
126129

127130
#[inline(always)]
128131
fn predicate_mask(&self) -> u32 {
129-
(self.extra as u32) & PREDICATE_MASK_BITS
132+
(self.extra as u32) & ((1 << PREDICATE_MASK_BITS) - 1)
130133
}
131134

132135
fn add_delta(&self, cu: u32) -> u32 {
@@ -346,3 +349,71 @@ pub(crate) fn is_character_class(c: u32, property_escape: &PropertyEscape) -> bo
346349
false
347350
}
348351
}
352+
353+
#[test]
354+
fn test_folds() {
355+
for c in 0..0x41 {
356+
assert_eq!(fold(c), c);
357+
}
358+
for c in 0x41..=0x5A {
359+
assert_eq!(fold(c), c + 0x20);
360+
}
361+
assert_eq!(fold(0xB5), 0x3BC);
362+
assert_eq!(fold(0xC0), 0xE0);
363+
364+
assert_eq!(fold(0x1B8), 0x1B9);
365+
assert_eq!(fold(0x1B9), 0x1B9);
366+
assert_eq!(fold(0x1BA), 0x1BA);
367+
assert_eq!(fold(0x1BB), 0x1BB);
368+
assert_eq!(fold(0x1BC), 0x1BD);
369+
assert_eq!(fold(0x1BD), 0x1BD);
370+
371+
for c in 0x1F8..0x21F {
372+
if c % 2 == 0 {
373+
assert_eq!(fold(c), c + 1);
374+
} else {
375+
assert_eq!(fold(c), c);
376+
}
377+
}
378+
379+
assert_eq!(fold(0x37F), 0x3F3);
380+
assert_eq!(fold(0x380), 0x380);
381+
assert_eq!(fold(0x16E40), 0x16E60);
382+
assert_eq!(fold(0x16E41), 0x16E61);
383+
assert_eq!(fold(0x16E42), 0x16E62);
384+
assert_eq!(fold(0x1E900), 0x1E922);
385+
assert_eq!(fold(0x1E901), 0x1E923);
386+
for c in 0xF0000..=0x10FFFF {
387+
assert_eq!(fold(c), c);
388+
}
389+
}
390+
391+
#[test]
392+
fn test_fold_idempotent() {
393+
for c in 0..=0x10FFFF {
394+
let fc = fold(c);
395+
let ffc = fold(fc);
396+
assert_eq!(ffc, fc);
397+
}
398+
}
399+
400+
#[test]
401+
fn test_unfold_chars() {
402+
// Map from folded char to the chars that folded to it.
403+
let mut fold_map: HashMap<u32, Vec<u32>> = HashMap::new();
404+
for c in 0..=0x10FFFF {
405+
let fc = fold(c);
406+
fold_map.entry(fc).or_insert_with(Vec::new).push(c);
407+
}
408+
409+
// Sort them all.
410+
for v in fold_map.values_mut() {
411+
v.sort_unstable();
412+
}
413+
414+
for c in 0..=0x10FFFF {
415+
let mut unfolded = unfold_char(c);
416+
unfolded.sort_unstable();
417+
assert_eq!(unfolded, fold_map[&fold(c)]);
418+
}
419+
}

tests/tests.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1537,3 +1537,46 @@ fn test_escapes_folding_tc(tc: TestConfig) {
15371537
tc.test_match_succeeds(r"\u{61}", "i", "a");
15381538
tc.test_match_succeeds(r"\u{61}", "i", "A");
15391539
}
1540+
1541+
#[test]
1542+
fn test_high_folds() {
1543+
test_with_configs(test_high_folds_tc)
1544+
}
1545+
1546+
fn test_high_folds_tc(tc: TestConfig) {
1547+
// Regression test for bogus folding.
1548+
// We incorrectly folded certain characters in delta blocks:
1549+
// we folded U+100 to U+101 (correctly) but then U+101 to U+102 (wrong).
1550+
tc.test_match_succeeds(r"\u{100}", "", "\u{100}");
1551+
tc.test_match_succeeds(r"\u{100}", "i", "\u{100}");
1552+
1553+
tc.test_match_fails(r"\u{100}", "", "\u{101}");
1554+
tc.test_match_succeeds(r"\u{100}", "i", "\u{101}");
1555+
1556+
tc.test_match_succeeds(r"\u{101}", "", "\u{101}");
1557+
tc.test_match_succeeds(r"\u{101}", "i", "\u{101}");
1558+
1559+
tc.test_match_fails(r"\u{101}", "", "\u{102}");
1560+
tc.test_match_fails(r"\u{101}", "i", "\u{102}");
1561+
1562+
// Exercise a "mod 4 range":
1563+
// U+1B8 folds to U+1B9
1564+
// U+1BC folds to U+1BD
1565+
// Codepoints between fold to themselves.
1566+
tc.test_match_succeeds(r"\u{1B8}", "", "\u{1B8}");
1567+
tc.test_match_succeeds(r"\u{1B8}", "i", "\u{1B8}");
1568+
tc.test_match_fails(r"\u{1B8}", "", "\u{1B9}");
1569+
tc.test_match_succeeds(r"\u{1B8}", "i", "\u{1B9}");
1570+
tc.test_match_succeeds(r"\u{1B9}", "", "\u{1B9}");
1571+
tc.test_match_succeeds(r"\u{1B9}", "i", "\u{1B9}");
1572+
tc.test_match_fails(r"\u{1B9}", "", "\u{1BA}");
1573+
tc.test_match_fails(r"\u{1B9}", "i", "\u{1BA}");
1574+
tc.test_match_succeeds(r"\u{1BC}", "", "\u{1BC}");
1575+
tc.test_match_succeeds(r"\u{1BC}", "i", "\u{1BC}");
1576+
tc.test_match_fails(r"\u{1BC}", "", "\u{1BD}");
1577+
tc.test_match_succeeds(r"\u{1BC}", "i", "\u{1BD}");
1578+
tc.test_match_succeeds(r"\u{1BD}", "", "\u{1BD}");
1579+
tc.test_match_succeeds(r"\u{1BD}", "i", "\u{1BD}");
1580+
tc.test_match_fails(r"\u{1BD}", "", "\u{1BE}");
1581+
tc.test_match_fails(r"\u{1BD}", "i", "\u{1BE}");
1582+
}

0 commit comments

Comments
 (0)