Skip to content

Commit 3472eeb

Browse files
author
Andy C
committed
[eggex] Separate range checks for bytes and code points
- disallow NUL byte \y00 - aka code point 0 \u{0} - keep the ASCII limitation on bytes, so 128-255 are disallowed - relax limitation on code points - we can match arbitrary code points now - because of the recent setlocale() change, which checks that we the libc locale is UTF-8 - if OILS_LOCALE_OK, then all bets are off TODO: document this in doc/eggex.md, and the reference
1 parent 582f9a5 commit 3472eeb

File tree

2 files changed

+57
-19
lines changed

2 files changed

+57
-19
lines changed

spec/ysh-regex-bytes-chars.test.sh

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ no
130130

131131
#### Can't match code points \u{ff} because they don't translate to valid ERE
132132
shopt -s ysh:all
133-
var pat2 = /[ \u{00} - \u{ff} ]/;
133+
134+
var pat2 = /[ \u{1} - \u{3bc} ]/;
134135

135136
# This causes an error
136137
echo $pat2
@@ -141,8 +142,8 @@ echo $pat2
141142
var pat1 = /[ \u{ff} ]/;
142143

143144
echo $pat1 | od -A n -t x1
144-
if (b'\x7f' ~ pat) { echo yes } else { echo no }
145-
if (b'\x7e' ~ pat) { echo yes } else { echo no }
145+
if (b'\y7f' ~ pat1) { echo yes } else { echo no }
146+
if (b'\y7e' ~ pat1) { echo yes } else { echo no }
146147

147148
## status: 1
148149
## stdout-json: ""
@@ -209,21 +210,16 @@ shopt -s ysh:all
209210
# ascii works
210211
pp test_ (b'\y7f' ~ / [\x7f] /)
211212
pp test_ (b'\y7e' ~ / [\x7f] /)
212-
echo
213-
214-
# BUG before disabling: high byte doesn't work? Because of utf-8 locale
215-
#
216-
# We translate \x80 to a the byte b'\y80', but it still doesn't work
217-
#
218-
# We COULD allow this with LANG=C, but for now don't bother. I think using an
219-
# ERE string is probably better.
220213

221-
pp test_ (b'\y80' ~ / [\x80] /)
222-
pp test_ (b'\yff' ~ / [\xff] /)
214+
= str( / [\y80]/ )
223215

224-
= str( / [\x80]/ )
216+
#pp test_ (b'\y80' ~ / [\y80] /)
217+
#pp test_ (b'\yff' ~ / [\yff] /)
225218

219+
## status: 1
226220
## STDOUT:
221+
(Bool) true
222+
(Bool) false
227223
## END
228224

229225
#### High bytes 0x80 0xff can be matched with plain ERE and LC_ALL=C
@@ -245,3 +241,39 @@ EOF
245241
(Bool) true
246242
(Bool) false
247243
## END
244+
245+
#### Code points like \u{3bc} can be matched
246+
247+
var pat = / [\u{3bc}] /
248+
pp test_ (b'a' ~ pat)
249+
pp test_ (b'\u{3bc}' ~ / [\u{3bc}] /)
250+
echo
251+
252+
var pat = / [\u{10ffff}] /
253+
pp test_ (b'a' ~ pat)
254+
pp test_ (b'\u{10ffff}' ~ pat)
255+
256+
#echo "-$pat-"
257+
258+
## STDOUT:
259+
(Bool) false
260+
(Bool) true
261+
262+
(Bool) false
263+
(Bool) true
264+
## END
265+
266+
#### Max code point is disallowed at parse time
267+
268+
pp test_ (/ [\u{10ffff}] /)
269+
pp test_ (/ [\u{110000}] /)
270+
271+
## STDOUT:
272+
(Bool) false
273+
(Bool) true
274+
275+
(Bool) false
276+
(Bool) true
277+
## END
278+
279+

ysh/regex_translate.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from _devbuild.gen.id_kind_asdl import Id
2020
from _devbuild.gen.value_asdl import value
2121
from core.error import e_die, p_die
22+
from data_lang import j8
2223
from frontend import lexer
2324
from mycpp.mylib import log, tagswitch, switch
2425
from osh import glob_ # for ExtendedRegexEscape
@@ -90,10 +91,14 @@ def _CharCodeToEre(term, parts, special_char_flags):
9091
if char_int == 0:
9192
e_die("ERE can't express char code %d" % char_int, term.blame_tok)
9293

93-
if char_int >= 128 and term.u_braced: # 128 is 0x80
94-
# \u{ff} can't be represented in ERE because we don't know the encoding
95-
# \xff can be represented
96-
e_die("ERE can't express char code %d" % char_int, term.blame_tok)
94+
if term.u_braced: # \u{3bc}
95+
# max is \u{10ffff} is checked at PARSE time
96+
pass
97+
else:
98+
if char_int >= 128: # 128 is 0x80
99+
# \yff can't be represented in ERE if LC_ALL=utf-8
100+
# It should be \u{ff} instead
101+
e_die("ERE can't express high bytes %d" % char_int, term.blame_tok)
97102

98103
# note: mycpp doesn't handle
99104
# special_char_flags[0] |= FLAG_HYPHEN
@@ -108,7 +113,8 @@ def _CharCodeToEre(term, parts, special_char_flags):
108113
elif char_int == CH_BACKSLASH:
109114
mask |= FLAG_BACKSLASH
110115
else:
111-
parts.append(chr(char_int))
116+
s = j8.Utf8Encode(char_int)
117+
parts.append(s)
112118

113119
special_char_flags[0] = mask
114120

0 commit comments

Comments
 (0)