Skip to content

Commit 582f9a5

Browse files
author
Andy C
committed
[eggex] Disallow NUL byte when translating to ERE
There is no such thing as \x00 escapes in POSIX ERE, and libc regcomp() takes a NUL-terminated string, so the literal can't be expressed. Also: Test that you can match high bytes \yff with LC_ALL=C
1 parent e5a4dde commit 582f9a5

File tree

2 files changed

+59
-13
lines changed

2 files changed

+59
-13
lines changed

spec/ysh-regex-bytes-chars.test.sh

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -154,16 +154,56 @@ echo $pat
154154
## status: 1
155155
## stdout-json: ""
156156

157-
#### Match NUL byte - failing because of CPython API right now
158-
shopt -s ysh:all
157+
#### Bytes are denoted \y01 in Eggex char classes (not \x01)
158+
159+
# That is, eggex does have MODES like re.UNICODE
160+
#
161+
# We UNAMBIGUOUSLY accept
162+
# - \y01 or \u{1} - these are the same
163+
# - \yff or \u{ff} - these are DIFFERENT
164+
165+
var pat = / [\y01] /
166+
pp test_ (b'\y01' ~ pat)
167+
pp test_ ('a' ~ pat)
168+
169+
## STDOUT:
170+
(Bool) true
171+
(Bool) false
172+
## END
173+
174+
#### NUL byte can be expressed in Eggex, but not in ERE
175+
176+
$SH <<'EOF'
177+
pp test_ (b'\y01' ~ / [\y01] /)
178+
pp test_ (b'\y00' ~ / [\y00] /)
179+
EOF
180+
echo status=$?
159181

160-
# BUG in osh-cpython, literal NUL is not accepted by regex API
182+
$SH <<'EOF'
183+
pp test_ (b'\y01' ~ / [\u{1}] /)
184+
pp test_ (b'\y00' ~ / [\u{0}] /)
185+
EOF
186+
echo status=$?
187+
188+
189+
# legacy synonym
190+
191+
$SH <<'EOF'
192+
pp test_ (b'\y01' ~ / [\x01] /)
161193
pp test_ (b'\y00' ~ / [\x00] /)
194+
EOF
195+
echo status=$?
162196

163197
## STDOUT:
198+
(Bool) true
199+
status=1
200+
(Bool) true
201+
status=1
202+
(Bool) true
203+
status=1
164204
## END
165205

166-
#### BUG: Can you match high bytes 0x80 0xff, which are not UTF-8?
206+
#### High bytes 0x80 0xff usually can't be matched - Eggex is UTF-8
167207
shopt -s ysh:all
168208

169209
# ascii works
@@ -186,17 +226,20 @@ pp test_ (b'\yff' ~ / [\xff] /)
186226
## STDOUT:
187227
## END
188228

189-
#### Bytes are denoted \y01 in Eggex char classes (not \x01)
229+
#### High bytes 0x80 0xff can be matched with plain ERE and LC_ALL=C
190230

191-
# That is, eggex does have MODES like re.UNICODE
192-
#
193-
# We UNAMBIGUOUSLY accept
194-
# - \y01 or \u{1} - these are the same
195-
# - \yff or \u{ff} - these are DIFFERENT
231+
export LC_ALL=C
196232

197-
var pat = / [\y01] /
198-
pp test_ (b'\y01' ~ pat)
199-
pp test_ ('a' ~ pat)
233+
$SH <<'EOF'
234+
var yes = b'foo \yff'
235+
var no = b'foo'
236+
237+
# POSIX ERE string
238+
var ere = b'[\yff]'
239+
240+
pp test_ (yes ~ ere)
241+
pp test_ (no ~ ere)
242+
EOF
200243

201244
## STDOUT:
202245
(Bool) true

ysh/regex_translate.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ def _CharCodeToEre(term, parts, special_char_flags):
8787
"""
8888

8989
char_int = term.i
90+
if char_int == 0:
91+
e_die("ERE can't express char code %d" % char_int, term.blame_tok)
92+
9093
if char_int >= 128 and term.u_braced: # 128 is 0x80
9194
# \u{ff} can't be represented in ERE because we don't know the encoding
9295
# \xff can be represented

0 commit comments

Comments
 (0)