Skip to content

Commit d822d3e

Browse files
akinomyogaAndy C
andauthored
[osh] Rewrite IFS algorithm to fix several bugs
* [osh/word_eval] Fix the IFS='\' bug of word splitting with a new splitter * [osh/split] Use new splitter to fix the IFS='\' bug of shSplit(s) * [builtin/read_osh] Use new splitter to fix the IFS=\ bug of the "read" builtin This implements the behavior closest to Bash's. Bash trims whitespace IFS characters from the right after the end of the last word. Changes by Andy: - Implement rstrip() with multiple chars - this is a special case, for now - In the "fixup", use `self.args.pop()` instead of slicing. Slicing creates a new object --------- Co-authored-by: Andy C <[email protected]>
1 parent ecfcb25 commit d822d3e

File tree

9 files changed

+487
-93
lines changed

9 files changed

+487
-93
lines changed

builtin/read_osh.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -505,26 +505,17 @@ def _Read(self, arg, names):
505505
else:
506506
delim_byte = pyos.NEWLINE_CH # read a line
507507

508-
# Read MORE THAN ONE line for \ line continuation (and not read -r)
509-
parts = [] # type: List[mylib.BufWriter]
510508
chunk, eof = _ReadPortion(delim_byte, mops.BigTruncate(arg.n), not raw,
511-
self.cmd_ev)
509+
self.cmd_ev)
512510

513511
# status 1 to terminate loop. (This is true even though we set
514512
# variables).
515513
status = 1 if eof else 0
516514

517515
#log('LINE %r', chunk)
518-
if len(chunk) > 0:
519-
join_next = False
520-
spans = self.splitter.SplitForRead(chunk, not raw, do_split)
521-
done, join_next = _AppendParts(chunk, spans, max_results,
522-
join_next, parts)
516+
entries = self.splitter.SplitForRead(chunk, not raw, do_split,
517+
max_results)
523518

524-
#log('PARTS %s continued %s', parts, continued)
525-
assert done
526-
527-
entries = [buf.getvalue() for buf in parts]
528519
num_parts = len(entries)
529520
if arg.a is not None:
530521
state.BuiltinSetArray(self.mem, arg.a, entries)

mycpp/gc_str.cc

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,15 @@ bool OmitChar(int ch, int what) {
341341
}
342342
}
343343

344+
bool OmitCharMany(int ch, BigStr* chars) {
345+
for (int i = 0; i < len(chars); ++i) {
346+
if (ch == chars->data_[i]) {
347+
return true;
348+
}
349+
}
350+
return false;
351+
}
352+
344353
// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
345354
// implement 6 functions:
346355
//
@@ -391,7 +400,27 @@ BigStr* BigStr::strip() {
391400

392401
// Used for CommandSub in osh/cmd_exec.py
393402
BigStr* BigStr::rstrip(BigStr* chars) {
394-
DCHECK(len(chars) == 1);
403+
int num_chars = len(chars);
404+
if (num_chars == 0) {
405+
return this;
406+
}
407+
408+
// multiple chars, for word splitting
409+
if (num_chars > 1) {
410+
const char* char_data = data_;
411+
int j = len(this);
412+
do {
413+
j--;
414+
} while (j >= 0 && OmitCharMany(data_[j], chars));
415+
j++;
416+
417+
int new_len = j;
418+
BigStr* result = NewStr(new_len);
419+
memcpy(result->data(), data_, new_len);
420+
return result;
421+
}
422+
423+
// exactly 1 char
395424
int c = chars->data_[0];
396425
return StripAny(this, StripWhere::Right, c);
397426
}

mycpp/gc_str_test.cc

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -314,11 +314,45 @@ TEST test_str_strip() {
314314
ASSERT(str_equals(result, StrFromC("hi")));
315315
}
316316

317+
ASSERT(str_equals0(" abc", StrFromC(" abc ")->rstrip()));
318+
ASSERT(str_equals0(" def", StrFromC(" def")->rstrip()));
319+
320+
ASSERT(str_equals0("", kEmptyString->rstrip()));
321+
ASSERT(str_equals0("", kEmptyString->strip()));
322+
323+
ASSERT(str_equals0("123", StrFromC(" 123 ")->strip()));
324+
ASSERT(str_equals0("123", StrFromC(" 123")->strip()));
325+
ASSERT(str_equals0("123", StrFromC("123 ")->strip()));
326+
317327
printf("---------- Done ----------\n");
318328

319329
PASS();
320330
}
321331

332+
TEST test_rstrip() {
333+
// rstrip() with multiple characters
334+
335+
BigStr* s;
336+
337+
s = StrFromC(" axx")->rstrip(StrFromC("x"));
338+
ASSERT(str_equals0(" a", s));
339+
340+
s = StrFromC(" a ")->rstrip(StrFromC(" \t"));
341+
ASSERT(str_equals0(" a", s));
342+
343+
s = StrFromC(" axx")->rstrip(StrFromC(" x"));
344+
ASSERT(str_equals0(" a", s));
345+
346+
s = StrFromC(" a\t\t")->rstrip(StrFromC(" \t"));
347+
ASSERT(str_equals0(" a", s));
348+
349+
// Empty string allowed too
350+
s = StrFromC(" a\t\t")->rstrip(kEmptyString);
351+
ASSERT(str_equals0(" a\t\t", s));
352+
353+
PASS();
354+
}
355+
322356
TEST test_str_upper_lower() {
323357
printf("\n");
324358

@@ -1338,17 +1372,6 @@ TEST str_methods_test() {
13381372
ASSERT(str_equals0("o", kStrFood->slice(-3, -2)));
13391373
ASSERT(str_equals0("fo", kStrFood->slice(-4, -2)));
13401374

1341-
log("strip()");
1342-
ASSERT(str_equals0(" abc", StrFromC(" abc ")->rstrip()));
1343-
ASSERT(str_equals0(" def", StrFromC(" def")->rstrip()));
1344-
1345-
ASSERT(str_equals0("", kEmptyString->rstrip()));
1346-
ASSERT(str_equals0("", kEmptyString->strip()));
1347-
1348-
ASSERT(str_equals0("123", StrFromC(" 123 ")->strip()));
1349-
ASSERT(str_equals0("123", StrFromC(" 123")->strip()));
1350-
ASSERT(str_equals0("123", StrFromC("123 ")->strip()));
1351-
13521375
BigStr* input = nullptr;
13531376
BigStr* arg = nullptr;
13541377
BigStr* expected = nullptr;
@@ -1522,6 +1545,7 @@ int main(int argc, char** argv) {
15221545
// Members
15231546
RUN_TEST(test_str_find);
15241547
RUN_TEST(test_str_strip);
1548+
RUN_TEST(test_rstrip);
15251549
RUN_TEST(test_str_upper_lower);
15261550
RUN_TEST(test_str_replace);
15271551
RUN_TEST(test_str_just);

osh/glob_.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,6 @@ def GlobEscapeBackslash(s):
110110
\* evaluates to '\*'
111111
- that is, the \ is preserved literally
112112
"""
113-
# XXX--This representation is affected by the known IFS='\' bug, but the
114-
# bug will be fixed in the coming PR.
115113
return s.replace('\\', r'\@')
116114

117115

osh/split.py

Lines changed: 147 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,11 @@
3030
state_i)
3131
from _devbuild.gen.value_asdl import (value, value_e, value_t)
3232
from mycpp.mylib import log
33-
from core import pyutil
33+
from core import pyutil, pyos
3434
from frontend import consts
3535
from mycpp import mylib
3636
from mycpp.mylib import tagswitch
37+
from osh import glob_
3738

3839
from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
3940
if TYPE_CHECKING:
@@ -175,32 +176,35 @@ def Escape(self, s):
175176
sp = self._GetSplitter()
176177
return sp.Escape(s)
177178

179+
def CreateSplitterState(self, ifs=None):
180+
# type: (Optional[str]) -> IfsSplitterState
181+
sp = self._GetSplitter(ifs=ifs)
182+
return IfsSplitterState(sp.ifs_whitespace, sp.ifs_other)
183+
178184
def SplitForWordEval(self, s, ifs=None):
179185
# type: (str, Optional[str]) -> List[str]
180-
"""Split used by word evaluation.
181-
182-
Also used by the explicit shSplit() function.
186+
"""Split used by the explicit shSplit() function.
183187
"""
184-
sp = self._GetSplitter(ifs=ifs)
185-
spans = sp.Split(s, True)
188+
sp = self.CreateSplitterState(ifs=ifs)
189+
sp.SetAllowEscape(True)
190+
sp.PushFragment(s)
191+
return sp.PushTerminator()
186192

187-
# Note: pass allow_escape=False so \ isn't special
188-
#spans = sp.Split(s, False)
193+
def SplitForRead(self, line, allow_escape, do_split, max_parts):
194+
# type: (str, bool, bool, int) -> List[str]
189195

190-
if 0:
191-
for span in spans:
192-
log('SPAN %s', span)
193-
return _SpansToParts(s, spans)
194-
195-
def SplitForRead(self, line, allow_escape, do_split):
196-
# type: (str, bool, bool) -> List[Span]
196+
if len(line) == 0:
197+
return []
197198

198199
# None: use the default splitter, consulting $IFS
199200
# '' : forces IFS='' behavior
200201
ifs = None if do_split else ''
201202

202-
sp = self._GetSplitter(ifs=ifs)
203-
return sp.Split(line, allow_escape)
203+
sp = self.CreateSplitterState(ifs=ifs)
204+
sp.SetAllowEscape(allow_escape)
205+
sp.SetMaxSplit(max_parts - 1)
206+
sp.PushFragment(line)
207+
return sp.PushTerminator()
204208

205209

206210
class _BaseSplitter(object):
@@ -317,3 +321,129 @@ def Split(self, s, allow_escape):
317321
i += 1
318322

319323
return spans
324+
325+
326+
class IfsSplitterState(object):
327+
328+
def __init__(self, ifs_space, ifs_other):
329+
# type: (str, str) -> None
330+
self.ifs_space = ifs_space
331+
self.ifs_other = ifs_other
332+
self.glob_escape = False
333+
self.allow_escape = False
334+
self.max_split = -1
335+
336+
self.state = state_i.Start
337+
self.args = [] # type: List[str] # generated words
338+
self.frags = [] # type: List[str] # str fragments of the current word
339+
self.char_buff = [] # type: List[int] # chars in the current fragment
340+
self.white_buff = None # type: Optional[List[int]] # chars for max_split space
341+
342+
def SetGlobEscape(self, glob_escape):
343+
# type: (bool) -> None
344+
self.glob_escape = glob_escape
345+
346+
def SetAllowEscape(self, allow_escape):
347+
# type: (bool) -> None
348+
self.allow_escape = allow_escape
349+
350+
def SetMaxSplit(self, max_split):
351+
# type: (int) -> None
352+
self.max_split = max_split
353+
if max_split >= 0 and self.white_buff is None:
354+
self.white_buff = []
355+
356+
def _FlushCharBuff(self):
357+
# type: () -> None
358+
359+
if len(self.char_buff) >= 1:
360+
frag = mylib.JoinBytes(self.char_buff)
361+
if self.glob_escape:
362+
frag = glob_.GlobEscapeBackslash(frag)
363+
self.frags.append(frag)
364+
del self.char_buff[:]
365+
366+
def _GenerateWord(self):
367+
# type: () -> None
368+
self._FlushCharBuff()
369+
self.args.append(''.join(self.frags))
370+
del self.frags[:]
371+
372+
if self.max_split >= 0 and len(self.white_buff) >= 1:
373+
self.char_buff.extend(self.white_buff)
374+
del self.white_buff[:]
375+
376+
def PushLiteral(self, s):
377+
# type: (str) -> None
378+
"""
379+
Args:
380+
s: word fragment that should be literally added
381+
"""
382+
if self.state == state_i.DE_White1:
383+
self._GenerateWord()
384+
else:
385+
self._FlushCharBuff()
386+
self.frags.append(s)
387+
self.state = state_i.Black
388+
389+
def PushFragment(self, s):
390+
# type: (str) -> None
391+
"""
392+
Args:
393+
s: word fragment to split
394+
"""
395+
ifs_space = self.ifs_space
396+
ifs_other = self.ifs_other
397+
allow_escape = self.allow_escape
398+
max_split = self.max_split
399+
n = len(s)
400+
401+
for i in xrange(n):
402+
byte = mylib.ByteAt(s, i)
403+
404+
if self.state == state_i.Backslash:
405+
pass
406+
407+
elif max_split >= 0 and len(self.args) == max_split + 1:
408+
# When max_split is reached, the processing is modified.
409+
if allow_escape and byte == pyos.BACKSLASH_CH:
410+
self.state = state_i.Backslash
411+
continue
412+
elif mylib.ByteInSet(byte, ifs_space):
413+
if self.state == state_i.Start:
414+
self.char_buff.append(byte)
415+
continue
416+
417+
elif allow_escape and byte == pyos.BACKSLASH_CH:
418+
if self.state == state_i.DE_White1:
419+
self._GenerateWord()
420+
self.state = state_i.Backslash
421+
continue
422+
elif mylib.ByteInSet(byte, ifs_space):
423+
if self.state != state_i.Start:
424+
if len(self.args) == max_split:
425+
self.white_buff.append(byte)
426+
self.state = state_i.DE_White1
427+
continue
428+
elif mylib.ByteInSet(byte, ifs_other):
429+
if len(self.args) == max_split:
430+
self.white_buff.append(byte)
431+
self._GenerateWord()
432+
self.state = state_i.Start
433+
continue
434+
435+
if self.state == state_i.DE_White1:
436+
self._GenerateWord()
437+
self.char_buff.append(byte)
438+
self.state = state_i.Black
439+
440+
def PushTerminator(self):
441+
# type: () -> List[str]
442+
if self.state in (state_i.DE_White1, state_i.Black):
443+
self._GenerateWord()
444+
if self.max_split >= 0 and len(self.args) == self.max_split + 2:
445+
# TODO: is there an algorithm without this "fix up"?
446+
last = self.args.pop()
447+
self.args[-1] = self.args[-1] + last.rstrip(self.ifs_space)
448+
self.state = state_i.Start
449+
return self.args

0 commit comments

Comments
 (0)