Skip to content

Commit cf02216

Browse files
committed
tq: add exponential backoff for retries
Previously, retries happened as fast as possible unless the server provided the Retry-After header. This is effective for certain types of errors, but not when the LFS server is experiencing a temporary but not instantaneous failure. Delaying between retries lets the server recover and the LFS operation complete. Delays start at a fixed 250ms for the first retry and double with each successive retry up to a configurable maximum delay, 10s by default. The maximum retry is configurable using lfs.transfer.maxretrydelay. Delays can be disabled by setting the max delay to 0.
1 parent 379e1ba commit cf02216

File tree

5 files changed

+121
-33
lines changed

5 files changed

+121
-33
lines changed

docs/man/git-lfs-config.5.ronn

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,18 @@ be scoped inside the configuration for a remote.
160160
not an integer, is less than one, or is not given, a value of eight will be
161161
used instead.
162162

163+
* `lfs.transfer.maxretrydelay`
164+
165+
Specifies the maximum time in seconds LFS will wait between each retry
166+
attempt. LFS uses exponential backoff for retries, doubling the time between
167+
each retry until reaching this limit. If a server requests a delay using the
168+
`Retry-After` header, the header value overrides the exponential delay for
169+
that attempt and is not limited by this option.
170+
171+
Must be an integer which is not negative. Use zero to disable delays between
172+
retries unless requested by a server. If the value is not an integer, is
173+
negative, or is not given, a value of ten will be used instead.
174+
163175
* `lfs.transfer.maxverifies`
164176

165177
Specifies how many verification requests LFS will attempt per OID before

t/t-push.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,7 @@ begin_test "push (retry with expired actions)"
575575

576576
GIT_TRACE=1 git push origin master 2>&1 | tee push.log
577577

578-
expected="enqueue retry #1 for \"$contents_oid\" (size: $contents_size): LFS: tq: action \"upload\" expires at"
578+
expected="enqueue retry #1 after 0.25s for \"$contents_oid\" (size: $contents_size): LFS: tq: action \"upload\" expires at"
579579

580580
grep "$expected" push.log
581581
grep "Uploading LFS objects: 100% (1/1), 21 B" push.log

tq/manifest.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,16 @@ import (
1212

1313
const (
1414
defaultMaxRetries = 8
15+
defaultMaxRetryDelay = 10
1516
defaultConcurrentTransfers = 8
1617
)
1718

1819
type Manifest struct {
1920
// maxRetries is the maximum number of retries a single object can
20-
// attempt to make before it will be dropped.
21+
// attempt to make before it will be dropped. maxRetryDelay is the maximum
22+
// time in seconds to wait between retry attempts when using backoff.
2123
maxRetries int
24+
maxRetryDelay int
2225
concurrentTransfers int
2326
basicTransfersOnly bool
2427
standaloneTransferAgent string
@@ -39,6 +42,10 @@ func (m *Manifest) MaxRetries() int {
3942
return m.maxRetries
4043
}
4144

45+
func (m *Manifest) MaxRetryDelay() int {
46+
return m.maxRetryDelay
47+
}
48+
4249
func (m *Manifest) ConcurrentTransfers() int {
4350
return m.concurrentTransfers
4451
}
@@ -77,6 +84,9 @@ func NewManifest(f *fs.Filesystem, apiClient *lfsapi.Client, operation, remote s
7784
if v := git.Int("lfs.transfer.maxretries", 0); v > 0 {
7885
m.maxRetries = v
7986
}
87+
if v := git.Int("lfs.transfer.maxretrydelay", -1); v > -1 {
88+
m.maxRetryDelay = v
89+
}
8090
if v := git.Int("lfs.concurrenttransfers", 0); v > 0 {
8191
m.concurrentTransfers = v
8292
}
@@ -91,6 +101,9 @@ func NewManifest(f *fs.Filesystem, apiClient *lfsapi.Client, operation, remote s
91101
if m.maxRetries < 1 {
92102
m.maxRetries = defaultMaxRetries
93103
}
104+
if m.maxRetryDelay < 1 {
105+
m.maxRetryDelay = defaultMaxRetryDelay
106+
}
94107

95108
if m.concurrentTransfers < 1 {
96109
m.concurrentTransfers = defaultConcurrentTransfers

tq/transfer_queue.go

Lines changed: 50 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,37 +17,36 @@ import (
1717

1818
const (
1919
defaultBatchSize = 100
20+
baseRetryDelayMs = 250
2021
)
2122

2223
type retryCounter struct {
23-
MaxRetries int `git:"lfs.transfer.maxretries"`
24+
MaxRetries int
25+
MaxRetryDelay int
2426

2527
// cmu guards count
2628
cmu sync.Mutex
2729
// count maps OIDs to number of retry attempts
2830
count map[string]int
2931
}
3032

31-
// newRetryCounter instantiates a new *retryCounter. It parses the gitconfig
32-
// value: `lfs.transfer.maxretries`, and falls back to defaultMaxRetries if none
33-
// was provided.
34-
//
35-
// If it encountered an error in Unmarshaling the *config.Configuration, it will
36-
// be returned, otherwise nil.
33+
// newRetryCounter instantiates a new *retryCounter.
3734
func newRetryCounter() *retryCounter {
3835
return &retryCounter{
39-
MaxRetries: defaultMaxRetries,
40-
count: make(map[string]int),
36+
MaxRetries: defaultMaxRetries,
37+
MaxRetryDelay: defaultMaxRetryDelay,
38+
count: make(map[string]int),
4139
}
4240
}
4341

44-
// Increment increments the number of retries for a given OID. It is safe to
45-
// call across multiple goroutines.
46-
func (r *retryCounter) Increment(oid string) {
42+
// Increment increments the number of retries for a given OID and returns the
43+
// new value. It is safe to call across multiple goroutines.
44+
func (r *retryCounter) Increment(oid string) int {
4745
r.cmu.Lock()
4846
defer r.cmu.Unlock()
4947

5048
r.count[oid]++
49+
return r.count[oid]
5150
}
5251

5352
// CountFor returns the current number of retries for a given OID. It is safe to
@@ -66,6 +65,22 @@ func (r *retryCounter) CanRetry(oid string) (int, bool) {
6665
return count, count < r.MaxRetries
6766
}
6867

68+
// ReadyTime returns the time from now when the current retry can occur or the
69+
// zero time if the retry can occur immediately.
70+
func (r *retryCounter) ReadyTime(oid string) time.Time {
71+
count := r.CountFor(oid)
72+
if count < 1 {
73+
return time.Time{}
74+
}
75+
76+
maxDelayMs := 1000 * uint64(r.MaxRetryDelay)
77+
delay := uint64(baseRetryDelayMs) * (1 << uint(count-1))
78+
if delay == 0 || delay > maxDelayMs {
79+
delay = maxDelayMs
80+
}
81+
return time.Now().Add(time.Duration(delay) * time.Millisecond)
82+
}
83+
6984
// batch implements the sort.Interface interface and enables sorting on a slice
7085
// of `*Transfer`s by object size.
7186
//
@@ -295,6 +310,7 @@ func NewTransferQueue(dir Direction, manifest *Manifest, remote string, options
295310
}
296311

297312
q.rc.MaxRetries = q.manifest.maxRetries
313+
q.rc.MaxRetryDelay = q.manifest.maxRetryDelay
298314
q.client.MaxRetries = q.manifest.maxRetries
299315

300316
if q.batchSize <= 0 {
@@ -506,6 +522,24 @@ func (q *TransferQueue) enqueueAndCollectRetriesFor(batch batch) (batch, error)
506522
next := q.makeBatch()
507523
tracerx.Printf("tq: sending batch of size %d", len(batch))
508524

525+
enqueueRetry := func(t *objectTuple, err error, readyTime *time.Time) {
526+
count := q.rc.Increment(t.Oid)
527+
528+
if readyTime == nil {
529+
t.ReadyTime = q.rc.ReadyTime(t.Oid)
530+
} else {
531+
t.ReadyTime = *readyTime
532+
}
533+
delay := time.Until(t.ReadyTime).Seconds()
534+
535+
var errMsg string
536+
if err != nil {
537+
errMsg = fmt.Sprintf(": %s", err)
538+
}
539+
tracerx.Printf("tq: enqueue retry #%d after %.2fs for %q (size: %d)%s", count, delay, t.Oid, t.Size, errMsg)
540+
next = append(next, t)
541+
}
542+
509543
q.meter.Pause()
510544
var bRes *BatchResponse
511545
if q.manifest.standaloneTransferAgent != "" {
@@ -530,14 +564,10 @@ func (q *TransferQueue) enqueueAndCollectRetriesFor(batch batch) (batch, error)
530564
// retried, they will be marked as failed.
531565
for _, t := range batch {
532566
if q.canRetryObject(t.Oid, err) {
533-
q.rc.Increment(t.Oid)
534-
535-
next = append(next, t)
567+
enqueueRetry(t, err, nil)
536568
} else if readyTime, canRetry := q.canRetryObjectLater(t.Oid, err); canRetry {
537-
tracerx.Printf("tq: retrying object %s after %s seconds.", t.Oid, time.Until(readyTime).Seconds())
538569
err = nil
539-
t.ReadyTime = readyTime
540-
next = append(next, t)
570+
enqueueRetry(t, err, &readyTime)
541571
} else {
542572
q.wait.Done()
543573
}
@@ -599,13 +629,8 @@ func (q *TransferQueue) enqueueAndCollectRetriesFor(batch batch) (batch, error)
599629
tr := newTransfer(o, objects.First().Name, objects.First().Path)
600630

601631
if a, err := tr.Rel(q.direction.String()); err != nil {
602-
// XXX(taylor): duplication
603632
if q.canRetryObject(tr.Oid, err) {
604-
q.rc.Increment(tr.Oid)
605-
count := q.rc.CountFor(tr.Oid)
606-
607-
tracerx.Printf("tq: enqueue retry #%d for %q (size: %d): %s", count, tr.Oid, tr.Size, err)
608-
next = append(next, objects.First())
633+
enqueueRetry(objects.First(), err, nil)
609634
} else {
610635
q.errorc <- errors.Errorf("[%v] %v", tr.Name, err)
611636

@@ -624,12 +649,7 @@ func (q *TransferQueue) enqueueAndCollectRetriesFor(batch batch) (batch, error)
624649

625650
retries := q.addToAdapter(bRes.endpoint, toTransfer)
626651
for t := range retries {
627-
q.rc.Increment(t.Oid)
628-
count := q.rc.CountFor(t.Oid)
629-
630-
tracerx.Printf("tq: enqueue retry #%d for %q (size: %d)", count, t.Oid, t.Size)
631-
632-
next = append(next, t)
652+
enqueueRetry(t, nil, nil)
633653
}
634654

635655
return next, nil

tq/transfer_queue_test.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tq
22

33
import (
44
"testing"
5+
"time"
56

67
"github.com/stretchr/testify/assert"
78
)
@@ -10,15 +11,27 @@ func TestManifestDefaultsToFixedRetries(t *testing.T) {
1011
assert.Equal(t, 8, NewManifest(nil, nil, "", "").MaxRetries())
1112
}
1213

14+
func TestManifestDefaultsToFixedRetryDelay(t *testing.T) {
15+
assert.Equal(t, 10, NewManifest(nil, nil, "", "").MaxRetryDelay())
16+
}
17+
1318
func TestRetryCounterDefaultsToFixedRetries(t *testing.T) {
1419
rc := newRetryCounter()
1520
assert.Equal(t, 8, rc.MaxRetries)
1621
}
1722

23+
func TestRetryCounterDefaultsToFixedRetryDelay(t *testing.T) {
24+
rc := newRetryCounter()
25+
assert.Equal(t, 10, rc.MaxRetryDelay)
26+
}
27+
1828
func TestRetryCounterIncrementsObjects(t *testing.T) {
1929
rc := newRetryCounter()
20-
rc.Increment("oid")
30+
assert.Equal(t, 1, rc.Increment("oid"))
2131
assert.Equal(t, 1, rc.CountFor("oid"))
32+
33+
assert.Equal(t, 2, rc.Increment("oid"))
34+
assert.Equal(t, 2, rc.CountFor("oid"))
2235
}
2336

2437
func TestRetryCounterCanNotRetryAfterExceedingRetryCount(t *testing.T) {
@@ -31,6 +44,36 @@ func TestRetryCounterCanNotRetryAfterExceedingRetryCount(t *testing.T) {
3144
assert.False(t, canRetry)
3245
}
3346

47+
func TestRetryCounterDoesNotDelayFirstAttempt(t *testing.T) {
48+
rc := newRetryCounter()
49+
assert.Equal(t, time.Time{}, rc.ReadyTime("oid"))
50+
}
51+
52+
func TestRetryCounterDelaysExponentially(t *testing.T) {
53+
rc := newRetryCounter()
54+
start := time.Now()
55+
56+
rc.Increment("oid")
57+
ready1 := rc.ReadyTime("oid")
58+
assert.GreaterOrEqual(t, int64(ready1.Sub(start)/time.Millisecond), int64(baseRetryDelayMs))
59+
60+
rc.Increment("oid")
61+
ready2 := rc.ReadyTime("oid")
62+
assert.GreaterOrEqual(t, int64(ready2.Sub(start)/time.Millisecond), int64(2*baseRetryDelayMs))
63+
}
64+
65+
func TestRetryCounterLimitsDelay(t *testing.T) {
66+
rc := newRetryCounter()
67+
rc.MaxRetryDelay = 1
68+
69+
for i := 0; i < 4; i++ {
70+
rc.Increment("oid")
71+
}
72+
73+
rt := rc.ReadyTime("oid")
74+
assert.WithinDuration(t, time.Now(), rt, 1*time.Second)
75+
}
76+
3477
func TestBatchSizeReturnsBatchSize(t *testing.T) {
3578
q := NewTransferQueue(
3679
Upload, NewManifest(nil, nil, "", ""), "origin", WithBatchSize(3))

0 commit comments

Comments
 (0)