Skip to content

Commit 608bc8d

Browse files
committed
lfs: find invalid pointers
In the future, we'll want to support detecting various problems with pointers. These fall into two types: pointers which are non-canonical and files which should be pointers but are not. Our existing scanning functions are not well suited to this, unfortunately, so we add some additional functions. We first scan all of the commits in the range we want and then, having found their object IDs, call git ls-tree to enumerate each item in its corresponding root tree. We accumulate the patterns in every found .gitattributes file, and we keep track of every other file we process, checking small files for being a pointer. Once we've processed the entire tree, we compute the set of patterns for the .gitattributes file and check each file against it. If the file is a pointer, we emit the pointer to our callback, and if it is not a pointer but matches the patterns, then we emit an error indicating that it should have been a pointer.
1 parent 6bfbde8 commit 608bc8d

File tree

3 files changed

+171
-0
lines changed

3 files changed

+171
-0
lines changed

lfs/gitscanner.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,19 @@ func (s *GitScanner) ScanRef(ref string, cb GitScannerFoundPointer) error {
159159
return scanLeftRightToChan(s, callback, ref, "", s.cfg.GitEnv(), s.cfg.OSEnv(), opts)
160160
}
161161

162+
// ScanRefByTree scans through all trees in the current ref.
163+
func (s *GitScanner) ScanRefByTree(ref string, cb GitScannerFoundPointer) error {
164+
callback, err := firstGitScannerCallback(cb, s.FoundPointer)
165+
if err != nil {
166+
return err
167+
}
168+
169+
opts := s.opts(ScanRefsMode)
170+
opts.SkipDeletedBlobs = true
171+
opts.CommitsOnly = true
172+
return scanRefsByTree(s, callback, []string{ref}, []string{}, s.cfg.GitEnv(), s.cfg.OSEnv(), opts)
173+
}
174+
162175
// ScanAll scans through all objects in the git repository.
163176
func (s *GitScanner) ScanAll(cb GitScannerFoundPointer) error {
164177
callback, err := firstGitScannerCallback(cb, s.FoundPointer)
@@ -257,6 +270,7 @@ type ScanRefsOptions struct {
257270
ScanMode ScanningMode
258271
RemoteName string
259272
SkipDeletedBlobs bool
273+
CommitsOnly bool
260274
skippedRefs []string
261275
nameMap map[string]string
262276
mutex *sync.Mutex

lfs/gitscanner_refs.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package lfs
22

33
import (
44
"encoding/hex"
5+
"sync"
56

67
"github.com/git-lfs/git-lfs/config"
78
"github.com/git-lfs/git-lfs/git"
@@ -105,6 +106,45 @@ func scanMultiLeftRightToChan(scanner *GitScanner, pointerCb GitScannerFoundPoin
105106
return scanRefsToChan(scanner, pointerCb, []string{refLeft}, bases, gitEnv, osEnv, opt)
106107
}
107108

109+
// scanRefsByTree scans through all commits reachable by refs contained in
110+
// "include" and not reachable by any refs included in "exclude" and invokes
111+
// the provided callback for each pointer file, valid or invalid, that it finds.
112+
// Reports unique oids once only, not multiple times if >1 file uses the same content
113+
func scanRefsByTree(scanner *GitScanner, pointerCb GitScannerFoundPointer, include, exclude []string, gitEnv, osEnv config.Environment, opt *ScanRefsOptions) error {
114+
if opt == nil {
115+
panic("no scan ref options")
116+
}
117+
118+
revs, err := revListShas(include, exclude, opt)
119+
if err != nil {
120+
return err
121+
}
122+
123+
errchan := make(chan error, 20) // multiple errors possible
124+
wg := &sync.WaitGroup{}
125+
126+
for r := range revs.Results {
127+
wg.Add(1)
128+
go func(rev string) {
129+
defer wg.Done()
130+
err := runScanTreeForPointers(pointerCb, rev, gitEnv, osEnv)
131+
if err != nil {
132+
errchan <- err
133+
}
134+
}(r)
135+
}
136+
137+
wg.Wait()
138+
close(errchan)
139+
for err := range errchan {
140+
if err != nil {
141+
return err
142+
}
143+
}
144+
145+
return revs.Wait()
146+
}
147+
108148
// revListShas uses git rev-list to return the list of object sha1s
109149
// for the given ref. If all is true, ref is ignored. It returns a
110150
// channel from which sha1 strings can be read.
@@ -116,6 +156,7 @@ func revListShas(include, exclude []string, opt *ScanRefsOptions) (*StringChanne
116156
SkippedRefs: opt.skippedRefs,
117157
Mutex: opt.mutex,
118158
Names: opt.nameMap,
159+
CommitsOnly: opt.CommitsOnly,
119160
})
120161

121162
if err != nil {

lfs/gitscanner_tree.go

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@ package lfs
33
import (
44
"fmt"
55
"io/ioutil"
6+
"path"
7+
"path/filepath"
68

79
"github.com/git-lfs/git-lfs/config"
10+
"github.com/git-lfs/git-lfs/errors"
811
"github.com/git-lfs/git-lfs/filepathfilter"
912
"github.com/git-lfs/git-lfs/git"
13+
"github.com/git-lfs/git-lfs/git/gitattr"
1014
)
1115

1216
func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error {
@@ -120,3 +124,115 @@ func lsTreeBlobs(ref string, predicate func(*git.TreeBlob) bool) (*TreeBlobChann
120124

121125
return NewTreeBlobChannelWrapper(blobs, errchan), nil
122126
}
127+
128+
func catFileBatchTreeForPointers(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.Environment) (map[string]*WrappedPointer, *filepathfilter.Filter, error) {
129+
pscanner, err := NewPointerScanner(gitEnv, osEnv)
130+
if err != nil {
131+
return nil, nil, err
132+
}
133+
oscanner, err := git.NewObjectScanner(gitEnv, osEnv)
134+
if err != nil {
135+
return nil, nil, err
136+
}
137+
138+
pointers := make(map[string]*WrappedPointer)
139+
140+
paths := make([]git.AttributePath, 0)
141+
processor := gitattr.NewMacroProcessor()
142+
143+
hasNext := true
144+
for t := range treeblobs.Results {
145+
if path.Base(t.Filename) == ".gitattributes" {
146+
hasNext = oscanner.Scan(t.Oid)
147+
148+
if rdr := oscanner.Contents(); rdr != nil {
149+
paths = append(paths, git.AttrPathsFromReader(
150+
processor,
151+
t.Filename,
152+
"",
153+
rdr,
154+
t.Filename == ".gitattributes", // Read macros from the top-level attributes
155+
)...)
156+
}
157+
158+
if err := oscanner.Err(); err != nil {
159+
return nil, nil, err
160+
}
161+
} else if t.Size < blobSizeCutoff {
162+
hasNext = pscanner.Scan(t.Oid)
163+
164+
// It's intentional that we insert nil for
165+
// non-pointers; we want to keep track of them
166+
// as well as pointers.
167+
p := pscanner.Pointer()
168+
if p != nil {
169+
p.Name = t.Filename
170+
}
171+
pointers[t.Filename] = p
172+
173+
if err := pscanner.Err(); err != nil {
174+
return nil, nil, err
175+
}
176+
} else {
177+
pointers[t.Filename] = nil
178+
}
179+
180+
if !hasNext {
181+
break
182+
}
183+
}
184+
185+
// If the scanner quit early, we may still have treeblobs to
186+
// read, so waiting for it to close will cause a deadlock.
187+
if hasNext {
188+
// Deal with nested error from incoming treeblobs
189+
err := treeblobs.Wait()
190+
if err != nil {
191+
return nil, nil, err
192+
}
193+
}
194+
195+
if err = pscanner.Close(); err != nil {
196+
return nil, nil, err
197+
}
198+
if err = oscanner.Close(); err != nil {
199+
return nil, nil, err
200+
}
201+
202+
patterns := make([]filepathfilter.Pattern, 0, len(paths))
203+
for _, path := range paths {
204+
// Convert all separators to `/` before creating a pattern to
205+
// avoid characters being escaped in situations like `subtree\*.md`
206+
patterns = append(patterns, filepathfilter.NewPattern(filepath.ToSlash(path.Path), filepathfilter.Strict(true)))
207+
}
208+
209+
return pointers, filepathfilter.NewFromPatterns(patterns, nil), nil
210+
}
211+
212+
func runScanTreeForPointers(cb GitScannerFoundPointer, tree string, gitEnv, osEnv config.Environment) error {
213+
treeShas, err := lsTreeBlobs(tree, func(t *git.TreeBlob) bool {
214+
return t != nil
215+
})
216+
if err != nil {
217+
return err
218+
}
219+
220+
pointers, filter, err := catFileBatchTreeForPointers(treeShas, gitEnv, osEnv)
221+
if err != nil {
222+
return err
223+
}
224+
225+
for name, p := range pointers {
226+
// This file matches the patterns in .gitattributes, so it
227+
// should be a pointer. If it is not, then it is a plain Git
228+
// blob, which we report as an error.
229+
if filter.Allows(name) {
230+
if p == nil {
231+
cb(nil, errors.NewPointerScanError(errors.NewNotAPointerError(nil), tree, name))
232+
} else {
233+
cb(p, nil)
234+
}
235+
}
236+
}
237+
return nil
238+
}

0 commit comments

Comments
 (0)