[repograph] Add tracking of commit indexes, branch membership

Change-Id: I01917a10364655477e3d4d95a9e520d65079fe0a
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/234901
Commit-Queue: Eric Boren <borenet@google.com>
Reviewed-by: Kevin Lubick <kjlubick@google.com>
diff --git a/go/git/repograph/graph.go b/go/git/repograph/graph.go
index 0238141..7d70ed5 100644
--- a/go/git/repograph/graph.go
+++ b/go/git/repograph/graph.go
@@ -14,7 +14,9 @@
 
 	"go.skia.org/infra/go/git"
 	"go.skia.org/infra/go/metrics2"
+	"go.skia.org/infra/go/skerr"
 	"go.skia.org/infra/go/sklog"
+	"go.skia.org/infra/go/util"
 	"go.skia.org/infra/go/vcsinfo"
 )
 
@@ -26,12 +28,6 @@
 type Commit struct {
 	*vcsinfo.LongCommit
 	parents []*Commit
-
-	// HistoryLen is the number of commits in the longest line of history
-	// reachable from this Commit. It is only used during Recurse as an
-	// estimate of the number of commits which might be visited, to prevent
-	// excessive resizing of the visited map.
-	HistoryLen int
 }
 
 // Parents returns the parents of this commit.
@@ -52,8 +48,11 @@
 // 	sklog.Info(c.Hash)
 // 	return nil
 // })
+//
+// If the passed-in function returns no errors other than ErrStopRecursing,
+// Recurse() will never return an error.
 func (c *Commit) Recurse(f func(*Commit) error) error {
-	return c.recurse(f, make(map[*Commit]bool, c.HistoryLen))
+	return c.recurse(f, make(map[*Commit]bool, c.Index+1))
 }
 
 // recurse is a helper function used by Recurse.
@@ -93,10 +92,28 @@
 	return nil
 }
 
+// RecurseFirstParent is like Recurse, but it only follows the first parent of
+// each commit. Like Recurse, if the passed-in function returns no errors other
+// than ErrStopRecursing, RecurseFirstParent() will never return an error.
+func (c *Commit) RecurseFirstParent(f func(*Commit) error) error {
+	for {
+		if err := f(c); err == ErrStopRecursing {
+			return nil
+		} else if err != nil {
+			return err
+		}
+		if len(c.parents) == 0 {
+			return nil
+		} else {
+			c = c.parents[0]
+		}
+	}
+}
+
 // AllCommits returns the hashes of all commits reachable from this Commit, in
 // reverse topological order.
 func (c *Commit) AllCommits() ([]string, error) {
-	commits := make(map[*Commit]bool, c.HistoryLen)
+	commits := make(map[*Commit]bool, c.Index+1)
 	if err := c.recurse(func(c *Commit) error {
 		return nil
 	}, commits); err != nil {
@@ -162,7 +179,8 @@
 	// Update the local view of the repo.
 	Update(context.Context) error
 
-	// Return the details for the given commit.
+	// Return the details for the given commit. The Graph takes ownership
+	// of the returned LongCommit and may modify it.
 	Details(context.Context, string) (*vcsinfo.LongCommit, error)
 
 	// Return the branch heads, as of the last call to Update().
@@ -179,8 +197,9 @@
 	commits  map[string]*Commit
 	graphMtx sync.RWMutex
 
-	updateMtx sync.Mutex
-	repoImpl  RepoImpl
+	updateMtx     sync.Mutex
+	repoImpl      RepoImpl
+	trackBranches bool
 }
 
 // NewLocalGraph returns a Graph instance, creating a git.Repo from the repoUrl
@@ -216,11 +235,50 @@
 	return len(r.commits)
 }
 
+// EnableBranchTracking enables tracking of commit membership in branches. A
+// commit is considered a member of a branch if and only if it is reachable by
+// tracing the *first parent* lineage backward from the branch head. Notably,
+// this will result in different branch membership than would be obtained by
+// `git branch --all --list --contains <hash` as is used elsewhere. This process
+// is somewhat expensive and is thus not enabled by default.
+func (r *Graph) EnableBranchTracking() {
+	r.updateMtx.Lock()
+	defer r.updateMtx.Unlock()
+	r.graphMtx.Lock()
+	defer r.graphMtx.Unlock()
+	r.trackBranches = true
+	r.updateBranchInfo()
+}
+
+// updateBranchInfo updates the membership of all commits in all branches. The
+// caller must hold r.updateMtx AND r.graphMtx.
+func (r *Graph) updateBranchInfo() {
+	// TODO(borenet): We have to obtain this information from scratch when
+	// EnableBranchTracking() is called, but we should be able to update it
+	// incrementally in updateFrom() in the common case. It seemed that
+	// there were enough edge cases that updateFrom() would become
+	// significantly more complicated, so I did not attempt that.
+	membership := make(map[*Commit]util.StringSet, len(r.commits))
+	for _, branch := range r.branches {
+		_ = r.commits[branch.Head].RecurseFirstParent(func(c *Commit) error {
+			m := membership[c]
+			if m == nil {
+				m = util.NewStringSet()
+				membership[c] = m
+			}
+			m[branch.Name] = true
+			return nil
+		})
+	}
+	for _, c := range r.commits {
+		c.Branches = membership[c]
+	}
+}
+
 // addCommit adds the given commit to the Graph. Requires that the commit's
 // parents are already in the Graph. Assumes that the caller holds r.graphMtx.
 func (r *Graph) addCommit(lc *vcsinfo.LongCommit) error {
 	defer metrics2.FuncTimer().Stop()
-	maxParentHistoryLen := 0
 	var parents []*Commit
 	if len(lc.Parents) > 0 {
 		for _, h := range lc.Parents {
@@ -232,8 +290,8 @@
 				return fmt.Errorf("repograph.Graph: Could not find parent commit %q", h)
 			}
 			parents = append(parents, p)
-			if p.HistoryLen > maxParentHistoryLen {
-				maxParentHistoryLen = p.HistoryLen
+			if lc.Index == 0 {
+				lc.Index = p.Index + 1
 			}
 		}
 	}
@@ -241,7 +299,6 @@
 	c := &Commit{
 		LongCommit: lc,
 		parents:    parents,
-		HistoryLen: maxParentHistoryLen + 1,
 	}
 	r.commits[c.Hash] = c
 	return nil
@@ -417,6 +474,9 @@
 	defer r.graphMtx.Unlock()
 	r.branches = newGraph.branches
 	r.commits = newGraph.commits
+	if r.trackBranches {
+		r.updateBranchInfo()
+	}
 	return added, removed, nil
 }
 
@@ -701,6 +761,39 @@
 	return rv
 }
 
+// LogLinear is equivalent to "git log --first-parent --ancestry-path from..to",
+// ie. it only returns commits which are on the direct path from A to B, and
+// only on the "main" branch. This is as opposed to "git log from..to" which
+// returns all commits which are ancestors of 'to' but not 'from'. The 'from'
+// commit may be the empty string, in which case all commits in the first-parent
+// line are returned.
+func (r *Graph) LogLinear(from, to string) ([]*Commit, error) {
+	fromCommit := r.Get(from)
+	if fromCommit == nil && from != "" {
+		return nil, skerr.Fmt("No such commit %q", from)
+	}
+	toCommit := r.Get(to)
+	if toCommit == nil {
+		return nil, skerr.Fmt("No such commit %q", to)
+	}
+	rv := []*Commit{}
+	found := false
+	_ = toCommit.RecurseFirstParent(func(c *Commit) error {
+		if c == fromCommit {
+			found = true
+			return ErrStopRecursing
+		}
+		rv = append(rv, c)
+		return nil
+	})
+	if !found && from != "" {
+		// If the "from" commit was not found, then there is no ancestry
+		// path from "from" to "to".
+		return nil, nil
+	}
+	return rv, nil
+}
+
 // IsAncestor returns true iff A is an ancestor of B, where A and B are either
 // commit hashes or branch names.
 func (r *Graph) IsAncestor(a, b string) (bool, error) {
@@ -818,6 +911,9 @@
 		newGraph := newGraphs[repoUrl]
 		r.branches = newGraph.branches
 		r.commits = newGraph.commits
+		if r.trackBranches {
+			r.updateBranchInfo()
+		}
 	}
 	return added, removed, nil
 }
diff --git a/go/git/repograph/local_test.go b/go/git/repograph/local_test.go
index 321f1bb..3a823a6 100644
--- a/go/git/repograph/local_test.go
+++ b/go/git/repograph/local_test.go
@@ -57,6 +57,13 @@
 	shared_tests.TestRecurseAllBranches(t, ctx, g, repo, ud)
 }
 
+func TestLogLinearRepo(t *testing.T) {
+	unittest.MediumTest(t)
+	ctx, g, repo, ud, cleanup := setupRepo(t)
+	defer cleanup()
+	shared_tests.TestLogLinear(t, ctx, g, repo, ud)
+}
+
 func TestUpdateHistoryChangedRepo(t *testing.T) {
 	unittest.MediumTest(t)
 	ctx, g, repo, ud, cleanup := setupRepo(t)
@@ -77,3 +84,10 @@
 	defer cleanup()
 	shared_tests.TestRevList(t, ctx, g, repo, ud)
 }
+
+func TestBranchMembershipRepo(t *testing.T) {
+	unittest.MediumTest(t)
+	ctx, g, repo, ud, cleanup := setupRepo(t)
+	defer cleanup()
+	shared_tests.TestBranchMembership(t, ctx, g, repo, ud)
+}
diff --git a/go/git/repograph/shared_tests/shared_tests.go b/go/git/repograph/shared_tests/shared_tests.go
index f662c77..e4ee171 100644
--- a/go/git/repograph/shared_tests/shared_tests.go
+++ b/go/git/repograph/shared_tests/shared_tests.go
@@ -7,6 +7,7 @@
 	"os"
 	"path"
 	"sort"
+	"strings"
 
 	assert "github.com/stretchr/testify/require"
 	"go.skia.org/infra/go/deepequal"
@@ -214,6 +215,13 @@
 	assert.Equal(t, []*repograph.Commit(nil), c1.GetParents())
 	assert.Equal(t, []*repograph.Commit{c2}, c3.GetParents())
 
+	// Assert that each of the commits has the correct index.
+	assert.Equal(t, 0, c1.Index)
+	assert.Equal(t, 1, c2.Index)
+	assert.Equal(t, 2, c3.Index)
+	assert.Equal(t, 2, c4.Index)
+	assert.Equal(t, 3, c5.Index)
+
 	// Ensure that we can start in an empty dir and check out from scratch properly.
 	tmp2, err := ioutil.TempDir("", "")
 	assert.NoError(t, err)
@@ -358,6 +366,50 @@
 	assert.True(t, gotCommits[c2])
 }
 
+func TestLogLinear(t sktest.TestingT, ctx context.Context, g *git_testutils.GitBuilder, repo *repograph.Graph, rf RepoImplRefresher) {
+	commits := GitSetup(t, ctx, g, repo, rf)
+
+	c1 := commits[0]
+	c2 := commits[1]
+	c3 := commits[2]
+	c4 := commits[3]
+	c5 := commits[4]
+
+	gitdir := git.GitDir(g.Dir())
+	test := func(from, to string, checkAgainstGit bool, expect ...*repograph.Commit) {
+		if checkAgainstGit {
+			// Ensure that our expectations match actual git results.
+			cmd := []string{"--first-parent"}
+			if from == "" {
+				cmd = append(cmd, to)
+			} else {
+				cmd = append(cmd, "--ancestry-path", fmt.Sprintf("%s..%s", from, to))
+			}
+			hashes, err := gitdir.RevList(ctx, cmd...)
+			assert.NoError(t, err)
+			assert.Equal(t, len(hashes), len(expect))
+			for i, h := range hashes {
+				assert.Equal(t, h, expect[i].Hash)
+			}
+		}
+
+		// Ensure that we get the expected results from the Graph.
+		actual, err := repo.LogLinear(from, to)
+		assert.NoError(t, err)
+		deepequal.AssertDeepEqual(t, expect, actual)
+	}
+
+	// Get the full linear history from c5.
+	test("", c5.Hash, true, c5, c4, c2, c1)
+	// Get the linear history from c1 to c5. Like "git log", we don't
+	// include the "from" commit in the results.
+	test(c1.Hash, c5.Hash, true, c5, c4, c2)
+	// c3 is not reachable via first-parents from c5. For some reason, git
+	// actually returns c5.Hash, even though c5 has c4 as its first parent.
+	// Ignore the check against git in this case.
+	test(c3.Hash, c5.Hash, false)
+}
+
 func TestUpdateHistoryChanged(t sktest.TestingT, ctx context.Context, g *git_testutils.GitBuilder, repo *repograph.Graph, rf RepoImplRefresher) {
 	commits := GitSetup(t, ctx, g, repo, rf)
 
@@ -671,3 +723,87 @@
 	check(commits[2], commits[4], commits[3:5])
 	check(commits[3], commits[4], []string{commits[2], commits[4]})
 }
+
+func TestBranchMembership(t sktest.TestingT, ctx context.Context, gb *git_testutils.GitBuilder, repo *repograph.Graph, rf RepoImplRefresher) {
+	commits := GitSetup(t, ctx, gb, repo, rf)
+	c1 := commits[0]
+	c2 := commits[1]
+	c3 := commits[2]
+	c4 := commits[3]
+	c5 := commits[4]
+	test := func(c *repograph.Commit, branches ...string) {
+		sklog.Errorf("%d: %s", c.Index, c.Hash)
+		sklog.Errorf("Expect:\n%s", strings.Join(branches, "\n"))
+		actualStr := ""
+		for branch := range c.Branches {
+			actualStr += branch + "\n"
+		}
+		sklog.Errorf("Actual:\n%s", actualStr)
+		assert.Equal(t, len(branches), len(c.Branches))
+		for _, b := range branches {
+			assert.True(t, c.Branches[b])
+		}
+	}
+
+	// Branches should be nil at first.
+	for _, c := range commits {
+		assert.Nil(t, c.Branches)
+	}
+
+	// Enable branch tracking. Ensure that all commits were updated with the
+	// correct branch membership.
+	repo.EnableBranchTracking()
+	test(c1, "master", "branch2")
+	test(c2, "master", "branch2")
+	test(c3, "branch2") // c3 is reachable from master, but not via first-parent.
+	test(c4, "master")
+	test(c5, "master")
+
+	// Ensure that subsequent calls to Update() cause the branch membership
+	// to be updated.
+	gb.CreateBranchTrackBranch(ctx, "b3", "master")
+	rf.Refresh()
+	assert.NoError(t, repo.Update(ctx))
+	test(c1, "master", "branch2", "b3")
+	test(c2, "master", "branch2", "b3")
+	test(c3, "branch2") // c3 is reachable from b3, but not via first-parent.
+	test(c4, "master", "b3")
+	test(c5, "master", "b3")
+
+	// Reset b3 to branch2.
+	gb.Reset(ctx, "--hard", "branch2")
+	rf.Refresh()
+	assert.NoError(t, repo.Update(ctx))
+	test(c1, "master", "branch2", "b3")
+	test(c2, "master", "branch2", "b3")
+	test(c3, "branch2", "b3")
+	test(c4, "master")
+	test(c5, "master")
+
+	// Delete b3. We should get the same results as before it was added.
+	gb.CheckoutBranch(ctx, "master")
+	gb.UpdateRef(ctx, "-d", "refs/heads/b3")
+	rf.Refresh()
+	assert.NoError(t, repo.Update(ctx))
+	test(c1, "master", "branch2")
+	test(c2, "master", "branch2")
+	test(c3, "branch2")
+	test(c4, "master")
+	test(c5, "master")
+
+	// Ensure that repograph.Map updates branch membership as well.
+	m := repograph.Map{"dummy": repo}
+	c6hash := gb.CommitGen(ctx, "blah")
+	c6details, err := git.GitDir(gb.Dir()).Details(ctx, c6hash)
+	assert.NoError(t, err)
+	rf.Refresh(c6details)
+	assert.NoError(t, m.Update(ctx))
+	c6 := repo.Get(c6hash)
+	assert.NotNil(t, c6)
+	test(c1, "master", "branch2")
+	test(c2, "master", "branch2")
+	test(c3, "branch2")
+	test(c4, "master")
+	test(c5, "master")
+	test(c6, "master")
+}
diff --git a/go/gitstore/bt_gitstore/repo_impl_test.go b/go/gitstore/bt_gitstore/repo_impl_test.go
index 50a695b..3494e18 100644
--- a/go/gitstore/bt_gitstore/repo_impl_test.go
+++ b/go/gitstore/bt_gitstore/repo_impl_test.go
@@ -119,6 +119,13 @@
 	repograph_shared_tests.TestRecurseAllBranches(t, ctx, g, repo, ud)
 }
 
+func TestLogLinearBTGitStore(t *testing.T) {
+	unittest.LargeTest(t)
+	ctx, g, repo, ud, cleanup := setupGitStore(t)
+	defer cleanup()
+	repograph_shared_tests.TestLogLinear(t, ctx, g, repo, ud)
+}
+
 func TestUpdateHistoryChangedBTGitStore(t *testing.T) {
 	unittest.LargeTest(t)
 	ctx, g, repo, ud, cleanup := setupGitStore(t)
@@ -139,3 +146,10 @@
 	defer cleanup()
 	repograph_shared_tests.TestRevList(t, ctx, g, repo, ud)
 }
+
+func TestBranchMembershipBTGitStore(t *testing.T) {
+	unittest.LargeTest(t)
+	ctx, g, repo, ud, cleanup := setupGitStore(t)
+	defer cleanup()
+	repograph_shared_tests.TestBranchMembership(t, ctx, g, repo, ud)
+}
diff --git a/go/gitstore/repo_impl_test.go b/go/gitstore/repo_impl_test.go
index 596dec1..2d1233f 100644
--- a/go/gitstore/repo_impl_test.go
+++ b/go/gitstore/repo_impl_test.go
@@ -166,6 +166,13 @@
 	repograph_shared_tests.TestRecurseAllBranches(t, ctx, g, repo, ud)
 }
 
+func TestLogLinearGitStore(t *testing.T) {
+	unittest.LargeTest(t)
+	ctx, g, repo, ud, cleanup := setupGitStore(t)
+	defer cleanup()
+	repograph_shared_tests.TestLogLinear(t, ctx, g, repo, ud)
+}
+
 func TestUpdateHistoryChangedGitStore(t *testing.T) {
 	unittest.LargeTest(t)
 	ctx, g, repo, ud, cleanup := setupGitStore(t)
@@ -186,3 +193,10 @@
 	defer cleanup()
 	repograph_shared_tests.TestRevList(t, ctx, g, repo, ud)
 }
+
+func TestBranchMembershipGitStore(t *testing.T) {
+	unittest.LargeTest(t)
+	ctx, g, repo, ud, cleanup := setupGitStore(t)
+	defer cleanup()
+	repograph_shared_tests.TestBranchMembership(t, ctx, g, repo, ud)
+}
diff --git a/go/vcsinfo/types.go b/go/vcsinfo/types.go
index 593ff23..857848e 100644
--- a/go/vcsinfo/types.go
+++ b/go/vcsinfo/types.go
@@ -16,9 +16,9 @@
 	MaxTime = time.Unix(9999999999, 0)
 )
 
-// IndexCommit is information about a commit that includes the offset from
-// the first commit in the repository. The first commit in the branch has Index 0.
-// Usually the indexing makes most sense the commits on a branch in first-parent order.
+// IndexCommit is information about a commit that includes the commit's index
+// in the linear ancestry path obtained by following each commit's first parent
+// backward in history. The first commit in a given branch has Index 0.
 type IndexCommit struct {
 	Hash      string
 	Index     int
@@ -35,10 +35,16 @@
 // LongCommit gives more detailed information about a commit.
 type LongCommit struct {
 	*ShortCommit
-	Parents   []string        `json:"parent"`
-	Body      string          `json:"body"`
-	Timestamp time.Time       `json:"timestamp"`
-	Branches  map[string]bool `json:"-"`
+	Parents   []string  `json:"parent"`
+	Body      string    `json:"body"`
+	Timestamp time.Time `json:"timestamp"`
+	// Index is this Commit's index in the linear ancestry path obtained by
+	// following each commit's first parent backward in history. The first
+	// commit in a given branch has Index 0. This field is not set by
+	// default.
+	Index int `json:"-"`
+	// Branches indicates which branches can reach this commit.
+	Branches map[string]bool `json:"-"`
 }
 
 func NewLongCommit() *LongCommit {