Fix synchronization bug in repo indexer (#3455)

This commit is contained in:
Ethan Koenig 2018-02-05 00:39:51 -08:00 committed by Lauris BH
parent 17655cdf1b
commit b16c84de7b

View file

@ -5,9 +5,7 @@
package models package models
import ( import (
"io/ioutil" "fmt"
"os"
"path"
"strconv" "strconv"
"strings" "strings"
@ -16,8 +14,6 @@ import (
"code.gitea.io/gitea/modules/indexer" "code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/setting"
"github.com/Unknwon/com"
) )
// RepoIndexerStatus status of a repo's entry in the repo indexer // RepoIndexerStatus status of a repo's entry in the repo indexer
@ -132,7 +128,11 @@ func populateRepoIndexer(maxRepoID int64) {
} }
func updateRepoIndexer(repo *Repository) error { func updateRepoIndexer(repo *Repository) error {
changes, err := getRepoChanges(repo) sha, err := getDefaultBranchSha(repo)
if err != nil {
return err
}
changes, err := getRepoChanges(repo, sha)
if err != nil { if err != nil {
return err return err
} else if changes == nil { } else if changes == nil {
@ -140,12 +140,12 @@ func updateRepoIndexer(repo *Repository) error {
} }
batch := indexer.RepoIndexerBatch() batch := indexer.RepoIndexerBatch()
for _, filename := range changes.UpdatedFiles { for _, update := range changes.Updates {
if err := addUpdate(filename, repo, batch); err != nil { if err := addUpdate(update, repo, batch); err != nil {
return err return err
} }
} }
for _, filename := range changes.RemovedFiles { for _, filename := range changes.RemovedFilenames {
if err := addDelete(filename, repo, batch); err != nil { if err := addDelete(filename, repo, batch); err != nil {
return err return err
} }
@ -153,56 +153,61 @@ func updateRepoIndexer(repo *Repository) error {
if err = batch.Flush(); err != nil { if err = batch.Flush(); err != nil {
return err return err
} }
return updateLastIndexSync(repo) return repo.updateIndexerStatus(sha)
} }
// repoChanges changes (file additions/updates/removals) to a repo // repoChanges changes (file additions/updates/removals) to a repo
type repoChanges struct { type repoChanges struct {
UpdatedFiles []string Updates []fileUpdate
RemovedFiles []string RemovedFilenames []string
}
type fileUpdate struct {
Filename string
BlobSha string
}
func getDefaultBranchSha(repo *Repository) (string, error) {
stdout, err := git.NewCommand("show-ref", "-s", repo.DefaultBranch).RunInDir(repo.RepoPath())
if err != nil {
return "", err
}
return strings.TrimSpace(stdout), nil
} }
// getRepoChanges returns changes to repo since last indexer update // getRepoChanges returns changes to repo since last indexer update
func getRepoChanges(repo *Repository) (*repoChanges, error) { func getRepoChanges(repo *Repository, revision string) (*repoChanges, error) {
repoWorkingPool.CheckIn(com.ToStr(repo.ID)) if err := repo.getIndexerStatus(); err != nil {
defer repoWorkingPool.CheckOut(com.ToStr(repo.ID))
if err := repo.UpdateLocalCopyBranch(""); err != nil {
return nil, err
} else if !git.IsBranchExist(repo.LocalCopyPath(), repo.DefaultBranch) {
// repo does not have any commits yet, so nothing to update
return nil, nil
} else if err = repo.UpdateLocalCopyBranch(repo.DefaultBranch); err != nil {
return nil, err
} else if err = repo.getIndexerStatus(); err != nil {
return nil, err return nil, err
} }
if len(repo.IndexerStatus.CommitSha) == 0 { if len(repo.IndexerStatus.CommitSha) == 0 {
return genesisChanges(repo) return genesisChanges(repo, revision)
} }
return nonGenesisChanges(repo) return nonGenesisChanges(repo, revision)
} }
func addUpdate(filename string, repo *Repository, batch *indexer.Batch) error { func addUpdate(update fileUpdate, repo *Repository, batch *indexer.Batch) error {
filepath := path.Join(repo.LocalCopyPath(), filename) stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
if stat, err := os.Stat(filepath); err != nil { RunInDir(repo.RepoPath())
if err != nil {
return err return err
} else if stat.Size() > setting.Indexer.MaxIndexerFileSize { }
return nil if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
} else if stat.IsDir() { return fmt.Errorf("Misformatted git cat-file output: %v", err)
// file could actually be a directory, if it is the root of a submodule. } else if int64(size) > setting.Indexer.MaxIndexerFileSize {
// We do not index submodule contents, so don't do anything.
return nil return nil
} }
fileContents, err := ioutil.ReadFile(filepath)
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil { if err != nil {
return err return err
} else if !base.IsTextFile(fileContents) { } else if !base.IsTextFile(fileContents) {
return nil return nil
} }
return batch.Add(indexer.RepoIndexerUpdate{ return batch.Add(indexer.RepoIndexerUpdate{
Filepath: filename, Filepath: update.Filename,
Op: indexer.RepoIndexerOpUpdate, Op: indexer.RepoIndexerOpUpdate,
Data: &indexer.RepoIndexerData{ Data: &indexer.RepoIndexerData{
RepoID: repo.ID, RepoID: repo.ID,
@ -221,42 +226,76 @@ func addDelete(filename string, repo *Repository, batch *indexer.Batch) error {
}) })
} }
// genesisChanges get changes to add repo to the indexer for the first time // parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
func genesisChanges(repo *Repository) (*repoChanges, error) { func parseGitLsTreeOutput(stdout string) ([]fileUpdate, error) {
var changes repoChanges lines := strings.Split(stdout, "\n")
stdout, err := git.NewCommand("ls-files").RunInDir(repo.LocalCopyPath()) updates := make([]fileUpdate, 0, len(lines))
if err != nil { for _, line := range lines {
return nil, err // expect line to be "<mode> <object-type> <object-sha>\t<filename>"
} line = strings.TrimSpace(line)
for _, line := range strings.Split(stdout, "\n") { if len(line) == 0 {
filename := strings.TrimSpace(line)
if len(filename) == 0 {
continue continue
} else if filename[0] == '"' { }
firstSpaceIndex := strings.IndexByte(line, ' ')
if firstSpaceIndex < 0 {
log.Error(4, "Misformatted git ls-tree output: %s", line)
continue
}
tabIndex := strings.IndexByte(line, '\t')
if tabIndex < 42+firstSpaceIndex || tabIndex == len(line)-1 {
log.Error(4, "Misformatted git ls-tree output: %s", line)
continue
}
if objectType := line[firstSpaceIndex+1 : tabIndex-41]; objectType != "blob" {
// submodules appear as commit objects, we do not index submodules
continue
}
blobSha := line[tabIndex-40 : tabIndex]
filename := line[tabIndex+1:]
if filename[0] == '"' {
var err error
filename, err = strconv.Unquote(filename) filename, err = strconv.Unquote(filename)
if err != nil { if err != nil {
return nil, err return nil, err
} }
} }
changes.UpdatedFiles = append(changes.UpdatedFiles, filename) updates = append(updates, fileUpdate{
Filename: filename,
BlobSha: blobSha,
})
} }
return &changes, nil return updates, nil
}
// genesisChanges get changes to add repo to the indexer for the first time
func genesisChanges(repo *Repository, revision string) (*repoChanges, error) {
var changes repoChanges
stdout, err := git.NewCommand("ls-tree", "--full-tree", "-r", revision).
RunInDir(repo.RepoPath())
if err != nil {
return nil, err
}
changes.Updates, err = parseGitLsTreeOutput(stdout)
return &changes, err
} }
// nonGenesisChanges get changes since the previous indexer update // nonGenesisChanges get changes since the previous indexer update
func nonGenesisChanges(repo *Repository) (*repoChanges, error) { func nonGenesisChanges(repo *Repository, revision string) (*repoChanges, error) {
diffCmd := git.NewCommand("diff", "--name-status", diffCmd := git.NewCommand("diff", "--name-status",
repo.IndexerStatus.CommitSha, "HEAD") repo.IndexerStatus.CommitSha, revision)
stdout, err := diffCmd.RunInDir(repo.LocalCopyPath()) stdout, err := diffCmd.RunInDir(repo.RepoPath())
if err != nil { if err != nil {
// previous commit sha may have been removed by a force push, so // previous commit sha may have been removed by a force push, so
// try rebuilding from scratch // try rebuilding from scratch
log.Warn("git diff: %v", err)
if err = indexer.DeleteRepoFromIndexer(repo.ID); err != nil { if err = indexer.DeleteRepoFromIndexer(repo.ID); err != nil {
return nil, err return nil, err
} }
return genesisChanges(repo) return genesisChanges(repo, revision)
} }
var changes repoChanges var changes repoChanges
updatedFilenames := make([]string, 0, 10)
for _, line := range strings.Split(stdout, "\n") { for _, line := range strings.Split(stdout, "\n") {
line = strings.TrimSpace(line) line = strings.TrimSpace(line)
if len(line) == 0 { if len(line) == 0 {
@ -274,23 +313,22 @@ func nonGenesisChanges(repo *Repository) (*repoChanges, error) {
switch status := line[0]; status { switch status := line[0]; status {
case 'M', 'A': case 'M', 'A':
changes.UpdatedFiles = append(changes.UpdatedFiles, filename) updatedFilenames = append(updatedFilenames, filename)
case 'D': case 'D':
changes.RemovedFiles = append(changes.RemovedFiles, filename) changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
default: default:
log.Warn("Unrecognized status: %c (line=%s)", status, line) log.Warn("Unrecognized status: %c (line=%s)", status, line)
} }
} }
return &changes, nil
}
func updateLastIndexSync(repo *Repository) error { cmd := git.NewCommand("ls-tree", "--full-tree", revision, "--")
stdout, err := git.NewCommand("rev-parse", "HEAD").RunInDir(repo.LocalCopyPath()) cmd.AddArguments(updatedFilenames...)
stdout, err = cmd.RunInDir(repo.RepoPath())
if err != nil { if err != nil {
return err return nil, err
} }
sha := strings.TrimSpace(stdout) changes.Updates, err = parseGitLsTreeOutput(stdout)
return repo.updateIndexerStatus(sha) return &changes, err
} }
func processRepoIndexerOperationQueue() { func processRepoIndexerOperationQueue() {