From 19d4531d087e6dd291f746db859876fc28ccd5d6 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 30 Mar 2021 21:13:20 +0100 Subject: [PATCH 1/3] Speed up `enry.IsVendor` `enry.IsVendor` is kinda slow as it simply iterates across all regexps. This PR ajdusts the regexps to combine them to make this process a little quicker. Related #15143 Signed-off-by: Andrew Thornton --- modules/analyze/vendor.go | 70 ++++++++++++++++++++++ modules/git/repo_language_stats_gogit.go | 2 +- modules/git/repo_language_stats_nogogit.go | 2 +- modules/indexer/code/bleve.go | 2 +- modules/indexer/code/elastic_search.go | 2 +- 5 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 modules/analyze/vendor.go diff --git a/modules/analyze/vendor.go b/modules/analyze/vendor.go new file mode 100644 index 0000000000000..dc0f861f0b455 --- /dev/null +++ b/modules/analyze/vendor.go @@ -0,0 +1,70 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package analyze + +import ( + "regexp" + "sort" + "strings" + + "github.com/go-enry/go-enry/v2/data" +) + +var isVendorRegExp *regexp.Regexp + +func init() { + matchers := data.VendorMatchers + + caretStrings := make([]string, 0, 10) + caretShareStrings := make([]string, 0, 10) + + matcherStrings := make([]string, 0, len(matchers)) + for _, matcher := range matchers { + str := matcher.String() + if str[0] == '^' { + caretStrings = append(caretStrings, str[1:]) + } else if str[0:5] == "(^|/)" { + caretShareStrings = append(caretShareStrings, str[5:]) + } else { + matcherStrings = append(matcherStrings, matcher.String()) + } + } + + sort.Strings(caretShareStrings) + sort.Strings(caretStrings) + sort.Strings(matcherStrings) + + sb := &strings.Builder{} + sb.WriteString("(?:^(?:") + sb.WriteString(caretStrings[0]) + for _, matcher := range caretStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString("))") + sb.WriteString("|") + sb.WriteString("(?:(?:^|/)(?:") + sb.WriteString(caretShareStrings[0]) + for _, matcher := range caretShareStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString("))") + sb.WriteString("|") + sb.WriteString("(?:") + sb.WriteString(matcherStrings[0]) + for _, matcher := range matcherStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString(")") + combined := sb.String() + isVendorRegExp = regexp.MustCompile(combined) +} + +// IsVendor returns whether or not path is a vendor path. +func IsVendor(path string) bool { + return isVendorRegExp.MatchString(path) +} diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go index b5a235921c8ae..20a7b061f2107 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/repo_language_stats_gogit.go @@ -43,7 +43,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err sizes := make(map[string]int64) err = tree.Files().ForEach(func(f *object.File) error { - if f.Size == 0 || enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || + if f.Size == 0 || analyze.IsVendor(f.Name) || enry.IsDotFile(f.Name) || enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { return nil } diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index a929d7953b54b..3f197f8d74e9d 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -67,7 +67,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err for _, f := range entries { contentBuf.Reset() content = contentBuf.Bytes() - if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) || + if f.Size() == 0 || analyze.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) || enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) { continue } diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go index 573ea8b88cbc4..416adeea74f2c 100644 --- a/modules/indexer/code/bleve.go +++ b/modules/indexer/code/bleve.go @@ -178,7 +178,7 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) { func (b *BleveIndexer) addUpdate(batchWriter *io.PipeWriter, batchReader *bufio.Reader, commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { // Ignore vendored files in code search - if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { return nil } diff --git a/modules/indexer/code/elastic_search.go b/modules/indexer/code/elastic_search.go index 5327eb1e51e9d..ebb7910fdcb40 100644 --- a/modules/indexer/code/elastic_search.go +++ b/modules/indexer/code/elastic_search.go @@ -177,7 +177,7 @@ func (b *ElasticSearchIndexer) init() (bool, error) { func (b *ElasticSearchIndexer) addUpdate(batchWriter *io.PipeWriter, batchReader *bufio.Reader, sha string, update fileUpdate, repo *models.Repository) ([]elastic.BulkableRequest, error) { // Ignore vendored files in code search - if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { return nil, nil } From ef156246150b516bceaeb54c31fce37bc34e6bb8 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Wed, 31 Mar 2021 18:22:06 +0100 Subject: [PATCH 2/3] add testcase as per @lunny Signed-off-by: Andrew Thornton --- modules/analyze/vendor_test.go | 42 ++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 modules/analyze/vendor_test.go diff --git a/modules/analyze/vendor_test.go b/modules/analyze/vendor_test.go new file mode 100644 index 0000000000000..2784e49d34604 --- /dev/null +++ b/modules/analyze/vendor_test.go @@ -0,0 +1,42 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package analyze + +import "testing" + +func TestIsVendor(t *testing.T) { + tests := []struct { + path string + want bool + }{ + {"cache/", true}, + {"random/cache/", true}, + {"cache", false}, + {"dependencies/", true}, + {"Dependencies/", true}, + {"dependency/", false}, + {"dist/", true}, + {"dist", false}, + {"random/dist/", true}, + {"random/dist", false}, + {"deps/", true}, + {"configure", true}, + {"a/configure", true}, + {"config.guess", true}, + {"config.guess/", false}, + {".vscode/", true}, + {"doc/_build/", true}, + {"a/docs/_build/", true}, + {"a/dasdocs/_build-vsdoc.js", true}, + {"a/dasdocs/_build-vsdoc.j", false}, + } + for _, tt := range tests { + t.Run(tt.path, func(t *testing.T) { + if got := IsVendor(tt.path); got != tt.want { + t.Errorf("IsVendor() = %v, want %v", got, tt.want) + } + }) + } +} From ff64a6f19c74cc2947923de30d4604894e23534d Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Wed, 31 Mar 2021 19:49:48 +0100 Subject: [PATCH 3/3] use the already made str Signed-off-by: Andrew Thornton --- modules/analyze/vendor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/analyze/vendor.go b/modules/analyze/vendor.go index dc0f861f0b455..12ae8dbd8075c 100644 --- a/modules/analyze/vendor.go +++ b/modules/analyze/vendor.go @@ -28,7 +28,7 @@ func init() { } else if str[0:5] == "(^|/)" { caretShareStrings = append(caretShareStrings, str[5:]) } else { - matcherStrings = append(matcherStrings, matcher.String()) + matcherStrings = append(matcherStrings, str) } }