mirror of https://github.com/go-gitea/gitea.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
449 lines
14 KiB
449 lines
14 KiB
// Copyright 2020 The Gitea Authors. All rights reserved. |
|
// SPDX-License-Identifier: MIT |
|
|
|
package elasticsearch |
|
|
|
import ( |
|
"bufio" |
|
"context" |
|
"fmt" |
|
"io" |
|
"strconv" |
|
"strings" |
|
|
|
repo_model "code.gitea.io/gitea/models/repo" |
|
"code.gitea.io/gitea/modules/analyze" |
|
"code.gitea.io/gitea/modules/charset" |
|
"code.gitea.io/gitea/modules/git" |
|
"code.gitea.io/gitea/modules/git/gitcmd" |
|
"code.gitea.io/gitea/modules/gitrepo" |
|
"code.gitea.io/gitea/modules/indexer" |
|
"code.gitea.io/gitea/modules/indexer/code/internal" |
|
indexer_internal "code.gitea.io/gitea/modules/indexer/internal" |
|
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch" |
|
"code.gitea.io/gitea/modules/json" |
|
"code.gitea.io/gitea/modules/log" |
|
"code.gitea.io/gitea/modules/setting" |
|
"code.gitea.io/gitea/modules/timeutil" |
|
"code.gitea.io/gitea/modules/typesniffer" |
|
"code.gitea.io/gitea/modules/util" |
|
|
|
"github.com/go-enry/go-enry/v2" |
|
"github.com/olivere/elastic/v7" |
|
) |
|
|
|
const ( |
|
esRepoIndexerLatestVersion = 3 |
|
// multi-match-types, currently only 2 types are used |
|
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types |
|
esMultiMatchTypeBestFields = "best_fields" |
|
esMultiMatchTypePhrasePrefix = "phrase_prefix" |
|
) |
|
|
|
var _ internal.Indexer = &Indexer{} |
|
|
|
// Indexer implements Indexer interface |
|
type Indexer struct { |
|
inner *inner_elasticsearch.Indexer |
|
indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much |
|
} |
|
|
|
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode { |
|
return indexer.SearchModesExactWords() |
|
} |
|
|
|
// NewIndexer creates a new elasticsearch indexer |
|
func NewIndexer(url, indexerName string) *Indexer { |
|
inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping) |
|
indexer := &Indexer{ |
|
inner: inner, |
|
Indexer: inner, |
|
} |
|
return indexer |
|
} |
|
|
|
const ( |
|
defaultMapping = `{ |
|
"settings": { |
|
"analysis": { |
|
"analyzer": { |
|
"content_analyzer": { |
|
"tokenizer": "content_tokenizer", |
|
"filter" : ["lowercase"] |
|
}, |
|
"filename_path_analyzer": { |
|
"tokenizer": "path_tokenizer" |
|
}, |
|
"reversed_filename_path_analyzer": { |
|
"tokenizer": "reversed_path_tokenizer" |
|
} |
|
}, |
|
"tokenizer": { |
|
"content_tokenizer": { |
|
"type": "simple_pattern_split", |
|
"pattern": "[^a-zA-Z0-9]" |
|
}, |
|
"path_tokenizer": { |
|
"type": "path_hierarchy", |
|
"delimiter": "/" |
|
}, |
|
"reversed_path_tokenizer": { |
|
"type": "path_hierarchy", |
|
"delimiter": "/", |
|
"reverse": true |
|
} |
|
} |
|
} |
|
}, |
|
"mappings": { |
|
"properties": { |
|
"repo_id": { |
|
"type": "long", |
|
"index": true |
|
}, |
|
"filename": { |
|
"type": "text", |
|
"term_vector": "with_positions_offsets", |
|
"index": true, |
|
"fields": { |
|
"path": { |
|
"type": "text", |
|
"analyzer": "reversed_filename_path_analyzer" |
|
}, |
|
"path_reversed": { |
|
"type": "text", |
|
"analyzer": "filename_path_analyzer" |
|
} |
|
} |
|
}, |
|
"content": { |
|
"type": "text", |
|
"term_vector": "with_positions_offsets", |
|
"index": true, |
|
"analyzer": "content_analyzer" |
|
}, |
|
"commit_id": { |
|
"type": "keyword", |
|
"index": true |
|
}, |
|
"language": { |
|
"type": "keyword", |
|
"index": true |
|
}, |
|
"updated_at": { |
|
"type": "long", |
|
"index": true |
|
} |
|
} |
|
} |
|
}` |
|
) |
|
|
|
func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) { |
|
// Ignore vendored files in code search |
|
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { |
|
return nil, nil |
|
} |
|
|
|
size := update.Size |
|
var err error |
|
if !update.Sized { |
|
var stdout string |
|
stdout, err = gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha)) |
|
if err != nil { |
|
return nil, err |
|
} |
|
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil { |
|
return nil, fmt.Errorf("misformatted git cat-file output: %w", err) |
|
} |
|
} |
|
|
|
if size > setting.Indexer.MaxIndexerFileSize { |
|
return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil |
|
} |
|
|
|
if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { |
|
return nil, err |
|
} |
|
|
|
_, _, size, err = git.ReadBatchLine(batchReader) |
|
if err != nil { |
|
return nil, err |
|
} |
|
|
|
fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) |
|
if err != nil { |
|
return nil, err |
|
} else if !typesniffer.DetectContentType(fileContents).IsText() { |
|
// FIXME: UTF-16 files will probably fail here |
|
return nil, nil |
|
} |
|
|
|
if _, err = batchReader.Discard(1); err != nil { |
|
return nil, err |
|
} |
|
id := internal.FilenameIndexerID(repo.ID, update.Filename) |
|
|
|
return []elastic.BulkableRequest{ |
|
elastic.NewBulkIndexRequest(). |
|
Index(b.inner.VersionedIndexName()). |
|
Id(id). |
|
Doc(map[string]any{ |
|
"repo_id": repo.ID, |
|
"filename": update.Filename, |
|
"content": string(charset.ToUTF8DropErrors(fileContents)), |
|
"commit_id": sha, |
|
"language": analyze.GetCodeLanguage(update.Filename, fileContents), |
|
"updated_at": timeutil.TimeStampNow(), |
|
}), |
|
}, nil |
|
} |
|
|
|
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest { |
|
id := internal.FilenameIndexerID(repo.ID, filename) |
|
return elastic.NewBulkDeleteRequest(). |
|
Index(b.inner.VersionedIndexName()). |
|
Id(id) |
|
} |
|
|
|
// Index will save the index data |
|
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error { |
|
reqs := make([]elastic.BulkableRequest, 0) |
|
if len(changes.Updates) > 0 { |
|
batch, err := gitrepo.NewBatch(ctx, repo) |
|
if err != nil { |
|
return err |
|
} |
|
defer batch.Close() |
|
|
|
for _, update := range changes.Updates { |
|
updateReqs, err := b.addUpdate(ctx, batch.Writer, batch.Reader, sha, update, repo) |
|
if err != nil { |
|
return err |
|
} |
|
if len(updateReqs) > 0 { |
|
reqs = append(reqs, updateReqs...) |
|
} |
|
} |
|
batch.Close() |
|
} |
|
|
|
for _, filename := range changes.RemovedFilenames { |
|
reqs = append(reqs, b.addDelete(filename, repo)) |
|
} |
|
|
|
if len(reqs) > 0 { |
|
esBatchSize := 50 |
|
|
|
for i := 0; i < len(reqs); i += esBatchSize { |
|
_, err := b.inner.Client.Bulk(). |
|
Index(b.inner.VersionedIndexName()). |
|
Add(reqs[i:min(i+esBatchSize, len(reqs))]...). |
|
Do(ctx) |
|
if err != nil { |
|
return err |
|
} |
|
} |
|
} |
|
return nil |
|
} |
|
|
|
// Delete entries by repoId |
|
func (b *Indexer) Delete(ctx context.Context, repoID int64) error { |
|
if err := b.doDelete(ctx, repoID); err != nil { |
|
// Maybe there is a conflict during the delete operation, so we should retry after a refresh |
|
log.Warn("Deletion of entries of repo %v within index %v was erroneous. Trying to refresh index before trying again", repoID, b.inner.VersionedIndexName(), err) |
|
if err := b.refreshIndex(ctx); err != nil { |
|
return err |
|
} |
|
if err := b.doDelete(ctx, repoID); err != nil { |
|
log.Error("Could not delete entries of repo %v within index %v", repoID, b.inner.VersionedIndexName()) |
|
return err |
|
} |
|
} |
|
return nil |
|
} |
|
|
|
func (b *Indexer) refreshIndex(ctx context.Context) error { |
|
if _, err := b.inner.Client.Refresh(b.inner.VersionedIndexName()).Do(ctx); err != nil { |
|
log.Error("Error while trying to refresh index %v", b.inner.VersionedIndexName(), err) |
|
return err |
|
} |
|
|
|
return nil |
|
} |
|
|
|
// Delete entries by repoId |
|
func (b *Indexer) doDelete(ctx context.Context, repoID int64) error { |
|
_, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()). |
|
Query(elastic.NewTermsQuery("repo_id", repoID)). |
|
Do(ctx) |
|
return err |
|
} |
|
|
|
// contentMatchIndexPos find words positions for start and the following end on content. It will |
|
// return the beginning position of the first start and the ending position of the |
|
// first end following the start string. |
|
// If not found any of the positions, it will return -1, -1. |
|
func contentMatchIndexPos(content, start, end string) (int, int) { |
|
startIdx := strings.Index(content, start) |
|
if startIdx < 0 { |
|
return -1, -1 |
|
} |
|
endIdx := strings.Index(content[startIdx+len(start):], end) |
|
if endIdx < 0 { |
|
return -1, -1 |
|
} |
|
return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data |
|
} |
|
|
|
func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { |
|
hits := make([]*internal.SearchResult, 0, pageSize) |
|
for _, hit := range searchResult.Hits.Hits { |
|
repoID, fileName := internal.ParseIndexerID(hit.Id) |
|
res := make(map[string]any) |
|
if err := json.Unmarshal(hit.Source, &res); err != nil { |
|
return 0, nil, nil, err |
|
} |
|
|
|
// FIXME: There is no way to get the position the keyword on the content currently on the same request. |
|
// So we get it from content, this may made the query slower. See |
|
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291 |
|
var startIndex, endIndex int |
|
if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 { |
|
startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string)) |
|
} else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 { |
|
// FIXME: Since the highlighting content will include <em> and </em> for the keywords, |
|
// now we should find the positions. But how to avoid html content which contains the |
|
// <em> and </em> tags? If elastic search has handled that? |
|
startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>") |
|
if startIndex == -1 { |
|
panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) |
|
} |
|
} else { |
|
panic(fmt.Sprintf("2===%#v", hit.Highlight)) |
|
} |
|
|
|
language := res["language"].(string) |
|
|
|
hits = append(hits, &internal.SearchResult{ |
|
RepoID: repoID, |
|
Filename: fileName, |
|
CommitID: res["commit_id"].(string), |
|
Content: res["content"].(string), |
|
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), |
|
Language: language, |
|
StartIndex: startIndex, |
|
EndIndex: endIndex, |
|
Color: enry.GetColor(language), |
|
}) |
|
} |
|
|
|
return searchResult.TotalHits(), hits, extractAggs(searchResult), nil |
|
} |
|
|
|
func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages { |
|
var searchResultLanguages []*internal.SearchResultLanguages |
|
agg, found := searchResult.Aggregations.Terms("language") |
|
if found { |
|
searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10) |
|
|
|
for _, bucket := range agg.Buckets { |
|
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{ |
|
Language: bucket.Key.(string), |
|
Color: enry.GetColor(bucket.Key.(string)), |
|
Count: int(bucket.DocCount), |
|
}) |
|
} |
|
} |
|
return searchResultLanguages |
|
} |
|
|
|
// Search searches for codes and language stats by given conditions. |
|
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { |
|
var contentQuery elastic.Query |
|
searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue) |
|
if searchMode == indexer.SearchModeExact { |
|
// 1.21 used NewMultiMatchQuery().Type(esMultiMatchTypePhrasePrefix), but later releases changed to NewMatchPhraseQuery |
|
contentQuery = elastic.NewMatchPhraseQuery("content", opts.Keyword) |
|
} else /* words */ { |
|
contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).Type(esMultiMatchTypeBestFields).Operator("and") |
|
} |
|
kwQuery := elastic.NewBoolQuery().Should( |
|
contentQuery, |
|
elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix), |
|
) |
|
query := elastic.NewBoolQuery() |
|
query = query.Must(kwQuery) |
|
if len(opts.RepoIDs) > 0 { |
|
repoStrs := make([]any, 0, len(opts.RepoIDs)) |
|
for _, repoID := range opts.RepoIDs { |
|
repoStrs = append(repoStrs, repoID) |
|
} |
|
repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...) |
|
query = query.Must(repoQuery) |
|
} |
|
|
|
var ( |
|
start, pageSize = opts.GetSkipTake() |
|
kw = "<em>" + opts.Keyword + "</em>" |
|
aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc() |
|
) |
|
|
|
if len(opts.Language) == 0 { |
|
searchResult, err := b.inner.Client.Search(). |
|
Index(b.inner.VersionedIndexName()). |
|
Aggregation("language", aggregation). |
|
Query(query). |
|
Highlight( |
|
elastic.NewHighlight(). |
|
Field("content"). |
|
Field("filename"). |
|
NumOfFragments(0). // return all highting content on fragments |
|
HighlighterType("fvh"), |
|
). |
|
Sort("_score", false). |
|
Sort("updated_at", true). |
|
From(start).Size(pageSize). |
|
Do(ctx) |
|
if err != nil { |
|
return 0, nil, nil, err |
|
} |
|
|
|
return convertResult(searchResult, kw, pageSize) |
|
} |
|
|
|
langQuery := elastic.NewMatchQuery("language", opts.Language) |
|
countResult, err := b.inner.Client.Search(). |
|
Index(b.inner.VersionedIndexName()). |
|
Aggregation("language", aggregation). |
|
Query(query). |
|
Size(0). // We only need stats information |
|
Do(ctx) |
|
if err != nil { |
|
return 0, nil, nil, err |
|
} |
|
|
|
query = query.Must(langQuery) |
|
searchResult, err := b.inner.Client.Search(). |
|
Index(b.inner.VersionedIndexName()). |
|
Query(query). |
|
Highlight( |
|
elastic.NewHighlight(). |
|
Field("content"). |
|
Field("filename"). |
|
NumOfFragments(0). // return all highting content on fragments |
|
HighlighterType("fvh"), |
|
). |
|
Sort("_score", false). |
|
Sort("updated_at", true). |
|
From(start).Size(pageSize). |
|
Do(ctx) |
|
if err != nil { |
|
return 0, nil, nil, err |
|
} |
|
|
|
total, hits, _, err := convertResult(searchResult, kw, pageSize) |
|
|
|
return total, hits, extractAggs(countResult), err |
|
}
|
|
|