Refactor indexer (#25174)

Refactor `modules/indexer` to make it more maintainable. And it can be easier to support more features. I'm trying to solve some of issue searching, this is a precursor to making functional changes. Current supported engines and the index versions: | engines | issues | code | | - | - | - | | db | Just a wrapper for database queries, doesn't need version | - | | bleve | The version of index is **2** | The version of index is **6** | | elasticsearch | The old index has no version, will be treated as version **0** in this PR | The version of index is **1** | | meilisearch | The old index has no version, will be treated as version **0** in this PR | - | ## Changes ### Split Splited it into mutiple packages ```text indexer ├── internal │ ├── bleve │ ├── db │ ├── elasticsearch │ └── meilisearch ├── code │ ├── bleve │ ├── elasticsearch │ └── internal └── issues ├── bleve ├── db ├── elasticsearch ├── internal └── meilisearch ``` - `indexer/interanal`: Internal shared package for indexer. - `indexer/interanal/[engine]`: Internal shared package for each engine (bleve/db/elasticsearch/meilisearch). - `indexer/code`: Implementations for code indexer. - `indexer/code/internal`: Internal shared package for code indexer. - `indexer/code/[engine]`: Implementation via each engine for code indexer. - `indexer/issues`: Implementations for issues indexer. ### Deduplication - Combine `Init/Ping/Close` for code indexer and issues indexer. - ~Combine `issues.indexerHolder` and `code.wrappedIndexer` to `internal.IndexHolder`.~ Remove it, use dummy indexer instead when the indexer is not ready. - Duplicate two copies of creating ES clients. - Duplicate two copies of `indexerID()`. ### Enhancement - [x] Support index version for elasticsearch issues indexer, the old index without version will be treated as version 0. - [x] Fix spell of `elastic_search/ElasticSearch`, it should be `Elasticsearch`. - [x] Improve versioning of ES index. We don't need `Aliases`: - Gitea does't need aliases for "Zero Downtime" because it never delete old indexes. - The old code of issues indexer uses the orignal name to create issue index, so it's tricky to convert it to an alias. - [x] Support index version for meilisearch issues indexer, the old index without version will be treated as version 0. - [x] Do "ping" only when `Ping` has been called, don't ping periodically and cache the status. - [x] Support the context parameter whenever possible. - [x] Fix outdated example config. - [x] Give up the requeue logic of issues indexer: When indexing fails, call Ping to check if it was caused by the engine being unavailable, and only requeue the task if the engine is unavailable. - It is fragile and tricky, could cause data losing (It did happen when I was doing some tests for this PR). And it works for ES only. - Just always requeue the failed task, if it caused by bad data, it's a bug of Gitea which should be fixed. --------- Co-authored-by: Giteabot <teabot@gitea.io>
2025-08-02 16:35:19 +02:00 · 2023-06-23 20:37:56 +08:00 · 2023-06-23 20:37:56 +08:00 · 375fd15fbf
commit 375fd15fbf
parent b0215c40cd
43 changed files with 1374 additions and 1426 deletions
--- a/modules/indexer/code/elastic_search.go
+++ b/modules/indexer/code/elastic_search.go
@ -1,512 +0,0 @@
-// Copyright 2020 The Gitea Authors. All rights reserved.
-// SPDX-License-Identifier: MIT
-
-package code
-
-import (
-	"bufio"
-	"context"
-	"errors"
-	"fmt"
-	"io"
-	"net"
-	"strconv"
-	"strings"
-	"sync"
-	"time"
-
-	repo_model "code.gitea.io/gitea/models/repo"
-	"code.gitea.io/gitea/modules/analyze"
-	"code.gitea.io/gitea/modules/charset"
-	"code.gitea.io/gitea/modules/git"
-	"code.gitea.io/gitea/modules/graceful"
-	"code.gitea.io/gitea/modules/json"
-	"code.gitea.io/gitea/modules/log"
-	"code.gitea.io/gitea/modules/setting"
-	"code.gitea.io/gitea/modules/timeutil"
-	"code.gitea.io/gitea/modules/typesniffer"
-
-	"github.com/go-enry/go-enry/v2"
-	"github.com/olivere/elastic/v7"
-)
-
-const (
-	esRepoIndexerLatestVersion = 1
-	// multi-match-types, currently only 2 types are used
-	// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
-	esMultiMatchTypeBestFields   = "best_fields"
-	esMultiMatchTypePhrasePrefix = "phrase_prefix"
-)
-
-var _ Indexer = &ElasticSearchIndexer{}
-
-// ElasticSearchIndexer implements Indexer interface
-type ElasticSearchIndexer struct {
-	client           *elastic.Client
-	indexerAliasName string
-	available        bool
-	stopTimer        chan struct{}
-	lock             sync.RWMutex
-}
-
-// NewElasticSearchIndexer creates a new elasticsearch indexer
-func NewElasticSearchIndexer(url, indexerName string) (*ElasticSearchIndexer, bool, error) {
-	opts := []elastic.ClientOptionFunc{
-		elastic.SetURL(url),
-		elastic.SetSniff(false),
-		elastic.SetHealthcheckInterval(10 * time.Second),
-		elastic.SetGzip(false),
-	}
-
-	logger := log.GetLogger(log.DEFAULT)
-
-	opts = append(opts, elastic.SetTraceLog(&log.PrintfLogger{Logf: logger.Trace}))
-	opts = append(opts, elastic.SetInfoLog(&log.PrintfLogger{Logf: logger.Info}))
-	opts = append(opts, elastic.SetErrorLog(&log.PrintfLogger{Logf: logger.Error}))
-
-	client, err := elastic.NewClient(opts...)
-	if err != nil {
-		return nil, false, err
-	}
-
-	indexer := &ElasticSearchIndexer{
-		client:           client,
-		indexerAliasName: indexerName,
-		available:        true,
-		stopTimer:        make(chan struct{}),
-	}
-
-	ticker := time.NewTicker(10 * time.Second)
-	go func() {
-		for {
-			select {
-			case <-ticker.C:
-				indexer.checkAvailability()
-			case <-indexer.stopTimer:
-				ticker.Stop()
-				return
-			}
-		}
-	}()
-
-	exists, err := indexer.init()
-	if err != nil {
-		indexer.Close()
-		return nil, false, err
-	}
-	return indexer, !exists, err
-}
-
-const (
-	defaultMapping = `{
-		"mappings": {
-			"properties": {
-				"repo_id": {
-					"type": "long",
-					"index": true
-				},
-				"content": {
-					"type": "text",
-					"term_vector": "with_positions_offsets",
-					"index": true
-				},
-				"commit_id": {
-					"type": "keyword",
-					"index": true
-				},
-				"language": {
-					"type": "keyword",
-					"index": true
-				},
-				"updated_at": {
-					"type": "long",
-					"index": true
-				}
-			}
-		}
-	}`
-)
-
-func (b *ElasticSearchIndexer) realIndexerName() string {
-	return fmt.Sprintf("%s.v%d", b.indexerAliasName, esRepoIndexerLatestVersion)
-}
-
-// Init will initialize the indexer
-func (b *ElasticSearchIndexer) init() (bool, error) {
-	ctx := graceful.GetManager().HammerContext()
-	exists, err := b.client.IndexExists(b.realIndexerName()).Do(ctx)
-	if err != nil {
-		return false, b.checkError(err)
-	}
-	if !exists {
-		mapping := defaultMapping
-
-		createIndex, err := b.client.CreateIndex(b.realIndexerName()).BodyString(mapping).Do(ctx)
-		if err != nil {
-			return false, b.checkError(err)
-		}
-		if !createIndex.Acknowledged {
-			return false, fmt.Errorf("create index %s with %s failed", b.realIndexerName(), mapping)
-		}
-	}
-
-	// check version
-	r, err := b.client.Aliases().Do(ctx)
-	if err != nil {
-		return false, b.checkError(err)
-	}
-
-	realIndexerNames := r.IndicesByAlias(b.indexerAliasName)
-	if len(realIndexerNames) < 1 {
-		res, err := b.client.Alias().
-			Add(b.realIndexerName(), b.indexerAliasName).
-			Do(ctx)
-		if err != nil {
-			return false, b.checkError(err)
-		}
-		if !res.Acknowledged {
-			return false, fmt.Errorf("create alias %s to index %s failed", b.indexerAliasName, b.realIndexerName())
-		}
-	} else if len(realIndexerNames) >= 1 && realIndexerNames[0] < b.realIndexerName() {
-		log.Warn("Found older gitea indexer named %s, but we will create a new one %s and keep the old NOT DELETED. You can delete the old version after the upgrade succeed.",
-			realIndexerNames[0], b.realIndexerName())
-		res, err := b.client.Alias().
-			Remove(realIndexerNames[0], b.indexerAliasName).
-			Add(b.realIndexerName(), b.indexerAliasName).
-			Do(ctx)
-		if err != nil {
-			return false, b.checkError(err)
-		}
-		if !res.Acknowledged {
-			return false, fmt.Errorf("change alias %s to index %s failed", b.indexerAliasName, b.realIndexerName())
-		}
-	}
-
-	return exists, nil
-}
-
-// Ping checks if elastic is available
-func (b *ElasticSearchIndexer) Ping() bool {
-	b.lock.RLock()
-	defer b.lock.RUnlock()
-	return b.available
-}
-
-func (b *ElasticSearchIndexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update fileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
-	// Ignore vendored files in code search
-	if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
-		return nil, nil
-	}
-
-	size := update.Size
-	var err error
-	if !update.Sized {
-		var stdout string
-		stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
-		if err != nil {
-			return nil, err
-		}
-		if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
-			return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
-		}
-	}
-
-	if size > setting.Indexer.MaxIndexerFileSize {
-		return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
-	}
-
-	if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
-		return nil, err
-	}
-
-	_, _, size, err = git.ReadBatchLine(batchReader)
-	if err != nil {
-		return nil, err
-	}
-
-	fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
-	if err != nil {
-		return nil, err
-	} else if !typesniffer.DetectContentType(fileContents).IsText() {
-		// FIXME: UTF-16 files will probably fail here
-		return nil, nil
-	}
-
-	if _, err = batchReader.Discard(1); err != nil {
-		return nil, err
-	}
-	id := filenameIndexerID(repo.ID, update.Filename)
-
-	return []elastic.BulkableRequest{
-		elastic.NewBulkIndexRequest().
-			Index(b.indexerAliasName).
-			Id(id).
-			Doc(map[string]interface{}{
-				"repo_id":    repo.ID,
-				"content":    string(charset.ToUTF8DropErrors(fileContents)),
-				"commit_id":  sha,
-				"language":   analyze.GetCodeLanguage(update.Filename, fileContents),
-				"updated_at": timeutil.TimeStampNow(),
-			}),
-	}, nil
-}
-
-func (b *ElasticSearchIndexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
-	id := filenameIndexerID(repo.ID, filename)
-	return elastic.NewBulkDeleteRequest().
-		Index(b.indexerAliasName).
-		Id(id)
-}
-
-// Index will save the index data
-func (b *ElasticSearchIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *repoChanges) error {
-	reqs := make([]elastic.BulkableRequest, 0)
-	if len(changes.Updates) > 0 {
-		// Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
-		if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
-			log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
-			return err
-		}
-
-		batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
-		defer cancel()
-
-		for _, update := range changes.Updates {
-			updateReqs, err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo)
-			if err != nil {
-				return err
-			}
-			if len(updateReqs) > 0 {
-				reqs = append(reqs, updateReqs...)
-			}
-		}
-		cancel()
-	}
-
-	for _, filename := range changes.RemovedFilenames {
-		reqs = append(reqs, b.addDelete(filename, repo))
-	}
-
-	if len(reqs) > 0 {
-		_, err := b.client.Bulk().
-			Index(b.indexerAliasName).
-			Add(reqs...).
-			Do(ctx)
-		return b.checkError(err)
-	}
-	return nil
-}
-
-// Delete deletes indexes by ids
-func (b *ElasticSearchIndexer) Delete(repoID int64) error {
-	_, err := b.client.DeleteByQuery(b.indexerAliasName).
-		Query(elastic.NewTermsQuery("repo_id", repoID)).
-		Do(graceful.GetManager().HammerContext())
-	return b.checkError(err)
-}
-
-// indexPos find words positions for start and the following end on content. It will
-// return the beginning position of the first start and the ending position of the
-// first end following the start string.
-// If not found any of the positions, it will return -1, -1.
-func indexPos(content, start, end string) (int, int) {
-	startIdx := strings.Index(content, start)
-	if startIdx < 0 {
-		return -1, -1
-	}
-	endIdx := strings.Index(content[startIdx+len(start):], end)
-	if endIdx < 0 {
-		return -1, -1
-	}
-	return startIdx, startIdx + len(start) + endIdx + len(end)
-}
-
-func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
-	hits := make([]*SearchResult, 0, pageSize)
-	for _, hit := range searchResult.Hits.Hits {
-		// FIXME: There is no way to get the position the keyword on the content currently on the same request.
-		// So we get it from content, this may made the query slower. See
-		// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
-		var startIndex, endIndex int
-		c, ok := hit.Highlight["content"]
-		if ok && len(c) > 0 {
-			// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
-			// now we should find the positions. But how to avoid html content which contains the
-			// <em> and </em> tags? If elastic search has handled that?
-			startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
-			if startIndex == -1 {
-				panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
-			}
-		} else {
-			panic(fmt.Sprintf("2===%#v", hit.Highlight))
-		}
-
-		repoID, fileName := parseIndexerID(hit.Id)
-		res := make(map[string]interface{})
-		if err := json.Unmarshal(hit.Source, &res); err != nil {
-			return 0, nil, nil, err
-		}
-
-		language := res["language"].(string)
-
-		hits = append(hits, &SearchResult{
-			RepoID:      repoID,
-			Filename:    fileName,
-			CommitID:    res["commit_id"].(string),
-			Content:     res["content"].(string),
-			UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
-			Language:    language,
-			StartIndex:  startIndex,
-			EndIndex:    endIndex - 9, // remove the length <em></em> since we give Content the original data
-			Color:       enry.GetColor(language),
-		})
-	}
-
-	return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
-}
-
-func extractAggs(searchResult *elastic.SearchResult) []*SearchResultLanguages {
-	var searchResultLanguages []*SearchResultLanguages
-	agg, found := searchResult.Aggregations.Terms("language")
-	if found {
-		searchResultLanguages = make([]*SearchResultLanguages, 0, 10)
-
-		for _, bucket := range agg.Buckets {
-			searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
-				Language: bucket.Key.(string),
-				Color:    enry.GetColor(bucket.Key.(string)),
-				Count:    int(bucket.DocCount),
-			})
-		}
-	}
-	return searchResultLanguages
-}
-
-// Search searches for codes and language stats by given conditions.
-func (b *ElasticSearchIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
-	searchType := esMultiMatchTypeBestFields
-	if isMatch {
-		searchType = esMultiMatchTypePhrasePrefix
-	}
-
-	kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
-	query := elastic.NewBoolQuery()
-	query = query.Must(kwQuery)
-	if len(repoIDs) > 0 {
-		repoStrs := make([]interface{}, 0, len(repoIDs))
-		for _, repoID := range repoIDs {
-			repoStrs = append(repoStrs, repoID)
-		}
-		repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
-		query = query.Must(repoQuery)
-	}
-
-	var (
-		start       int
-		kw          = "<em>" + keyword + "</em>"
-		aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
-	)
-
-	if page > 0 {
-		start = (page - 1) * pageSize
-	}
-
-	if len(language) == 0 {
-		searchResult, err := b.client.Search().
-			Index(b.indexerAliasName).
-			Aggregation("language", aggregation).
-			Query(query).
-			Highlight(
-				elastic.NewHighlight().
-					Field("content").
-					NumOfFragments(0). // return all highting content on fragments
-					HighlighterType("fvh"),
-			).
-			Sort("repo_id", true).
-			From(start).Size(pageSize).
-			Do(ctx)
-		if err != nil {
-			return 0, nil, nil, b.checkError(err)
-		}
-
-		return convertResult(searchResult, kw, pageSize)
-	}
-
-	langQuery := elastic.NewMatchQuery("language", language)
-	countResult, err := b.client.Search().
-		Index(b.indexerAliasName).
-		Aggregation("language", aggregation).
-		Query(query).
-		Size(0). // We only needs stats information
-		Do(ctx)
-	if err != nil {
-		return 0, nil, nil, b.checkError(err)
-	}
-
-	query = query.Must(langQuery)
-	searchResult, err := b.client.Search().
-		Index(b.indexerAliasName).
-		Query(query).
-		Highlight(
-			elastic.NewHighlight().
-				Field("content").
-				NumOfFragments(0). // return all highting content on fragments
-				HighlighterType("fvh"),
-		).
-		Sort("repo_id", true).
-		From(start).Size(pageSize).
-		Do(ctx)
-	if err != nil {
-		return 0, nil, nil, b.checkError(err)
-	}
-
-	total, hits, _, err := convertResult(searchResult, kw, pageSize)
-
-	return total, hits, extractAggs(countResult), err
-}
-
-// Close implements indexer
-func (b *ElasticSearchIndexer) Close() {
-	select {
-	case <-b.stopTimer:
-	default:
-		close(b.stopTimer)
-	}
-}
-
-func (b *ElasticSearchIndexer) checkError(err error) error {
-	var opErr *net.OpError
-	if !(elastic.IsConnErr(err) || (errors.As(err, &opErr) && (opErr.Op == "dial" || opErr.Op == "read"))) {
-		return err
-	}
-
-	b.setAvailability(false)
-
-	return err
-}
-
-func (b *ElasticSearchIndexer) checkAvailability() {
-	if b.Ping() {
-		return
-	}
-
-	// Request cluster state to check if elastic is available again
-	_, err := b.client.ClusterState().Do(graceful.GetManager().ShutdownContext())
-	if err != nil {
-		b.setAvailability(false)
-		return
-	}
-
-	b.setAvailability(true)
-}
-
-func (b *ElasticSearchIndexer) setAvailability(available bool) {
-	b.lock.Lock()
-	defer b.lock.Unlock()
-
-	if b.available == available {
-		return
-	}
-
-	b.available = available
-}