forgejo/modules/indexer/issues/elasticsearch/elasticsearch.go
Jason Song 1e76a824bc
Refactor and enhance issue indexer to support both searching, filtering and paging (#26012)
Fix #24662.

Replace #24822 and #25708 (although it has been merged)


## Background

In the past, Gitea supported issue searching with a keyword and
conditions in a less efficient way. It worked by searching for issues
with the keyword and obtaining limited IDs (as it is heavy to get all)
on the indexer (bleve/elasticsearch/meilisearch), and then querying with
conditions on the database to find a subset of the found IDs. This is
why the results could be incomplete.

To solve this issue, we need to store all fields that could be used as
conditions in the indexer and support both keyword and additional
conditions when searching with the indexer.

## Major changes

- Redefine `IndexerData` to include all fields that could be used as
filter conditions.
- Refactor `Search(ctx context.Context, kw string, repoIDs []int64,
limit, start int, state string)` to `Search(ctx context.Context, options
*SearchOptions)`, so it supports more conditions now.
- Change the data type stored in `issueIndexerQueue`. Use
`IndexerMetadata` instead of `IndexerData` in case the data has been
updated while it is in the queue. This also reduces the storage size of
the queue.
- Enhance searching with Bleve/Elasticsearch/Meilisearch, make them
fully support `SearchOptions`. Also, update the data versions.
- Keep most logic of database indexer, but remove
`issues.SearchIssueIDsByKeyword` in `models` to avoid confusion where is
the entry point to search issues.
- Start a Meilisearch instance to test it in unit tests.
- Add unit tests with almost full coverage to test
Bleve/Elasticsearch/Meilisearch indexer.

---------

Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
2023-07-31 06:28:53 +00:00

281 lines
7.7 KiB
Go

// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"context"
"fmt"
"strconv"
"strings"
"code.gitea.io/gitea/modules/graceful"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
"code.gitea.io/gitea/modules/indexer/issues/internal"
"github.com/olivere/elastic/v7"
)
const (
issueIndexerLatestVersion = 1
)
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
inner *inner_elasticsearch.Indexer
indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
}
// NewIndexer creates a new elasticsearch indexer
func NewIndexer(url, indexerName string) *Indexer {
inner := inner_elasticsearch.NewIndexer(url, indexerName, issueIndexerLatestVersion, defaultMapping)
indexer := &Indexer{
inner: inner,
Indexer: inner,
}
return indexer
}
const (
defaultMapping = `
{
"mappings": {
"properties": {
"id": { "type": "integer", "index": true },
"repo_id": { "type": "integer", "index": true },
"is_public": { "type": "boolean", "index": true },
"title": { "type": "text", "index": true },
"content": { "type": "text", "index": true },
"comments": { "type" : "text", "index": true },
"is_pull": { "type": "boolean", "index": true },
"is_closed": { "type": "boolean", "index": true },
"label_ids": { "type": "integer", "index": true },
"no_label": { "type": "boolean", "index": true },
"milestone_id": { "type": "integer", "index": true },
"project_id": { "type": "integer", "index": true },
"project_board_id": { "type": "integer", "index": true },
"poster_id": { "type": "integer", "index": true },
"assignee_id": { "type": "integer", "index": true },
"mention_ids": { "type": "integer", "index": true },
"reviewed_ids": { "type": "integer", "index": true },
"review_requested_ids": { "type": "integer", "index": true },
"subscriber_ids": { "type": "integer", "index": true },
"updated_unix": { "type": "integer", "index": true },
"created_unix": { "type": "integer", "index": true },
"deadline_unix": { "type": "integer", "index": true },
"comment_count": { "type": "integer", "index": true }
}
}
}
`
)
// Index will save the index data
func (b *Indexer) Index(ctx context.Context, issues ...*internal.IndexerData) error {
if len(issues) == 0 {
return nil
} else if len(issues) == 1 {
issue := issues[0]
_, err := b.inner.Client.Index().
Index(b.inner.VersionedIndexName()).
Id(fmt.Sprintf("%d", issue.ID)).
BodyJson(issue).
Do(ctx)
return err
}
reqs := make([]elastic.BulkableRequest, 0)
for _, issue := range issues {
reqs = append(reqs,
elastic.NewBulkIndexRequest().
Index(b.inner.VersionedIndexName()).
Id(fmt.Sprintf("%d", issue.ID)).
Doc(issue),
)
}
_, err := b.inner.Client.Bulk().
Index(b.inner.VersionedIndexName()).
Add(reqs...).
Do(graceful.GetManager().HammerContext())
return err
}
// Delete deletes indexes by ids
func (b *Indexer) Delete(ctx context.Context, ids ...int64) error {
if len(ids) == 0 {
return nil
} else if len(ids) == 1 {
_, err := b.inner.Client.Delete().
Index(b.inner.VersionedIndexName()).
Id(fmt.Sprintf("%d", ids[0])).
Do(ctx)
return err
}
reqs := make([]elastic.BulkableRequest, 0)
for _, id := range ids {
reqs = append(reqs,
elastic.NewBulkDeleteRequest().
Index(b.inner.VersionedIndexName()).
Id(fmt.Sprintf("%d", id)),
)
}
_, err := b.inner.Client.Bulk().
Index(b.inner.VersionedIndexName()).
Add(reqs...).
Do(graceful.GetManager().HammerContext())
return err
}
// Search searches for issues by given conditions.
// Returns the matching issue IDs
func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) {
query := elastic.NewBoolQuery()
if options.Keyword != "" {
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments"))
}
if len(options.RepoIDs) > 0 {
q := elastic.NewBoolQuery()
q.Should(elastic.NewTermsQuery("repo_id", toAnySlice(options.RepoIDs)...))
if options.AllPublic {
q.Should(elastic.NewTermQuery("is_public", true))
}
query.Must(q)
}
if !options.IsPull.IsNone() {
query.Must(elastic.NewTermQuery("is_pull", options.IsPull.IsTrue()))
}
if !options.IsClosed.IsNone() {
query.Must(elastic.NewTermQuery("is_closed", options.IsClosed.IsTrue()))
}
if options.NoLabelOnly {
query.Must(elastic.NewTermQuery("no_label", true))
} else {
if len(options.IncludedLabelIDs) > 0 {
q := elastic.NewBoolQuery()
for _, labelID := range options.IncludedLabelIDs {
q.Must(elastic.NewTermQuery("label_ids", labelID))
}
query.Must(q)
} else if len(options.IncludedAnyLabelIDs) > 0 {
query.Must(elastic.NewTermsQuery("label_ids", toAnySlice(options.IncludedAnyLabelIDs)...))
}
if len(options.ExcludedLabelIDs) > 0 {
q := elastic.NewBoolQuery()
for _, labelID := range options.ExcludedLabelIDs {
q.MustNot(elastic.NewTermQuery("label_ids", labelID))
}
query.Must(q)
}
}
if len(options.MilestoneIDs) > 0 {
query.Must(elastic.NewTermsQuery("milestone_id", toAnySlice(options.MilestoneIDs)...))
}
if options.ProjectID != nil {
query.Must(elastic.NewTermQuery("project_id", *options.ProjectID))
}
if options.ProjectBoardID != nil {
query.Must(elastic.NewTermQuery("project_board_id", *options.ProjectBoardID))
}
if options.PosterID != nil {
query.Must(elastic.NewTermQuery("poster_id", *options.PosterID))
}
if options.AssigneeID != nil {
query.Must(elastic.NewTermQuery("assignee_id", *options.AssigneeID))
}
if options.MentionID != nil {
query.Must(elastic.NewTermQuery("mention_ids", *options.MentionID))
}
if options.ReviewedID != nil {
query.Must(elastic.NewTermQuery("reviewed_ids", *options.ReviewedID))
}
if options.ReviewRequestedID != nil {
query.Must(elastic.NewTermQuery("review_requested_ids", *options.ReviewRequestedID))
}
if options.SubscriberID != nil {
query.Must(elastic.NewTermQuery("subscriber_ids", *options.SubscriberID))
}
if options.UpdatedAfterUnix != nil || options.UpdatedBeforeUnix != nil {
q := elastic.NewRangeQuery("updated_unix")
if options.UpdatedAfterUnix != nil {
q.Gte(*options.UpdatedAfterUnix)
}
if options.UpdatedBeforeUnix != nil {
q.Lte(*options.UpdatedBeforeUnix)
}
query.Must(q)
}
if options.SortBy == "" {
options.SortBy = internal.SortByCreatedAsc
}
sortBy := []elastic.Sorter{
parseSortBy(options.SortBy),
elastic.NewFieldSort("id").Desc(),
}
// See https://stackoverflow.com/questions/35206409/elasticsearch-2-1-result-window-is-too-large-index-max-result-window/35221900
// TODO: make it configurable since it's configurable in elasticsearch
const maxPageSize = 10000
skip, limit := indexer_internal.ParsePaginator(options.Paginator, maxPageSize)
searchResult, err := b.inner.Client.Search().
Index(b.inner.VersionedIndexName()).
Query(query).
SortBy(sortBy...).
From(skip).Size(limit).
Do(ctx)
if err != nil {
return nil, err
}
hits := make([]internal.Match, 0, limit)
for _, hit := range searchResult.Hits.Hits {
id, _ := strconv.ParseInt(hit.Id, 10, 64)
hits = append(hits, internal.Match{
ID: id,
})
}
return &internal.SearchResult{
Total: searchResult.TotalHits(),
Hits: hits,
}, nil
}
func toAnySlice[T any](s []T) []any {
ret := make([]any, 0, len(s))
for _, item := range s {
ret = append(ret, item)
}
return ret
}
func parseSortBy(sortBy internal.SortBy) elastic.Sorter {
field := strings.TrimPrefix(string(sortBy), "-")
ret := elastic.NewFieldSort(field)
if strings.HasPrefix(string(sortBy), "-") {
ret.Desc()
}
return ret
}