forgejo/modules/indexer/code/internal/highlight.go
Alex619829 e35880e7ac Add code search with zoekt support (#8827)
This PR adds zoekt as a code search engine for forgejo. This Pull Request is a continuation of the discussion #8302.
The meilisearch search engine was not suitable, as it is not designed for searching by code. The zoekt project was proposed instead. Zoekt copes well with code indexing, but its operating principle differs from such search engines as elasticsearch.
While elasticsearch can return a result in a ready-made form (with pagination, ready-made snippets, etc.) and forgejo only needs to show this result in the interface with a little work with the data, zoekt works completely differently.

Zoekt finds matches in the repository index and returns a response. The response contains a line with the search word, its number from the file, and also a context, if specified in the request. This response is not suitable for Forgejo, so you need to assemble it yourself. To assemble the response from Zoekt into a form acceptable for Forgejo, I had to write some code and create a new function `searchZoektResult`, since the existing `searchResult` function is completely unsuitable for this search engine. I also had to write logic for pagination, highlighting, and correct display of lines in found snippets with a match, but this is a feature of Zoekt.
At the moment, Zoekt does not support deleting a repository index by repo_id, it only supports complete deletion of all repositories. But I still implemented the Delete function, which deletes a specific repository by its ID.

Co-authored-by: Aleksandr Gamzin <gamzin@altlinux.org>
Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/8827
Reviewed-by: Shiny Nematoda <snematoda@noreply.codeberg.org>
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
2026-05-28 20:52:34 +02:00

114 lines
3 KiB
Go

// Copyright 2026 The Forgejo Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"bytes"
"html/template"
"strings"
"forgejo.org/modules/highlight"
"forgejo.org/services/gitdiff"
)
func WriteStrings(buf *bytes.Buffer, strs ...string) error {
for _, s := range strs {
_, err := buf.WriteString(s)
if err != nil {
return err
}
}
return nil
}
const (
highlightTagStart = "<span class=\"search-highlight\">"
highlightTagEnd = "</span>"
)
func HighlightSearchResultCode(filename string, lineNums []int, highlightRanges [][3]int, code string) []ResultLine {
hcd := gitdiff.NewHighlightCodeDiff()
hcd.CollectUsedRunes(code)
startTag, endTag := hcd.NextPlaceholder(), hcd.NextPlaceholder()
hcd.PlaceholderTokenMap[startTag] = highlightTagStart
hcd.PlaceholderTokenMap[endTag] = highlightTagEnd
// we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting
hl, _ := highlight.Code(filename, "", code)
conv := hcd.ConvertToPlaceholders(string(hl))
convLines := strings.Split(conv, "\n")
// each highlightRange is of the form [line number, start byte offset, end byte offset]
for _, highlightRange := range highlightRanges {
ln, start, end := highlightRange[0], highlightRange[1], highlightRange[2]
line := convLines[ln]
if line == "" || len(line) <= start || len(line) < end {
continue
}
sr := strings.NewReader(line)
sb := strings.Builder{}
count := -1
isOpen := false
for r, size, err := sr.ReadRune(); err == nil; r, size, err = sr.ReadRune() {
if token, ok := hcd.PlaceholderTokenMap[r];
// token was not found
!ok {
count += size
} else if
// token was marked as used
token == "" ||
// the token is not an valid html tag emitted by chroma
!(len(token) > 6 && (token[0:5] == "<span" || token[0:6] == "</span")) {
count++
} else if !isOpen {
// open the tag only after all other placeholders
sb.WriteRune(r)
continue
} else if isOpen && count < end {
// if the tag is open, but a placeholder exists in between
// close the tag
sb.WriteRune(endTag)
// write the placeholder
sb.WriteRune(r)
// reopen the tag
sb.WriteRune(startTag)
continue
}
switch {
case count >= end:
// if tag is not open, no need to close
if !isOpen {
break
}
sb.WriteRune(endTag)
isOpen = false
case count >= start:
// if tag is open, do not open again
if isOpen {
break
}
isOpen = true
sb.WriteRune(startTag)
}
sb.WriteRune(r)
}
if isOpen {
sb.WriteRune(endTag)
}
convLines[ln] = sb.String()
}
conv = strings.Join(convLines, "\n")
highlightedLines := strings.Split(hcd.Recover(conv), "\n")
// The lineNums outputted by highlight.Code might not match the original lineNums, because "highlight" removes the last `\n`
lines := make([]ResultLine, min(len(highlightedLines), len(lineNums)))
for i := range len(lines) {
lines[i].Num = lineNums[i]
lines[i].FormattedContent = template.HTML(highlightedLines[i])
}
return lines
}