Skip to content

Commit

Permalink
Upgrade vellum and accommodate change to Reader interfaces
Browse files Browse the repository at this point in the history
  • Loading branch information
abhinavdangeti committed Jan 7, 2025
1 parent 5536602 commit ec5723c
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 14 deletions.
2 changes: 1 addition & 1 deletion fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

package sear

func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (dist int, exceeded bool, reuse []int) {
func levenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (dist int, exceeded bool, reuse []int) {
la := len(a)
lb := len(b)

Expand Down
4 changes: 2 additions & 2 deletions fuzzy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,13 @@ func TestFuzzyMatch(t *testing.T) {
test := test
t.Run(fmt.Sprintf("%s-%d", test.searchTerm, test.fuzziness), func(t *testing.T) {
for _, sm := range test.shouldMatch {
dist, exceeded, _ := LevenshteinDistanceMaxReuseSlice(test.searchTerm, sm, test.fuzziness, nil)
dist, exceeded, _ := levenshteinDistanceMaxReuseSlice(test.searchTerm, sm, test.fuzziness, nil)
if dist > test.fuzziness || exceeded {
t.Errorf("expected %s to match, did not", sm)
}
}
for _, snm := range test.shouldNotMatch {
dist, exceeded, _ := LevenshteinDistanceMaxReuseSlice(test.searchTerm, snm, test.fuzziness, nil)
dist, exceeded, _ := levenshteinDistanceMaxReuseSlice(test.searchTerm, snm, test.fuzziness, nil)
if dist <= test.fuzziness && !exceeded {
t.Errorf("expected %s not to match, did", snm)
}
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.21

require (
github.com/blevesearch/bleve_index_api v1.2.1
github.com/blevesearch/vellum v1.0.10
github.com/blevesearch/vellum v1.1.0
)

require (
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ github.com/blevesearch/bleve_index_api v1.2.1 h1:IuXwLvmyp7I7+e0FOA68gcHHLfzSQ4A
github.com/blevesearch/bleve_index_api v1.2.1/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI=
github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a h1:dGzPydgVsqGcTRVwiLJ1jVbufYwmzD3LfVPLKsKg+0k=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
71 changes: 63 additions & 8 deletions reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ import (
"sort"
"strings"

"github.com/blevesearch/vellum"

index "github.com/blevesearch/bleve_index_api"
"github.com/blevesearch/vellum"
vellev "github.com/blevesearch/vellum/levenshtein"
velreg "github.com/blevesearch/vellum/regexp"
)

Expand Down Expand Up @@ -150,29 +150,41 @@ func automatonMatch(la vellum.Automaton, termStr string) bool {
}

func (r *Reader) FieldDictRegexp(field, regexStr string) (index.FieldDict, error) {
fd, _, err := r.fieldDictRegexp(field, regexStr)
return fd, err
}

func (r *Reader) FieldDictRegexpAutomaton(field, regexStr string) (
index.FieldDict, index.RegexAutomaton, error) {
return r.fieldDictRegexp(field, regexStr)
}

func (r *Reader) fieldDictRegexp(field, regexStr string) (
index.FieldDict, index.RegexAutomaton, error) {
regex, cached := r.velregCache[regexStr]
if !cached {
var err error
regex, err = velreg.New(regexStr)
if err != nil {
return nil, fmt.Errorf("error compiling regexp: %v", err)
return nil, nil, fmt.Errorf("error compiling regexp: %v", err)
}
r.velregCache[regexStr] = regex
}
if r.s.doc == nil {
return fieldDictEmpty, nil
return fieldDictEmpty, regex, nil
}
fieldSortedTerms, err := r.s.doc.SortedTermsForField(field)
if err != nil {
// only error is field doesn't exist in doc
return fieldDictEmpty, nil
return fieldDictEmpty, regex, nil
}
return NewFieldDictWithTerms(fieldSortedTerms, func(s string) bool {
return automatonMatch(regex, s)
}), nil
}), regex, nil
}

func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string) (index.FieldDict, error) {
func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string) (
index.FieldDict, error) {
if r.s.doc == nil {
return fieldDictEmpty, nil
}
Expand All @@ -184,14 +196,30 @@ func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string
return NewFieldDictWithTerms(fieldSortedTerms, func(indexTerm string) bool {
var dist int
var exceeded bool
dist, exceeded, r.levSlice = LevenshteinDistanceMaxReuseSlice(term, indexTerm, fuzziness, r.levSlice)
dist, exceeded, r.levSlice = levenshteinDistanceMaxReuseSlice(
term, indexTerm, fuzziness, r.levSlice)
if dist <= fuzziness && !exceeded {
return true
}
return false
}), nil
}

func (r *Reader) FieldDictFuzzyAutomaton(field, term string, fuzziness int, prefix string) (
index.FieldDict, index.FuzzyAutomaton, error) {
a, err := getLevAutomaton(term, uint8(fuzziness))
if err != nil {
return nil, nil, err
}
var fa index.FuzzyAutomaton
if vfa, ok := a.(vellum.FuzzyAutomaton); ok {
fa = vfa
}

fd, err := r.FieldDictFuzzy(field, term, fuzziness, prefix)
return fd, fa, err
}

func (r *Reader) FieldDictContains(field string) (index.FieldDictContains, error) {
if r.s.doc == nil {
return fieldDictContainsEmpty, nil
Expand Down Expand Up @@ -253,3 +281,30 @@ func (r *Reader) InternalID(id string) (index.IndexInternalID, error) {
func (r *Reader) Close() error {
return nil
}

// -----------------------------------------------------------------------------

// re usable, threadsafe levenshtein builders
var lb1, lb2 *vellev.LevenshteinAutomatonBuilder

func init() {
var err error
lb1, err = vellev.NewLevenshteinAutomatonBuilder(1, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
}
lb2, err = vellev.NewLevenshteinAutomatonBuilder(2, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
}
}

// https://github.com/blevesearch/bleve/blob/77458c4/index/scorch/snapshot_index.go#L291
func getLevAutomaton(term string, fuzziness uint8) (vellum.Automaton, error) {
if fuzziness == 1 {
return lb1.BuildDfa(term, fuzziness)
} else if fuzziness == 2 {
return lb2.BuildDfa(term, fuzziness)
}
return nil, fmt.Errorf("fuzziness exceeds the max limit")
}

0 comments on commit ec5723c

Please # to comment.