From ec5723c72f69edda7acf50de011711b68bf8a298 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Tue, 7 Jan 2025 14:35:14 -0700 Subject: [PATCH] Upgrade vellum and accommodate change to Reader interfaces --- fuzzy.go | 2 +- fuzzy_test.go | 4 +-- go.mod | 2 +- go.sum | 4 +-- reader.go | 71 +++++++++++++++++++++++++++++++++++++++++++++------ 5 files changed, 69 insertions(+), 14 deletions(-) diff --git a/fuzzy.go b/fuzzy.go index b231731..cedd24d 100644 --- a/fuzzy.go +++ b/fuzzy.go @@ -14,7 +14,7 @@ package sear -func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (dist int, exceeded bool, reuse []int) { +func levenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (dist int, exceeded bool, reuse []int) { la := len(a) lb := len(b) diff --git a/fuzzy_test.go b/fuzzy_test.go index caf34ed..db158d1 100644 --- a/fuzzy_test.go +++ b/fuzzy_test.go @@ -70,13 +70,13 @@ func TestFuzzyMatch(t *testing.T) { test := test t.Run(fmt.Sprintf("%s-%d", test.searchTerm, test.fuzziness), func(t *testing.T) { for _, sm := range test.shouldMatch { - dist, exceeded, _ := LevenshteinDistanceMaxReuseSlice(test.searchTerm, sm, test.fuzziness, nil) + dist, exceeded, _ := levenshteinDistanceMaxReuseSlice(test.searchTerm, sm, test.fuzziness, nil) if dist > test.fuzziness || exceeded { t.Errorf("expected %s to match, did not", sm) } } for _, snm := range test.shouldNotMatch { - dist, exceeded, _ := LevenshteinDistanceMaxReuseSlice(test.searchTerm, snm, test.fuzziness, nil) + dist, exceeded, _ := levenshteinDistanceMaxReuseSlice(test.searchTerm, snm, test.fuzziness, nil) if dist <= test.fuzziness && !exceeded { t.Errorf("expected %s not to match, did", snm) } diff --git a/go.mod b/go.mod index 2746678..e5b2160 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.21 require ( github.com/blevesearch/bleve_index_api v1.2.1 - github.com/blevesearch/vellum v1.0.10 + github.com/blevesearch/vellum v1.1.0 ) require ( diff --git a/go.sum b/go.sum index e22d4a5..97ed339 100644 --- a/go.sum +++ b/go.sum @@ -4,7 +4,7 @@ github.com/blevesearch/bleve_index_api v1.2.1 h1:IuXwLvmyp7I7+e0FOA68gcHHLfzSQ4A github.com/blevesearch/bleve_index_api v1.2.1/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI= -github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k= +github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= +github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a h1:dGzPydgVsqGcTRVwiLJ1jVbufYwmzD3LfVPLKsKg+0k= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/reader.go b/reader.go index 96822a1..ab3cce2 100644 --- a/reader.go +++ b/reader.go @@ -21,9 +21,9 @@ import ( "sort" "strings" - "github.com/blevesearch/vellum" - index "github.com/blevesearch/bleve_index_api" + "github.com/blevesearch/vellum" + vellev "github.com/blevesearch/vellum/levenshtein" velreg "github.com/blevesearch/vellum/regexp" ) @@ -150,29 +150,41 @@ func automatonMatch(la vellum.Automaton, termStr string) bool { } func (r *Reader) FieldDictRegexp(field, regexStr string) (index.FieldDict, error) { + fd, _, err := r.fieldDictRegexp(field, regexStr) + return fd, err +} + +func (r *Reader) FieldDictRegexpAutomaton(field, regexStr string) ( + index.FieldDict, index.RegexAutomaton, error) { + return r.fieldDictRegexp(field, regexStr) +} + +func (r *Reader) fieldDictRegexp(field, regexStr string) ( + index.FieldDict, index.RegexAutomaton, error) { regex, cached := r.velregCache[regexStr] if !cached { var err error regex, err = velreg.New(regexStr) if err != nil { - return nil, fmt.Errorf("error compiling regexp: %v", err) + return nil, nil, fmt.Errorf("error compiling regexp: %v", err) } r.velregCache[regexStr] = regex } if r.s.doc == nil { - return fieldDictEmpty, nil + return fieldDictEmpty, regex, nil } fieldSortedTerms, err := r.s.doc.SortedTermsForField(field) if err != nil { // only error is field doesn't exist in doc - return fieldDictEmpty, nil + return fieldDictEmpty, regex, nil } return NewFieldDictWithTerms(fieldSortedTerms, func(s string) bool { return automatonMatch(regex, s) - }), nil + }), regex, nil } -func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string) (index.FieldDict, error) { +func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string) ( + index.FieldDict, error) { if r.s.doc == nil { return fieldDictEmpty, nil } @@ -184,7 +196,8 @@ func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string return NewFieldDictWithTerms(fieldSortedTerms, func(indexTerm string) bool { var dist int var exceeded bool - dist, exceeded, r.levSlice = LevenshteinDistanceMaxReuseSlice(term, indexTerm, fuzziness, r.levSlice) + dist, exceeded, r.levSlice = levenshteinDistanceMaxReuseSlice( + term, indexTerm, fuzziness, r.levSlice) if dist <= fuzziness && !exceeded { return true } @@ -192,6 +205,21 @@ func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string }), nil } +func (r *Reader) FieldDictFuzzyAutomaton(field, term string, fuzziness int, prefix string) ( + index.FieldDict, index.FuzzyAutomaton, error) { + a, err := getLevAutomaton(term, uint8(fuzziness)) + if err != nil { + return nil, nil, err + } + var fa index.FuzzyAutomaton + if vfa, ok := a.(vellum.FuzzyAutomaton); ok { + fa = vfa + } + + fd, err := r.FieldDictFuzzy(field, term, fuzziness, prefix) + return fd, fa, err +} + func (r *Reader) FieldDictContains(field string) (index.FieldDictContains, error) { if r.s.doc == nil { return fieldDictContainsEmpty, nil @@ -253,3 +281,30 @@ func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { func (r *Reader) Close() error { return nil } + +// ----------------------------------------------------------------------------- + +// re usable, threadsafe levenshtein builders +var lb1, lb2 *vellev.LevenshteinAutomatonBuilder + +func init() { + var err error + lb1, err = vellev.NewLevenshteinAutomatonBuilder(1, true) + if err != nil { + panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) + } + lb2, err = vellev.NewLevenshteinAutomatonBuilder(2, true) + if err != nil { + panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) + } +} + +// https://github.com/blevesearch/bleve/blob/77458c4/index/scorch/snapshot_index.go#L291 +func getLevAutomaton(term string, fuzziness uint8) (vellum.Automaton, error) { + if fuzziness == 1 { + return lb1.BuildDfa(term, fuzziness) + } else if fuzziness == 2 { + return lb2.BuildDfa(term, fuzziness) + } + return nil, fmt.Errorf("fuzziness exceeds the max limit") +}