diff --git a/docs/lib/search.ts.html b/docs/lib/search.ts.html index ded38c3..0729ced 100644 --- a/docs/lib/search.ts.html +++ b/docs/lib/search.ts.html @@ -17,105 +17,151 @@

./lib/search.ts annotated source


Basic principles


libsearch is the core text search algorithm that I've polished and +reused over the years across many of my personal +projects for fast and simple full-text +search, packaged into a small single-purpose JavaScript library.


For how to import and use in your own project, and for canonical +documentation, check out the GitHub repository +page.


TODO: Explain stuff...


Basic principles


libsearch uses two tricks to return full-text search results that are +reasonably good: (1) index-free, regular expression-based string search and +(2) TF-IDF ranking based on those RegExp matches:

  1. Rather than using a pre-built index that maps tokens to documents, which +requires maintenance to be kept up-to-date every time the underlying +corpus changes, libsearch transforms the search query into regular +expressions that progressively filter the corpus. In theory, this is +O(n), but in practice, for small enough n (MBs of text), this is good +enough.
  2. +
  3. The conventional TF-IDF formula requires knowing the number of tokens in +every document. This requires either a pre-built index, or is +computationally expensive, so instead we approximate this using the +character count of the document. Using JavaScript's RegExp#exec with a +global regular expression lets us quickly count the number of matches of +a keyword in a document. Using these tricks, libsearch uses the formula:
  4. +

tokens / doc.length * log(# docs / # matching docs)




To turn every potential query into a regular expression, we need to be able -to escape key characters.

9function escapeForRegExp(text: string): string {
10    return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1');
+to escape characters that are significant in RegExp.

35function escapeForRegExp(text: string): string {
36    return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1');

Utility function for sorting an array by some predicate, rather than a -comparator function.

15function sortBy<T>(items: T[], by: (_it: T) => any): T[] {
16    return items.sort((a, b) => {
17        const aby = by(a);
18        const bby = by(b);
19        if (aby < bby) {
20            return 1;
21        }
22        if (bby < aby) {
23            return -1;
24        }
25        return 0;
26    });

The main search function takes:

+comparator function. This implementation assumes by(it) is very cheap.

41function sortBy<T>(items: T[], by: (_it: T) => any): T[] {
42    return items.sort((a, b) => {
43        const aby = by(a);
44        const bby = by(b);
45        if (aby < bby) {
46            return 1;
47        }
48        if (bby < aby) {
49            return -1;
50        }
51        return 0;
52    });

The search function takes:

Options include

37export function search<T>(items: T[], query: string, by: (_it: T) => any = x => x, {
38    caseSensitive = false,
39    mode = 'prefix',
40}: {
41    caseSensitive?: boolean;
42    mode?: 'word' | 'prefix';
43} = {}) {
44    function countMatches(s: string, regexp: RegExp): number {
45        let i = 0;
46        while (regexp.exec(s) !== null) {
47            i ++;
48        }
49        return i;
50    }
52    const words = query
53        .trim()
54        .split(' ')
55        .filter(s => s !== '');
57    if (words.length === 0) {
58        return items;
59    }
61    const tfidf = new Map<T, number>();
62    const suggestions = words.reduce((suggestions, word, i) => {
63        const isLastWord = i + 1 === words.length;
64        const regexp = new RegExp(
65            '(^|\\W)' + escapeForRegExp(word) + (isLastWord || mode === 'prefix' ? '' : '($|\\W)'),
66            // the "u" flag for Unicode used to be used here, but was removed
67            // because it was (1) across-the-board too slow, and removing it
68            // made a statistically significant speed improvement, and (2)
69            // caused at least Chrome to have strange performance cliffs in
70            // unpredictable ways where certain regexp operations would take
71            // 10s of ms.
72            caseSensitive ? 'mg' : 'img'
73        );
74        return suggestions.filter(sugg => {
75            const text = by(sugg);
76            const count = countMatches(text, regexp);
77            if (count === 0) {
78                return false;
79            }

TF-IDF weighting per-term

81            tfidf.set(
82                sugg,
83                (tfidf.get(sugg) || 0)
84                    + (count / text.length * Math.log(items.length / suggestions.length))
85            );
86            return true;
87        })
88    }, items);
90    return sortBy(suggestions, sugg => tfidf.get(sugg));
66export function search<T>(items: T[], query: string, by: (_it: T) => any = x => x, {
67    caseSensitive = false,
68    mode = 'autocomplete',
69}: {
70    caseSensitive?: boolean;
71    mode?: 'word' | 'prefix' | 'autocomplete';
72} = {}) {

countMatches counts the number of times regexp occurs in the string +s. We need this information for ranking, where documents that mention +the keyword more times (relative to the total word count of the +document) are ranked higher.

77    function countMatches(s: string, regexp: RegExp): number {
78        let i = 0;
79        while (regexp.exec(s) !== null) {
80            i ++;
81        }
82        return i;
83    }

We chunk up the query string into a list of "words", each of which will +become a regular expression filter.

87    const words = query
88        .trim()
89        .split(/\s+/)
90        .filter(s => s !== '');

Short-circuit if the search query is empty -- return the original list. +This is a sensible default because in most apps this corresponds to the +"home view" of the list, where a search has not been performed.

95    if (words.length === 0) {
96        return items;
97    }

For every word in the search query, we're going to keep track of every +document's TF-IDF value in this map, and aggregate them together by the +end for sorting.

102    const tfidf = new Map<T, number>();

Iterate through every word in the query and progressively filter down +items to just the documents that match every query word.

106    const results = words.reduce((results, word, i) => {
107        const isLastWord = i + 1 === words.length;
108        const regexp = new RegExp(
109            '(^|\\W)'
110                + escapeForRegExp(word)
111                + ((mode === 'autocomplete' && isLastWord) || mode === 'prefix' ? '' : '($|\\W)'),

The 'u' flag for Unicode used to be used here, but was removed +because it was (1) across-the-board too slow, and removing it +made a statistically significant speed improvement, and (2) +caused at least Chrome to have strange performance cliffs in +unpredictable ways where certain RegExp operations would take +10s of ms.

118            caseSensitive ? 'mg' : 'img'
119        );
120        return results.filter(result => {
121            const text = by(result);
122            const count = countMatches(text, regexp);
123            if (count === 0) {
124                return false;
125            }

Compute the TF-IDF value for this word, and add it to this +result's TF-IDF value so far.

128            tfidf.set(
129                result,
130                (tfidf.get(result) || 0)
131                    + (count / text.length * Math.log(items.length / results.length))
132            );
133            return true;
134        })
135    }, items);

Sort the results list by our ranking metric, TF-IDF

138    return sortBy(results, result => tfidf.get(result));
diff --git a/docs/test/search.js.html b/docs/test/search.js.html index abf7ba0..6f1b6db 100644 --- a/docs/test/search.js.html +++ b/docs/test/search.js.html @@ -20,135 +20,157 @@

./test/search.js annotated source

1import {strict as assert} from 'node:assert';
2import {search} from '../dist/search.js';
4function item(name) {
5    return {name};

Most of the tests work on this pre-set list of items to search

9const ITEMS = [
10    item('Linus Lee'),
11    item('@thesephist'),
12    item('@geohot'),
13    item('linuslee'),
14    item('linus is a person'),
15    item('@dlwlrma'),
18describe('basic search', () => {
19    it('search empty array', () => {
20        assert.deepEqual(search([], 'query', x => x.name), []);
21    });
23    it('search with empty query', () => {
24        assert.deepEqual(search(ITEMS, '', x => x.name), ITEMS);
25    });
27    it('search with 1 letter returns correct result', () => {
28        assert.deepEqual(search(ITEMS, 'l', x => x.name), [
29            item('Linus Lee'),
30            item('linuslee'),
31            item('linus is a person'),
32        ]);
33    });
35    it('multi-word search returns correct result', () => {
36        assert.deepEqual(search(ITEMS, 'linus lee', x => x.name), [
37            item('Linus Lee'),
38        ]);
39    });
41    it('searching words out of order returns correct result', () => {
42        assert.deepEqual(search(ITEMS, 'lee linus', x => x.name), [
43            item('Linus Lee'),
44        ]);
45    });
47    it('search works even if the last query word is incomplete', () => {
48        assert.deepEqual(search(ITEMS, 'linus le', x => x.name), [
49            item('Linus Lee'),
50        ]);
51    });
53    it('correctly implements TF-IDF ranking', () => {
4const item = name => ({name});

Most of the tests operate on this pre-set list of items to search

7const ITEMS = [
8    item('Linus Lee'),
9    item('@thesephist'),
10    item('@geohot'),
11    item('linuslee'),
12    item('linus is a person'),
13    item('@dlwlrma'),
16describe('basic search', () => {
17    it('search empty array', () => {
18        assert.deepEqual(search([], 'query', x => x.name), []);
19    });
21    it('search with empty query', () => {
22        assert.deepEqual(search(ITEMS, '', x => x.name), ITEMS);
23    });
25    it('search with 1 letter returns correct result', () => {
26        assert.deepEqual(search(ITEMS, 'l', x => x.name), [
27            item('Linus Lee'),
28            item('linuslee'),
29            item('linus is a person'),
30        ]);
31    });
33    it('search does not match from middle of words', () => {
34        assert.deepEqual(search(ITEMS, 'w', x => x.name), []);
35    });
37    it('multi-word search returns correct result', () => {
38        assert.deepEqual(search(ITEMS, 'linus lee', x => x.name), [
39            item('Linus Lee'),
40        ]);
41    });
43    it('searching words out of order returns correct result', () => {
44        assert.deepEqual(search(ITEMS, 'lee linus', x => x.name), [
45            item('Linus Lee'),
46        ]);
47    });
49    it('search works even if the last query word is incomplete', () => {
50        assert.deepEqual(search(ITEMS, 'linus le', x => x.name), [
51            item('Linus Lee'),
52        ]);
53    });
55    it('search query may contain newlines, tabs, and multiple consecutive spaces', () => {
56        assert.deepEqual(search(ITEMS, '  linus\t is\nperson\t', x => x.name), [
57            item('linus is a person'),
58        ]);
59    });
61    it('correctly implements TF-IDF ranking', () => {

In this example, "mango" has much higher IDF (is a higher-signal word) in the corpus than "apple", which appears in nearly every document. Therefore,

57        assert.deepEqual(
58            search([
59                // matches
60                item('mango mango mango apple'),
61                item('mango apple mango apple'),
62                item('apple mango apple mango apple mango apple mango'),
63                item('apple apple apple apple apple apple apple apple mango'),
64                // rejects
65                item('apple apple apple'),
66                item('mango mango mango'),
67                item('applemango'),
68                item('mangoapple'),
69                item('apple 1'),
70                item('apple 2'),
71                item('apple 3'),
72                item('apple 4'),
73                item('apple 5'),
74                item('apple 6'),
75                item('apple 7'),
76                item('apple 8'),
77                item('apple 9'),
78            ], 'apple mango', x => x.name),
79            [
80                item('mango mango mango apple'),
81                item('mango apple mango apple'),
82                item('apple mango apple mango apple mango apple mango'),
83                item('apple apple apple apple apple apple apple apple mango'),
84            ]
85        );
86    });
89describe('custom search-by predicates', () => {
90    it('default predicate is provided as x => x', () => {
91        assert.deepEqual(
92            search([
93                'university',
94                'uni of california',
95                'university of california',
96            ], 'uni of cali'),
97            [
98                'uni of california',
99                'university of california',
100            ]
101        );
102    });
104    it('accepts and uses a custom predicate', () => {
105        assert.deepEqual(search(ITEMS, 'sunil ee', x => x.name.split('').reverse().join('')), [
106            item('Linus Lee'),
107        ]);
108    });
65        assert.deepEqual(
66            search([
67                // matches
68                item('mango mango mango apple'),
69                item('mango apple mango apple'),
70                item('apple mango apple mango apple mango apple mango'),
71                item('apple apple apple apple apple apple apple apple mango'),
72                // rejects
73                item('apple apple apple'),
74                item('mango mango mango'),
75                item('applemango'),
76                item('mangoapple'),
77                item('apple 1'),
78                item('apple 2'),
79                item('apple 3'),
80                item('apple 4'),
81                item('apple 5'),
82                item('apple 6'),
83                item('apple 7'),
84                item('apple 8'),
85                item('apple 9'),
86            ], 'apple mango', x => x.name),
87            [
88                item('mango mango mango apple'),
89                item('mango apple mango apple'),
90                item('apple mango apple mango apple mango apple mango'),
91                item('apple apple apple apple apple apple apple apple mango'),
92            ]
93        );
94    });
97describe('custom search-by predicates', () => {
98    it('default predicate is provided as x => x', () => {
99        assert.deepEqual(
100            search([
101                'university',
102                'uni of california',
103                'university of california',
104            ], 'uni of cali'),
105            [
106                'uni of california',
107            ]
108        );
109    });
111describe('search modes', () => {
112    it('in mode: prefix, every query word can be incomplete', () => {
113        assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'prefix'}), [
114            item('Linus Lee'),
115        ])
116    });
111    it('accepts and uses a custom predicate', () => {
112        assert.deepEqual(search(ITEMS, 'sunil ee', x => x.name.split('').reverse().join('')), [
113            item('Linus Lee'),
114        ]);
115    });
118    it('in mode: word, search does not match if non-last words are incomplete', () => {
119        assert.deepEqual(search(ITEMS, 'linu lee', x => x.name, {mode: 'word'}), []);
120    });
118describe('search modes', () => {
119    it('in mode: word, search does not match if any words are incomplete', () => {
120        assert.deepEqual(search(ITEMS, 'linu lee', x => x.name, {mode: 'word'}), []);
121    });
123describe('case sensitivity', () => {
124    it('caseSensitive: true omits non-matching results', () => {
125        assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: true}), [
126            item('linuslee'),
127            item('linus is a person'),
128        ]);
129    });
123    it('in mode: prefix, every query word may be incomplete', () => {
124        assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'prefix'}), [
125            item('Linus Lee'),
126        ]);
127    });
129    it('in mode: autocomplete, only the last query word may be incomplete', () => {
130        assert.deepEqual(search(ITEMS, 'linus le', x => x.name, {mode: 'autocomplete'}), [
131            item('Linus Lee'),
132        ]);
133        assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'autocomplete'}), []);
134    });
137describe('case sensitivity', () => {
138    it('caseSensitive: true omits non-matching results', () => {
139        assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: true}), [
140            item('linuslee'),
141            item('linus is a person'),
142        ]);
143    });
145    it('caseSensitive: false includes case-insensitive results', () => {
146        assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: false}), [
147            item('Linus Lee'),
148            item('linuslee'),
149            item('linus is a person'),
150        ]);
151    });
diff --git a/lib/search.ts b/lib/search.ts index fe61f8b..e2faf6e 100644 --- a/lib/search.ts +++ b/lib/search.ts @@ -1,17 +1,43 @@ -//> ## Basic principles +//> **libsearch** is the core text search algorithm that I've polished and +// reused over the years across [many of my personal +// projects](https://thesephist/projects) for fast and simple full-text +// search, packaged into a small single-purpose JavaScript library. +// +// For how to import and use in your own project, and for canonical +// documentation, check out the [GitHub repository +// page](https://github.com/thesephist/libsearch). -//> TODO: Explain stuff... +//> ## Basic principles +// +// libsearch uses two tricks to return full-text search results that are +// reasonably good: (1) index-free, regular expression-based string search and +// (2) TF-IDF ranking based on those RegExp matches: +// +// 1. Rather than using a pre-built index that maps tokens to documents, which +// requires maintenance to be kept up-to-date every time the underlying +// corpus changes, libsearch transforms the search query into regular +// expressions that progressively filter the corpus. In theory, this is +// O(n), but in practice, for small enough n (MBs of text), this is good +// enough. +// 2. The conventional TF-IDF formula requires knowing the number of tokens in +// every document. This requires either a pre-built index, or is +// computationally expensive, so instead we approximate this using the +// character count of the document. Using JavaScript's RegExp#exec with a +// global regular expression lets us quickly count the number of matches of +// a keyword in a document. Using these tricks, libsearch uses the formula: +// +// # tokens / doc.length * log(# docs / # matching docs) //> ## Implementation //> To turn every potential query into a regular expression, we need to be able -// to escape key characters. +// to escape characters that are significant in RegExp. function escapeForRegExp(text: string): string { return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1'); } //> Utility function for sorting an array by some predicate, rather than a -// comparator function. +// comparator function. This implementation assumes `by(it)` is very cheap. function sortBy(items: T[], by: (_it: T) => any): T[] { return items.sort((a, b) => { const aby = by(a); @@ -26,21 +52,28 @@ function sortBy(items: T[], by: (_it: T) => any): T[] { }); } -//> The main search function takes: +//> The search function takes: // - `items`, the list of items to search // - `query`, the search query text -// - `by`, which is a predicate (string, number, or function) that takes an item from the items list and returns the string that should be matched with the query +// - `by`, which is a predicate function that takes an item from the items +// list and returns the string that should be matched with the query +// - `options`, a dictionary of options: // // Options include // - `caseSensitive`, which is self-explanatory -// - `mode`: which is 'word' or 'prefix' ('prefix' by default) +// - `mode`: which is 'word', 'prefix', or 'autocomplete' ('autocomplete' by +// default), determining the way in which partial matches are processed export function search(items: T[], query: string, by: (_it: T) => any = x => x, { caseSensitive = false, - mode = 'prefix', + mode = 'autocomplete', }: { caseSensitive?: boolean; - mode?: 'word' | 'prefix'; + mode?: 'word' | 'prefix' | 'autocomplete'; } = {}) { + //> `countMatches` counts the number of times `regexp` occurs in the string + // `s`. We need this information for ranking, where documents that mention + // the keyword more times (relative to the total word count of the + // document) are ranked higher. function countMatches(s: string, regexp: RegExp): number { let i = 0; while (regexp.exec(s) !== null) { @@ -49,44 +82,59 @@ export function search(items: T[], query: string, by: (_it: T) => any = x => return i; } + //> We chunk up the query string into a list of "words", each of which will + // become a regular expression filter. const words = query .trim() - .split(' ') + .split(/\s+/) .filter(s => s !== ''); + //> Short-circuit if the search query is empty -- return the original list. + // This is a sensible default because in most apps this corresponds to the + // "home view" of the list, where a search has not been performed. if (words.length === 0) { return items; } + //> For every word in the search query, we're going to keep track of every + // document's TF-IDF value in this map, and aggregate them together by the + // end for sorting. const tfidf = new Map(); - const suggestions = words.reduce((suggestions, word, i) => { + + //> Iterate through every word in the query and progressively filter down + // `items` to just the documents that match every query word. + const results = words.reduce((results, word, i) => { const isLastWord = i + 1 === words.length; const regexp = new RegExp( - '(^|\\W)' + escapeForRegExp(word) + (isLastWord || mode === 'prefix' ? '' : '($|\\W)'), - // the "u" flag for Unicode used to be used here, but was removed - // because it was (1) across-the-board too slow, and removing it - // made a statistically significant speed improvement, and (2) - // caused at least Chrome to have strange performance cliffs in - // unpredictable ways where certain regexp operations would take - // 10s of ms. + '(^|\\W)' + + escapeForRegExp(word) + + ((mode === 'autocomplete' && isLastWord) || mode === 'prefix' ? '' : '($|\\W)'), + //> The 'u' flag for Unicode used to be used here, but was removed + // because it was (1) across-the-board too slow, and removing it + // made a statistically significant speed improvement, and (2) + // caused at least Chrome to have strange performance cliffs in + // unpredictable ways where certain RegExp operations would take + // 10s of ms. caseSensitive ? 'mg' : 'img' ); - return suggestions.filter(sugg => { - const text = by(sugg); + return results.filter(result => { + const text = by(result); const count = countMatches(text, regexp); if (count === 0) { return false; } - //> TF-IDF weighting per-term + //> Compute the TF-IDF value for this `word`, and add it to this + // result's TF-IDF value so far. tfidf.set( - sugg, - (tfidf.get(sugg) || 0) - + (count / text.length * Math.log(items.length / suggestions.length)) + result, + (tfidf.get(result) || 0) + + (count / text.length * Math.log(items.length / results.length)) ); return true; }) }, items); - return sortBy(suggestions, sugg => tfidf.get(sugg)); + //> Sort the results list by our ranking metric, TF-IDF + return sortBy(results, result => tfidf.get(result)); }