From 2e8d275e1148fe3e1629461378735fab76059c71 Mon Sep 17 00:00:00 2001 From: Linus Lee Date: Thu, 21 Jul 2022 10:45:20 -0400 Subject: [PATCH] docs: Add code annotations --- docs/lib/search.ts.html | 216 ++++++++++++++++++------------- docs/test/search.js.html | 268 +++++++++++++++++++++------------------ lib/search.ts | 98 ++++++++++---- 3 files changed, 349 insertions(+), 233 deletions(-) diff --git a/docs/lib/search.ts.html b/docs/lib/search.ts.html index ded38c3..0729ced 100644 --- a/docs/lib/search.ts.html +++ b/docs/lib/search.ts.html @@ -17,105 +17,151 @@

./lib/search.ts annotated source


         
-        

Basic principles

-
2
+

libsearch is the core text search algorithm that I've polished and +reused over the years across many of my personal +projects for fast and simple full-text +search, packaged into a small single-purpose JavaScript library.

+

For how to import and use in your own project, and for canonical +documentation, check out the GitHub repository +page.

+
9
-

TODO: Explain stuff...

-
4
+

Basic principles

+

libsearch uses two tricks to return full-text search results that are +reasonably good: (1) index-free, regular expression-based string search and +(2) TF-IDF ranking based on those RegExp matches:

+
    +
  1. Rather than using a pre-built index that maps tokens to documents, which +requires maintenance to be kept up-to-date every time the underlying +corpus changes, libsearch transforms the search query into regular +expressions that progressively filter the corpus. In theory, this is +O(n), but in practice, for small enough n (MBs of text), this is good +enough.
  2. +
  3. The conventional TF-IDF formula requires knowing the number of tokens in +every document. This requires either a pre-built index, or is +computationally expensive, so instead we approximate this using the +character count of the document. Using JavaScript's RegExp#exec with a +global regular expression lets us quickly count the number of matches of +a keyword in a document. Using these tricks, libsearch uses the formula:
  4. +
+

tokens / doc.length * log(# docs / # matching docs)

+
30

Implementation

-
6
+
32

To turn every potential query into a regular expression, we need to be able -to escape key characters.

-
9function escapeForRegExp(text: string): string {
-
10    return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1');
-
11}
-
12
+to escape characters that are significant in RegExp.

+
35function escapeForRegExp(text: string): string {
+
36    return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1');
+
37}
+
38

Utility function for sorting an array by some predicate, rather than a -comparator function.

-
15function sortBy<T>(items: T[], by: (_it: T) => any): T[] {
-
16    return items.sort((a, b) => {
-
17        const aby = by(a);
-
18        const bby = by(b);
-
19        if (aby < bby) {
-
20            return 1;
-
21        }
-
22        if (bby < aby) {
-
23            return -1;
-
24        }
-
25        return 0;
-
26    });
-
27}
-
28
-

The main search function takes:

+comparator function. This implementation assumes by(it) is very cheap.

+
41function sortBy<T>(items: T[], by: (_it: T) => any): T[] {
+
42    return items.sort((a, b) => {
+
43        const aby = by(a);
+
44        const bby = by(b);
+
45        if (aby < bby) {
+
46            return 1;
+
47        }
+
48        if (bby < aby) {
+
49            return -1;
+
50        }
+
51        return 0;
+
52    });
+
53}
+
54
+

The search function takes:

  • items, the list of items to search
  • query, the search query text
  • -
  • by, which is a predicate (string, number, or function) that takes an item from the items list and returns the string that should be matched with the query
  • +
  • by, which is a predicate function that takes an item from the items +list and returns the string that should be matched with the query
  • +
  • options, a dictionary of options:

Options include

  • caseSensitive, which is self-explanatory
  • -
  • mode: which is 'word' or 'prefix' ('prefix' by default)
  • +
  • mode: which is 'word', 'prefix', or 'autocomplete' ('autocomplete' by +default), determining the way in which partial matches are processed
-
37export function search<T>(items: T[], query: string, by: (_it: T) => any = x => x, {
-
38    caseSensitive = false,
-
39    mode = 'prefix',
-
40}: {
-
41    caseSensitive?: boolean;
-
42    mode?: 'word' | 'prefix';
-
43} = {}) {
-
44    function countMatches(s: string, regexp: RegExp): number {
-
45        let i = 0;
-
46        while (regexp.exec(s) !== null) {
-
47            i ++;
-
48        }
-
49        return i;
-
50    }
-
51
-
52    const words = query
-
53        .trim()
-
54        .split(' ')
-
55        .filter(s => s !== '');
-
56
-
57    if (words.length === 0) {
-
58        return items;
-
59    }
-
60
-
61    const tfidf = new Map<T, number>();
-
62    const suggestions = words.reduce((suggestions, word, i) => {
-
63        const isLastWord = i + 1 === words.length;
-
64        const regexp = new RegExp(
-
65            '(^|\\W)' + escapeForRegExp(word) + (isLastWord || mode === 'prefix' ? '' : '($|\\W)'),
-
66            // the "u" flag for Unicode used to be used here, but was removed
-
67            // because it was (1) across-the-board too slow, and removing it
-
68            // made a statistically significant speed improvement, and (2)
-
69            // caused at least Chrome to have strange performance cliffs in
-
70            // unpredictable ways where certain regexp operations would take
-
71            // 10s of ms.
-
72            caseSensitive ? 'mg' : 'img'
-
73        );
-
74        return suggestions.filter(sugg => {
-
75            const text = by(sugg);
-
76            const count = countMatches(text, regexp);
-
77            if (count === 0) {
-
78                return false;
-
79            }
-

TF-IDF weighting per-term

-
81            tfidf.set(
-
82                sugg,
-
83                (tfidf.get(sugg) || 0)
-
84                    + (count / text.length * Math.log(items.length / suggestions.length))
-
85            );
-
86            return true;
-
87        })
-
88    }, items);
-
89
-
90    return sortBy(suggestions, sugg => tfidf.get(sugg));
-
91}
-
92
-
93
+
66export function search<T>(items: T[], query: string, by: (_it: T) => any = x => x, {
+
67    caseSensitive = false,
+
68    mode = 'autocomplete',
+
69}: {
+
70    caseSensitive?: boolean;
+
71    mode?: 'word' | 'prefix' | 'autocomplete';
+
72} = {}) {
+

countMatches counts the number of times regexp occurs in the string +s. We need this information for ranking, where documents that mention +the keyword more times (relative to the total word count of the +document) are ranked higher.

+
77    function countMatches(s: string, regexp: RegExp): number {
+
78        let i = 0;
+
79        while (regexp.exec(s) !== null) {
+
80            i ++;
+
81        }
+
82        return i;
+
83    }
+
84
+

We chunk up the query string into a list of "words", each of which will +become a regular expression filter.

+
87    const words = query
+
88        .trim()
+
89        .split(/\s+/)
+
90        .filter(s => s !== '');
+
91
+

Short-circuit if the search query is empty -- return the original list. +This is a sensible default because in most apps this corresponds to the +"home view" of the list, where a search has not been performed.

+
95    if (words.length === 0) {
+
96        return items;
+
97    }
+
98
+

For every word in the search query, we're going to keep track of every +document's TF-IDF value in this map, and aggregate them together by the +end for sorting.

+
102    const tfidf = new Map<T, number>();
+
103
+

Iterate through every word in the query and progressively filter down +items to just the documents that match every query word.

+
106    const results = words.reduce((results, word, i) => {
+
107        const isLastWord = i + 1 === words.length;
+
108        const regexp = new RegExp(
+
109            '(^|\\W)'
+
110                + escapeForRegExp(word)
+
111                + ((mode === 'autocomplete' && isLastWord) || mode === 'prefix' ? '' : '($|\\W)'),
+

The 'u' flag for Unicode used to be used here, but was removed +because it was (1) across-the-board too slow, and removing it +made a statistically significant speed improvement, and (2) +caused at least Chrome to have strange performance cliffs in +unpredictable ways where certain RegExp operations would take +10s of ms.

+
118            caseSensitive ? 'mg' : 'img'
+
119        );
+
120        return results.filter(result => {
+
121            const text = by(result);
+
122            const count = countMatches(text, regexp);
+
123            if (count === 0) {
+
124                return false;
+
125            }
+

Compute the TF-IDF value for this word, and add it to this +result's TF-IDF value so far.

+
128            tfidf.set(
+
129                result,
+
130                (tfidf.get(result) || 0)
+
131                    + (count / text.length * Math.log(items.length / results.length))
+
132            );
+
133            return true;
+
134        })
+
135    }, items);
+
136
+

Sort the results list by our ranking metric, TF-IDF

+
138    return sortBy(results, result => tfidf.get(result));
+
139}
+
140
+
141
diff --git a/docs/test/search.js.html b/docs/test/search.js.html index abf7ba0..6f1b6db 100644 --- a/docs/test/search.js.html +++ b/docs/test/search.js.html @@ -20,135 +20,157 @@

./test/search.js annotated source

1import {strict as assert} from 'node:assert';
2import {search} from '../dist/search.js';
3
-
4function item(name) {
-
5    return {name};
-
6}
-
7
-

Most of the tests work on this pre-set list of items to search

-
9const ITEMS = [
-
10    item('Linus Lee'),
-
11    item('@thesephist'),
-
12    item('@geohot'),
-
13    item('linuslee'),
-
14    item('linus is a person'),
-
15    item('@dlwlrma'),
-
16];
-
17
-
18describe('basic search', () => {
-
19    it('search empty array', () => {
-
20        assert.deepEqual(search([], 'query', x => x.name), []);
-
21    });
-
22
-
23    it('search with empty query', () => {
-
24        assert.deepEqual(search(ITEMS, '', x => x.name), ITEMS);
-
25    });
-
26
-
27    it('search with 1 letter returns correct result', () => {
-
28        assert.deepEqual(search(ITEMS, 'l', x => x.name), [
-
29            item('Linus Lee'),
-
30            item('linuslee'),
-
31            item('linus is a person'),
-
32        ]);
-
33    });
-
34
-
35    it('multi-word search returns correct result', () => {
-
36        assert.deepEqual(search(ITEMS, 'linus lee', x => x.name), [
-
37            item('Linus Lee'),
-
38        ]);
-
39    });
-
40
-
41    it('searching words out of order returns correct result', () => {
-
42        assert.deepEqual(search(ITEMS, 'lee linus', x => x.name), [
-
43            item('Linus Lee'),
-
44        ]);
-
45    });
-
46
-
47    it('search works even if the last query word is incomplete', () => {
-
48        assert.deepEqual(search(ITEMS, 'linus le', x => x.name), [
-
49            item('Linus Lee'),
-
50        ]);
-
51    });
-
52
-
53    it('correctly implements TF-IDF ranking', () => {
+
4const item = name => ({name});
+
5
+

Most of the tests operate on this pre-set list of items to search

+
7const ITEMS = [
+
8    item('Linus Lee'),
+
9    item('@thesephist'),
+
10    item('@geohot'),
+
11    item('linuslee'),
+
12    item('linus is a person'),
+
13    item('@dlwlrma'),
+
14];
+
15
+
16describe('basic search', () => {
+
17    it('search empty array', () => {
+
18        assert.deepEqual(search([], 'query', x => x.name), []);
+
19    });
+
20
+
21    it('search with empty query', () => {
+
22        assert.deepEqual(search(ITEMS, '', x => x.name), ITEMS);
+
23    });
+
24
+
25    it('search with 1 letter returns correct result', () => {
+
26        assert.deepEqual(search(ITEMS, 'l', x => x.name), [
+
27            item('Linus Lee'),
+
28            item('linuslee'),
+
29            item('linus is a person'),
+
30        ]);
+
31    });
+
32
+
33    it('search does not match from middle of words', () => {
+
34        assert.deepEqual(search(ITEMS, 'w', x => x.name), []);
+
35    });
+
36
+
37    it('multi-word search returns correct result', () => {
+
38        assert.deepEqual(search(ITEMS, 'linus lee', x => x.name), [
+
39            item('Linus Lee'),
+
40        ]);
+
41    });
+
42
+
43    it('searching words out of order returns correct result', () => {
+
44        assert.deepEqual(search(ITEMS, 'lee linus', x => x.name), [
+
45            item('Linus Lee'),
+
46        ]);
+
47    });
+
48
+
49    it('search works even if the last query word is incomplete', () => {
+
50        assert.deepEqual(search(ITEMS, 'linus le', x => x.name), [
+
51            item('Linus Lee'),
+
52        ]);
+
53    });
+
54
+
55    it('search query may contain newlines, tabs, and multiple consecutive spaces', () => {
+
56        assert.deepEqual(search(ITEMS, '  linus\t is\nperson\t', x => x.name), [
+
57            item('linus is a person'),
+
58        ]);
+
59    });
+
60
+
61    it('correctly implements TF-IDF ranking', () => {

In this example, "mango" has much higher IDF (is a higher-signal word) in the corpus than "apple", which appears in nearly every document. Therefore,

-
57        assert.deepEqual(
-
58            search([
-
59                // matches
-
60                item('mango mango mango apple'),
-
61                item('mango apple mango apple'),
-
62                item('apple mango apple mango apple mango apple mango'),
-
63                item('apple apple apple apple apple apple apple apple mango'),
-
64                // rejects
-
65                item('apple apple apple'),
-
66                item('mango mango mango'),
-
67                item('applemango'),
-
68                item('mangoapple'),
-
69                item('apple 1'),
-
70                item('apple 2'),
-
71                item('apple 3'),
-
72                item('apple 4'),
-
73                item('apple 5'),
-
74                item('apple 6'),
-
75                item('apple 7'),
-
76                item('apple 8'),
-
77                item('apple 9'),
-
78            ], 'apple mango', x => x.name),
-
79            [
-
80                item('mango mango mango apple'),
-
81                item('mango apple mango apple'),
-
82                item('apple mango apple mango apple mango apple mango'),
-
83                item('apple apple apple apple apple apple apple apple mango'),
-
84            ]
-
85        );
-
86    });
-
87});
-
88
-
89describe('custom search-by predicates', () => {
-
90    it('default predicate is provided as x => x', () => {
-
91        assert.deepEqual(
-
92            search([
-
93                'university',
-
94                'uni of california',
-
95                'university of california',
-
96            ], 'uni of cali'),
-
97            [
-
98                'uni of california',
-
99                'university of california',
-
100            ]
-
101        );
-
102    });
-
103
-
104    it('accepts and uses a custom predicate', () => {
-
105        assert.deepEqual(search(ITEMS, 'sunil ee', x => x.name.split('').reverse().join('')), [
-
106            item('Linus Lee'),
-
107        ]);
-
108    });
-
109});
+
65        assert.deepEqual(
+
66            search([
+
67                // matches
+
68                item('mango mango mango apple'),
+
69                item('mango apple mango apple'),
+
70                item('apple mango apple mango apple mango apple mango'),
+
71                item('apple apple apple apple apple apple apple apple mango'),
+
72                // rejects
+
73                item('apple apple apple'),
+
74                item('mango mango mango'),
+
75                item('applemango'),
+
76                item('mangoapple'),
+
77                item('apple 1'),
+
78                item('apple 2'),
+
79                item('apple 3'),
+
80                item('apple 4'),
+
81                item('apple 5'),
+
82                item('apple 6'),
+
83                item('apple 7'),
+
84                item('apple 8'),
+
85                item('apple 9'),
+
86            ], 'apple mango', x => x.name),
+
87            [
+
88                item('mango mango mango apple'),
+
89                item('mango apple mango apple'),
+
90                item('apple mango apple mango apple mango apple mango'),
+
91                item('apple apple apple apple apple apple apple apple mango'),
+
92            ]
+
93        );
+
94    });
+
95});
+
96
+
97describe('custom search-by predicates', () => {
+
98    it('default predicate is provided as x => x', () => {
+
99        assert.deepEqual(
+
100            search([
+
101                'university',
+
102                'uni of california',
+
103                'university of california',
+
104            ], 'uni of cali'),
+
105            [
+
106                'uni of california',
+
107            ]
+
108        );
+
109    });
110
-
111describe('search modes', () => {
-
112    it('in mode: prefix, every query word can be incomplete', () => {
-
113        assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'prefix'}), [
-
114            item('Linus Lee'),
-
115        ])
-
116    });
+
111    it('accepts and uses a custom predicate', () => {
+
112        assert.deepEqual(search(ITEMS, 'sunil ee', x => x.name.split('').reverse().join('')), [
+
113            item('Linus Lee'),
+
114        ]);
+
115    });
+
116});
117
-
118    it('in mode: word, search does not match if non-last words are incomplete', () => {
-
119        assert.deepEqual(search(ITEMS, 'linu lee', x => x.name, {mode: 'word'}), []);
-
120    });
-
121});
+
118describe('search modes', () => {
+
119    it('in mode: word, search does not match if any words are incomplete', () => {
+
120        assert.deepEqual(search(ITEMS, 'linu lee', x => x.name, {mode: 'word'}), []);
+
121    });
122
-
123describe('case sensitivity', () => {
-
124    it('caseSensitive: true omits non-matching results', () => {
-
125        assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: true}), [
-
126            item('linuslee'),
-
127            item('linus is a person'),
-
128        ]);
-
129    });
-
130});
-
131
-
132
+
123    it('in mode: prefix, every query word may be incomplete', () => {
+
124        assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'prefix'}), [
+
125            item('Linus Lee'),
+
126        ]);
+
127    });
+
128
+
129    it('in mode: autocomplete, only the last query word may be incomplete', () => {
+
130        assert.deepEqual(search(ITEMS, 'linus le', x => x.name, {mode: 'autocomplete'}), [
+
131            item('Linus Lee'),
+
132        ]);
+
133        assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'autocomplete'}), []);
+
134    });
+
135});
+
136
+
137describe('case sensitivity', () => {
+
138    it('caseSensitive: true omits non-matching results', () => {
+
139        assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: true}), [
+
140            item('linuslee'),
+
141            item('linus is a person'),
+
142        ]);
+
143    });
+
144
+
145    it('caseSensitive: false includes case-insensitive results', () => {
+
146        assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: false}), [
+
147            item('Linus Lee'),
+
148            item('linuslee'),
+
149            item('linus is a person'),
+
150        ]);
+
151    });
+
152});
+
153
+
154
diff --git a/lib/search.ts b/lib/search.ts index fe61f8b..e2faf6e 100644 --- a/lib/search.ts +++ b/lib/search.ts @@ -1,17 +1,43 @@ -//> ## Basic principles +//> **libsearch** is the core text search algorithm that I've polished and +// reused over the years across [many of my personal +// projects](https://thesephist/projects) for fast and simple full-text +// search, packaged into a small single-purpose JavaScript library. +// +// For how to import and use in your own project, and for canonical +// documentation, check out the [GitHub repository +// page](https://github.com/thesephist/libsearch). -//> TODO: Explain stuff... +//> ## Basic principles +// +// libsearch uses two tricks to return full-text search results that are +// reasonably good: (1) index-free, regular expression-based string search and +// (2) TF-IDF ranking based on those RegExp matches: +// +// 1. Rather than using a pre-built index that maps tokens to documents, which +// requires maintenance to be kept up-to-date every time the underlying +// corpus changes, libsearch transforms the search query into regular +// expressions that progressively filter the corpus. In theory, this is +// O(n), but in practice, for small enough n (MBs of text), this is good +// enough. +// 2. The conventional TF-IDF formula requires knowing the number of tokens in +// every document. This requires either a pre-built index, or is +// computationally expensive, so instead we approximate this using the +// character count of the document. Using JavaScript's RegExp#exec with a +// global regular expression lets us quickly count the number of matches of +// a keyword in a document. Using these tricks, libsearch uses the formula: +// +// # tokens / doc.length * log(# docs / # matching docs) //> ## Implementation //> To turn every potential query into a regular expression, we need to be able -// to escape key characters. +// to escape characters that are significant in RegExp. function escapeForRegExp(text: string): string { return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1'); } //> Utility function for sorting an array by some predicate, rather than a -// comparator function. +// comparator function. This implementation assumes `by(it)` is very cheap. function sortBy(items: T[], by: (_it: T) => any): T[] { return items.sort((a, b) => { const aby = by(a); @@ -26,21 +52,28 @@ function sortBy(items: T[], by: (_it: T) => any): T[] { }); } -//> The main search function takes: +//> The search function takes: // - `items`, the list of items to search // - `query`, the search query text -// - `by`, which is a predicate (string, number, or function) that takes an item from the items list and returns the string that should be matched with the query +// - `by`, which is a predicate function that takes an item from the items +// list and returns the string that should be matched with the query +// - `options`, a dictionary of options: // // Options include // - `caseSensitive`, which is self-explanatory -// - `mode`: which is 'word' or 'prefix' ('prefix' by default) +// - `mode`: which is 'word', 'prefix', or 'autocomplete' ('autocomplete' by +// default), determining the way in which partial matches are processed export function search(items: T[], query: string, by: (_it: T) => any = x => x, { caseSensitive = false, - mode = 'prefix', + mode = 'autocomplete', }: { caseSensitive?: boolean; - mode?: 'word' | 'prefix'; + mode?: 'word' | 'prefix' | 'autocomplete'; } = {}) { + //> `countMatches` counts the number of times `regexp` occurs in the string + // `s`. We need this information for ranking, where documents that mention + // the keyword more times (relative to the total word count of the + // document) are ranked higher. function countMatches(s: string, regexp: RegExp): number { let i = 0; while (regexp.exec(s) !== null) { @@ -49,44 +82,59 @@ export function search(items: T[], query: string, by: (_it: T) => any = x => return i; } + //> We chunk up the query string into a list of "words", each of which will + // become a regular expression filter. const words = query .trim() - .split(' ') + .split(/\s+/) .filter(s => s !== ''); + //> Short-circuit if the search query is empty -- return the original list. + // This is a sensible default because in most apps this corresponds to the + // "home view" of the list, where a search has not been performed. if (words.length === 0) { return items; } + //> For every word in the search query, we're going to keep track of every + // document's TF-IDF value in this map, and aggregate them together by the + // end for sorting. const tfidf = new Map(); - const suggestions = words.reduce((suggestions, word, i) => { + + //> Iterate through every word in the query and progressively filter down + // `items` to just the documents that match every query word. + const results = words.reduce((results, word, i) => { const isLastWord = i + 1 === words.length; const regexp = new RegExp( - '(^|\\W)' + escapeForRegExp(word) + (isLastWord || mode === 'prefix' ? '' : '($|\\W)'), - // the "u" flag for Unicode used to be used here, but was removed - // because it was (1) across-the-board too slow, and removing it - // made a statistically significant speed improvement, and (2) - // caused at least Chrome to have strange performance cliffs in - // unpredictable ways where certain regexp operations would take - // 10s of ms. + '(^|\\W)' + + escapeForRegExp(word) + + ((mode === 'autocomplete' && isLastWord) || mode === 'prefix' ? '' : '($|\\W)'), + //> The 'u' flag for Unicode used to be used here, but was removed + // because it was (1) across-the-board too slow, and removing it + // made a statistically significant speed improvement, and (2) + // caused at least Chrome to have strange performance cliffs in + // unpredictable ways where certain RegExp operations would take + // 10s of ms. caseSensitive ? 'mg' : 'img' ); - return suggestions.filter(sugg => { - const text = by(sugg); + return results.filter(result => { + const text = by(result); const count = countMatches(text, regexp); if (count === 0) { return false; } - //> TF-IDF weighting per-term + //> Compute the TF-IDF value for this `word`, and add it to this + // result's TF-IDF value so far. tfidf.set( - sugg, - (tfidf.get(sugg) || 0) - + (count / text.length * Math.log(items.length / suggestions.length)) + result, + (tfidf.get(result) || 0) + + (count / text.length * Math.log(items.length / results.length)) ); return true; }) }, items); - return sortBy(suggestions, sugg => tfidf.get(sugg)); + //> Sort the results list by our ranking metric, TF-IDF + return sortBy(results, result => tfidf.get(result)); }