From 2e8d275e1148fe3e1629461378735fab76059c71 Mon Sep 17 00:00:00 2001
From: Linus Lee libsearch is the core text search algorithm that I've polished and
+reused over the years across many of my personal
+projects for fast and simple full-text
+search, packaged into a small single-purpose JavaScript library. For how to import and use in your own project, and for canonical
+documentation, check out the GitHub repository
+page. TODO: Explain stuff... libsearch uses two tricks to return full-text search results that are
+reasonably good: (1) index-free, regular expression-based string search and
+(2) TF-IDF ranking based on those RegExp matches: To turn every potential query into a regular expression, we need to be able
-to escape key characters../lib/search.ts annotated source
- Basic principles
-2
9
4
Basic principles
+
+
+tokens / doc.length * log(# docs / # matching docs)
+30
Implementation
-6
32
9function escapeForRegExp(text: string): string {
10 return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1');
11}
12
35function escapeForRegExp(text: string): string {+
36 return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1');
37}
38
Utility function for sorting an array by some predicate, rather than a -comparator function.
-15function sortBy<T>(items: T[], by: (_it: T) => any): T[] {
16 return items.sort((a, b) => {
17 const aby = by(a);
18 const bby = by(b);
19 if (aby < bby) {
20 return 1;
21 }
22 if (bby < aby) {
23 return -1;
24 }
25 return 0;
26 });
27}
28
The main search function takes:
+comparator function. This implementation assumesby(it)
is very cheap.
+41function sortBy<T>(items: T[], by: (_it: T) => any): T[] {
42 return items.sort((a, b) => {
43 const aby = by(a);
44 const bby = by(b);
45 if (aby < bby) {
46 return 1;
47 }
48 if (bby < aby) {
49 return -1;
50 }
51 return 0;
52 });
53}
54
The search function takes:
items
, the list of items to searchquery
, the search query textby
, which is a predicate (string, number, or function) that takes an item from the items list and returns the string that should be matched with the queryby
, which is a predicate function that takes an item from the items
+list and returns the string that should be matched with the queryoptions
, a dictionary of options:Options include
caseSensitive
, which is self-explanatorymode
: which is 'word' or 'prefix' ('prefix' by default)mode
: which is 'word', 'prefix', or 'autocomplete' ('autocomplete' by
+default), determining the way in which partial matches are processed37export function search<T>(items: T[], query: string, by: (_it: T) => any = x => x, {
38 caseSensitive = false,
39 mode = 'prefix',
40}: {
41 caseSensitive?: boolean;
42 mode?: 'word' | 'prefix';
43} = {}) {
44 function countMatches(s: string, regexp: RegExp): number {
45 let i = 0;
46 while (regexp.exec(s) !== null) {
47 i ++;
48 }
49 return i;
50 }
51
52 const words = query
53 .trim()
54 .split(' ')
55 .filter(s => s !== '');
56
57 if (words.length === 0) {
58 return items;
59 }
60
61 const tfidf = new Map<T, number>();
62 const suggestions = words.reduce((suggestions, word, i) => {
63 const isLastWord = i + 1 === words.length;
64 const regexp = new RegExp(
65 '(^|\\W)' + escapeForRegExp(word) + (isLastWord || mode === 'prefix' ? '' : '($|\\W)'),
66 // the "u" flag for Unicode used to be used here, but was removed
67 // because it was (1) across-the-board too slow, and removing it
68 // made a statistically significant speed improvement, and (2)
69 // caused at least Chrome to have strange performance cliffs in
70 // unpredictable ways where certain regexp operations would take
71 // 10s of ms.
72 caseSensitive ? 'mg' : 'img'
73 );
74 return suggestions.filter(sugg => {
75 const text = by(sugg);
76 const count = countMatches(text, regexp);
77 if (count === 0) {
78 return false;
79 }
TF-IDF weighting per-term
-81 tfidf.set(
82 sugg,
83 (tfidf.get(sugg) || 0)
84 + (count / text.length * Math.log(items.length / suggestions.length))
85 );
86 return true;
87 })
88 }, items);
89
90 return sortBy(suggestions, sugg => tfidf.get(sugg));
91}
92
93
66export function search<T>(items: T[], query: string, by: (_it: T) => any = x => x, {+
67 caseSensitive = false,
68 mode = 'autocomplete',
69}: {
70 caseSensitive?: boolean;
71 mode?: 'word' | 'prefix' | 'autocomplete';
72} = {}) {
countMatches
counts the number of times regexp
occurs in the string
+s
. We need this information for ranking, where documents that mention
+the keyword more times (relative to the total word count of the
+document) are ranked higher.
77 function countMatches(s: string, regexp: RegExp): number {
78 let i = 0;
79 while (regexp.exec(s) !== null) {
80 i ++;
81 }
82 return i;
83 }
84
We chunk up the query string into a list of "words", each of which will +become a regular expression filter.
+87 const words = query
88 .trim()
89 .split(/\s+/)
90 .filter(s => s !== '');
91
Short-circuit if the search query is empty -- return the original list. +This is a sensible default because in most apps this corresponds to the +"home view" of the list, where a search has not been performed.
+95 if (words.length === 0) {
96 return items;
97 }
98
For every word in the search query, we're going to keep track of every +document's TF-IDF value in this map, and aggregate them together by the +end for sorting.
+102 const tfidf = new Map<T, number>();
103
Iterate through every word in the query and progressively filter down
+items
to just the documents that match every query word.
106 const results = words.reduce((results, word, i) => {
107 const isLastWord = i + 1 === words.length;
108 const regexp = new RegExp(
109 '(^|\\W)'
110 + escapeForRegExp(word)
111 + ((mode === 'autocomplete' && isLastWord) || mode === 'prefix' ? '' : '($|\\W)'),
The 'u' flag for Unicode used to be used here, but was removed +because it was (1) across-the-board too slow, and removing it +made a statistically significant speed improvement, and (2) +caused at least Chrome to have strange performance cliffs in +unpredictable ways where certain RegExp operations would take +10s of ms.
+118 caseSensitive ? 'mg' : 'img'
119 );
120 return results.filter(result => {
121 const text = by(result);
122 const count = countMatches(text, regexp);
123 if (count === 0) {
124 return false;
125 }
Compute the TF-IDF value for this word
, and add it to this
+result's TF-IDF value so far.
128 tfidf.set(
129 result,
130 (tfidf.get(result) || 0)
131 + (count / text.length * Math.log(items.length / results.length))
132 );
133 return true;
134 })
135 }, items);
136
Sort the results list by our ranking metric, TF-IDF
+138 return sortBy(results, result => tfidf.get(result));
139}
140
141
1import {strict as assert} from 'node:assert';
2import {search} from '../dist/search.js';
3
4function item(name) {
5 return {name};
6}
7
Most of the tests work on this pre-set list of items to search
-9const ITEMS = [
10 item('Linus Lee'),
11 item('@thesephist'),
12 item('@geohot'),
13 item('linuslee'),
14 item('linus is a person'),
15 item('@dlwlrma'),
16];
17
18describe('basic search', () => {
19 it('search empty array', () => {
20 assert.deepEqual(search([], 'query', x => x.name), []);
21 });
22
23 it('search with empty query', () => {
24 assert.deepEqual(search(ITEMS, '', x => x.name), ITEMS);
25 });
26
27 it('search with 1 letter returns correct result', () => {
28 assert.deepEqual(search(ITEMS, 'l', x => x.name), [
29 item('Linus Lee'),
30 item('linuslee'),
31 item('linus is a person'),
32 ]);
33 });
34
35 it('multi-word search returns correct result', () => {
36 assert.deepEqual(search(ITEMS, 'linus lee', x => x.name), [
37 item('Linus Lee'),
38 ]);
39 });
40
41 it('searching words out of order returns correct result', () => {
42 assert.deepEqual(search(ITEMS, 'lee linus', x => x.name), [
43 item('Linus Lee'),
44 ]);
45 });
46
47 it('search works even if the last query word is incomplete', () => {
48 assert.deepEqual(search(ITEMS, 'linus le', x => x.name), [
49 item('Linus Lee'),
50 ]);
51 });
52
53 it('correctly implements TF-IDF ranking', () => {
4const item = name => ({name});
5
Most of the tests operate on this pre-set list of items to search
+7const ITEMS = [
8 item('Linus Lee'),
9 item('@thesephist'),
10 item('@geohot'),
11 item('linuslee'),
12 item('linus is a person'),
13 item('@dlwlrma'),
14];
15
16describe('basic search', () => {
17 it('search empty array', () => {
18 assert.deepEqual(search([], 'query', x => x.name), []);
19 });
20
21 it('search with empty query', () => {
22 assert.deepEqual(search(ITEMS, '', x => x.name), ITEMS);
23 });
24
25 it('search with 1 letter returns correct result', () => {
26 assert.deepEqual(search(ITEMS, 'l', x => x.name), [
27 item('Linus Lee'),
28 item('linuslee'),
29 item('linus is a person'),
30 ]);
31 });
32
33 it('search does not match from middle of words', () => {
34 assert.deepEqual(search(ITEMS, 'w', x => x.name), []);
35 });
36
37 it('multi-word search returns correct result', () => {
38 assert.deepEqual(search(ITEMS, 'linus lee', x => x.name), [
39 item('Linus Lee'),
40 ]);
41 });
42
43 it('searching words out of order returns correct result', () => {
44 assert.deepEqual(search(ITEMS, 'lee linus', x => x.name), [
45 item('Linus Lee'),
46 ]);
47 });
48
49 it('search works even if the last query word is incomplete', () => {
50 assert.deepEqual(search(ITEMS, 'linus le', x => x.name), [
51 item('Linus Lee'),
52 ]);
53 });
54
55 it('search query may contain newlines, tabs, and multiple consecutive spaces', () => {
56 assert.deepEqual(search(ITEMS, ' linus\t is\nperson\t', x => x.name), [
57 item('linus is a person'),
58 ]);
59 });
60
61 it('correctly implements TF-IDF ranking', () => {
In this example, "mango" has much higher IDF (is a higher-signal word) in the corpus than "apple", which appears in nearly every document. Therefore,
-57 assert.deepEqual(
58 search([
59 // matches
60 item('mango mango mango apple'),
61 item('mango apple mango apple'),
62 item('apple mango apple mango apple mango apple mango'),
63 item('apple apple apple apple apple apple apple apple mango'),
64 // rejects
65 item('apple apple apple'),
66 item('mango mango mango'),
67 item('applemango'),
68 item('mangoapple'),
69 item('apple 1'),
70 item('apple 2'),
71 item('apple 3'),
72 item('apple 4'),
73 item('apple 5'),
74 item('apple 6'),
75 item('apple 7'),
76 item('apple 8'),
77 item('apple 9'),
78 ], 'apple mango', x => x.name),
79 [
80 item('mango mango mango apple'),
81 item('mango apple mango apple'),
82 item('apple mango apple mango apple mango apple mango'),
83 item('apple apple apple apple apple apple apple apple mango'),
84 ]
85 );
86 });
87});
88
89describe('custom search-by predicates', () => {
90 it('default predicate is provided as x => x', () => {
91 assert.deepEqual(
92 search([
93 'university',
94 'uni of california',
95 'university of california',
96 ], 'uni of cali'),
97 [
98 'uni of california',
99 'university of california',
100 ]
101 );
102 });
103
104 it('accepts and uses a custom predicate', () => {
105 assert.deepEqual(search(ITEMS, 'sunil ee', x => x.name.split('').reverse().join('')), [
106 item('Linus Lee'),
107 ]);
108 });
109});
65 assert.deepEqual(+
66 search([
67 // matches
68 item('mango mango mango apple'),
69 item('mango apple mango apple'),
70 item('apple mango apple mango apple mango apple mango'),
71 item('apple apple apple apple apple apple apple apple mango'),
72 // rejects
73 item('apple apple apple'),
74 item('mango mango mango'),
75 item('applemango'),
76 item('mangoapple'),
77 item('apple 1'),
78 item('apple 2'),
79 item('apple 3'),
80 item('apple 4'),
81 item('apple 5'),
82 item('apple 6'),
83 item('apple 7'),
84 item('apple 8'),
85 item('apple 9'),
86 ], 'apple mango', x => x.name),
87 [
88 item('mango mango mango apple'),
89 item('mango apple mango apple'),
90 item('apple mango apple mango apple mango apple mango'),
91 item('apple apple apple apple apple apple apple apple mango'),
92 ]
93 );
94 });
95});
96
97describe('custom search-by predicates', () => {
98 it('default predicate is provided as x => x', () => {
99 assert.deepEqual(
100 search([
101 'university',
102 'uni of california',
103 'university of california',
104 ], 'uni of cali'),
105 [
106 'uni of california',
107 ]
108 );
109 });
110
111describe('search modes', () => {
112 it('in mode: prefix, every query word can be incomplete', () => {
113 assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'prefix'}), [
114 item('Linus Lee'),
115 ])
116 });
111 it('accepts and uses a custom predicate', () => {
112 assert.deepEqual(search(ITEMS, 'sunil ee', x => x.name.split('').reverse().join('')), [
113 item('Linus Lee'),
114 ]);
115 });
116});
117
118 it('in mode: word, search does not match if non-last words are incomplete', () => {
119 assert.deepEqual(search(ITEMS, 'linu lee', x => x.name, {mode: 'word'}), []);
120 });
121});
118describe('search modes', () => {
119 it('in mode: word, search does not match if any words are incomplete', () => {
120 assert.deepEqual(search(ITEMS, 'linu lee', x => x.name, {mode: 'word'}), []);
121 });
122
123describe('case sensitivity', () => {
124 it('caseSensitive: true omits non-matching results', () => {
125 assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: true}), [
126 item('linuslee'),
127 item('linus is a person'),
128 ]);
129 });
130});
131
132
123 it('in mode: prefix, every query word may be incomplete', () => {
124 assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'prefix'}), [
125 item('Linus Lee'),
126 ]);
127 });
128
129 it('in mode: autocomplete, only the last query word may be incomplete', () => {
130 assert.deepEqual(search(ITEMS, 'linus le', x => x.name, {mode: 'autocomplete'}), [
131 item('Linus Lee'),
132 ]);
133 assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'autocomplete'}), []);
134 });
135});
136
137describe('case sensitivity', () => {
138 it('caseSensitive: true omits non-matching results', () => {
139 assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: true}), [
140 item('linuslee'),
141 item('linus is a person'),
142 ]);
143 });
144
145 it('caseSensitive: false includes case-insensitive results', () => {
146 assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: false}), [
147 item('Linus Lee'),
148 item('linuslee'),
149 item('linus is a person'),
150 ]);
151 });
152});
153
154