-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVSM.js
190 lines (190 loc) · 7.79 KB
/
VSM.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.VectorOperations = exports.SimilaritySchemas = exports.WeighingSchemas = exports.WordMappings = exports.VectorSpaceModel = exports.VectorizedDocument = void 0;
const lodash_1 = require("lodash");
/**
* Instead of representing a document as an arbitrarily ordered tuple of values (because dimensions can be arbitrarily ordered),
* Each document contains a field, vector, which is represented as a Map<string, number>
* where the key represents a term/dimension and number represents the tf-idf score.
* @field vector represents the vector representation of the document.
* @field content is the text of the original document.
* @field meta allows for the user to attach any desired metadata to the document. This is especially useful for indexing documents in relation to the rest of the collection.
* @field score is the relevance metric of the score. This is initialized as NaN.
*/
class VectorizedDocument {
/** Constructs a Document from a RawDocument, populating the document with term frequencies.
* @param raw RawDocument to be converted to Document
*/
constructor(vector, content, score, meta) {
this._vector = new Map(vector);
this._meta = meta ? lodash_1.cloneDeep(meta) : new Map();
this._content = content;
this._score = score || NaN;
}
/** @returns vectorized form of the Document */
get vector() {
return this._vector;
}
/** @param vector vectorized form of the document. */
set vector(vector) {
this._vector = new Map(vector);
}
/** @returns metadata of the Document */
get meta() {
return this._meta;
}
/** @param meta metadata of the document. */
set meta(meta) {
this._meta = meta ? lodash_1.cloneDeep(meta) : new Map();
}
/** @returns content of the Document */
get content() {
return this._content;
}
/** @param content content of the document. */
set content(content) {
this._content = content;
}
/** @returns score of the Document */
get score() {
return this._score;
}
/** @param score score of the document. */
set score(score) {
this._score = score;
}
}
exports.VectorizedDocument = VectorizedDocument;
/**
* @description A class that models collections of documents into vector space models.
*/
class VectorSpaceModel {
/**
* @param similaritySchema Function determining the metric of relevance between two vectorized documents. See SimilaritySchemas library for examples.
* @param weighingSchema Function determining the weight of each term component of each document. If not specified (unadvisable), the document vectors will be
* prepopulated with term frequencies. See WeighingSchemas library for examples.
* @param wordMappingSchema Function that maps variations of words into a specific word. See WordMappings library for examples.
*/
constructor(similaritySchema, weighingSchema, wordMappingSchema) {
this._similaritySchema = similaritySchema;
this._wordMappingSchema = wordMappingSchema;
this._weighingSchema = weighingSchema;
}
/**
* @description Performs a query on a collection of documents, returning the top k results.
* @param query Query to perform on the collection.
* @param collection Collection of documents to construct a vector space of. The query should not be included.
* @param k Positive integer representing the top k number of results to return.
*/
query(query, collection, k) {
if (k > collection.length) {
throw Error("Parameter k cannot be greater than collection size.");
}
else if (k <= 0 || k !== Math.floor(k)) {
throw Error("Parameter k must be a positive integer.");
}
else if (query === "") {
throw Error("Empty query not allowed.");
}
else if (collection.length === 0) {
return [];
}
// Each term is a dimension
const dimensions = new Set();
// Mapping documents and query to vectors.
let vectors = collection
.concat([{ content: query }])
.map((document) => {
const vector = new Map();
const rawWords = (document.content || "").match(/\w+/g);
rawWords &&
rawWords.forEach((rawWord) => {
const word = this._wordMappingSchema
? this._wordMappingSchema(rawWord)
: rawWord;
dimensions.add(word);
const wordCount = vector.get(word) || 0;
vector.set(word, 1 + wordCount);
});
return new VectorizedDocument(vector, document.content, undefined, document.meta);
});
// Assigning weights of the vectors based on a bag-of-words function
vectors = this._weighingSchema
? this._weighingSchema(vectors, dimensions)
: vectors;
const queryVector = vectors.pop();
// Score vectors
vectors.forEach((documentVector, index) => {
vectors[index].score = queryVector ? this._similaritySchema(queryVector, documentVector) : 0;
});
vectors.sort((a, b) => {
return a.score === b.score ? 0 : a.score > b.score ? -1 : 1;
});
return vectors.slice(0, k);
}
}
exports.VectorSpaceModel = VectorSpaceModel;
exports.WordMappings = {
caseInsensitive: (word) => word.toLowerCase(),
noPunctuation: (word) => {
return word.replace(/[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]/g, "");
},
};
exports.WeighingSchemas = {
/** A weighing function that utilizes term frequency and inverse document frequency. */
tfidf: (vectorSpace, dimensions) => {
const normalizedVectorSpace = lodash_1.cloneDeep(vectorSpace);
const dictionary = dimensions;
dictionary.forEach((dimension) => {
// Number of documents in the collection that contain a term t
let documentFrequency = 0;
vectorSpace.forEach((document) => {
documentFrequency += document.vector.has(dimension) ? 1 : 0;
});
const idf = Math.log(normalizedVectorSpace.length / documentFrequency);
normalizedVectorSpace.forEach((vector, index) => {
// The frequency of a term in a document
const tf = vector.vector.get(dimension) || 0;
const idftf = idf * tf;
normalizedVectorSpace[index].vector.set(dimension, idftf);
});
});
return normalizedVectorSpace;
},
};
exports.SimilaritySchemas = {
cosineSimilarity: (a, b) => {
return (exports.VectorOperations.dotProduct(a, b) /
(exports.VectorOperations.euclideanLength(a) *
exports.VectorOperations.euclideanLength(b)));
},
};
/** A collection of vector operations that may be helpful when constructing similarity schemas. */
exports.VectorOperations = {
/**
* @returns The euclidiean length of a document.
* @param vector Vectorized document to find length of.
*/
euclideanLength: (vector) => {
let length = 0;
Array.from(vector.vector.values()).forEach((component) => {
length += Math.pow(component, 2);
});
return Math.sqrt(length);
},
/**
* @returns Calculates the dot product between two documents.
* @param a First vector.
* @param b Second vector.
*/
dotProduct: (a, b) => {
let dp = 0;
Array.from(a.vector.entries()).forEach((aNthDimension) => {
const bNthDimension = b.vector.get(aNthDimension[0]);
if (bNthDimension) {
dp += aNthDimension[1] * bNthDimension;
}
});
return dp;
},
};