Map terms to int IDs to save memory

Each Document has a termMap, which maps a string term to a float64 weight. There's a lot of redundant storage there; if a string term appears in a thousand documents, that string will be stored as a hashed key a thousand times. Instead, this commit adds a global `termIDs` cache that maps string terms to integer IDs, with a `termID` function that looks up the term in that cache and/or creates a new entry. Primitive benchmarking suggests this reduces peak memory usage by about 15-20% without increasing processing time. That's not huge, but not bad. The "benchmarking" used to figure this out: $ sudo apt install time $ make && /usr/bin/time -v ./docsim "foo bar baz" ~/documents 2>&1 | grep "Maximum resident set size" Maximum resident set size (kbytes): 633248
hrs · Jun 28, 2023 · ab22d17 · ab22d17
1 parent 03eaeef
commit ab22d17
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 46 deletions.
diff --git a/corpus/document.go b/corpus/document.go
@@ -14,8 +14,6 @@ import (
 	"unicode"
 )
 
-type termMap map[string]float64
-
 type Document struct {
 	path     string
 	termFreq termMap
@@ -83,7 +81,6 @@ func parseTokens(rd io.Reader) ([]string, error) {
 				tokens = append(tokens, word)
 			}
 		}
-
 	}
 
 	if err := scanner.Err(); err != nil {
@@ -103,16 +100,15 @@ func NewDocument(rd io.Reader, config *Config) (*Document, error) {
 	}
 
 	// Loop over the tokens, stem them if configured, pass them through the
-	// stoplist if configured, and, for each that "should" "count", increment it
-	// in the term map.
+	// stoplist if configured, and, for each that "should" "count", increment its
+	// ID in the term map.
 	for _, token := range tokens {
 		if config.NoStoplist || !config.Stoplist.include(token) {
-			if config.NoStemming {
-				termCount[token]++
-			} else {
-				termCount[stem(token)]++
+			if !config.NoStemming {
+				token = stem(token)
 			}
 
+			termCount[termID(token)]++
 			totalTermCount++
 		}
 	}

diff --git a/corpus/document_test.go b/corpus/document_test.go
@@ -42,11 +42,11 @@ func TestNewDocument(t *testing.T) {
 
 	tests := []struct {
 		config Config
-		expMap termMap
+		expMap map[string]float64
 	}{
 		{
 			Config{Stoplist: DefaultStoplist},
-			termMap{
+			map[string]float64{
 				"bodi":   0.1250,
 				"magic":  0.2500,
 				"metal":  0.1250,
@@ -68,7 +68,7 @@ func TestNewDocument(t *testing.T) {
 					},
 				),
 			},
-			termMap{
+			map[string]float64{
 				"bodi":   0.0769,
 				"had":    0.0769,
 				"it":     0.0769,
@@ -87,7 +87,7 @@ func TestNewDocument(t *testing.T) {
 			Config{
 				NoStoplist: true,
 			},
-			termMap{
+			map[string]float64{
 				"and":    0.1000,
 				"bodi":   0.0500,
 				"had":    0.0500,
@@ -112,7 +112,7 @@ func TestNewDocument(t *testing.T) {
 				NoStemming: true,
 				Stoplist:   DefaultStoplist,
 			},
-			termMap{
+			map[string]float64{
 				"body":      0.1250,
 				"magic":     0.2500,
 				"metal":     0.1250,
@@ -130,15 +130,8 @@ func TestNewDocument(t *testing.T) {
 			t.Errorf("got unexpected error %v", err)
 		}
 
-		for gotTerm := range got.termFreq {
-			_, expKey := tc.expMap[gotTerm]
-			if !expKey {
-				t.Errorf("parsed unexpected term '%s'", gotTerm)
-			}
-		}
-
 		for expTerm, expFreq := range tc.expMap {
-			gotFreq, ok := got.termFreq[expTerm]
+			gotFreq, ok := got.termFreq[termID(expTerm)]
 			if !ok {
 				t.Errorf("found unexpected term '%s' in termFreq", expTerm)
 			}
@@ -147,15 +140,15 @@ func TestNewDocument(t *testing.T) {
 				t.Errorf("for term '%s' got %.4f, wanted %.4f", expTerm, gotFreq, expFreq)
 			}
 		}
+
+		if len(got.termFreq) > len(tc.expMap) {
+			t.Errorf("parsed more terms than expected")
+		}
 	}
 }
 
 func TestNormalizeTfIdf(t *testing.T) {
-	tm := termMap{
-		"foo": 2.0,
-		"bar": 3.0,
-		"baz": 4.0,
-	}
+	tm := termMap{0: 2.0, 1: 3.0, 2: 4.0}
 
 	tests := []struct {
 		doc    Document
@@ -174,20 +167,12 @@ func TestNormalizeTfIdf(t *testing.T) {
 		},
 		{
 			Document{
-				termFreq: termMap{
-					"foo": 3.0,
-					"bar": 4.0,
-					"baz": 5.0,
-				},
+				termFreq: termMap{0: 3.0, 1: 4.0, 2: 5.0},
 			},
 			tm,
 			Document{
-				tfIdf: termMap{
-					"foo": 6.0,
-					"bar": 12.0,
-					"baz": 20.0,
-				},
-				norm: 24.0832,
+				tfIdf: termMap{0: 6.0, 1: 12.0, 2: 20.0},
+				norm:  24.0832,
 			},
 		},
 	}
@@ -221,11 +206,7 @@ func TestCalcNorm(t *testing.T) {
 		},
 		{
 			Document{
-				tfIdf: termMap{
-					"foo": 2.0,
-					"bar": 3.0,
-					"baz": 4.0,
-				},
+				tfIdf: termMap{0: 2.0, 1: 3.0, 2: 4.0},
 			},
 			5.3852,
 		},

diff --git a/corpus/similarity_test.go b/corpus/similarity_test.go
@@ -18,13 +18,13 @@ func TestCosineSimilarity(t *testing.T) {
 		sim    float64
 	}
 
-	docA := Document{tfIdf: termMap{"foo": 0.3013, "bar": 0.2628}}
+	docA := Document{tfIdf: termMap{0: 0.3013, 1: 0.2628}}
 	docA.norm = docA.calcNorm()
 
-	docB := Document{tfIdf: termMap{"baz": 0.1577, "quux": 0.7796, "xyzzy": 0.1577}}
+	docB := Document{tfIdf: termMap{2: 0.1577, 3: 0.7796, 4: 0.1577}}
 	docB.norm = docB.calcNorm()
 
-	docC := Document{tfIdf: termMap{"foo": 0.2260, "quux": 0.6496}}
+	docC := Document{tfIdf: termMap{0: 0.2260, 3: 0.6496}}
 	docC.norm = docC.calcNorm()
 
 	cosTests := []cosTest{

diff --git a/corpus/term_map.go b/corpus/term_map.go
@@ -0,0 +1,17 @@
+package corpus
+
+type termMap map[int]float64
+
+var termIDs = make(map[string]int)
+
+func termID(term string) int {
+	cachedID, ok := termIDs[term]
+
+	if ok {
+		return cachedID
+	}
+
+	id := len(termIDs)
+	termIDs[term] = id
+	return id
+}
diff --git a/corpus/term_map_test.go b/corpus/term_map_test.go
@@ -0,0 +1,30 @@
+package corpus
+
+import (
+	"testing"
+)
+
+func TestTermID(t *testing.T) {
+	// Clear out the existing cache mapping terms to IDs
+	termIDs = make(map[string]int)
+
+	// Check that calling termID maps the next available int ID to the term and
+	// fetches the ID associated with already-cached terms
+	tests := []struct {
+		term string
+		id   int
+	}{
+		{"foo", 0},
+		{"bar", 1},
+		{"foo", 0},
+		{"baz", 2},
+	}
+
+	for _, tc := range tests {
+		id := termID(tc.term)
+
+		if id != tc.id {
+			t.Errorf("got %d, wanted %d", tc.id, id)
+		}
+	}
+}