-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Each Document has a termMap, which maps a string term to a float64 weight. There's a lot of redundant storage there; if a string term appears in a thousand documents, that string will be stored as a hashed key a thousand times. Instead, this commit adds a global `termIDs` cache that maps string terms to integer IDs, with a `termID` function that looks up the term in that cache and/or creates a new entry. Primitive benchmarking suggests this reduces peak memory usage by about 15-20% without increasing processing time. That's not huge, but not bad. The "benchmarking" used to figure this out: $ sudo apt install time $ make && /usr/bin/time -v ./docsim "foo bar baz" ~/documents 2>&1 | grep "Maximum resident set size" Maximum resident set size (kbytes): 633248
- Loading branch information
Showing
5 changed files
with
70 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package corpus | ||
|
||
type termMap map[int]float64 | ||
|
||
var termIDs = make(map[string]int) | ||
|
||
func termID(term string) int { | ||
cachedID, ok := termIDs[term] | ||
|
||
if ok { | ||
return cachedID | ||
} | ||
|
||
id := len(termIDs) | ||
termIDs[term] = id | ||
return id | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package corpus | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestTermID(t *testing.T) { | ||
// Clear out the existing cache mapping terms to IDs | ||
termIDs = make(map[string]int) | ||
|
||
// Check that calling termID maps the next available int ID to the term and | ||
// fetches the ID associated with already-cached terms | ||
tests := []struct { | ||
term string | ||
id int | ||
}{ | ||
{"foo", 0}, | ||
{"bar", 1}, | ||
{"foo", 0}, | ||
{"baz", 2}, | ||
} | ||
|
||
for _, tc := range tests { | ||
id := termID(tc.term) | ||
|
||
if id != tc.id { | ||
t.Errorf("got %d, wanted %d", tc.id, id) | ||
} | ||
} | ||
} |