Skip to content

Commit

Permalink
Map terms to int IDs to save memory
Browse files Browse the repository at this point in the history
Each Document has a termMap, which maps a string term to a float64 weight.
There's a lot of redundant storage there; if a string term appears in a thousand
documents, that string will be stored as a hashed key a thousand times.

Instead, this commit adds a global `termIDs` cache that maps string terms to
integer IDs, with a `termID` function that looks up the term in that cache
and/or creates a new entry.

Primitive benchmarking suggests this reduces peak memory usage by about 15-20%
without increasing processing time. That's not huge, but not bad.

The "benchmarking" used to figure this out:

    $ sudo apt install time
    $ make && /usr/bin/time -v ./docsim "foo bar baz" ~/documents 2>&1 | grep "Maximum resident set size"
    Maximum resident set size (kbytes): 633248
  • Loading branch information
hrs committed Jun 28, 2023
1 parent 03eaeef commit ab22d17
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 46 deletions.
14 changes: 5 additions & 9 deletions corpus/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ import (
"unicode"
)

type termMap map[string]float64

type Document struct {
path string
termFreq termMap
Expand Down Expand Up @@ -83,7 +81,6 @@ func parseTokens(rd io.Reader) ([]string, error) {
tokens = append(tokens, word)
}
}

}

if err := scanner.Err(); err != nil {
Expand All @@ -103,16 +100,15 @@ func NewDocument(rd io.Reader, config *Config) (*Document, error) {
}

// Loop over the tokens, stem them if configured, pass them through the
// stoplist if configured, and, for each that "should" "count", increment it
// in the term map.
// stoplist if configured, and, for each that "should" "count", increment its
// ID in the term map.
for _, token := range tokens {
if config.NoStoplist || !config.Stoplist.include(token) {
if config.NoStemming {
termCount[token]++
} else {
termCount[stem(token)]++
if !config.NoStemming {
token = stem(token)
}

termCount[termID(token)]++
totalTermCount++
}
}
Expand Down
49 changes: 15 additions & 34 deletions corpus/document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ func TestNewDocument(t *testing.T) {

tests := []struct {
config Config
expMap termMap
expMap map[string]float64
}{
{
Config{Stoplist: DefaultStoplist},
termMap{
map[string]float64{
"bodi": 0.1250,
"magic": 0.2500,
"metal": 0.1250,
Expand All @@ -68,7 +68,7 @@ func TestNewDocument(t *testing.T) {
},
),
},
termMap{
map[string]float64{
"bodi": 0.0769,
"had": 0.0769,
"it": 0.0769,
Expand All @@ -87,7 +87,7 @@ func TestNewDocument(t *testing.T) {
Config{
NoStoplist: true,
},
termMap{
map[string]float64{
"and": 0.1000,
"bodi": 0.0500,
"had": 0.0500,
Expand All @@ -112,7 +112,7 @@ func TestNewDocument(t *testing.T) {
NoStemming: true,
Stoplist: DefaultStoplist,
},
termMap{
map[string]float64{
"body": 0.1250,
"magic": 0.2500,
"metal": 0.1250,
Expand All @@ -130,15 +130,8 @@ func TestNewDocument(t *testing.T) {
t.Errorf("got unexpected error %v", err)
}

for gotTerm := range got.termFreq {
_, expKey := tc.expMap[gotTerm]
if !expKey {
t.Errorf("parsed unexpected term '%s'", gotTerm)
}
}

for expTerm, expFreq := range tc.expMap {
gotFreq, ok := got.termFreq[expTerm]
gotFreq, ok := got.termFreq[termID(expTerm)]
if !ok {
t.Errorf("found unexpected term '%s' in termFreq", expTerm)
}
Expand All @@ -147,15 +140,15 @@ func TestNewDocument(t *testing.T) {
t.Errorf("for term '%s' got %.4f, wanted %.4f", expTerm, gotFreq, expFreq)
}
}

if len(got.termFreq) > len(tc.expMap) {
t.Errorf("parsed more terms than expected")
}
}
}

func TestNormalizeTfIdf(t *testing.T) {
tm := termMap{
"foo": 2.0,
"bar": 3.0,
"baz": 4.0,
}
tm := termMap{0: 2.0, 1: 3.0, 2: 4.0}

tests := []struct {
doc Document
Expand All @@ -174,20 +167,12 @@ func TestNormalizeTfIdf(t *testing.T) {
},
{
Document{
termFreq: termMap{
"foo": 3.0,
"bar": 4.0,
"baz": 5.0,
},
termFreq: termMap{0: 3.0, 1: 4.0, 2: 5.0},
},
tm,
Document{
tfIdf: termMap{
"foo": 6.0,
"bar": 12.0,
"baz": 20.0,
},
norm: 24.0832,
tfIdf: termMap{0: 6.0, 1: 12.0, 2: 20.0},
norm: 24.0832,
},
},
}
Expand Down Expand Up @@ -221,11 +206,7 @@ func TestCalcNorm(t *testing.T) {
},
{
Document{
tfIdf: termMap{
"foo": 2.0,
"bar": 3.0,
"baz": 4.0,
},
tfIdf: termMap{0: 2.0, 1: 3.0, 2: 4.0},
},
5.3852,
},
Expand Down
6 changes: 3 additions & 3 deletions corpus/similarity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ func TestCosineSimilarity(t *testing.T) {
sim float64
}

docA := Document{tfIdf: termMap{"foo": 0.3013, "bar": 0.2628}}
docA := Document{tfIdf: termMap{0: 0.3013, 1: 0.2628}}
docA.norm = docA.calcNorm()

docB := Document{tfIdf: termMap{"baz": 0.1577, "quux": 0.7796, "xyzzy": 0.1577}}
docB := Document{tfIdf: termMap{2: 0.1577, 3: 0.7796, 4: 0.1577}}
docB.norm = docB.calcNorm()

docC := Document{tfIdf: termMap{"foo": 0.2260, "quux": 0.6496}}
docC := Document{tfIdf: termMap{0: 0.2260, 3: 0.6496}}
docC.norm = docC.calcNorm()

cosTests := []cosTest{
Expand Down
17 changes: 17 additions & 0 deletions corpus/term_map.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package corpus

type termMap map[int]float64

var termIDs = make(map[string]int)

func termID(term string) int {
cachedID, ok := termIDs[term]

if ok {
return cachedID
}

id := len(termIDs)
termIDs[term] = id
return id
}
30 changes: 30 additions & 0 deletions corpus/term_map_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package corpus

import (
"testing"
)

func TestTermID(t *testing.T) {
// Clear out the existing cache mapping terms to IDs
termIDs = make(map[string]int)

// Check that calling termID maps the next available int ID to the term and
// fetches the ID associated with already-cached terms
tests := []struct {
term string
id int
}{
{"foo", 0},
{"bar", 1},
{"foo", 0},
{"baz", 2},
}

for _, tc := range tests {
id := termID(tc.term)

if id != tc.id {
t.Errorf("got %d, wanted %d", tc.id, id)
}
}
}

0 comments on commit ab22d17

Please # to comment.