-
Notifications
You must be signed in to change notification settings - Fork 92
Generic string interning package to help memory & perf. #96
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package(default_visibility = ["//visibility:public"]) | ||
|
||
load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library", "go_test") | ||
|
||
go_library( | ||
name = "go_default_library", | ||
srcs = [ | ||
"intern.go", | ||
], | ||
) | ||
|
||
go_test( | ||
name = "intern_test", | ||
srcs = [ "intern_test.go" ], | ||
size = "small", | ||
library = ":go_default_library", | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
// Copyright 2016 Google Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package intern | ||
|
||
import ( | ||
"sync" | ||
) | ||
|
||
// Pool is a general purpose string interning package to reduce memory | ||
// consumption and improve processor cache efficiency. | ||
type Pool struct { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should probably document Pool, as it is exported. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. We really need to get lint checking working in the build... |
||
sync.RWMutex | ||
strings map[string]string | ||
currentSize int | ||
maxSize int | ||
} | ||
|
||
const ( | ||
// just a guess of course... | ||
averageStringLength = 10 | ||
) | ||
|
||
// NewPool allocates a new interning pool ready for use. | ||
// | ||
// Go doesn't currently have sophisticated GC primitives, such as weak pointers. | ||
// As a result, a simple string interning solution can easily become subject to | ||
// memory bloating. Strings are only ever added and never removed, leading to | ||
// an eventual OOM condition. Can easily be leveraged to DDoS a server for example. | ||
// | ||
// The code here uses a simple approach to work around this problem. If the table | ||
// holding the interned strings ever holds more than than maxSize's worth of strings, | ||
// the table is completely dropped on the floor and a new table is allocated. This | ||
// allows any stale strings pointed to by the old table to be reclaimed by the GC. | ||
// This effectively puts a cap on the memory held by any single pool. The cost of | ||
// this approach of course is that interning will be less efficient. | ||
func NewPool(maxSize int) *Pool { | ||
return &Pool{strings: make(map[string]string, maxSize/averageStringLength), maxSize: maxSize} | ||
} | ||
|
||
// Intern returns a sharable version of the string, allowing the | ||
// parameter's storage to be garbage collected. | ||
func (p *Pool) Intern(s string) string { | ||
// quick try if its already in the table | ||
p.RLock() | ||
result, found := p.strings[s] | ||
p.RUnlock() | ||
|
||
if !found { | ||
// look again under a serializing r/w lock | ||
p.Lock() | ||
if result, found = p.strings[s]; !found { | ||
if len(s) > p.maxSize-p.currentSize { | ||
p.strings = make(map[string]string, p.maxSize/averageStringLength) | ||
p.currentSize = 0 | ||
} | ||
|
||
p.strings[s] = s | ||
p.currentSize += len(s) | ||
result = s | ||
} | ||
p.Unlock() | ||
} | ||
|
||
return result | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
// Copyright 2016 Google Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package intern | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestIntern(t *testing.T) { | ||
p := NewPool(4) | ||
|
||
strings := []string{ | ||
"a", | ||
"b", | ||
"c", | ||
"d", | ||
"e", | ||
"f", | ||
"g", | ||
"h", | ||
} | ||
|
||
// We're only testing the semantics here, we're not actually | ||
// verifying that interning has taken place. That's too hard to | ||
// do in Go... | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can actually write a test that checks for the purpose. mixologist/cp/whitelist/whitelist_test.go#67 import (
g "github.com/onsi/gomega"
)
func TestWhiteListUnload(t *testing.T) {
g.RegisterTestingT(t)
wl, err := build("")
g.Expect(err).To(g.BeNil())
var finalized atomic.Value
finalized.Store(false)
// check adapter is eventually unloaded
// This test ensures that after unload is called
// the adapter is removed from memory
runtime.SetFinalizer(wl, func(ff interface{}) {
finalized.Store(true)
})
wl.Unload()
runtime.GC()
g.Eventually(func() bool {
runtime.GC()
return finalized.Load().(bool)
}).Should(g.BeTrue(), "Adapter was not fully unloaded")
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I played with that stuff and it unfortunately doesn't work for strings. The point here would be to know whether a string gets collected or not. Unfortunately, calling SetFinalize on a string gives a runtime error because a string is a value types and not a pointer. I'd really need to be passing the underlying uintptr which I don't have access to. Anyway, the library code is so simple, I'm pretty sure it's behaving as it should. |
||
|
||
for _, s := range strings { | ||
r := p.Intern(s) | ||
if r != s { | ||
t.Errorf("Got mismatch %v != %v", r, s) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this seems fine. I guess we could also use sync.Pool (a la the example from the other issue), and not worry about the mutex'ing here. Thoughts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The solution mentioned has the unfortunate effect of zapping the table when the GC decides it needs more memory, rather than on a specific size limit. This means the table can potentially get very big indeed. When multiple components use a similar technique in a process, it means we don't get to decide what feature needs more memory compared to another.
So I like my approach better...