Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

fix: Make fewer copies when building a string array #5503

Merged
merged 2 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions array/array_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ func TestString(t *testing.T) {
b.Append("abcdefghij")
}
},
bsz: 256, // 64 bytes nulls + 192 bytes data.
sz: 64, // The minimum size of a buffer is 64 bytes
bsz: 64, // 64 bytes data.
sz: 64, // The minimum size of a buffer is 64 bytes
want: []interface{}{
"abcdefghij",
"abcdefghij",
Expand Down
174 changes: 143 additions & 31 deletions array/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,53 +2,100 @@ package array

import (
"bytes"
"sync/atomic"
"unsafe"

"github.com/apache/arrow/go/v7/arrow"
"github.com/apache/arrow/go/v7/arrow/array"
"github.com/apache/arrow/go/v7/arrow/bitutil"
"github.com/apache/arrow/go/v7/arrow/memory"
)

type StringBuilder struct {
mem memory.Allocator
builder *array.BinaryBuilder
constant bool
mem memory.Allocator
len int
cap int
reserveData int
buffer *memory.Buffer
builder *array.BinaryBuilder
refCount int64
}

func NewStringBuilder(mem memory.Allocator) *StringBuilder {
return &StringBuilder{
mem: mem,
builder: array.NewBinaryBuilder(mem, StringType),
constant: true,
mem: mem,
len: 0,
cap: 0,
reserveData: 0,
buffer: nil,
builder: nil,
refCount: 1,
}
}

func (b *StringBuilder) Retain() {
b.builder.Retain()
atomic.AddInt64(&b.refCount, 1)
}
func (b *StringBuilder) Release() {
b.builder.Release()
if atomic.AddInt64(&b.refCount, -1) == 0 {
if b.buffer != nil {
b.buffer.Release()
}
if b.builder != nil {
b.builder.Release()
}
}
}
func (b *StringBuilder) Len() int {
return b.builder.Len()
if b.builder != nil {
return b.builder.Len()
}
return b.len
}
func (b *StringBuilder) Cap() int {
return b.builder.Cap()
if b.builder != nil {
return b.builder.Cap()
}
if b.cap > b.len {
return b.cap
}
return b.len
}
func (b *StringBuilder) NullN() int {
return b.builder.NullN()
if b.builder != nil {
return b.builder.NullN()
}
return 0
}

func (b *StringBuilder) AppendBytes(buf []byte) {
if b.builder.Len() > 0 {
b.constant = b.constant && bytes.Equal(buf, b.builder.Value(0))
if b.builder != nil {
b.builder.Append(buf)
return
}
if b.len == 0 {
b.buffer = memory.NewResizableBuffer(b.mem)
b.buffer.Resize(len(buf))
copy(b.buffer.Bytes(), buf)
b.len = 1
return
}
b.builder.Append(buf)
if bytes.Equal(b.buffer.Bytes(), buf) {
b.len++
return
}
b.makeBuilder(buf)

}

// Append appends a string to the array being built. The input string
// will always be copied.
// Append appends a string to the array being built. A reference
// to the input string will not be retained by the builder. The
// string will be copied, if necessary.
func (b *StringBuilder) Append(v string) {
b.AppendBytes([]byte(v))
// Avoid copying the input string as AppendBytes
// will never keep a reference or modify the input.
bytes := unsafe.Slice(unsafe.StringData(v), len(v))
b.AppendBytes(bytes)
}

func (b *StringBuilder) AppendValues(v []string, valid []bool) {
Expand All @@ -61,38 +108,66 @@ func (b *StringBuilder) AppendValues(v []string, valid []bool) {
}
}
func (b *StringBuilder) AppendNull() {
b.constant = false
if b.builder == nil {
b.makeBuilder(nil)
}
b.builder.AppendNull()
}

func (b *StringBuilder) UnsafeAppendBoolToBitmap(isValid bool) {
b.builder.UnsafeAppendBoolToBitmap(isValid)
}

func (b *StringBuilder) Reserve(n int) {
b.builder.Reserve(n)
if b.builder != nil {
b.builder.Reserve(n)
return
}
if b.len+n > b.cap {
b.cap = b.len + n
}
}

func (b *StringBuilder) ReserveData(n int) {
b.builder.ReserveData(n)
if b.builder != nil {
b.builder.ReserveData(n)
return
}
b.reserveData = n
}

func (b *StringBuilder) Resize(n int) {
b.builder.Resize(n)
if b.builder != nil {
b.builder.Resize(n)
}
b.cap = n
if b.len > n {
b.len = n
}
}

func (b *StringBuilder) NewArray() Array {
return b.NewStringArray()
}

func (b *StringBuilder) NewStringArray() *String {
arr := b.builder.NewBinaryArray()
if !b.constant || arr.Len() < 1 {
b.constant = true
return &String{arr}
if b.builder != nil {
arr := &String{b.builder.NewBinaryArray()}
b.builder.Release()
b.builder = nil
return arr
}
if b.buffer != nil {
arr := &String{&repeatedBinary{
len: b.len,
buf: b.buffer,
}}
b.buffer = nil
b.len = 0
b.cap = 0
return arr
}
defer arr.Release()
return StringRepeat(arr.ValueString(0), arr.Len(), b.mem)
// getting this far means we have an empty array.
arr := StringRepeat("", b.len, b.mem)
b.len = 0
b.cap = 0
return arr
}

func (b *StringBuilder) CopyValidValues(values *String, nullCheckArray Array) {
Expand All @@ -110,6 +185,43 @@ func (b *StringBuilder) CopyValidValues(values *String, nullCheckArray Array) {
}
}

func (b *StringBuilder) makeBuilder(value []byte) {
bufferLen := 0
if b.buffer != nil {
bufferLen = b.buffer.Len()
}
size := b.len
if b.cap > b.len {
size = b.cap
}
dataSize := b.len * bufferLen
if value != nil {
if b.cap <= b.len {
size++
}
dataSize += len(value)
}
if b.reserveData > dataSize {
dataSize = b.reserveData
}
b.builder = array.NewBinaryBuilder(b.mem, arrow.BinaryTypes.String)
b.builder.Resize(size)
b.builder.ReserveData(dataSize)
for i := 0; i < b.len; i++ {
b.builder.Append(b.buffer.Bytes())
}
if value != nil {
b.builder.Append(value)
}
if b.buffer != nil {
b.buffer.Release()
b.buffer = nil
}
b.len = 0
b.cap = 0
b.reserveData = 0
}

// Copy of Array.IsValid from arrow, allowing the IsValid check to be done without going through an interface
func isValid(nullBitmapBytes []byte, offset int, i int) bool {
return len(nullBitmapBytes) == 0 || bitutil.BitIsSet(nullBitmapBytes, offset+i)
Expand Down