Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

fix: Make fewer copies when building a string array #5503

Merged
merged 2 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions array/array_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ func TestString(t *testing.T) {
b.Append("abcdefghij")
}
},
bsz: 256, // 64 bytes nulls + 192 bytes data.
sz: 64, // The minimum size of a buffer is 64 bytes
bsz: 64, // 64 bytes data.
sz: 64, // The minimum size of a buffer is 64 bytes
want: []interface{}{
"abcdefghij",
"abcdefghij",
Expand Down
169 changes: 140 additions & 29 deletions array/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,53 +2,99 @@ package array

import (
"bytes"
"sync/atomic"
"unsafe"

"github.com/apache/arrow/go/v7/arrow"
"github.com/apache/arrow/go/v7/arrow/array"
"github.com/apache/arrow/go/v7/arrow/bitutil"
"github.com/apache/arrow/go/v7/arrow/memory"
)

type StringBuilder struct {
mem memory.Allocator
builder *array.BinaryBuilder
constant bool
mem memory.Allocator
len int
cap int
reserveData int
buffer *memory.Buffer
builder *array.BinaryBuilder
refCount int64
}

func NewStringBuilder(mem memory.Allocator) *StringBuilder {
return &StringBuilder{
mem: mem,
builder: array.NewBinaryBuilder(mem, StringType),
constant: true,
mem: mem,
len: 0,
cap: 0,
reserveData: 0,
buffer: nil,
builder: nil,
refCount: 1,
}
}

func (b *StringBuilder) Retain() {
b.builder.Retain()
atomic.AddInt64(&b.refCount, 1)
}
func (b *StringBuilder) Release() {
b.builder.Release()
if atomic.AddInt64(&b.refCount, -1) == 0 {
if b.buffer != nil {
b.buffer.Release()
}
if b.builder != nil {
b.builder.Release()
}
}
}
func (b *StringBuilder) Len() int {
return b.builder.Len()
if b.builder != nil {
return b.builder.Len()
}
return b.len
}
func (b *StringBuilder) Cap() int {
return b.builder.Cap()
if b.builder != nil {
return b.builder.Cap()
}
if b.cap > b.len {
return b.cap
}
return b.len
}
func (b *StringBuilder) NullN() int {
return b.builder.NullN()
if b.builder != nil {
return b.builder.NullN()
}
return 0
}

func (b *StringBuilder) AppendBytes(buf []byte) {
if b.builder.Len() > 0 {
b.constant = b.constant && bytes.Equal(buf, b.builder.Value(0))
if b.builder != nil {
b.builder.Append(buf)
return
}
if b.len == 0 {
b.buffer = memory.NewResizableBuffer(b.mem)
b.buffer.Resize(len(buf))
copy(b.buffer.Bytes(), buf)
b.len = 1
return
}
b.builder.Append(buf)
if bytes.Equal(b.buffer.Bytes(), buf) {
b.len++
return
}
b.makeBuilder(buf)

}

// Append appends a string to the array being built. The input string
// will always be copied.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor point: the second half of this comment was true prior to this change, but now isn't

func (b *StringBuilder) Append(v string) {
b.AppendBytes([]byte(v))
// Avoid copying the input string as AppendBytes
// will never keep a reference or modify the input.
bytes := unsafe.Slice(unsafe.StringData(v), len(v))
b.AppendBytes(bytes)
}

func (b *StringBuilder) AppendValues(v []string, valid []bool) {
Expand All @@ -61,38 +107,66 @@ func (b *StringBuilder) AppendValues(v []string, valid []bool) {
}
}
func (b *StringBuilder) AppendNull() {
b.constant = false
if b.builder == nil {
b.makeBuilder(nil)
}
b.builder.AppendNull()
}

func (b *StringBuilder) UnsafeAppendBoolToBitmap(isValid bool) {
b.builder.UnsafeAppendBoolToBitmap(isValid)
}

func (b *StringBuilder) Reserve(n int) {
b.builder.Reserve(n)
if b.builder != nil {
b.builder.Reserve(n)
return
}
if b.len+n > b.cap {
b.cap = b.len + n
}
}

func (b *StringBuilder) ReserveData(n int) {
b.builder.ReserveData(n)
if b.builder != nil {
b.builder.ReserveData(n)
return
}
b.reserveData = n
}

func (b *StringBuilder) Resize(n int) {
b.builder.Resize(n)
if b.builder != nil {
b.builder.Resize(n)
}
b.cap = n
if b.len > n {
b.len = n
}
}

func (b *StringBuilder) NewArray() Array {
return b.NewStringArray()
}

func (b *StringBuilder) NewStringArray() *String {
arr := b.builder.NewBinaryArray()
if !b.constant || arr.Len() < 1 {
b.constant = true
return &String{arr}
if b.builder != nil {
arr := &String{b.builder.NewBinaryArray()}
b.builder.Release()
b.builder = nil
return arr
}
if b.buffer != nil {
arr := &String{&repeatedBinary{
len: b.len,
buf: b.buffer,
}}
b.buffer = nil
b.len = 0
b.cap = 0
return arr
}
defer arr.Release()
return StringRepeat(arr.ValueString(0), arr.Len(), b.mem)
// getting this far means we have an empty array.
arr := StringRepeat("", b.len, b.mem)
b.len = 0
b.cap = 0
return arr
}

func (b *StringBuilder) CopyValidValues(values *String, nullCheckArray Array) {
Expand All @@ -110,6 +184,43 @@ func (b *StringBuilder) CopyValidValues(values *String, nullCheckArray Array) {
}
}

func (b *StringBuilder) makeBuilder(value []byte) {
bufferLen := 0
if b.buffer != nil {
bufferLen = b.buffer.Len()
}
size := b.len
if b.cap > b.len {
size = b.cap
}
dataSize := b.len * bufferLen
if value != nil {
if b.cap <= b.len {
size++
}
dataSize += len(value)
}
if b.reserveData > dataSize {
dataSize = b.reserveData
}
b.builder = array.NewBinaryBuilder(b.mem, arrow.BinaryTypes.String)
b.builder.Resize(size)
b.builder.ReserveData(dataSize)
for i := 0; i < b.len; i++ {
b.builder.Append(b.buffer.Bytes())
}
if value != nil {
b.builder.Append(value)
}
if b.buffer != nil {
b.buffer.Release()
b.buffer = nil
}
b.len = 0
b.cap = 0
b.reserveData = 0
}

// Copy of Array.IsValid from arrow, allowing the IsValid check to be done without going through an interface
func isValid(nullBitmapBytes []byte, offset int, i int) bool {
return len(nullBitmapBytes) == 0 || bitutil.BitIsSet(nullBitmapBytes, offset+i)
Expand Down