Fast and efficient Golang package for splitting large csv files on smaller chunks by size in bytes.
- Super-fast splitting. Splitting of 700MB+ file takes less than 1 sec!
- Allocates minimum memory regardless file size.
- Also accepts io.Reader as input.
- Supports multiline cells and headers (csv should follow the basic rules https://en.wikipedia.org/wiki/Comma-separated_values).
- Configurable destination folder.
- Disabling/enabling of copying a header in chunk files.
Install:
go get -u github.com/tolik505/split-csv
Import:
import splitCsv "github.com/tolik505/split-csv"
func ExampleSplitCsv() {
splitter := splitCsv.New()
splitter.Separator = ";" // "," is by default
splitter.FileChunkSize = 100000000 //in bytes (100MB)
result, _ := splitter.Split("testdata/test.csv", "testdata/")
fmt.Println(result)
// Output: [testdata/test_1.csv testdata/test_2.csv testdata/test_3.csv]
}
If copying of a header in chunks is not needed then:
func ExampleSplitCsv() {
splitter := splitCsv.New()
splitter.Separator = ";" // "," is by default
splitter.FileChunkSize = 20000000 //in bytes (20MB)
s.WithHeader = false //copying of header in chunks is disabled
result, _ := splitter.Split("testdata/test.csv", "testdata/")
fmt.Println(result)
// Output: [testdata/test_1.csv testdata/test_2.csv testdata/test_3.csv]
}
Or if you want to pass io.Reader instead of a file path:
// First implement io.Reader interface with an appropriate logic for your use-case
type testReader struct {
dataCh chan []byte
buf []byte
}
// Read listens to the data channel and populates p accordingly.
// When p is full remaining data goes to the buffer to be used in the next read cycle
func (r *testReader) Read(p []byte) (n int, err error) {
pLen := len(p)
i := 0
for _, char := range r.buf {
p[i] = char
i++
}
r.buf = nil
for bytes := range r.dataCh {
for j, char := range bytes {
p[i] = byte(char)
i++
if i >= pLen {
r.buf = bytes[j+1:]
return pLen, nil
}
}
}
return i, io.EOF
}
// In the example data is being sent to the channel which is consumed by the custom reader.
// In such way we can stream data to the splitter.
func ExampleSplitCsv() {
dataCh := make(chan []byte)
reader := &testReader{dataCh: dataCh}
data := []string{
"Test header 1; Test header 2; Test header 3; Test header 4; Test header 5\n",
"1; test value 1st; test value 1st; test value 1st; test value 1st\n",
"2; test value 2nd; test value 2nd; test value 2nd; test value 2nd\n",
"3; test value 3rd; test value 3rd; test value 3rd; test value 3rd\n",
}
go func() {
defer close(dataCh)
for _, v := range data {
dataCh <- []byte(v)
}
}()
splitter := splitCsv.New()
splitter.Separator = ";" // "," is by default
splitter.FileChunkSize = 100000000 //in bytes (100MB)
result, _ := splitter.SplitReader(reader, "output/dir", "output_file_prefix")
fmt.Println(result)
// Output: [output/dir/test_1.csv output/dir/test_2.csv output/dir/test_3.csv]
}