From c8815f5ab978990972e6f9e09715f7ce12a3aa4c Mon Sep 17 00:00:00 2001 From: Mike Farah Date: Mon, 1 Aug 2022 10:28:34 +1000 Subject: [PATCH] Csv decoder (#1290) * WIP: adding CSV decoder * Adding CSV decoder * Added CSV roundtrip * Fixing from review --- acceptance_tests/inputs-format.sh | 47 +++ acceptance_tests/output-format.sh | 42 +++ cmd/utils.go | 4 + examples/sample_objects.csv | 3 + pkg/yqlib/csv_test.go | 273 ++++++++++++++++++ pkg/yqlib/decoder.go | 6 + pkg/yqlib/decoder_csv_object.go | 77 +++++ pkg/yqlib/doc/operators/encode-decode.md | 46 ++- .../doc/operators/headers/encode-decode.md | 8 +- pkg/yqlib/doc/usage/csv-tsv.md | 214 ++++++++++++++ pkg/yqlib/doc/usage/headers/csv-tsv.md | 30 ++ pkg/yqlib/doc/usage/headers/properties.md | 2 +- pkg/yqlib/doc/usage/properties.md | 2 +- pkg/yqlib/encoder_csv.go | 78 ++++- pkg/yqlib/encoder_csv_test.go | 60 ---- pkg/yqlib/lexer_participle.go | 3 + pkg/yqlib/lexer_participle_test.go | 1 - pkg/yqlib/lib.go | 6 +- pkg/yqlib/operator_encoder_decoder.go | 4 + pkg/yqlib/operator_encoder_decoder_test.go | 18 +- 20 files changed, 837 insertions(+), 87 deletions(-) create mode 100644 examples/sample_objects.csv create mode 100644 pkg/yqlib/csv_test.go create mode 100644 pkg/yqlib/decoder_csv_object.go create mode 100644 pkg/yqlib/doc/usage/csv-tsv.md create mode 100644 pkg/yqlib/doc/usage/headers/csv-tsv.md delete mode 100644 pkg/yqlib/encoder_csv_test.go diff --git a/acceptance_tests/inputs-format.sh b/acceptance_tests/inputs-format.sh index acb1c3d3b4..31598f9f64 100755 --- a/acceptance_tests/inputs-format.sh +++ b/acceptance_tests/inputs-format.sh @@ -3,6 +3,8 @@ setUp() { rm test*.yml 2>/dev/null || true rm test*.properties 2>/dev/null || true + rm test*.csv 2>/dev/null || true + rm test*.tsv 2>/dev/null || true rm test*.xml 2>/dev/null || true } @@ -40,6 +42,51 @@ EOM assertEquals "$expected" "$X" } +testInputCSV() { + cat >test.csv <test.tsv <test.yml <BiBi diff --git a/acceptance_tests/output-format.sh b/acceptance_tests/output-format.sh index 995f0fcf3b..a238dc0bc6 100755 --- a/acceptance_tests/output-format.sh +++ b/acceptance_tests/output-format.sh @@ -102,6 +102,48 @@ EOM assertEquals "$expected" "$X" } +testOutputCSV() { + cat >test.yml <test.yml <test.yml < 0 { + log.Debugf("Adding contentRow: %v", contentRow) + rootArray.Content = append(rootArray.Content, dec.createObject(headerRow, contentRow)) + contentRow, err = dec.reader.Read() + log.Debugf("Read next contentRow: %v, %v", contentRow, err) + } + if !errors.Is(err, io.EOF) { + return err + } + + log.Debugf("finished, contentRow%v", contentRow) + log.Debugf("err: %v", err) + + rootYamlNode.Kind = yaml.DocumentNode + rootYamlNode.Content = []*yaml.Node{rootArray} + return nil +} diff --git a/pkg/yqlib/doc/operators/encode-decode.md b/pkg/yqlib/doc/operators/encode-decode.md index 5332649fcf..b66b1dd464 100644 --- a/pkg/yqlib/doc/operators/encode-decode.md +++ b/pkg/yqlib/doc/operators/encode-decode.md @@ -11,14 +11,14 @@ These operators are useful to process yaml documents that have stringified embed | --- | -- | --| | Yaml | from_yaml | to_yaml(i)/@yaml | | JSON | from_json | to_json(i)/@json | -| Properties | from_props | to_props/@props | -| CSV | | to_csv/@csv | -| TSV | | to_tsv/@tsv | +| Properties | from_props/@propsd | to_props/@props | +| CSV | from_csv/@csvd | to_csv/@csv | +| TSV | from_tsv/@tsvd | to_tsv/@tsv | | XML | from_xml | to_xml(i)/@xml | | Base64 | @base64d | @base64 | -CSV and TSV format both accept either a single array or scalars (representing a single row), or an array of array of scalars (representing multiple rows). +See CSV and TSV [documentation](https://mikefarah.gitbook.io/yq/usage/csv-tsv) for accepted formats. XML uses the `--xml-attribute-prefix` and `xml-content-name` flags to identify attributes and content fields. @@ -132,7 +132,7 @@ a: |- ``` then ```bash -yq '.a |= from_props' sample.yml +yq '.a |= @propsd' sample.yml ``` will output ```yaml @@ -141,6 +141,42 @@ a: dogs: cool as well ``` +## Decode csv encoded string +Given a sample.yml file of: +```yaml +a: |- + cats,dogs + great,cool as well +``` +then +```bash +yq '.a |= @csvd' sample.yml +``` +will output +```yaml +a: + - cats: great + dogs: cool as well +``` + +## Decode tsv encoded string +Given a sample.yml file of: +```yaml +a: |- + cats dogs + great cool as well +``` +then +```bash +yq '.a |= @tsvd' sample.yml +``` +will output +```yaml +a: + - cats: great + dogs: cool as well +``` + ## Encode value as yaml string Indent defaults to 2 diff --git a/pkg/yqlib/doc/operators/headers/encode-decode.md b/pkg/yqlib/doc/operators/headers/encode-decode.md index 07aa47eec4..8ba6982d3d 100644 --- a/pkg/yqlib/doc/operators/headers/encode-decode.md +++ b/pkg/yqlib/doc/operators/headers/encode-decode.md @@ -11,14 +11,14 @@ These operators are useful to process yaml documents that have stringified embed | --- | -- | --| | Yaml | from_yaml | to_yaml(i)/@yaml | | JSON | from_json | to_json(i)/@json | -| Properties | from_props | to_props/@props | -| CSV | | to_csv/@csv | -| TSV | | to_tsv/@tsv | +| Properties | from_props/@propsd | to_props/@props | +| CSV | from_csv/@csvd | to_csv/@csv | +| TSV | from_tsv/@tsvd | to_tsv/@tsv | | XML | from_xml | to_xml(i)/@xml | | Base64 | @base64d | @base64 | -CSV and TSV format both accept either a single array or scalars (representing a single row), or an array of array of scalars (representing multiple rows). +See CSV and TSV [documentation](https://mikefarah.gitbook.io/yq/usage/csv-tsv) for accepted formats. XML uses the `--xml-attribute-prefix` and `xml-content-name` flags to identify attributes and content fields. diff --git a/pkg/yqlib/doc/usage/csv-tsv.md b/pkg/yqlib/doc/usage/csv-tsv.md new file mode 100644 index 0000000000..03749ab636 --- /dev/null +++ b/pkg/yqlib/doc/usage/csv-tsv.md @@ -0,0 +1,214 @@ +# CSV +Encode/Decode/Roundtrip CSV and TSV files. + +## Encode +Currently supports arrays of homogenous flat objects, that is: no nesting and it assumes the _first_ object has all the keys required: + +```yaml +- name: Bobo + type: dog +- name: Fifi + type: cat +``` + +As well as arrays of arrays of scalars (strings/numbers/booleans): + +```yaml +- [Bobo, dog] +- [Fifi, cat] +``` + +## Decode +Decode assumes the first CSV/TSV row is the header row, and all rows beneath are the entries. +The data will be coded into an array of objects, using the header rows as keys. + +```csv +name,type +Bobo,dog +Fifi,cat +``` + + +{% hint style="warning" %} +Note that versions prior to 4.18 require the 'eval/e' command to be specified. + +`yq e ` +{% endhint %} + +## Encode CSV simple +Given a sample.yml file of: +```yaml +- [i, like, csv] +- [because, excel, is, cool] +``` +then +```bash +yq -o=csv sample.yml +``` +will output +```csv +i,like,csv +because,excel,is,cool +``` + +## Encode TSV simple +Given a sample.yml file of: +```yaml +- [i, like, csv] +- [because, excel, is, cool] +``` +then +```bash +yq -o=tsv sample.yml +``` +will output +```tsv +i like csv +because excel is cool +``` + +## Encode array of objects to csv +Given a sample.yml file of: +```yaml +- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 + +``` +then +```bash +yq -o=csv sample.yml +``` +will output +```csv +name,numberOfCats,likesApples,height +Gary,1,true,168.8 +Samantha's Rabbit,2,false,-188.8 +``` + +## Encode array of objects to custom csv format +Add the header row manually, then the we convert each object into an array of values - resulting in an array of arrays. Pick the columns and call the header whatever you like. + +Given a sample.yml file of: +```yaml +- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 + +``` +then +```bash +yq -o=csv '[["Name", "Number of Cats"]] + [.[] | [.name, .numberOfCats ]]' sample.yml +``` +will output +```csv +Name,Number of Cats +Gary,1 +Samantha's Rabbit,2 +``` + +## Encode array of objects to csv - missing fields behaviour +First entry is used to determine the headers, and it is missing 'likesApples', so it is not included in the csv. Second entry does not have 'numberOfCats' so that is blank + +Given a sample.yml file of: +```yaml +- name: Gary + numberOfCats: 1 + height: 168.8 +- name: Samantha's Rabbit + height: -188.8 + likesApples: false + +``` +then +```bash +yq -o=csv sample.yml +``` +will output +```csv +name,numberOfCats,height +Gary,1,168.8 +Samantha's Rabbit,,-188.8 +``` + +## Parse CSV into an array of objects +First row is assumed to be the header row. + +Given a sample.csv file of: +```csv +name,numberOfCats,likesApples,height +Gary,1,true,168.8 +Samantha's Rabbit,2,false,-188.8 + +``` +then +```bash +yq -p=csv sample.csv +``` +will output +```yaml +- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 +``` + +## Parse TSV into an array of objects +First row is assumed to be the header row. + +Given a sample.tsv file of: +```tsv +name numberOfCats likesApples height +Gary 1 true 168.8 +Samantha's Rabbit 2 false -188.8 + +``` +then +```bash +yq -p=tsv sample.tsv +``` +will output +```yaml +- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 +``` + +## Round trip +Given a sample.csv file of: +```csv +name,numberOfCats,likesApples,height +Gary,1,true,168.8 +Samantha's Rabbit,2,false,-188.8 + +``` +then +```bash +yq -p=csv -o=csv '(.[] | select(.name == "Gary") | .numberOfCats) = 3' sample.csv +``` +will output +```csv +name,numberOfCats,likesApples,height +Gary,3,true,168.8 +Samantha's Rabbit,2,false,-188.8 +``` + diff --git a/pkg/yqlib/doc/usage/headers/csv-tsv.md b/pkg/yqlib/doc/usage/headers/csv-tsv.md new file mode 100644 index 0000000000..0a7c7cac47 --- /dev/null +++ b/pkg/yqlib/doc/usage/headers/csv-tsv.md @@ -0,0 +1,30 @@ +# CSV +Encode/Decode/Roundtrip CSV and TSV files. + +## Encode +Currently supports arrays of homogenous flat objects, that is: no nesting and it assumes the _first_ object has all the keys required: + +```yaml +- name: Bobo + type: dog +- name: Fifi + type: cat +``` + +As well as arrays of arrays of scalars (strings/numbers/booleans): + +```yaml +- [Bobo, dog] +- [Fifi, cat] +``` + +## Decode +Decode assumes the first CSV/TSV row is the header row, and all rows beneath are the entries. +The data will be coded into an array of objects, using the header rows as keys. + +```csv +name,type +Bobo,dog +Fifi,cat +``` + diff --git a/pkg/yqlib/doc/usage/headers/properties.md b/pkg/yqlib/doc/usage/headers/properties.md index c0c1910ec0..90164a6dd4 100644 --- a/pkg/yqlib/doc/usage/headers/properties.md +++ b/pkg/yqlib/doc/usage/headers/properties.md @@ -1,5 +1,5 @@ # Properties -Encode to a property file (decode not yet supported). Line comments on value nodes will be copied across. +Encode/Decode/Roundtrip to/from a property file. Line comments on value nodes will be copied across. By default, empty maps and arrays are not encoded - see below for an example on how to encode a value for these. diff --git a/pkg/yqlib/doc/usage/properties.md b/pkg/yqlib/doc/usage/properties.md index 1b0f39048c..f80bba35d2 100644 --- a/pkg/yqlib/doc/usage/properties.md +++ b/pkg/yqlib/doc/usage/properties.md @@ -1,6 +1,6 @@ # Properties -Encode to a property file (decode not yet supported). Line comments on value nodes will be copied across. +Encode/Decode/Roundtrip to/from a property file. Line comments on value nodes will be copied across. By default, empty maps and arrays are not encoded - see below for an example on how to encode a value for these. diff --git a/pkg/yqlib/encoder_csv.go b/pkg/yqlib/encoder_csv.go index 2e321fa5cf..88e2b0dec8 100644 --- a/pkg/yqlib/encoder_csv.go +++ b/pkg/yqlib/encoder_csv.go @@ -13,7 +13,7 @@ type csvEncoder struct { } func NewCsvEncoder(separator rune) Encoder { - return &csvEncoder{separator} + return &csvEncoder{separator: separator} } func (e *csvEncoder) CanHandleAliases() bool { @@ -41,6 +41,67 @@ func (e *csvEncoder) encodeRow(csvWriter *csv.Writer, contents []*yaml.Node) err return csvWriter.Write(stringValues) } +func (e *csvEncoder) encodeArrays(csvWriter *csv.Writer, content []*yaml.Node) error { + for i, child := range content { + + if child.Kind != yaml.SequenceNode { + return fmt.Errorf("csv encoding only works for arrays of scalars (string/numbers/booleans), child[%v] is a %v", i, child.Tag) + } + err := e.encodeRow(csvWriter, child.Content) + if err != nil { + return err + } + } + return nil +} + +func (e *csvEncoder) extractHeader(child *yaml.Node) ([]*yaml.Node, error) { + if child.Kind != yaml.MappingNode { + return nil, fmt.Errorf("csv object encoding only works for arrays of flat objects (string key => string/numbers/boolean value), child[0] is a %v", child.Tag) + } + mapKeys := getMapKeys(child) + return mapKeys.Content, nil +} + +func (e *csvEncoder) createChildRow(child *yaml.Node, headers []*yaml.Node) []*yaml.Node { + childRow := make([]*yaml.Node, 0) + for _, header := range headers { + keyIndex := findKeyInMap(child, header) + value := createScalarNode(nil, "") + if keyIndex != -1 { + value = child.Content[keyIndex+1] + } + childRow = append(childRow, value) + } + return childRow + +} + +func (e *csvEncoder) encodeObjects(csvWriter *csv.Writer, content []*yaml.Node) error { + headers, err := e.extractHeader(content[0]) + if err != nil { + return nil + } + + err = e.encodeRow(csvWriter, headers) + if err != nil { + return nil + } + + for i, child := range content { + if child.Kind != yaml.MappingNode { + return fmt.Errorf("csv object encoding only works for arrays of flat objects (string key => string/numbers/boolean value), child[%v] is a %v", i, child.Tag) + } + row := e.createChildRow(child, headers) + err = e.encodeRow(csvWriter, row) + if err != nil { + return err + } + + } + return nil +} + func (e *csvEncoder) Encode(writer io.Writer, originalNode *yaml.Node) error { csvWriter := csv.NewWriter(writer) csvWriter.Comma = e.separator @@ -56,15 +117,10 @@ func (e *csvEncoder) Encode(writer io.Writer, originalNode *yaml.Node) error { return e.encodeRow(csvWriter, node.Content) } - for i, child := range node.Content { - - if child.Kind != yaml.SequenceNode { - return fmt.Errorf("csv encoding only works for arrays of scalars (string/numbers/booleans), child[%v] is a %v", i, child.Tag) - } - err := e.encodeRow(csvWriter, child.Content) - if err != nil { - return err - } + if node.Content[0].Kind == yaml.MappingNode { + return e.encodeObjects(csvWriter, node.Content) } - return nil + + return e.encodeArrays(csvWriter, node.Content) + } diff --git a/pkg/yqlib/encoder_csv_test.go b/pkg/yqlib/encoder_csv_test.go deleted file mode 100644 index 9e636c983a..0000000000 --- a/pkg/yqlib/encoder_csv_test.go +++ /dev/null @@ -1,60 +0,0 @@ -package yqlib - -import ( - "bufio" - "bytes" - "strings" - "testing" - - "github.com/mikefarah/yq/v4/test" -) - -func yamlToCsv(sampleYaml string, separator rune) string { - var output bytes.Buffer - writer := bufio.NewWriter(&output) - - var jsonEncoder = NewCsvEncoder(separator) - inputs, err := readDocuments(strings.NewReader(sampleYaml), "sample.yml", 0, NewYamlDecoder()) - if err != nil { - panic(err) - } - node := inputs.Front().Value.(*CandidateNode).Node - err = jsonEncoder.Encode(writer, node) - if err != nil { - panic(err) - } - writer.Flush() - - return strings.TrimSuffix(output.String(), "\n") -} - -var sampleYaml = `["apple", apple2, "comma, in, value", "new -line", 3, 3.40, true, "tab here"]` - -var sampleYamlArray = "[" + sampleYaml + ", [bob, cat, meow, puss]]" - -func TestCsvEncoderEmptyArray(t *testing.T) { - var actualCsv = yamlToCsv(`[]`, ',') - test.AssertResult(t, "", actualCsv) -} - -func TestCsvEncoder(t *testing.T) { - var expectedCsv = `apple,apple2,"comma, in, value",new line,3,3.40,true,tab here` - - var actualCsv = yamlToCsv(sampleYaml, ',') - test.AssertResult(t, expectedCsv, actualCsv) -} - -func TestCsvEncoderArrayOfArrays(t *testing.T) { - var actualCsv = yamlToCsv(sampleYamlArray, ',') - var expectedCsv = "apple,apple2,\"comma, in, value\",new line,3,3.40,true,tab here\nbob,cat,meow,puss" - test.AssertResult(t, expectedCsv, actualCsv) -} - -func TestTsvEncoder(t *testing.T) { - - var expectedCsv = `apple apple2 comma, in, value new line 3 3.40 true "tab here"` - - var actualCsv = yamlToCsv(sampleYaml, '\t') - test.AssertResult(t, expectedCsv, actualCsv) -} diff --git a/pkg/yqlib/lexer_participle.go b/pkg/yqlib/lexer_participle.go index 09e13844e1..7a5ba1387e 100644 --- a/pkg/yqlib/lexer_participle.go +++ b/pkg/yqlib/lexer_participle.go @@ -67,7 +67,10 @@ var participleYqRules = []*participleYqRule{ {"XMLEncode", `to_?xml`, encodeWithIndent(XMLOutputFormat, 2), 0}, {"XMLEncodeNoIndent", `@xml`, encodeWithIndent(XMLOutputFormat, 0), 0}, + {"CSVDecode", `from_?csv|@csvd`, decodeOp(CSVObjectInputFormat), 0}, {"CSVEncode", `to_?csv|@csv`, encodeWithIndent(CSVOutputFormat, 0), 0}, + + {"TSVDecode", `from_?tsv|@tsvd`, decodeOp(TSVObjectInputFormat), 0}, {"TSVEncode", `to_?tsv|@tsv`, encodeWithIndent(TSVOutputFormat, 0), 0}, {"Base64d", `@base64d`, decodeOp(Base64InputFormat), 0}, diff --git a/pkg/yqlib/lexer_participle_test.go b/pkg/yqlib/lexer_participle_test.go index 7dfa0f60b1..3a8739ed81 100644 --- a/pkg/yqlib/lexer_participle_test.go +++ b/pkg/yqlib/lexer_participle_test.go @@ -523,7 +523,6 @@ var participleLexerScenarios = []participleLexerScenario{ } func TestParticipleLexer(t *testing.T) { - log.Errorf("TestParticiple") lexer := newParticipleLexer() for _, scenario := range participleLexerScenarios { diff --git a/pkg/yqlib/lib.go b/pkg/yqlib/lib.go index 5ec53a76eb..bd586ac2e1 100644 --- a/pkg/yqlib/lib.go +++ b/pkg/yqlib/lib.go @@ -205,10 +205,10 @@ func findInArray(array *yaml.Node, item *yaml.Node) int { return -1 } -func findKeyInMap(array *yaml.Node, item *yaml.Node) int { +func findKeyInMap(dataMap *yaml.Node, item *yaml.Node) int { - for index := 0; index < len(array.Content); index = index + 2 { - if recursiveNodeEqual(array.Content[index], item) { + for index := 0; index < len(dataMap.Content); index = index + 2 { + if recursiveNodeEqual(dataMap.Content[index], item) { return index } } diff --git a/pkg/yqlib/operator_encoder_decoder.go b/pkg/yqlib/operator_encoder_decoder.go index 4751a27c1d..afb321ef9a 100644 --- a/pkg/yqlib/operator_encoder_decoder.go +++ b/pkg/yqlib/operator_encoder_decoder.go @@ -114,6 +114,10 @@ func decodeOperator(d *dataTreeNavigator, context Context, expressionNode *Expre decoder = NewBase64Decoder() case PropertiesInputFormat: decoder = NewPropertiesDecoder() + case CSVObjectInputFormat: + decoder = NewCSVObjectDecoder(',') + case TSVObjectInputFormat: + decoder = NewCSVObjectDecoder('\t') } var results = list.New() diff --git a/pkg/yqlib/operator_encoder_decoder_test.go b/pkg/yqlib/operator_encoder_decoder_test.go index 2c131f0ebd..5b5726f025 100644 --- a/pkg/yqlib/operator_encoder_decoder_test.go +++ b/pkg/yqlib/operator_encoder_decoder_test.go @@ -66,11 +66,27 @@ var encoderDecoderOperatorScenarios = []expressionScenario{ { description: "Decode props encoded string", document: `a: "cats=great\ndogs=cool as well"`, - expression: `.a |= from_props`, + expression: `.a |= @propsd`, expected: []string{ "D0, P[], (doc)::a:\n cats: great\n dogs: cool as well\n", }, }, + { + description: "Decode csv encoded string", + document: `a: "cats,dogs\ngreat,cool as well"`, + expression: `.a |= @csvd`, + expected: []string{ + "D0, P[], (doc)::a:\n - cats: great\n dogs: cool as well\n", + }, + }, + { + description: "Decode tsv encoded string", + document: `a: "cats dogs\ngreat cool as well"`, + expression: `.a |= @tsvd`, + expected: []string{ + "D0, P[], (doc)::a:\n - cats: great\n dogs: cool as well\n", + }, + }, { skipDoc: true, document: "a:\n cool:\n bob: dylan",