-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstrum.go
363 lines (317 loc) · 10.8 KB
/
strum.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
// Copyright 2021 by David A. Golden. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
// Package strum provides a string unmarshaler to tokenize line-oriented text
// (such as from stdin) and convert tokens into simple Go types.
//
// Tokenization defaults to whitespace-separated fields, but strum supports
// using delimiters, regular expressions, or a custom tokenizer.
//
// A line with a single token can be unmarshaled into a single variable of any
// supported type.
//
// A line with multiple tokens can be unmarshaled into a slice or a struct of
// supported types. It can also be unmarshaled into a single string, in which
// case tokenization is skipped.
//
// Trying to unmarshal multiple tokens into a single variable or too many tokens
// for the number of fields in a struct will result in an error. Having too few
// tokens for the fields in a struct is allowed; remaining fields will be
// zeroed. When unmarshaling to a slice, decoded values are appended; existing
// values are untouched.
//
// strum supports the following types:
//
// - strings
// - booleans (like strconv.ParseBool but case insensitive)
// - integers (signed and unsigned, all widths)
// - floats (32-bit and 64-bit)
//
// Additionally, there is special support for certain types:
//
// - time.Duration
// - time.Time
// - any type implementing encoding.TextUnmarshaler
// - pointers to supported types (which will auto-instantiate)
//
// For numeric types, all Go literal formats are supported, including base
// prefixes (`0xff`) and underscores (`1_000_000`) for integers.
//
// For time.Time, strum detects and parses a wide varity of formats using the
// github.com/araddon/dateparse library. By default, it favors United States
// interpretation of MM/DD/YYYY and has time zone semantics equivalent to
// `time.Parse`. strum allows specifying a custom parser instead.
//
// strum provides `DecodeAll` to unmarshal all lines of input at once.
package strum
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"reflect"
"regexp"
"strings"
"time"
"github.com/araddon/dateparse"
)
// A Tokenizer is a function that breaks an input string into tokens.
type Tokenizer func(s string) ([]string, error)
// A DateParser parses a string into a time.Time struct.
type DateParser func(s string) (time.Time, error)
// A Decoder converts an input stream into Go types.
type Decoder struct {
s *bufio.Scanner
t Tokenizer
dp DateParser
}
// NewDecoder returns a Decoder that reads from r. The default Decoder will
// tokenize with `strings.Fields` function. The default date parser uses
// github.com/araddon/dateparse.ParseAny.
func NewDecoder(r io.Reader) *Decoder {
return &Decoder{
s: bufio.NewScanner(r),
t: func(s string) ([]string, error) { return strings.Fields(s), nil },
dp: func(s string) (time.Time, error) { return dateparse.ParseAny(s) },
}
}
// WithDateParser modifies a Decoder to use a custom date parsing function.
func (d *Decoder) WithDateParser(dp DateParser) *Decoder {
d.dp = dp
return d
}
// WithTokenizer modifies a Decoder to use a custom tokenizing function.
func (d *Decoder) WithTokenizer(t Tokenizer) *Decoder {
d.t = t
return d
}
// WithTokenRegexp modifies a Decoder to use a regular expression to extract
// tokens. The regular expression is called with `FindStringSubmatches` for
// each line of input, so it must encompass an entire line of input. If the
// line fails to match or if the regular expression has no subexpressions, an
// error is returned.
func (d *Decoder) WithTokenRegexp(re *regexp.Regexp) *Decoder {
return d.WithTokenizer(
func(s string) ([]string, error) {
xs := re.FindStringSubmatch(s)
if xs == nil {
return []string{}, errors.New("regexp failed to match line " + s)
}
// A regexp without capture expressions is an error.
if len(xs) == 1 {
return []string{}, errors.New("regexp has no subexpressions")
}
// Drop the full match and return only submatches.
return xs[1:], nil
},
)
}
// WithSplitOn modifies a Decoder to split fields on a separator string.
func (d *Decoder) WithSplitOn(sep string) *Decoder {
return d.WithTokenizer(
func(s string) ([]string, error) {
return strings.Split(s, sep), nil
},
)
}
// Tokens consumes a line of input and returns all strings generated by the
// tokenizer. It is used internally by `Decode`, but available for testing or
// for skipping over a line of input that should not be decoded.
func (d *Decoder) Tokens() ([]string, error) {
s, err := d.readline()
if err != nil {
return nil, err
}
return d.t(s)
}
func (d *Decoder) readline() (string, error) {
if !(d.s.Scan()) {
err := d.s.Err()
if err != nil {
return "", err
}
return "", io.EOF
}
return d.s.Text(), nil
}
// Decode reads the next line of input and stores it in the value pointed to by
// `v`. It returns `io.EOF` when no more data is available.
func (d *Decoder) Decode(v interface{}) error {
destValue, err := extractDestValue(v)
if err != nil {
return fmt.Errorf("Decode: %w", err)
}
return d.decode(destValue)
}
// decode puts a single line of input into a destination. It invokes a type-aware,
// decoding routine that determines whether the line must have a single token,
// or be consumed as a line, or whether multiple tokens are decoded to a slice
// or struct. It also recursively dereferences pointers to find an element to
// decode in case they are pointers to structs, slices, or text unmarshalers.
func (d *Decoder) decode(destValue reflect.Value) error {
// Handle certain types specially, not as their underlying data kind.
switch destValue.Type() {
case durationType:
return d.decodeSingleToken(destValue)
case timeType, timePtrType:
return d.decodeSingleToken(destValue)
}
// Handle text unmarshaler types
if isTextUnmarshaler(destValue) {
return d.decodeSingleToken(destValue)
}
switch destValue.Kind() {
case reflect.Bool:
return d.decodeSingleToken(destValue)
case reflect.String:
return d.decodeLine(destValue)
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return d.decodeSingleToken(destValue)
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return d.decodeSingleToken(destValue)
case reflect.Float32, reflect.Float64:
return d.decodeSingleToken(destValue)
case reflect.Struct:
return d.decodeStruct(destValue)
case reflect.Slice:
return d.decodeSlice(destValue)
case reflect.Ptr:
maybeInstantiatePtr(destValue)
return d.decode(destValue.Elem())
default:
return fmt.Errorf("cannot decode into type %s", destValue.Type())
}
}
func (d *Decoder) decodeStruct(destValue reflect.Value) error {
tokens, err := d.Tokens()
if err != nil {
return err
}
destType := destValue.Type()
// Zero the struct so any prior fields are reset.
destValue.Set(reflect.New(destType).Elem())
// Map tokens into argValue
numFields := destValue.NumField()
for i := range tokens {
if i >= numFields {
return fmt.Errorf("too many tokens for struct %s", destValue.Type())
}
fieldName := destType.Name() + "." + destType.Field(i).Name
// PkgPath is empty for exported fields. See https://pkg.go.dev/reflect#StructField
// In Go 1.17, this is available as `IsExported`.
if destType.Field(i).PkgPath != "" {
return fmt.Errorf("cannot decode to unexported field %s", fieldName)
}
err = d.decodeToValue(fieldName, destValue.Field(i), tokens[i])
if err != nil {
return err
}
}
return nil
}
func (d *Decoder) decodeSlice(sliceValue reflect.Value) error {
sliceType := sliceValue.Type()
if !isDecodableValue(reflect.New(sliceType.Elem()).Elem()) {
return fmt.Errorf("decoding to this slice type not supported: %s", sliceValue.Type())
}
tokens, err := d.Tokens()
if err != nil {
return err
}
for i, s := range tokens {
v := reflect.New(sliceType.Elem()).Elem()
err := d.decodeToValue(fmt.Sprintf("element %d", i), v, s)
if err != nil {
return err
}
sliceValue.Set(reflect.Append(sliceValue, v))
}
return nil
}
func (d *Decoder) decodeSingleToken(destValue reflect.Value) error {
tokens, err := d.Tokens()
if err != nil {
return err
}
if len(tokens) != 1 {
return fmt.Errorf("decoding %s: expected 1 token, but found %d", destValue.Type(), len(tokens))
}
return d.decodeToValue(destValue.Type().String(), destValue, tokens[0])
}
func (d *Decoder) decodeLine(destValue reflect.Value) error {
line, err := d.readline()
if err != nil {
return err
}
return d.decodeToValue(destValue.Type().String(), destValue, line)
}
// DecodeAll reads the remaining lines of input into `v`, where `v` must be a
// pointer to a slice of a type that would valid for Decode. It works as if
// `Decode` were called for all lines and the resulting values were appended to
// the slice. If `v` points to an uninitialized slice, the slice will be
// created. DecodeAll returns `nil` when EOF is reached.
func (d *Decoder) DecodeAll(v interface{}) error {
sliceValue, err := extractDestSlice(v)
if err != nil {
return fmt.Errorf("DecodeAll: %w", err)
}
return d.decodeAll(sliceValue)
}
func (d *Decoder) decodeAll(sliceValue reflect.Value) error {
sliceType := sliceValue.Type()
// Make a zero-length slice if it starts uninitialized
if sliceValue.IsNil() {
sliceValue.Set(reflect.MakeSlice(sliceType, 0, 1))
}
// Decode every line into the slice
for {
rv := reflect.New(sliceType.Elem()).Elem()
err := d.decode(rv)
if err != nil {
if err == io.EOF {
return nil
}
return err
}
sliceValue.Set(reflect.Append(sliceValue, rv))
}
}
// Unmarshal parses the input data as newline delimited strings and appends the
// result to the value pointed to by `v`, where `v` must be a pointer to a slice
// of a type that would valid for Decode. If `v` points to an uninitialized
// slice, the slice will be created.
func Unmarshal(data []byte, v interface{}) error {
sliceValue, err := extractDestSlice(v)
if err != nil {
return fmt.Errorf("Unmarshal: %w", err)
}
r := bytes.NewBuffer(data)
d := NewDecoder(r)
return d.decodeAll(sliceValue)
}
func extractDestValue(v interface{}) (reflect.Value, error) {
if v == nil {
return reflect.Value{}, errors.New("argument must be a non-nil pointer")
}
rv := reflect.ValueOf(v)
if rv.Kind() != reflect.Ptr {
return reflect.Value{}, fmt.Errorf("argument must be a pointer, not %s", rv.Kind())
}
if rv.IsNil() {
return reflect.Value{}, errors.New("argument must be a non-nil pointer")
}
return rv.Elem(), nil
}
func extractDestSlice(v interface{}) (reflect.Value, error) {
rv, err := extractDestValue(v)
if err != nil {
return reflect.Value{}, err
}
if rv.Kind() != reflect.Slice {
return reflect.Value{}, fmt.Errorf("argument must be a pointer to slice, not %s", rv.Kind())
}
return rv, nil
}