@@ -45,6 +45,17 @@ pub struct ListingTableUrl {
45
45
impl ListingTableUrl {
46
46
/// Parse a provided string as a `ListingTableUrl`
47
47
///
48
+ /// A URL can either refer to a single object, or a collection of objects with a
49
+ /// common prefix, with the presence of a trailing `/` indicating a collection.
50
+ ///
51
+ /// For example, `file:///foo.txt` refers to the file at `/foo.txt`, whereas
52
+ /// `file:///foo/` refers to all the files under the directory `/foo` and its
53
+ /// subdirectories.
54
+ ///
55
+ /// Similarly `s3://BUCKET/blob.csv` refers to `blob.csv` in the S3 bucket `BUCKET`,
56
+ /// wherease `s3://BUCKET/foo/` refers to all objects with the prefix `foo/` in the
57
+ /// S3 bucket `BUCKET`
58
+ ///
48
59
/// # URL Encoding
49
60
///
50
61
/// URL paths are expected to be URL-encoded. That is, the URL for a file named `bar%2Efoo`
@@ -58,29 +69,29 @@ impl ListingTableUrl {
58
69
/// # Paths without a Scheme
59
70
///
60
71
/// If no scheme is provided, or the string is an absolute filesystem path
61
- /// as determined [`std::path::Path::is_absolute`], the string will be
72
+ /// as determined by [`std::path::Path::is_absolute`], the string will be
62
73
/// interpreted as a path on the local filesystem using the operating
63
74
/// system's standard path delimiter, i.e. `\` on Windows, `/` on Unix.
64
75
///
65
76
/// If the path contains any of `'?', '*', '['`, it will be considered
66
77
/// a glob expression and resolved as described in the section below.
67
78
///
68
- /// Otherwise, the path will be resolved to an absolute path, returning
69
- /// an error if it does not exist , and converted to a [file URI]
79
+ /// Otherwise, the path will be resolved to an absolute path based on the current
80
+ /// working directory , and converted to a [file URI].
70
81
///
71
- /// If you wish to specify a path that does not exist on the local
72
- /// machine you must provide it as a fully-qualified [file URI]
73
- /// e.g. `file:///myfile.txt`
82
+ /// If the path already exists in the local filesystem this will be used to determine if this
83
+ /// [`ListingTableUrl`] refers to a collection or a single object, otherwise the presence
84
+ /// of a trailing path delimiter will be used to indicate a directory. For the avoidance
85
+ /// of ambiguity it is recommended users always include trailing `/` when intending to
86
+ /// refer to a directory.
74
87
///
75
88
/// ## Glob File Paths
76
89
///
77
90
/// If no scheme is provided, and the path contains a glob expression, it will
78
91
/// be resolved as follows.
79
92
///
80
93
/// The string up to the first path segment containing a glob expression will be extracted,
81
- /// and resolved in the same manner as a normal scheme-less path. That is, resolved to
82
- /// an absolute path on the local filesystem, returning an error if it does not exist,
83
- /// and converted to a [file URI]
94
+ /// and resolved in the same manner as a normal scheme-less path above.
84
95
///
85
96
/// The remaining string will be interpreted as a [`glob::Pattern`] and used as a
86
97
/// filter when listing files from object storage
@@ -130,7 +141,7 @@ impl ListingTableUrl {
130
141
131
142
/// Creates a new [`ListingTableUrl`] interpreting `s` as a filesystem path
132
143
fn parse_path ( s : & str ) -> Result < Self > {
133
- let ( prefix , glob) = match split_glob_expression ( s) {
144
+ let ( path , glob) = match split_glob_expression ( s) {
134
145
Some ( ( prefix, glob) ) => {
135
146
let glob = Pattern :: new ( glob)
136
147
. map_err ( |e| DataFusionError :: External ( Box :: new ( e) ) ) ?;
@@ -139,15 +150,12 @@ impl ListingTableUrl {
139
150
None => ( s, None ) ,
140
151
} ;
141
152
142
- let path = std:: path:: Path :: new ( prefix) . canonicalize ( ) ?;
143
- let url = if path. is_dir ( ) {
144
- Url :: from_directory_path ( path)
145
- } else {
146
- Url :: from_file_path ( path)
147
- }
148
- . map_err ( |_| DataFusionError :: Internal ( format ! ( "Can not open path: {s}" ) ) ) ?;
149
- // TODO: Currently we do not have an IO-related error variant that accepts ()
150
- // or a string. Once we have such a variant, change the error type above.
153
+ let url = url_from_filesystem_path ( path) . ok_or_else ( || {
154
+ DataFusionError :: External (
155
+ format ! ( "Failed to convert path to URL: {path}" ) . into ( ) ,
156
+ )
157
+ } ) ?;
158
+
151
159
Self :: try_new ( url, glob)
152
160
}
153
161
@@ -162,7 +170,10 @@ impl ListingTableUrl {
162
170
self . url . scheme ( )
163
171
}
164
172
165
- /// Return the prefix from which to list files
173
+ /// Return the URL path not excluding any glob expression
174
+ ///
175
+ /// If [`Self::is_collection`], this is the listing prefix
176
+ /// Otherwise, this is the path to the object
166
177
pub fn prefix ( & self ) -> & Path {
167
178
& self . prefix
168
179
}
@@ -249,6 +260,34 @@ impl ListingTableUrl {
249
260
}
250
261
}
251
262
263
+ /// Creates a file URL from a potentially relative filesystem path
264
+ fn url_from_filesystem_path ( s : & str ) -> Option < Url > {
265
+ let path = std:: path:: Path :: new ( s) ;
266
+ let is_dir = match path. exists ( ) {
267
+ true => path. is_dir ( ) ,
268
+ // Fallback to inferring from trailing separator
269
+ false => std:: path:: is_separator ( s. chars ( ) . last ( ) ?) ,
270
+ } ;
271
+
272
+ let from_absolute_path = |p| {
273
+ let first = match is_dir {
274
+ true => Url :: from_directory_path ( p) . ok ( ) ,
275
+ false => Url :: from_file_path ( p) . ok ( ) ,
276
+ } ?;
277
+
278
+ // By default from_*_path preserve relative path segments
279
+ // We therefore parse the URL again to resolve these
280
+ Url :: parse ( first. as_str ( ) ) . ok ( )
281
+ } ;
282
+
283
+ if path. is_absolute ( ) {
284
+ return from_absolute_path ( path) ;
285
+ }
286
+
287
+ let absolute = std:: env:: current_dir ( ) . ok ( ) ?. join ( path) ;
288
+ from_absolute_path ( & absolute)
289
+ }
290
+
252
291
impl AsRef < str > for ListingTableUrl {
253
292
fn as_ref ( & self ) -> & str {
254
293
self . url . as_ref ( )
@@ -349,6 +388,37 @@ mod tests {
349
388
350
389
let url = ListingTableUrl :: parse ( path. to_str ( ) . unwrap ( ) ) . unwrap ( ) ;
351
390
assert ! ( url. prefix. as_ref( ) . ends_with( "bar%2Ffoo" ) , "{}" , url. prefix) ;
391
+
392
+ let url = ListingTableUrl :: parse ( "file:///foo/../a%252Fb.txt" ) . unwrap ( ) ;
393
+ assert_eq ! ( url. prefix. as_ref( ) , "a%2Fb.txt" ) ;
394
+
395
+ let url =
396
+ ListingTableUrl :: parse ( "file:///foo/./bar/../../baz/./test.txt" ) . unwrap ( ) ;
397
+ assert_eq ! ( url. prefix. as_ref( ) , "baz/test.txt" ) ;
398
+
399
+ let workdir = std:: env:: current_dir ( ) . unwrap ( ) ;
400
+ let t = workdir. join ( "non-existent" ) ;
401
+ let a = ListingTableUrl :: parse ( t. to_str ( ) . unwrap ( ) ) . unwrap ( ) ;
402
+ let b = ListingTableUrl :: parse ( "non-existent" ) . unwrap ( ) ;
403
+ assert_eq ! ( a, b) ;
404
+ assert ! ( a. prefix. as_ref( ) . ends_with( "non-existent" ) ) ;
405
+
406
+ let t = workdir. parent ( ) . unwrap ( ) ;
407
+ let a = ListingTableUrl :: parse ( t. to_str ( ) . unwrap ( ) ) . unwrap ( ) ;
408
+ let b = ListingTableUrl :: parse ( ".." ) . unwrap ( ) ;
409
+ assert_eq ! ( a, b) ;
410
+
411
+ let t = t. join ( "bar" ) ;
412
+ let a = ListingTableUrl :: parse ( t. to_str ( ) . unwrap ( ) ) . unwrap ( ) ;
413
+ let b = ListingTableUrl :: parse ( "../bar" ) . unwrap ( ) ;
414
+ assert_eq ! ( a, b) ;
415
+ assert ! ( a. prefix. as_ref( ) . ends_with( "bar" ) ) ;
416
+
417
+ let t = t. join ( "." ) . join ( "foo" ) . join ( ".." ) . join ( "baz" ) ;
418
+ let a = ListingTableUrl :: parse ( t. to_str ( ) . unwrap ( ) ) . unwrap ( ) ;
419
+ let b = ListingTableUrl :: parse ( "../bar/./foo/../baz" ) . unwrap ( ) ;
420
+ assert_eq ! ( a, b) ;
421
+ assert ! ( a. prefix. as_ref( ) . ends_with( "bar/baz" ) ) ;
352
422
}
353
423
354
424
#[ test]
0 commit comments