Skip to content

Commit 53d5df2

Browse files
authored
Don't canonicalize ListingTableUrl (#8014)
1 parent 2156dde commit 53d5df2

File tree

2 files changed

+91
-29
lines changed
  • datafusion/core/src/datasource/listing
  • datafusion-cli/src

2 files changed

+91
-29
lines changed

datafusion-cli/src/exec.rs

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -388,15 +388,7 @@ mod tests {
388388
// Ensure that local files are also registered
389389
let sql =
390390
format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'");
391-
let err = create_external_table_test(location, &sql)
392-
.await
393-
.unwrap_err();
394-
395-
if let DataFusionError::IoError(e) = err {
396-
assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
397-
} else {
398-
return Err(err);
399-
}
391+
create_external_table_test(location, &sql).await.unwrap();
400392

401393
Ok(())
402394
}

datafusion/core/src/datasource/listing/url.rs

Lines changed: 90 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,17 @@ pub struct ListingTableUrl {
4545
impl ListingTableUrl {
4646
/// Parse a provided string as a `ListingTableUrl`
4747
///
48+
/// A URL can either refer to a single object, or a collection of objects with a
49+
/// common prefix, with the presence of a trailing `/` indicating a collection.
50+
///
51+
/// For example, `file:///foo.txt` refers to the file at `/foo.txt`, whereas
52+
/// `file:///foo/` refers to all the files under the directory `/foo` and its
53+
/// subdirectories.
54+
///
55+
/// Similarly `s3://BUCKET/blob.csv` refers to `blob.csv` in the S3 bucket `BUCKET`,
56+
/// wherease `s3://BUCKET/foo/` refers to all objects with the prefix `foo/` in the
57+
/// S3 bucket `BUCKET`
58+
///
4859
/// # URL Encoding
4960
///
5061
/// URL paths are expected to be URL-encoded. That is, the URL for a file named `bar%2Efoo`
@@ -58,29 +69,29 @@ impl ListingTableUrl {
5869
/// # Paths without a Scheme
5970
///
6071
/// If no scheme is provided, or the string is an absolute filesystem path
61-
/// as determined [`std::path::Path::is_absolute`], the string will be
72+
/// as determined by [`std::path::Path::is_absolute`], the string will be
6273
/// interpreted as a path on the local filesystem using the operating
6374
/// system's standard path delimiter, i.e. `\` on Windows, `/` on Unix.
6475
///
6576
/// If the path contains any of `'?', '*', '['`, it will be considered
6677
/// a glob expression and resolved as described in the section below.
6778
///
68-
/// Otherwise, the path will be resolved to an absolute path, returning
69-
/// an error if it does not exist, and converted to a [file URI]
79+
/// Otherwise, the path will be resolved to an absolute path based on the current
80+
/// working directory, and converted to a [file URI].
7081
///
71-
/// If you wish to specify a path that does not exist on the local
72-
/// machine you must provide it as a fully-qualified [file URI]
73-
/// e.g. `file:///myfile.txt`
82+
/// If the path already exists in the local filesystem this will be used to determine if this
83+
/// [`ListingTableUrl`] refers to a collection or a single object, otherwise the presence
84+
/// of a trailing path delimiter will be used to indicate a directory. For the avoidance
85+
/// of ambiguity it is recommended users always include trailing `/` when intending to
86+
/// refer to a directory.
7487
///
7588
/// ## Glob File Paths
7689
///
7790
/// If no scheme is provided, and the path contains a glob expression, it will
7891
/// be resolved as follows.
7992
///
8093
/// The string up to the first path segment containing a glob expression will be extracted,
81-
/// and resolved in the same manner as a normal scheme-less path. That is, resolved to
82-
/// an absolute path on the local filesystem, returning an error if it does not exist,
83-
/// and converted to a [file URI]
94+
/// and resolved in the same manner as a normal scheme-less path above.
8495
///
8596
/// The remaining string will be interpreted as a [`glob::Pattern`] and used as a
8697
/// filter when listing files from object storage
@@ -130,7 +141,7 @@ impl ListingTableUrl {
130141

131142
/// Creates a new [`ListingTableUrl`] interpreting `s` as a filesystem path
132143
fn parse_path(s: &str) -> Result<Self> {
133-
let (prefix, glob) = match split_glob_expression(s) {
144+
let (path, glob) = match split_glob_expression(s) {
134145
Some((prefix, glob)) => {
135146
let glob = Pattern::new(glob)
136147
.map_err(|e| DataFusionError::External(Box::new(e)))?;
@@ -139,15 +150,12 @@ impl ListingTableUrl {
139150
None => (s, None),
140151
};
141152

142-
let path = std::path::Path::new(prefix).canonicalize()?;
143-
let url = if path.is_dir() {
144-
Url::from_directory_path(path)
145-
} else {
146-
Url::from_file_path(path)
147-
}
148-
.map_err(|_| DataFusionError::Internal(format!("Can not open path: {s}")))?;
149-
// TODO: Currently we do not have an IO-related error variant that accepts ()
150-
// or a string. Once we have such a variant, change the error type above.
153+
let url = url_from_filesystem_path(path).ok_or_else(|| {
154+
DataFusionError::External(
155+
format!("Failed to convert path to URL: {path}").into(),
156+
)
157+
})?;
158+
151159
Self::try_new(url, glob)
152160
}
153161

@@ -162,7 +170,10 @@ impl ListingTableUrl {
162170
self.url.scheme()
163171
}
164172

165-
/// Return the prefix from which to list files
173+
/// Return the URL path not excluding any glob expression
174+
///
175+
/// If [`Self::is_collection`], this is the listing prefix
176+
/// Otherwise, this is the path to the object
166177
pub fn prefix(&self) -> &Path {
167178
&self.prefix
168179
}
@@ -249,6 +260,34 @@ impl ListingTableUrl {
249260
}
250261
}
251262

263+
/// Creates a file URL from a potentially relative filesystem path
264+
fn url_from_filesystem_path(s: &str) -> Option<Url> {
265+
let path = std::path::Path::new(s);
266+
let is_dir = match path.exists() {
267+
true => path.is_dir(),
268+
// Fallback to inferring from trailing separator
269+
false => std::path::is_separator(s.chars().last()?),
270+
};
271+
272+
let from_absolute_path = |p| {
273+
let first = match is_dir {
274+
true => Url::from_directory_path(p).ok(),
275+
false => Url::from_file_path(p).ok(),
276+
}?;
277+
278+
// By default from_*_path preserve relative path segments
279+
// We therefore parse the URL again to resolve these
280+
Url::parse(first.as_str()).ok()
281+
};
282+
283+
if path.is_absolute() {
284+
return from_absolute_path(path);
285+
}
286+
287+
let absolute = std::env::current_dir().ok()?.join(path);
288+
from_absolute_path(&absolute)
289+
}
290+
252291
impl AsRef<str> for ListingTableUrl {
253292
fn as_ref(&self) -> &str {
254293
self.url.as_ref()
@@ -349,6 +388,37 @@ mod tests {
349388

350389
let url = ListingTableUrl::parse(path.to_str().unwrap()).unwrap();
351390
assert!(url.prefix.as_ref().ends_with("bar%2Ffoo"), "{}", url.prefix);
391+
392+
let url = ListingTableUrl::parse("file:///foo/../a%252Fb.txt").unwrap();
393+
assert_eq!(url.prefix.as_ref(), "a%2Fb.txt");
394+
395+
let url =
396+
ListingTableUrl::parse("file:///foo/./bar/../../baz/./test.txt").unwrap();
397+
assert_eq!(url.prefix.as_ref(), "baz/test.txt");
398+
399+
let workdir = std::env::current_dir().unwrap();
400+
let t = workdir.join("non-existent");
401+
let a = ListingTableUrl::parse(t.to_str().unwrap()).unwrap();
402+
let b = ListingTableUrl::parse("non-existent").unwrap();
403+
assert_eq!(a, b);
404+
assert!(a.prefix.as_ref().ends_with("non-existent"));
405+
406+
let t = workdir.parent().unwrap();
407+
let a = ListingTableUrl::parse(t.to_str().unwrap()).unwrap();
408+
let b = ListingTableUrl::parse("..").unwrap();
409+
assert_eq!(a, b);
410+
411+
let t = t.join("bar");
412+
let a = ListingTableUrl::parse(t.to_str().unwrap()).unwrap();
413+
let b = ListingTableUrl::parse("../bar").unwrap();
414+
assert_eq!(a, b);
415+
assert!(a.prefix.as_ref().ends_with("bar"));
416+
417+
let t = t.join(".").join("foo").join("..").join("baz");
418+
let a = ListingTableUrl::parse(t.to_str().unwrap()).unwrap();
419+
let b = ListingTableUrl::parse("../bar/./foo/../baz").unwrap();
420+
assert_eq!(a, b);
421+
assert!(a.prefix.as_ref().ends_with("bar/baz"));
352422
}
353423

354424
#[test]

0 commit comments

Comments
 (0)