From 49c91ad1e317d7684980e60358e1b73bdcfa15da Mon Sep 17 00:00:00 2001 From: Daniel Takacs Date: Wed, 22 Nov 2023 21:55:09 +0100 Subject: [PATCH 1/3] Handle binary open mode in S3Path. --- visidata/loaders/s3.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/visidata/loaders/s3.py b/visidata/loaders/s3.py index 7f181cc40..3507e6ec0 100644 --- a/visidata/loaders/s3.py +++ b/visidata/loaders/s3.py @@ -61,13 +61,10 @@ def fs(self): def fs(self, val): self._fs = val - def open(self, *args, **kwargs): + def open(self, mode='r', **kwargs): """Open the current S3 path, decompressing along the way if needed.""" - # Default to text mode unless we have a compressed file - mode = "rb" if self.compression else "r" - - fp = self.fs.open(self.given, mode=mode, version_id=self.version_id) + fp = self.fs.open(self.given, mode="rb" if self.compression else mode, version_id=self.version_id) # Workaround for https://github.com/ajkerrigan/visidata-plugins/issues/12 if hasattr(fp, "cache") and fp.cache.size != fp.size: @@ -79,17 +76,17 @@ def open(self, *args, **kwargs): if self.compression == "gz": import gzip - return gzip.open(fp, *args, **kwargs) + return gzip.open(fp, mode, **kwargs) if self.compression == "bz2": import bz2 - return bz2.open(fp, *args, **kwargs) + return bz2.open(fp, mode, **kwargs) if self.compression == "xz": import lzma - return lzma.open(fp, *args, **kwargs) + return lzma.open(fp, mode, **kwargs) return fp From d561d24100b9e0401fab76ac4a8bf6c471c95841 Mon Sep 17 00:00:00 2001 From: Daniel Takacs Date: Wed, 22 Nov 2023 21:57:14 +0100 Subject: [PATCH 2/3] Don't create RepeatFile for virtual paths in binary mode. --- visidata/path.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/visidata/path.py b/visidata/path.py index 74b18bb1c..66703d3bb 100644 --- a/visidata/path.py +++ b/visidata/path.py @@ -238,9 +238,10 @@ def open(self, mode='rt', encoding=None, encoding_errors=None, newline=None): return self.rfile.reopen() if self.fp: - self.fptext = codecs.iterdecode(self.fp, - encoding=encoding or vd.options.encoding, - errors=encoding_errors or vd.options.encoding_errors) + if 'b' not in mode: + self.fptext = codecs.iterdecode(self.fp, + encoding=encoding or vd.options.encoding, + errors=encoding_errors or vd.options.encoding_errors) if self.fptext: self.rfile = RepeatFile(self.fptext) From a48c9d2da7dc7cdad59d840d15dba32761db0541 Mon Sep 17 00:00:00 2001 From: Daniel Takacs Date: Wed, 22 Nov 2023 21:58:20 +0100 Subject: [PATCH 3/3] Use Path.open for parquet loading. --- visidata/loaders/parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/visidata/loaders/parquet.py b/visidata/loaders/parquet.py index 9ee624c90..b9fcbbfc5 100644 --- a/visidata/loaders/parquet.py +++ b/visidata/loaders/parquet.py @@ -18,7 +18,8 @@ def iterload(self): pq = vd.importExternal("pyarrow.parquet", "pyarrow") from visidata.loaders.arrow import arrow_to_vdtype - self.tbl = pq.read_table(str(self.source)) + with self.source.open('rb') as f: + self.tbl = pq.read_table(f) self.columns = [] for colname, col in zip(self.tbl.column_names, self.tbl.columns): c = ParquetColumn(colname, type=arrow_to_vdtype(col.type), source=col)