Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Fix parquet reading from zip or s3 #2133

Merged
merged 3 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion visidata/loaders/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def iterload(self):
pq = vd.importExternal("pyarrow.parquet", "pyarrow")
from visidata.loaders.arrow import arrow_to_vdtype

self.tbl = pq.read_table(str(self.source))
with self.source.open('rb') as f:
self.tbl = pq.read_table(f)
self.columns = []
for colname, col in zip(self.tbl.column_names, self.tbl.columns):
c = ParquetColumn(colname, type=arrow_to_vdtype(col.type), source=col)
Expand Down
13 changes: 5 additions & 8 deletions visidata/loaders/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,10 @@ def fs(self):
def fs(self, val):
self._fs = val

def open(self, *args, **kwargs):
def open(self, mode='r', **kwargs):
"""Open the current S3 path, decompressing along the way if needed."""

# Default to text mode unless we have a compressed file
mode = "rb" if self.compression else "r"

fp = self.fs.open(self.given, mode=mode, version_id=self.version_id)
fp = self.fs.open(self.given, mode="rb" if self.compression else mode, version_id=self.version_id)

# Workaround for https://github.com/ajkerrigan/visidata-plugins/issues/12
if hasattr(fp, "cache") and fp.cache.size != fp.size:
Expand All @@ -79,17 +76,17 @@ def open(self, *args, **kwargs):
if self.compression == "gz":
import gzip

return gzip.open(fp, *args, **kwargs)
return gzip.open(fp, mode, **kwargs)

if self.compression == "bz2":
import bz2

return bz2.open(fp, *args, **kwargs)
return bz2.open(fp, mode, **kwargs)

if self.compression == "xz":
import lzma

return lzma.open(fp, *args, **kwargs)
return lzma.open(fp, mode, **kwargs)

return fp

Expand Down
7 changes: 4 additions & 3 deletions visidata/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,10 @@ def open(self, mode='rt', encoding=None, encoding_errors=None, newline=None):
return self.rfile.reopen()

if self.fp:
self.fptext = codecs.iterdecode(self.fp,
encoding=encoding or vd.options.encoding,
errors=encoding_errors or vd.options.encoding_errors)
if 'b' not in mode:
self.fptext = codecs.iterdecode(self.fp,
encoding=encoding or vd.options.encoding,
errors=encoding_errors or vd.options.encoding_errors)

if self.fptext:
self.rfile = RepeatFile(self.fptext)
Expand Down
Loading