Skip to content

Commit

Permalink
Add an encoding argument to tabby loader
Browse files Browse the repository at this point in the history
When an encoding is explicitly specified, it will be used.

Otherwise, default encoding used by Path.open will be tried, and
charset_normalizer will be used to guess if that fails.
  • Loading branch information
mslw committed Nov 21, 2023
1 parent 8d4b6e1 commit 070937a
Showing 1 changed file with 26 additions and 15 deletions.
41 changes: 26 additions & 15 deletions datalad_tabby/io/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,19 @@ def __init__(
self._jsonld = jsonld
self._recursive = recursive

def __call__(self, src: Path, *, single: bool = True):
def __call__(self, src: Path, *, single: bool = True, encoding: str | None = None):
return (self._load_single if single else self._load_many)(
src=src,
trace=[],
encoding=encoding,
)

def _load_single(
self,
*,
src: Path,
trace: List,
encoding: str | None = None,
) -> Dict:
jfpath = self._get_corresponding_jsondata_fpath(src)
obj = json.load(jfpath.open()) if jfpath.exists() else {}
Expand All @@ -97,13 +99,16 @@ def _load_single(
trace=trace,
)

try:
tsv_obj = self._parse_tsv_single(src)
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
if encoding is not None:
tsv_obj = self._parse_tsv_single(src, encoding=encoding)
else:
try:
tsv_obj = self._parse_tsv_single(src)
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
tsv_obj = self._parse_tsv_single(src, encoding=encoding)

obj.update(tsv_obj)

Expand Down Expand Up @@ -140,6 +145,7 @@ def _load_many(
*,
src: Path,
trace: List,
encoding: str | None = None,
) -> List[Dict]:
obj_tmpl = {}
array = list()
Expand All @@ -160,17 +166,22 @@ def _load_many(
# the table field/column names have purposefully _nothing_
# to do with any possibly loaded JSON data

try:
tsv_array = self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None
)
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
if encoding is not None:
tsv_array = self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding
)
else:
try:
tsv_array = self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None
)
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
tsv_array = self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding
)

array.extend(tsv_array)

Expand Down

0 comments on commit 070937a

Please # to comment.