From 2df0059fa5d697091f1b91271c07278677fd2200 Mon Sep 17 00:00:00 2001 From: Angela Date: Thu, 25 Jul 2024 16:45:10 -0700 Subject: [PATCH 1/2] support for uploading .jsonl files --- .../dataset_source/filepath_dataset_source.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cleanlab_studio/internal/dataset_source/filepath_dataset_source.py b/cleanlab_studio/internal/dataset_source/filepath_dataset_source.py index a2174e69..3a837e71 100644 --- a/cleanlab_studio/internal/dataset_source/filepath_dataset_source.py +++ b/cleanlab_studio/internal/dataset_source/filepath_dataset_source.py @@ -1,11 +1,12 @@ import mimetypes +import os import pathlib from typing import Any, Optional -import os -from .dataset_source import DatasetSource from cleanlab_studio.errors import InvalidDatasetError, InvalidFilepathError +from .dataset_source import DatasetSource + class FilepathDatasetSource(DatasetSource): def __init__( @@ -21,7 +22,12 @@ def __init__( self.dataset_name = dataset_name if dataset_name is not None else filepath.name self.file_size = filepath.stat().st_size - maybe_file_type = mimetypes.guess_type(filepath)[0] + + if filepath.suffix.lower() == ".jsonl": + maybe_file_type = "application/json" + else: + maybe_file_type = mimetypes.guess_type(filepath)[0] + if maybe_file_type is None: raise InvalidDatasetError( f"Could not identify type of file at {filepath}. Make sure file name has valid extension" From 8e01c25e0bfdf8f0cb33f57502c54a49264a25fd Mon Sep 17 00:00:00 2001 From: Angela Date: Thu, 25 Jul 2024 17:26:18 -0700 Subject: [PATCH 2/2] mypy --- .../internal/dataset_source/filepath_dataset_source.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cleanlab_studio/internal/dataset_source/filepath_dataset_source.py b/cleanlab_studio/internal/dataset_source/filepath_dataset_source.py index 3a837e71..5bb7f32b 100644 --- a/cleanlab_studio/internal/dataset_source/filepath_dataset_source.py +++ b/cleanlab_studio/internal/dataset_source/filepath_dataset_source.py @@ -23,6 +23,7 @@ def __init__( self.dataset_name = dataset_name if dataset_name is not None else filepath.name self.file_size = filepath.stat().st_size + maybe_file_type: Optional[str] if filepath.suffix.lower() == ".jsonl": maybe_file_type = "application/json" else: