diff --git a/tests/artifacts/testdata/__init__.py b/tests/artifacts/testdata/__init__.py index ac9d487bd..8b6a7ea43 100644 --- a/tests/artifacts/testdata/__init__.py +++ b/tests/artifacts/testdata/__init__.py @@ -19,20 +19,30 @@ ### Constants used for data DATA_DIR = os.path.join(os.path.dirname(__file__)) +PARQUET_DATA_DIR = os.path.join(os.path.dirname(__file__), "parquet") TWITTER_COMPLAINTS_DATA_JSON = os.path.join(DATA_DIR, "twitter_complaints_small.json") TWITTER_COMPLAINTS_DATA_JSONL = os.path.join(DATA_DIR, "twitter_complaints_small.jsonl") +TWITTER_COMPLAINTS_DATA_PARQUET = os.path.join( + PARQUET_DATA_DIR, "twitter_complaints_small.parquet" +) TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON = os.path.join( DATA_DIR, "twitter_complaints_input_output.json" ) TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL = os.path.join( DATA_DIR, "twitter_complaints_input_output.jsonl" ) +TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET = os.path.join( + PARQUET_DATA_DIR, "twitter_complaints_input_output.parquet" +) TWITTER_COMPLAINTS_TOKENIZED_JSON = os.path.join( DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json" ) TWITTER_COMPLAINTS_TOKENIZED_JSONL = os.path.join( DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.jsonl" ) +TWITTER_COMPLAINTS_TOKENIZED_PARQUET = os.path.join( + PARQUET_DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.parquet" +) EMPTY_DATA = os.path.join(DATA_DIR, "empty_data.json") MALFORMATTED_DATA = os.path.join(DATA_DIR, "malformatted_data.json") MODEL_NAME = "Maykeye/TinyLLama-v0" diff --git a/tests/artifacts/testdata/parquet/twitter_complaints_input_output.parquet b/tests/artifacts/testdata/parquet/twitter_complaints_input_output.parquet new file mode 100644 index 000000000..e684a6035 Binary files /dev/null and b/tests/artifacts/testdata/parquet/twitter_complaints_input_output.parquet differ diff --git a/tests/artifacts/testdata/parquet/twitter_complaints_small.parquet b/tests/artifacts/testdata/parquet/twitter_complaints_small.parquet new file mode 100644 index 000000000..278dbb638 Binary files /dev/null and b/tests/artifacts/testdata/parquet/twitter_complaints_small.parquet differ diff --git a/tests/artifacts/testdata/parquet/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.parquet b/tests/artifacts/testdata/parquet/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.parquet new file mode 100644 index 000000000..f1cba75bf Binary files /dev/null and b/tests/artifacts/testdata/parquet/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.parquet differ diff --git a/tests/data/test_data_preprocessing_utils.py b/tests/data/test_data_preprocessing_utils.py index 02308b2f5..a4ec5dbf7 100644 --- a/tests/data/test_data_preprocessing_utils.py +++ b/tests/data/test_data_preprocessing_utils.py @@ -34,10 +34,13 @@ MODEL_NAME, TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON, TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL, + TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, TWITTER_COMPLAINTS_DATA_JSON, TWITTER_COMPLAINTS_DATA_JSONL, + TWITTER_COMPLAINTS_DATA_PARQUET, TWITTER_COMPLAINTS_TOKENIZED_JSON, TWITTER_COMPLAINTS_TOKENIZED_JSONL, + TWITTER_COMPLAINTS_TOKENIZED_PARQUET, ) # Local @@ -59,6 +62,10 @@ TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL, set(["ID", "Label", "input", "output"]), ), + ( + TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, + set(["ID", "Label", "input", "output"]), + ), ( TWITTER_COMPLAINTS_TOKENIZED_JSONL, set( @@ -73,10 +80,28 @@ ] ), ), + ( + TWITTER_COMPLAINTS_TOKENIZED_PARQUET, + set( + [ + "Tweet text", + "ID", + "Label", + "text_label", + "output", + "input_ids", + "labels", + ] + ), + ), ( TWITTER_COMPLAINTS_DATA_JSONL, set(["Tweet text", "ID", "Label", "text_label", "output"]), ), + ( + TWITTER_COMPLAINTS_DATA_PARQUET, + set(["Tweet text", "ID", "Label", "text_label", "output"]), + ), ], ) def test_load_dataset_with_datafile(datafile, column_names): @@ -98,6 +123,11 @@ def test_load_dataset_with_datafile(datafile, column_names): set(["ID", "Label", "input", "output"]), "text_dataset_input_output_masking", ), + ( + TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, + set(["ID", "Label", "input", "output"]), + "text_dataset_input_output_masking", + ), ( TWITTER_COMPLAINTS_TOKENIZED_JSONL, set( @@ -113,11 +143,31 @@ def test_load_dataset_with_datafile(datafile, column_names): ), "pretokenized_dataset", ), + ( + TWITTER_COMPLAINTS_TOKENIZED_PARQUET, + set( + [ + "Tweet text", + "ID", + "Label", + "text_label", + "output", + "input_ids", + "labels", + ] + ), + "pretokenized_dataset", + ), ( TWITTER_COMPLAINTS_DATA_JSONL, set(["Tweet text", "ID", "Label", "text_label", "output"]), "apply_custom_data_template", ), + ( + TWITTER_COMPLAINTS_DATA_PARQUET, + set(["Tweet text", "ID", "Label", "text_label", "output"]), + "apply_custom_data_template", + ), ], ) def test_load_dataset_with_datasetconfig(datafile, column_names, datasetconfigname): @@ -139,8 +189,14 @@ def test_load_dataset_with_datasetconfig(datafile, column_names, datasetconfigna TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL, "text_dataset_input_output_masking", ), + ( + TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, + "text_dataset_input_output_masking", + ), (TWITTER_COMPLAINTS_TOKENIZED_JSONL, "pretokenized_dataset"), + (TWITTER_COMPLAINTS_TOKENIZED_PARQUET, "pretokenized_dataset"), (TWITTER_COMPLAINTS_DATA_JSONL, "apply_custom_data_template"), + (TWITTER_COMPLAINTS_DATA_PARQUET, "apply_custom_data_template"), ], ) def test_load_dataset_with_dataconfig_and_datafile(datafile, datasetconfigname): @@ -339,8 +395,10 @@ def test_process_data_args_throws_error_where_needed(data_args, packing): [ (APPLY_CUSTOM_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_JSON), (APPLY_CUSTOM_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_JSONL), + (APPLY_CUSTOM_TEMPLATE_YAML, TWITTER_COMPLAINTS_DATA_PARQUET), (PRETOKENIZE_JSON_DATA_YAML, TWITTER_COMPLAINTS_TOKENIZED_JSON), (PRETOKENIZE_JSON_DATA_YAML, TWITTER_COMPLAINTS_TOKENIZED_JSONL), + (PRETOKENIZE_JSON_DATA_YAML, TWITTER_COMPLAINTS_TOKENIZED_PARQUET), ( TOKENIZE_AND_APPLY_INPUT_MASKING_YAML, TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON, @@ -349,6 +407,10 @@ def test_process_data_args_throws_error_where_needed(data_args, packing): TOKENIZE_AND_APPLY_INPUT_MASKING_YAML, TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL, ), + ( + TOKENIZE_AND_APPLY_INPUT_MASKING_YAML, + TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, + ), ], ) def test_process_dataconfig_file(data_config_path, data_path): @@ -414,6 +476,15 @@ def test_process_dataconfig_file(data_config_path, data_path): response_template="\n### Label:", ) ), + # single sequence PARQUET and response template + ( + configs.DataArguments( + training_data_path=TWITTER_COMPLAINTS_DATA_PARQUET, + validation_data_path=TWITTER_COMPLAINTS_DATA_PARQUET, + dataset_text_field="output", + response_template="\n### Label:", + ) + ), # data formatter template with input/output JSON ( configs.DataArguments( @@ -432,6 +503,15 @@ def test_process_dataconfig_file(data_config_path, data_path): response_template="\n### Label:", ) ), + # data formatter template with input/output PARQUET + ( + configs.DataArguments( + training_data_path=TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, + validation_data_path=TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, + data_formatter_template="### Text:{{input}} \n\n### Label: {{output}}", + response_template="\n### Label:", + ) + ), # input/output JSON with masking on input ( configs.DataArguments( @@ -446,6 +526,13 @@ def test_process_dataconfig_file(data_config_path, data_path): validation_data_path=TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL, ) ), + # input/output PARQUET with masking on input + ( + configs.DataArguments( + training_data_path=TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, + validation_data_path=TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET, + ) + ), ], ) def test_process_dataargs(data_args): @@ -487,6 +574,13 @@ def test_process_dataargs(data_args): validation_data_path=TWITTER_COMPLAINTS_TOKENIZED_JSONL, ) ), + # PARQUET pretokenized train and validation datasets + ( + configs.DataArguments( + training_data_path=TWITTER_COMPLAINTS_TOKENIZED_PARQUET, + validation_data_path=TWITTER_COMPLAINTS_TOKENIZED_PARQUET, + ) + ), # JSON pretokenized train datasets ( configs.DataArguments( @@ -499,6 +593,12 @@ def test_process_dataargs(data_args): training_data_path=TWITTER_COMPLAINTS_TOKENIZED_JSONL, ) ), + # PARQUET pretokenized train datasets + ( + configs.DataArguments( + training_data_path=TWITTER_COMPLAINTS_TOKENIZED_PARQUET, + ) + ), ], ) def test_process_dataargs_pretokenized(data_args): diff --git a/tuning/data/data_processors.py b/tuning/data/data_processors.py index f6f3b0ec9..c3f38e3f1 100644 --- a/tuning/data/data_processors.py +++ b/tuning/data/data_processors.py @@ -105,7 +105,7 @@ def _process_dataset_configs( # In future the streaming etc go as kwargs of this function raw_dataset = self.load_dataset(d, splitName) - logging.info("Loaded raw dataset : {raw_datasets}") + logging.info("Loaded raw dataset : %s", str(raw_dataset)) raw_datasets = DatasetDict() diff --git a/tuning/utils/utils.py b/tuning/utils/utils.py index 9def53df9..585011ae9 100644 --- a/tuning/utils/utils.py +++ b/tuning/utils/utils.py @@ -31,6 +31,8 @@ def get_loader_for_filepath(file_path: str) -> str: return "text" if ext in (".json", ".jsonl"): return "json" + if ext in (".parquet"): + return "parquet" return ext