diff --git a/.pylintrc b/.pylintrc index 222bdf6cb..f54599d18 100644 --- a/.pylintrc +++ b/.pylintrc @@ -333,7 +333,7 @@ indent-string=' ' max-line-length=100 # Maximum number of lines in a module. -max-module-lines=1200 +max-module-lines=1400 # Allow the body of a class to be on the same line as the declaration if body # contains single statement. diff --git a/tests/artifacts/testdata/jsonl/twitter_complaints_small_2.jsonl b/tests/artifacts/testdata/jsonl/twitter_complaints_small_2.jsonl new file mode 100644 index 000000000..0837217c3 --- /dev/null +++ b/tests/artifacts/testdata/jsonl/twitter_complaints_small_2.jsonl @@ -0,0 +1,10 @@ +{"Tweet text":"@NortonSupport Thanks much.","ID":10,"Label":2,"text_label":"no complaint","output":"### Text: @NortonSupport Thanks much.\n\n### Label: no complaint"} +{"Tweet text":"@VerizonSupport all of a sudden I can't connect to my primary wireless network but guest one works","ID":11,"Label":2,"text_label":"no complaint","output":"### Text: @VerizonSupport all of a sudden I can't connect to my primary wireless network but guest one works\n\n### Label: no complaint"} +{"Tweet text":"Aaaahhhhh!!!! My @Razer @PlayOverwatch d.va meka headset came in!!! I didn't even know it had shipped!!! So excited\u2026 https:\/\/t.co\/4gXy9xED8d","ID":12,"Label":2,"text_label":"no complaint","output":"### Text: Aaaahhhhh!!!! My @Razer @PlayOverwatch d.va meka headset came in!!! I didn't even know it had shipped!!! So excited\u2026 https:\/\/t.co\/4gXy9xED8d\n\n### Label: no complaint"} +{"Tweet text":"@Lin_Manuel @jmessinaphoto @VAMNit Omg a little squish!!!!! Enjoy and congrats!!!! I miss mine being so young! \ufffd\ufffd\ufffd\ufffd\ufffd\ufffd","ID":13,"Label":2,"text_label":"no complaint","output":"### Text: @Lin_Manuel @jmessinaphoto @VAMNit Omg a little squish!!!!! Enjoy and congrats!!!! I miss mine being so young! \ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\n\n### Label: no complaint"} +{"Tweet text":"@IanJamesPoulter What's your secret to poaching eggs? Mine NEVER look that good.","ID":14,"Label":2,"text_label":"no complaint","output":"### Text: @IanJamesPoulter What's your secret to poaching eggs? Mine NEVER look that good.\n\n### Label: no complaint"} +{"Tweet text":"@AWSSupport When will be able Kinesis Firehose compatible with Elasticsearch 6.0? Thank you!","ID":15,"Label":2,"text_label":"no complaint","output":"### Text: @AWSSupport When will be able Kinesis Firehose compatible with Elasticsearch 6.0? Thank you!\n\n### Label: no complaint"} +{"Tweet text":"@NCIS_CBS https:\/\/t.co\/eeVL9Eu3bE","ID":16,"Label":2,"text_label":"no complaint","output":"### Text: @NCIS_CBS https:\/\/t.co\/eeVL9Eu3bE\n\n### Label: no complaint"} +{"Tweet text":"@msetchell Via the settings? That\u2019s how I do it on master T\u2019s","ID":17,"Label":2,"text_label":"no complaint","output":"### Text: @msetchell Via the settings? That\u2019s how I do it on master T\u2019s\n\n### Label: no complaint"} +{"Tweet text":"Today at work there was a low flying duck heading toward a crowd of people, and I yelled \"watch out! and I'm very disappointed with myself.","ID":18,"Label":2,"text_label":"no complaint","output":"### Text: Today at work there was a low flying duck heading toward a crowd of people, and I yelled \"watch out! and I'm very disappointed with myself.\n\n### Label: no complaint"} +{"Tweet text":"@NortonSupport @NortonOnline What the hell is a dm 5-10 days to get money back bank account now overdrawn thanks guys","ID":19,"Label":1,"text_label":"complaint","output":"### Text: @NortonSupport @NortonOnline What the hell is a dm 5-10 days to get money back bank account now overdrawn thanks guys\n\n### Label: complaint"} diff --git a/tests/data/test_data_preprocessing_utils.py b/tests/data/test_data_preprocessing_utils.py index 5559ac8ec..801f75bee 100644 --- a/tests/data/test_data_preprocessing_utils.py +++ b/tests/data/test_data_preprocessing_utils.py @@ -13,7 +13,9 @@ # limitations under the License. # Standard +import glob import json +import os import tempfile # Third Party @@ -613,6 +615,54 @@ def test_process_dataconfig_multiple_files(data_config_path, data_path_list): assert formatted_dataset_field in set(train_set.column_names) +@pytest.mark.parametrize( + "data_config_path, data_path", + [ + ( + DATA_CONFIG_APPLY_CUSTOM_TEMPLATE_YAML, + os.path.join( + os.path.dirname(TWITTER_COMPLAINTS_DATA_JSONL), "*small*.jsonl" + ), + ), + ], +) +def test_process_dataconfig_multiple_files_with_globbing(data_config_path, data_path): + """Ensure that datasets files matching globbing pattern are formatted and validated correctly based on the arguments passed in config file.""" + with open(data_config_path, "r") as f: + yaml_content = yaml.safe_load(f) + + PATTERN_TWITTER_COMPLAINTS_DATA_JSONL = data_path.replace( + "twitter_complaints_small.jsonl", "*small*.jsonl" + ) + yaml_content["datasets"][0]["data_paths"][0] = PATTERN_TWITTER_COMPLAINTS_DATA_JSONL + + # Modify dataset_text_field and template according to dataset + formatted_dataset_field = "formatted_data_field" + template = "### Input: {{Tweet text}} \n\n ### Response: {{text_label}}" + yaml_content["datasets"][0]["data_handlers"][0]["arguments"]["fn_kwargs"] = { + "dataset_text_field": formatted_dataset_field, + "template": template, + } + + with tempfile.NamedTemporaryFile( + "w", delete=False, suffix=".yaml" + ) as temp_yaml_file: + yaml.dump(yaml_content, temp_yaml_file) + temp_yaml_file_path = temp_yaml_file.name + data_args = configs.DataArguments(data_config_path=temp_yaml_file_path) + + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + (train_set, _, _) = _process_dataconfig_file(data_args, tokenizer) + assert isinstance(train_set, Dataset) + assert formatted_dataset_field in set(train_set.column_names) + + data_len = sum( + sum(1 for _ in open(file, "r")) # Count lines in each JSONL file + for file in glob.glob(PATTERN_TWITTER_COMPLAINTS_DATA_JSONL) + ) + assert len(train_set) == data_len + + @pytest.mark.parametrize( "datafiles, datasetconfigname", [ diff --git a/tuning/data/data_config.py b/tuning/data/data_config.py index 4da83d720..ed8af3f8c 100644 --- a/tuning/data/data_config.py +++ b/tuning/data/data_config.py @@ -79,7 +79,6 @@ def _validate_dataset_config(dataset_config) -> DataSetConfig: c.data_paths = [] for p in data_paths: assert isinstance(p, str), f"path {p} should be of the type string" - assert os.path.exists(p), f"data_paths {p} does not exist" if not os.path.isabs(p): _p = os.path.abspath(p) logging.warning( diff --git a/tuning/utils/utils.py b/tuning/utils/utils.py index 6eef6b2cf..b6c6a38b0 100644 --- a/tuning/utils/utils.py +++ b/tuning/utils/utils.py @@ -31,9 +31,9 @@ def get_loader_for_filepath(file_path: str) -> str: return "text" if ext in (".json", ".jsonl"): return "json" - if ext in (".arrow"): + if ext in (".arrow",): return "arrow" - if ext in (".parquet"): + if ext in (".parquet",): return "parquet" return ext