diff --git a/llm/tools/preprocess/README.md b/llm/tools/preprocess/README.md index a44cd8884aab..e51c25969805 100644 --- a/llm/tools/preprocess/README.md +++ b/llm/tools/preprocess/README.md @@ -126,12 +126,9 @@ head -1 baike_sample.jsonl ``` optional arguments: -h, --help show this help message and exit - --model_name MODEL_NAME + --model_name_or_path MODEL_NAME_OR_PATH What model to use. 必须设置,如:idea-ccnl/ziya-llama-13b-v1, 可以参考已有的模型名称 https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm - --tokenizer_name {LlamaTokenizer} - What type of tokenizer to use. - 模型对应的tokenizer, Llama模型需使用LlamaTokenizer data input/output: --input_path INPUT_PATH Path to input JSON files. @@ -183,8 +180,7 @@ common config: * 针对 llama 模型 ```shell python -u create_pretraining_data.py \ - --model_name "idea-ccnl/ziya-llama-13b-v1" \ - --tokenizer_name "LlamaTokenizer" \ + --model_name_or_path "idea-ccnl/ziya-llama-13b-v1" \ --input_path "baike_sample.jsonl" \ --output_prefix "baike_sample" \ --data_format "JSON" \ @@ -199,8 +195,7 @@ python -u create_pretraining_data.py \ * 针对 ernie 模型 ```shell python -u create_pretraining_data.py \ - --model_name "ernie-3.0-base-zh" \ - --tokenizer_name "ErnieTokenizer" \ + --model_name_or_path "ernie-3.0-base-zh" \ --input_path "baike_sample.jsonl" \ --output_prefix "baike_sample" \ --data_format "JSON" \ diff --git a/llm/tools/preprocess/create_pretraining_data.py b/llm/tools/preprocess/create_pretraining_data.py index 31daa796753d..4d795f97f5f2 100644 --- a/llm/tools/preprocess/create_pretraining_data.py +++ b/llm/tools/preprocess/create_pretraining_data.py @@ -23,8 +23,8 @@ import numpy as np from tqdm import tqdm -import paddlenlp.transformers as tfs from paddlenlp.data import indexed_dataset +from paddlenlp.transformers import AutoTokenizer from paddlenlp.utils.log import logger try: @@ -44,23 +44,7 @@ def print_datetime(string): def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, required=True, help="What model to use.") - parser.add_argument( - "--tokenizer_name", - type=str, - required=True, - choices=[ - "ErnieTokenizer", - "BertTokenizer", - "GPTTokenizer", - "GPTChineseTokenizer", - "LlamaTokenizer", - "ElectraTokenizer", - "T5Tokenizer", - "Qwen2Tokenizer" - ], - help="What type of tokenizer to use.", - ) + parser.add_argument("--model_name_or_path", type=str, required=True, help="What model to use.") group = parser.add_argument_group(title="data input/output") group.add_argument("--input_path", type=str, required=True, help="Path to input JSON files.") group.add_argument("--output_prefix", type=str, required=True, help="Output prefix to store output file.") @@ -227,7 +211,7 @@ def __init__(self, args): self.args = args def initializer(self): - Converter.tokenizer = getattr(tfs, self.args.tokenizer_name).from_pretrained(self.args.model_name) + Converter.tokenizer = AutoTokenizer.from_pretrained(self.args.model_name_or_path) if self.args.cn_whole_word_segment: # Extend chinese char vocab for ErnieTokinzer Converter.tokenizer.extend_chinese_char() @@ -333,7 +317,7 @@ def main(): convert = Converter(args) # Try tokenizer is availiable - sample_tokenizer = getattr(tfs, args.tokenizer_name).from_pretrained(args.model_name) + sample_tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) if sample_tokenizer.vocab_size < 2**16 - 1: save_dtype = np.uint16 else: