diff --git a/llm/predict/export_model.py b/llm/predict/export_model.py index 774b69091c7a..e68edbf95d3b 100644 --- a/llm/predict/export_model.py +++ b/llm/predict/export_model.py @@ -58,7 +58,7 @@ def main(): tensor_parallel_rank = hcg.get_model_parallel_rank() # set predictor type - predictor = create_predictor(predictor_args, model_args, tensor_parallel_degree, tensor_parallel_rank) + predictor = create_predictor(predictor_args, model_args) predictor.model.eval() predictor.model.to_static( diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py index 8e8770f138d8..0eba44559a92 100644 --- a/llm/predict/predictor.py +++ b/llm/predict/predictor.py @@ -41,7 +41,7 @@ ChatGLMv2Tokenizer, Llama3Tokenizer, LlamaTokenizer, - PretrainedModel, + PretrainedConfig, PretrainedTokenizer, ) from paddlenlp.trl import llm_utils @@ -247,11 +247,9 @@ def predict(self, input_texts: str | list[str], return_tokens=False): class DygraphPredictor(BasePredictor): - def __init__( - self, config: PredictorArgument, model: PretrainedModel = None, tokenizer: PretrainedTokenizer = None - ): + def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs): super().__init__(config, tokenizer) - self.model = model + self.model = kwargs.get("model", None) if config.lora_path is not None: lora_config = LoRAConfig.from_pretrained(config.lora_path) dtype = lora_config.dtype @@ -328,7 +326,7 @@ def stream_predict(self, inputs: dict[str, paddle.Tensor]): class StaticGraphPredictor(BasePredictor): - def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None): + def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs): super().__init__(config, tokenizer) inference_config = paddle.inference.Config(self.config.model_name_or_path, self.config.model_prefix) @@ -625,14 +623,16 @@ def _preprocess(self, source): return inputs -class StaticInferencePredictor(InferencePredictorMixin): +class StaticGraphInferencePredictor(InferencePredictorMixin): def __init__( self, config: PredictorArgument, - cache_kvs_shape: list[list[int]], tokenizer: PretrainedTokenizer = None, + **kwargs, ): - self.cache_kvs_shape = cache_kvs_shape + self.cache_kvs_shape = kwargs.get("cache_kvs_shape", None) + if self.cache_kvs_shape is None: + raise ValueError("cache_kvs_shape should be provided for StaticGraphInferencePredictor") InferencePredictorMixin.__init__(self, config, tokenizer) self.predictor = self._create_predictor(config) @@ -715,9 +715,12 @@ class DygraphInferencePredictor(InferencePredictorMixin): def __init__( self, config: PredictorArgument, - model: PretrainedModel = None, tokenizer: PretrainedTokenizer = None, + **kwargs, ): + model = kwargs.get("model", None) + if model is None: + raise ValueError("model should be provided for DygraphInferencePredictor") self.cache_kvs_shape = model.get_cache_kvs_shape(model.config, config.batch_size, config.total_max_length) InferencePredictorMixin.__init__(self, config, tokenizer) self.model = model @@ -991,12 +994,10 @@ def _preprocess(self, input_text: list[str]): class DygraphBlockInferencePredictor(BlockInferencePredictorMixin): - def __init__( - self, - config: PredictorArgument, - model: PretrainedModel = None, - tokenizer: PretrainedTokenizer = None, - ): + def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs): + model = kwargs.get("model", None) + if model is None: + raise ValueError("model should be provided for DygraphBlockInferencePredictor") self.cache_kvs_shape = model.get_cache_kvs_shape(model.config, config.batch_size) BlockInferencePredictorMixin.__init__(self, config, tokenizer) @@ -1088,14 +1089,16 @@ def predict(self, input_texts: list[str], return_tokens=False): return outputs -class StaticBlockInferencePredictor(BlockInferencePredictorMixin): +class StaticGraphBlockInferencePredictor(BlockInferencePredictorMixin): def __init__( self, config: PredictorArgument, - cache_kvs_shape: list[list[int]], tokenizer: PretrainedTokenizer = None, + **kwargs, ): - self.cache_kvs_shape = cache_kvs_shape + self.cache_kvs_shape = kwargs.get("cache_kvs_shape", None) + if self.cache_kvs_shape is None: + raise ValueError("cache_kvs_shape should be provided for StaticGraphBlockInferencePredictor") BlockInferencePredictorMixin.__init__(self, config, tokenizer) self._create_predictor(config) @@ -1245,21 +1248,71 @@ def predict(self, input_texts: list[str], return_tokens=False): return outputs -def get_ptq_multicards_num(directory): - count = 0 - if os.path.exists(directory): - prefix = "act_scales_" - for filename in os.listdir(directory): - if filename.startswith(prefix): - count += 1 - return count +class AutoPredictor: + def __init__(self, *args, **kwargs): + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`" + ) + + @classmethod + def create_predictor( + cls, + predictor_args: PredictorArgument, + config: PretrainedConfig, + model_args: ModelArgument, + tokenizer: PretrainedTokenizer = None, + **kwargs + ): + """ + Create a predictor + + Args: + predictor_args (PredictorArgument): The predictor arguments. + config (PretrainedConfig): The model configuration. + model_args (ModelArgument): The model arguments. + tokenizer (PretrainedTokenizer): The tokenizer. + **kwargs: Additional keyword arguments. + Returns: + Predictor: The predictor. + """ + model = kwargs.pop("model", None) + cache_kvs_shape = None + + # static or dynamic + execute_mode = "Dygraph" if predictor_args.mode == "dynamic" else "StaticGraph" + + # infer/ no infer + if predictor_args.inference_model: + # block/no block + if predictor_args.block_attn: + attn_type = "Block" + else: + attn_type = "" + inference_mode = f"{attn_type}Inference" + + if predictor_args.mode == "static": + cache_kvs_shape = model.get_cache_kvs_shape( + config, predictor_args.batch_size, predictor_args.total_max_length + ) + else: + inference_mode = "" + + predictor_class_name = execute_mode + inference_mode + "Predictor" + + import_class = sys.modules[__name__] + + # import class + predictor_class = getattr(import_class, predictor_class_name) + + # instance + predictor = predictor_class(predictor_args, tokenizer=tokenizer, model=model, cache_kvs_shape=cache_kvs_shape) + return predictor def create_predictor( predictor_args: PredictorArgument, model_args: ModelArgument, - tensor_parallel_degree: int = 1, - tensor_parallel_rank: int = 0, ): tokenizer = AutoTokenizer.from_pretrained( predictor_args.model_name_or_path, @@ -1293,9 +1346,23 @@ def create_predictor( predictor_args.temperature = 1.0 tensor_parallel_rank, tensor_parallel_degree = llm_utils.init_dist_env() - if not predictor_args.inference_model: - tokenizer.padding_side = "left" + + model = None + + # model loading + if predictor_args.inference_model: + model = AutoInferenceModelForCausalLM.from_pretrained( + predictor_args.model_name_or_path, + config=config, + predictor_args=predictor_args, + model_args=model_args, + dtype=predictor_args.dtype, + tensor_parallel_degree=tensor_parallel_degree, + tensor_parallel_rank=tensor_parallel_rank, + ) + else: if predictor_args.mode == "dynamic": + # model import (gpt-3,ernie) or AutoModel if model_args.model_type == "gpt-3": sys.path.append("./gpt-3") from modeling import GPTForCausalLM @@ -1330,47 +1397,7 @@ def create_predictor( tensor_parallel_output=False, ) - predictor = DygraphPredictor(predictor_args, model=model, tokenizer=tokenizer) - elif predictor_args.mode == "static": - predictor = StaticGraphPredictor(predictor_args, tokenizer=tokenizer) - else: - raise ValueError("the `mode` should be one of [dynamic, static]") - else: - if predictor_args.mode == "dynamic": - model = AutoInferenceModelForCausalLM.from_pretrained( - predictor_args.model_name_or_path, - config=config, - predictor_args=predictor_args, - model_args=model_args, - dtype=predictor_args.dtype, - tensor_parallel_degree=tensor_parallel_degree, - tensor_parallel_rank=tensor_parallel_rank, - ) - model.eval() - if predictor_args.block_attn: - predictor = DygraphBlockInferencePredictor(predictor_args, model=model, tokenizer=tokenizer) - else: - predictor = DygraphInferencePredictor(predictor_args, model=model, tokenizer=tokenizer) - - elif predictor_args.mode == "static": - model = AutoInferenceModelForCausalLM.from_pretrained( - predictor_args.model_name_or_path, - config=config, - predictor_args=predictor_args, - model_args=model_args, - dtype=predictor_args.dtype, - tensor_parallel_degree=tensor_parallel_degree, - tensor_parallel_rank=tensor_parallel_rank, - ) - cache_kvs_shape = model.get_cache_kvs_shape( - config, predictor_args.batch_size, predictor_args.total_max_length - ) - if predictor_args.block_attn: - predictor = StaticBlockInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer) - else: - predictor = StaticInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer) - else: - raise ValueError("the `mode` should be one of [dynamic, static]") + predictor = AutoPredictor.create_predictor(predictor_args, config, model_args, tokenizer, model=model) return predictor diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index 3649a09199b9..4ce49ddf7847 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -858,7 +858,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) if predictor_args.mode == "dynamic": - return model_class.from_pretrained(predictor_args.model_name_or_path, config=config, dtype=dtype) + model = model_class.from_pretrained(predictor_args.model_name_or_path, config=config, dtype=dtype) + model.eval() + return model return model_class