Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

[INFER][LLM] Add the AutoPredictor for inference #9445

Merged
merged 9 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llm/predict/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def main():
tensor_parallel_rank = hcg.get_model_parallel_rank()

# set predictor type
predictor = create_predictor(predictor_args, model_args, tensor_parallel_degree, tensor_parallel_rank)
predictor = create_predictor(predictor_args, model_args)
predictor.model.eval()

predictor.model.to_static(
Expand Down
171 changes: 99 additions & 72 deletions llm/predict/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
ChatGLMv2Tokenizer,
Llama3Tokenizer,
LlamaTokenizer,
PretrainedModel,
PretrainedConfig,
PretrainedTokenizer,
)
from paddlenlp.trl import llm_utils
Expand Down Expand Up @@ -247,11 +247,9 @@ def predict(self, input_texts: str | list[str], return_tokens=False):


class DygraphPredictor(BasePredictor):
def __init__(
self, config: PredictorArgument, model: PretrainedModel = None, tokenizer: PretrainedTokenizer = None
):
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs):
super().__init__(config, tokenizer)
self.model = model
self.model = kwargs.get("model", None)
if config.lora_path is not None:
lora_config = LoRAConfig.from_pretrained(config.lora_path)
dtype = lora_config.dtype
Expand Down Expand Up @@ -328,7 +326,7 @@ def stream_predict(self, inputs: dict[str, paddle.Tensor]):


class StaticGraphPredictor(BasePredictor):
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None):
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs):
super().__init__(config, tokenizer)

inference_config = paddle.inference.Config(self.config.model_name_or_path, self.config.model_prefix)
Expand Down Expand Up @@ -625,14 +623,16 @@ def _preprocess(self, source):
return inputs


class StaticInferencePredictor(InferencePredictorMixin):
class StaticGraphInferencePredictor(InferencePredictorMixin):
def __init__(
self,
config: PredictorArgument,
cache_kvs_shape: list[list[int]],
tokenizer: PretrainedTokenizer = None,
**kwargs,
):
self.cache_kvs_shape = cache_kvs_shape
self.cache_kvs_shape = kwargs.get("cache_kvs_shape", None)
if self.cache_kvs_shape is None:
raise ValueError("cache_kvs_shape should be provided for StaticGraphInferencePredictor")
InferencePredictorMixin.__init__(self, config, tokenizer)

self.predictor = self._create_predictor(config)
Expand Down Expand Up @@ -715,9 +715,12 @@ class DygraphInferencePredictor(InferencePredictorMixin):
def __init__(
self,
config: PredictorArgument,
model: PretrainedModel = None,
tokenizer: PretrainedTokenizer = None,
**kwargs,
):
model = kwargs.get("model", None)
if model is None:
raise ValueError("model should be provided for DygraphInferencePredictor")
self.cache_kvs_shape = model.get_cache_kvs_shape(model.config, config.batch_size, config.total_max_length)
InferencePredictorMixin.__init__(self, config, tokenizer)
self.model = model
Expand Down Expand Up @@ -991,12 +994,10 @@ def _preprocess(self, input_text: list[str]):


class DygraphBlockInferencePredictor(BlockInferencePredictorMixin):
def __init__(
self,
config: PredictorArgument,
model: PretrainedModel = None,
tokenizer: PretrainedTokenizer = None,
):
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs):
model = kwargs.get("model", None)
if model is None:
raise ValueError("model should be provided for DygraphBlockInferencePredictor")
self.cache_kvs_shape = model.get_cache_kvs_shape(model.config, config.batch_size)
BlockInferencePredictorMixin.__init__(self, config, tokenizer)

Expand Down Expand Up @@ -1088,14 +1089,16 @@ def predict(self, input_texts: list[str], return_tokens=False):
return outputs


class StaticBlockInferencePredictor(BlockInferencePredictorMixin):
class StaticGraphBlockInferencePredictor(BlockInferencePredictorMixin):
def __init__(
self,
config: PredictorArgument,
cache_kvs_shape: list[list[int]],
tokenizer: PretrainedTokenizer = None,
**kwargs,
):
self.cache_kvs_shape = cache_kvs_shape
self.cache_kvs_shape = kwargs.get("cache_kvs_shape", None)
if self.cache_kvs_shape is None:
raise ValueError("cache_kvs_shape should be provided for StaticGraphBlockInferencePredictor")
BlockInferencePredictorMixin.__init__(self, config, tokenizer)

self._create_predictor(config)
Expand Down Expand Up @@ -1245,21 +1248,71 @@ def predict(self, input_texts: list[str], return_tokens=False):
return outputs


def get_ptq_multicards_num(directory):
count = 0
if os.path.exists(directory):
prefix = "act_scales_"
for filename in os.listdir(directory):
if filename.startswith(prefix):
count += 1
return count
class AutoPredictor:
def __init__(self, *args, **kwargs):
raise EnvironmentError(
f"{self.__class__.__name__} is designed to be instantiated "
f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`"
)

@classmethod
def create_predictor(
cls,
predictor_args: PredictorArgument,
config: PretrainedConfig,
model_args: ModelArgument,
tokenizer: PretrainedTokenizer = None,
**kwargs
):
"""
Create a predictor

Args:
predictor_args (PredictorArgument): The predictor arguments.
config (PretrainedConfig): The model configuration.
model_args (ModelArgument): The model arguments.
tokenizer (PretrainedTokenizer): The tokenizer.
**kwargs: Additional keyword arguments.
Returns:
Predictor: The predictor.
"""
model = kwargs.pop("model", None)
cache_kvs_shape = None

# static or dynamic
execute_mode = "Dygraph" if predictor_args.mode == "dynamic" else "StaticGraph"

# infer/ no infer
if predictor_args.inference_model:
# block/no block
if predictor_args.block_attn:
attn_type = "Block"
else:
attn_type = ""
inference_mode = f"{attn_type}Inference"

if predictor_args.mode == "static":
cache_kvs_shape = model.get_cache_kvs_shape(
config, predictor_args.batch_size, predictor_args.total_max_length
)
else:
inference_mode = ""

predictor_class_name = execute_mode + inference_mode + "Predictor"

import_class = sys.modules[__name__]

# import class
predictor_class = getattr(import_class, predictor_class_name)

# instance
predictor = predictor_class(predictor_args, tokenizer=tokenizer, model=model, cache_kvs_shape=cache_kvs_shape)
return predictor


def create_predictor(
predictor_args: PredictorArgument,
model_args: ModelArgument,
tensor_parallel_degree: int = 1,
tensor_parallel_rank: int = 0,
):
tokenizer = AutoTokenizer.from_pretrained(
predictor_args.model_name_or_path,
Expand Down Expand Up @@ -1293,9 +1346,23 @@ def create_predictor(
predictor_args.temperature = 1.0

tensor_parallel_rank, tensor_parallel_degree = llm_utils.init_dist_env()
if not predictor_args.inference_model:
tokenizer.padding_side = "left"

model = None

# model loading
if predictor_args.inference_model:
model = AutoInferenceModelForCausalLM.from_pretrained(
predictor_args.model_name_or_path,
config=config,
predictor_args=predictor_args,
model_args=model_args,
dtype=predictor_args.dtype,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
)
else:
if predictor_args.mode == "dynamic":
# model import (gpt-3,ernie) or AutoModel
if model_args.model_type == "gpt-3":
sys.path.append("./gpt-3")
from modeling import GPTForCausalLM
Expand Down Expand Up @@ -1330,47 +1397,7 @@ def create_predictor(
tensor_parallel_output=False,
)

predictor = DygraphPredictor(predictor_args, model=model, tokenizer=tokenizer)
elif predictor_args.mode == "static":
predictor = StaticGraphPredictor(predictor_args, tokenizer=tokenizer)
else:
raise ValueError("the `mode` should be one of [dynamic, static]")
else:
if predictor_args.mode == "dynamic":
model = AutoInferenceModelForCausalLM.from_pretrained(
predictor_args.model_name_or_path,
config=config,
predictor_args=predictor_args,
model_args=model_args,
dtype=predictor_args.dtype,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
)
model.eval()
if predictor_args.block_attn:
predictor = DygraphBlockInferencePredictor(predictor_args, model=model, tokenizer=tokenizer)
else:
predictor = DygraphInferencePredictor(predictor_args, model=model, tokenizer=tokenizer)

elif predictor_args.mode == "static":
model = AutoInferenceModelForCausalLM.from_pretrained(
predictor_args.model_name_or_path,
config=config,
predictor_args=predictor_args,
model_args=model_args,
dtype=predictor_args.dtype,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
)
cache_kvs_shape = model.get_cache_kvs_shape(
config, predictor_args.batch_size, predictor_args.total_max_length
)
if predictor_args.block_attn:
predictor = StaticBlockInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer)
else:
predictor = StaticInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer)
else:
raise ValueError("the `mode` should be one of [dynamic, static]")
predictor = AutoPredictor.create_predictor(predictor_args, config, model_args, tokenizer, model=model)

return predictor

Expand Down
4 changes: 3 additions & 1 deletion paddlenlp/transformers/auto/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -858,7 +858,9 @@
)

if predictor_args.mode == "dynamic":
return model_class.from_pretrained(predictor_args.model_name_or_path, config=config, dtype=dtype)
model = model_class.from_pretrained(predictor_args.model_name_or_path, config=config, dtype=dtype)
model.eval()
return model

Check warning on line 863 in paddlenlp/transformers/auto/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/auto/modeling.py#L861-L863

Added lines #L861 - L863 were not covered by tests

return model_class

Expand Down