diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 1631f7e8..d2d10b9d 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -143,6 +143,7 @@ LLM_METADATA_KEY = "_llm" RESERVED_METADATA_KEYS = [LLM_METADATA_KEY, CONVERTED_FROM_ARTIFACT_LIKE_KEY] +VLLM_MODEL_WEIGHTS_FOLDER = "model_files" INFERENCE_FRAMEWORK_REPOSITORY: Dict[LLMInferenceFramework, str] = { LLMInferenceFramework.DEEPSPEED: "instant-llm", @@ -2792,6 +2793,10 @@ async def execute( validate_endpoint_supports_openai_completion(model_endpoint, endpoint_content) + # if inference framework is VLLM, we need to set the model to use the weights folder + if endpoint_content.inference_framework == LLMInferenceFramework.VLLM: + request.model = VLLM_MODEL_WEIGHTS_FOLDER + inference_request = SyncEndpointPredictV1Request( args=request.model_dump(exclude_none=True), destination_path=OPENAI_COMPLETION_PATH, @@ -2894,6 +2899,10 @@ async def execute( validate_endpoint_supports_openai_completion(model_endpoint, model_content) + # if inference framework is VLLM, we need to set the model to use the weights folder + if model_content.inference_framework == LLMInferenceFramework.VLLM: + request.model = VLLM_MODEL_WEIGHTS_FOLDER + inference_request = SyncEndpointPredictV1Request( args=request.model_dump(exclude_none=True), destination_path=OPENAI_COMPLETION_PATH, @@ -3051,6 +3060,10 @@ async def execute( validate_endpoint_supports_chat_completion(model_endpoint, endpoint_content) + # if inference framework is VLLM, we need to set the model to use the weights folder + if endpoint_content.inference_framework == LLMInferenceFramework.VLLM: + request.model = VLLM_MODEL_WEIGHTS_FOLDER + inference_request = SyncEndpointPredictV1Request( args=request.model_dump(exclude_none=True), destination_path=OPENAI_CHAT_COMPLETION_PATH, @@ -3152,6 +3165,10 @@ async def execute( ) validate_endpoint_supports_chat_completion(model_endpoint, model_content) + # if inference framework is VLLM, we need to set the model to use the weights folder + if model_content.inference_framework == LLMInferenceFramework.VLLM: + request.model = VLLM_MODEL_WEIGHTS_FOLDER + inference_request = SyncEndpointPredictV1Request( args=request.model_dump(exclude_none=True), destination_path=OPENAI_CHAT_COMPLETION_PATH,