Skip to content

Commit 2748c74

Browse files
khluuEC2 Default User
authored andcommitted
[3/n][CI] Load Quantization test models with S3 (vllm-project#13570)
Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal> Signed-off-by: Linkun Chen <github@lkchen.net>
1 parent 1c52654 commit 2748c74

File tree

2 files changed

+53
-2
lines changed

2 files changed

+53
-2
lines changed

tests/conftest.py

+51
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,57 @@
5757
"ArthurZ/Ilama-3.2-1B",
5858
"llava-hf/llava-1.5-7b-hf",
5959
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
60+
"JackFram/llama-160m",
61+
"ai21labs/Jamba-tiny-random",
62+
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
63+
"nm-testing/Phi-3-mini-128k-instruct-FP8",
64+
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
65+
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
66+
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
67+
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
68+
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
69+
"AMead10/Llama-3.2-1B-Instruct-AWQ",
70+
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
71+
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
72+
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
73+
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
74+
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
75+
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
76+
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
77+
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
78+
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
79+
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
80+
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
81+
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
82+
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
83+
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
84+
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
85+
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
86+
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
87+
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
88+
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
89+
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
90+
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
91+
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
92+
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
93+
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
94+
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
95+
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
96+
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
97+
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
98+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
99+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
100+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
101+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
102+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
103+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
104+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
105+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
106+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
107+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
108+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
109+
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
110+
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
60111
]
61112

62113
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"

vllm/model_executor/model_loader/weight_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727
from vllm.platforms import current_platform
2828
from vllm.utils import PlaceholderModule
2929

30-
logger = init_logger(__name__)
31-
3230
try:
3331
from runai_model_streamer import SafetensorsStreamer
3432
except (ImportError, OSError):
@@ -39,6 +37,8 @@
3937
SafetensorsStreamer = runai_model_streamer.placeholder_attr(
4038
"SafetensorsStreamer")
4139

40+
logger = init_logger(__name__)
41+
4242
# use system-level temp directory for file locks, so that multiple users
4343
# can share the same lock without error.
4444
# lock files in the temp directory will be automatically deleted when the

0 commit comments

Comments
 (0)