From 8ed14c65473baab379ef1c6ee8108eb2e49f6ba7 Mon Sep 17 00:00:00 2001 From: Giuseppe Franco Date: Tue, 20 Aug 2024 14:47:34 +0100 Subject: [PATCH] Updated README --- src/brevitas_examples/llm/README.md | 97 +++++++++++++++---- .../stable_diffusion/README.md | 33 +++---- 2 files changed, 94 insertions(+), 36 deletions(-) diff --git a/src/brevitas_examples/llm/README.md b/src/brevitas_examples/llm/README.md index 315e3b674..cdf708d17 100644 --- a/src/brevitas_examples/llm/README.md +++ b/src/brevitas_examples/llm/README.md @@ -14,50 +14,113 @@ Set the env variable BREVITAS_JIT=1 to speed up the quantization process. Currently unsupported whenever export is also toggled or with MSE based scales/zero-points. ```bash -usage: main.py [-h] [--model MODEL] [--seed SEED] [--nsamples NSAMPLES] [--seqlen SEQLEN] [--eval] [--weight-bit-width WEIGHT_BIT_WIDTH] [--weight-param-method {stats,mse}] - [--weight-scale-type {float32,po2}] [--weight-quant-type {sym,asym}] [--weight-quant-granularity {per_channel,per_tensor,per_group}] - [--weight-group-size WEIGHT_GROUP_SIZE] [--quantize-weight-zero-point] [--input-bit-width INPUT_BIT_WIDTH] [--input-param-method {stats,mse}] - [--input-scale-type {float32,po2}] [--input-quant-type {sym,asym}] [--input-quant-granularity {per_tensor}] [--quantize-input-zero-point] [--gptq] - [--act-calibration] [--bias-corr] [--act-equalization] +usage: main.py [-h] [--model MODEL] [--seed SEED] [--nsamples NSAMPLES] + [--seqlen SEQLEN] [--eval] [--dataset {wikitext2,c4}] + [--weight-bit-width WEIGHT_BIT_WIDTH] + [--weight-param-method {stats,mse}] + [--weight-scale-precision {float_scale,po2_scale}] + [--weight-quant-type {sym,asym}] + [--weight-quant-format WEIGHT_QUANT_FORMAT] + [--weight-quant-granularity {per_channel,per_tensor,per_group}] + [--weight-group-size WEIGHT_GROUP_SIZE] + [--quantize-weight-zero-point] + [--input-bit-width INPUT_BIT_WIDTH] + [--input-quant-format INPUT_QUANT_FORMAT] + [--input-param-method {stats,mse}] + [--input-scale-precision {float_scale,po2_scale}] + [--input-scale-type {static,dynamic,no_scale}] + [--input-quant-type {sym,asym}] + [--input-quant-granularity {per_tensor,per_row,per_group}] + [--input-group-size INPUT_GROUP_SIZE] + [--quantize-input-zero-point] [--quantize-last-layer] [--gptq] + [--act-calibration] [--bias-corr] [--ln-affine-merge] + [--no-quantize] [--no-float16] [--replace-mha] + [--weight-equalization] + [--act-equalization {None,layerwise,fx}] [--load-awq LOAD_AWQ] [--export-target {None,onnx_qcdq,torch_qcdq,sharded_torchmlir_group_weight,sharded_packed_torchmlir_group_weight}] + [--checkpoint-name CHECKPOINT_NAME] -optional arguments: +options: -h, --help show this help message and exit --model MODEL HF model name. Default: facebook/opt-125m. --seed SEED Seed for sampling the calibration data. Default: 0. --nsamples NSAMPLES Number of calibration data samples. Default: 128. --seqlen SEQLEN Sequence length. Default: 2048. - --eval Eval model PPL on C4. + --eval Eval model PPL on the chosen Dataset. + --dataset {wikitext2,c4} + Dataset to use for quantization (default: wikitext2) --weight-bit-width WEIGHT_BIT_WIDTH Weight bit width. Default: 8. --weight-param-method {stats,mse} How scales/zero-point are determined. Default: stats. - --weight-scale-type {float32,po2} + --weight-scale-precision {float_scale,po2_scale} Whether scale is a float value or a po2. Default: po2. --weight-quant-type {sym,asym} Weight quantization type. Default: asym. + --weight-quant-format WEIGHT_QUANT_FORMAT + Weight quantization type. Either int or eXmY, with + X+Y==weight_bit_width-1. It's possible to add + float_ocp_ or float_fnuz_ before the exponent/mantissa + bitwidth. Default: int. --weight-quant-granularity {per_channel,per_tensor,per_group} - Granularity for scales/zero-point of weights. Default: per_group. + Granularity for scales/zero-point of weights. Default: + per_group. --weight-group-size WEIGHT_GROUP_SIZE - Group size for per_group weight quantization. Default: 128. + Group size for per_group weight quantization. Default: + 128. --quantize-weight-zero-point Quantize weight zero-point. --input-bit-width INPUT_BIT_WIDTH - Input bit width. Default: None (disables input quantization). + Input bit width. Default: None (disables input + quantization). + --input-quant-format INPUT_QUANT_FORMAT + Input quantization type. Either int or eXmY, with + X+Y==weight_bit_width-1. It's possible to add + float_ocp_ or float_fnuz_ before the exponent/mantissa + bitwidth. Default: int. --input-param-method {stats,mse} - How scales/zero-point are determined. Default: stats. - --input-scale-type {float32,po2} - Whether input scale is a float value or a po2. Default: float32. + How scales/zero-point are determined. Default: stats + (percentile for static, absmax or minmax for dynamic). + --input-scale-precision {float_scale,po2_scale} + Whether input scale is a float value or a po2. + Default: float. + --input-scale-type {static,dynamic,no_scale} + Whether input scale is a static value or a dynamic + value. --input-quant-type {sym,asym} Input quantization type. Default: asym. - --input-quant-granularity {per_tensor} - Granularity for scales/zero-point of inputs. Default: per_tensor. + --input-quant-granularity {per_tensor,per_row,per_group} + Granularity for scales/zero-point of inputs. Default: + per_tensor. + --input-group-size INPUT_GROUP_SIZE + Group size for per_group input quantization. Default: + 64. --quantize-input-zero-point Quantize input zero-point. + --quantize-last-layer + Quantize last nn.Linear layer. --gptq Apply GPTQ. --act-calibration Apply activation calibration. --bias-corr Apply bias correction. - --act-equalization Apply activation equalization (SmoothQuant). + --ln-affine-merge Merge LN affine params. + --no-quantize Disable quantization. + --no-float16 Disable float16 as base datatype and switch to + float32. + --replace-mha Replace HuggingFace Attention with a quantizable + version + --weight-equalization + Apply weight equalization. Relevant to ReLU based + models (e.g. OPT). + --act-equalization {None,layerwise,fx} + Apply activation equalization (SmoothQuant). Layerwise + introduces standalone mul nodes,while fx merges them + whenever possible into previous tensors, which is + possible on ReLU based models (e.g. OPT). + --load-awq LOAD_AWQ Load the awq search results. --export-target {None,onnx_qcdq,torch_qcdq,sharded_torchmlir_group_weight,sharded_packed_torchmlir_group_weight} Model export. + --checkpoint-name CHECKPOINT_NAME + Filename to save checkpoint. If `None`, no checkpoint + is saved (default: None) + ``` diff --git a/src/brevitas_examples/stable_diffusion/README.md b/src/brevitas_examples/stable_diffusion/README.md index a51a06df5..1cc5374e8 100644 --- a/src/brevitas_examples/stable_diffusion/README.md +++ b/src/brevitas_examples/stable_diffusion/README.md @@ -97,17 +97,15 @@ usage: main.py [-h] [-m MODEL] [-d DEVICE] [-b BATCH_SIZE] [--prompt PROMPT] [--quantize-input-zero-point | --no-quantize-input-zero-point] [--export-cpu-float32 | --no-export-cpu-float32] [--use-mlperf-inference | --no-use-mlperf-inference] - [--use-ocp | --no-use-ocp] [--use-fnuz | --no-use-fnuz] [--use-negative-prompts | --no-use-negative-prompts] - [--dry-run | --no-dry-run] - [--quantize-sdp-1 | --no-quantize-sdp-1] - [--quantize-sdp-2 | --no-quantize-sdp-2] + [--dry-run | --no-dry-run] [--quantize-sdp | --no-quantize-sdp] [--override-conv-quant-config | --no-override-conv-quant-config] [--vae-fp16-fix | --no-vae-fp16-fix] + [--share-qkv-quant | --no-share-qkv-quant] Stable Diffusion quantization -optional arguments: +options: -h, --help show this help message and exit -m MODEL, --model MODEL Path or name of the model. @@ -203,10 +201,14 @@ optional arguments: Input quantization type. Default: asym. --weight-quant-format WEIGHT_QUANT_FORMAT Weight quantization type. Either int or eXmY, with - X+Y==weight_bit_width-1. Default: int. + X+Y==weight_bit_width-1. It's possible to add + float_ocp_ or float_fnuz_ before the exponent/mantissa + bitwidth. Default: int. --input-quant-format INPUT_QUANT_FORMAT Input quantization type. Either int or eXmY, with - X+Y==input_bit_width-1. Default: int. + X+Y==input_bit_width-1. It's possible to add + float_ocp_ or float_fnuz_ before the exponent/mantissa + bitwidth. Default: int. --weight-quant-granularity {per_channel,per_tensor,per_group} Granularity for scales/zero-point of weights. Default: per_channel. @@ -242,14 +244,6 @@ optional arguments: --no-use-mlperf-inference Disable Evaluate FID score with MLPerf pipeline. Default: False - --use-ocp Enable Use OCP format for float quantization. Default: - True - --no-use-ocp Disable Use OCP format for float quantization. - Default: True - --use-fnuz Enable Use FNUZ format for float quantization. - Default: True - --no-use-fnuz Disable Use FNUZ format for float quantization. - Default: True --use-negative-prompts Enable Use negative prompts during generation/calibration. Default: Enabled @@ -260,10 +254,8 @@ optional arguments: calibration. Default: Disabled --no-dry-run Disable Generate a quantized model without any calibration. Default: Disabled - --quantize-sdp-1 Enable Quantize SDP. Default: Disabled - --no-quantize-sdp-1 Disable Quantize SDP. Default: Disabled - --quantize-sdp-2 Enable Quantize SDP. Default: Disabled - --no-quantize-sdp-2 Disable Quantize SDP. Default: Disabled + --quantize-sdp Enable Quantize SDP. Default: Disabled + --no-quantize-sdp Disable Quantize SDP. Default: Disabled --override-conv-quant-config Enable Quantize Convolutions in the same way as SDP (i.e., FP8). Default: Disabled @@ -274,4 +266,7 @@ optional arguments: Default: Disabled --no-vae-fp16-fix Disable Rescale the VAE to not go NaN with FP16. Default: Disabled + --share-qkv-quant Enable Share QKV/KV quantization. Default: Disabled + --no-share-qkv-quant Disable Share QKV/KV quantization. Default: Disabled + ```