From 8ed14c65473baab379ef1c6ee8108eb2e49f6ba7 Mon Sep 17 00:00:00 2001
From: Giuseppe Franco <giuseppefranco4@gmail.com>
Date: Tue, 20 Aug 2024 14:47:34 +0100
Subject: [PATCH] Updated README

---
 src/brevitas_examples/llm/README.md           | 97 +++++++++++++++----
 .../stable_diffusion/README.md                | 33 +++----
 2 files changed, 94 insertions(+), 36 deletions(-)

diff --git a/src/brevitas_examples/llm/README.md b/src/brevitas_examples/llm/README.md
index 315e3b674..cdf708d17 100644
--- a/src/brevitas_examples/llm/README.md
+++ b/src/brevitas_examples/llm/README.md
@@ -14,50 +14,113 @@
 Set the env variable BREVITAS_JIT=1 to speed up the quantization process. Currently unsupported whenever export is also toggled or with MSE based scales/zero-points.
 
 ```bash
-usage: main.py [-h] [--model MODEL] [--seed SEED] [--nsamples NSAMPLES] [--seqlen SEQLEN] [--eval] [--weight-bit-width WEIGHT_BIT_WIDTH] [--weight-param-method {stats,mse}]
-               [--weight-scale-type {float32,po2}] [--weight-quant-type {sym,asym}] [--weight-quant-granularity {per_channel,per_tensor,per_group}]
-               [--weight-group-size WEIGHT_GROUP_SIZE] [--quantize-weight-zero-point] [--input-bit-width INPUT_BIT_WIDTH] [--input-param-method {stats,mse}]
-               [--input-scale-type {float32,po2}] [--input-quant-type {sym,asym}] [--input-quant-granularity {per_tensor}] [--quantize-input-zero-point] [--gptq]
-               [--act-calibration] [--bias-corr] [--act-equalization]
+usage: main.py [-h] [--model MODEL] [--seed SEED] [--nsamples NSAMPLES]
+               [--seqlen SEQLEN] [--eval] [--dataset {wikitext2,c4}]
+               [--weight-bit-width WEIGHT_BIT_WIDTH]
+               [--weight-param-method {stats,mse}]
+               [--weight-scale-precision {float_scale,po2_scale}]
+               [--weight-quant-type {sym,asym}]
+               [--weight-quant-format WEIGHT_QUANT_FORMAT]
+               [--weight-quant-granularity {per_channel,per_tensor,per_group}]
+               [--weight-group-size WEIGHT_GROUP_SIZE]
+               [--quantize-weight-zero-point]
+               [--input-bit-width INPUT_BIT_WIDTH]
+               [--input-quant-format INPUT_QUANT_FORMAT]
+               [--input-param-method {stats,mse}]
+               [--input-scale-precision {float_scale,po2_scale}]
+               [--input-scale-type {static,dynamic,no_scale}]
+               [--input-quant-type {sym,asym}]
+               [--input-quant-granularity {per_tensor,per_row,per_group}]
+               [--input-group-size INPUT_GROUP_SIZE]
+               [--quantize-input-zero-point] [--quantize-last-layer] [--gptq]
+               [--act-calibration] [--bias-corr] [--ln-affine-merge]
+               [--no-quantize] [--no-float16] [--replace-mha]
+               [--weight-equalization]
+               [--act-equalization {None,layerwise,fx}] [--load-awq LOAD_AWQ]
                [--export-target {None,onnx_qcdq,torch_qcdq,sharded_torchmlir_group_weight,sharded_packed_torchmlir_group_weight}]
+               [--checkpoint-name CHECKPOINT_NAME]
 
-optional arguments:
+options:
   -h, --help            show this help message and exit
   --model MODEL         HF model name. Default: facebook/opt-125m.
   --seed SEED           Seed for sampling the calibration data. Default: 0.
   --nsamples NSAMPLES   Number of calibration data samples. Default: 128.
   --seqlen SEQLEN       Sequence length. Default: 2048.
-  --eval                Eval model PPL on C4.
+  --eval                Eval model PPL on the chosen Dataset.
+  --dataset {wikitext2,c4}
+                        Dataset to use for quantization (default: wikitext2)
   --weight-bit-width WEIGHT_BIT_WIDTH
                         Weight bit width. Default: 8.
   --weight-param-method {stats,mse}
                         How scales/zero-point are determined. Default: stats.
-  --weight-scale-type {float32,po2}
+  --weight-scale-precision {float_scale,po2_scale}
                         Whether scale is a float value or a po2. Default: po2.
   --weight-quant-type {sym,asym}
                         Weight quantization type. Default: asym.
+  --weight-quant-format WEIGHT_QUANT_FORMAT
+                        Weight quantization type. Either int or eXmY, with
+                        X+Y==weight_bit_width-1. It's possible to add
+                        float_ocp_ or float_fnuz_ before the exponent/mantissa
+                        bitwidth. Default: int.
   --weight-quant-granularity {per_channel,per_tensor,per_group}
-                        Granularity for scales/zero-point of weights. Default: per_group.
+                        Granularity for scales/zero-point of weights. Default:
+                        per_group.
   --weight-group-size WEIGHT_GROUP_SIZE
-                        Group size for per_group weight quantization. Default: 128.
+                        Group size for per_group weight quantization. Default:
+                        128.
   --quantize-weight-zero-point
                         Quantize weight zero-point.
   --input-bit-width INPUT_BIT_WIDTH
-                        Input bit width. Default: None (disables input quantization).
+                        Input bit width. Default: None (disables input
+                        quantization).
+  --input-quant-format INPUT_QUANT_FORMAT
+                        Input quantization type. Either int or eXmY, with
+                        X+Y==weight_bit_width-1. It's possible to add
+                        float_ocp_ or float_fnuz_ before the exponent/mantissa
+                        bitwidth. Default: int.
   --input-param-method {stats,mse}
-                        How scales/zero-point are determined. Default: stats.
-  --input-scale-type {float32,po2}
-                        Whether input scale is a float value or a po2. Default: float32.
+                        How scales/zero-point are determined. Default: stats
+                        (percentile for static, absmax or minmax for dynamic).
+  --input-scale-precision {float_scale,po2_scale}
+                        Whether input scale is a float value or a po2.
+                        Default: float.
+  --input-scale-type {static,dynamic,no_scale}
+                        Whether input scale is a static value or a dynamic
+                        value.
   --input-quant-type {sym,asym}
                         Input quantization type. Default: asym.
-  --input-quant-granularity {per_tensor}
-                        Granularity for scales/zero-point of inputs. Default: per_tensor.
+  --input-quant-granularity {per_tensor,per_row,per_group}
+                        Granularity for scales/zero-point of inputs. Default:
+                        per_tensor.
+  --input-group-size INPUT_GROUP_SIZE
+                        Group size for per_group input quantization. Default:
+                        64.
   --quantize-input-zero-point
                         Quantize input zero-point.
+  --quantize-last-layer
+                        Quantize last nn.Linear layer.
   --gptq                Apply GPTQ.
   --act-calibration     Apply activation calibration.
   --bias-corr           Apply bias correction.
-  --act-equalization    Apply activation equalization (SmoothQuant).
+  --ln-affine-merge     Merge LN affine params.
+  --no-quantize         Disable quantization.
+  --no-float16          Disable float16 as base datatype and switch to
+                        float32.
+  --replace-mha         Replace HuggingFace Attention with a quantizable
+                        version
+  --weight-equalization
+                        Apply weight equalization. Relevant to ReLU based
+                        models (e.g. OPT).
+  --act-equalization {None,layerwise,fx}
+                        Apply activation equalization (SmoothQuant). Layerwise
+                        introduces standalone mul nodes,while fx merges them
+                        whenever possible into previous tensors, which is
+                        possible on ReLU based models (e.g. OPT).
+  --load-awq LOAD_AWQ   Load the awq search results.
   --export-target {None,onnx_qcdq,torch_qcdq,sharded_torchmlir_group_weight,sharded_packed_torchmlir_group_weight}
                         Model export.
+  --checkpoint-name CHECKPOINT_NAME
+                        Filename to save checkpoint. If `None`, no checkpoint
+                        is saved (default: None)
+
 ```
diff --git a/src/brevitas_examples/stable_diffusion/README.md b/src/brevitas_examples/stable_diffusion/README.md
index a51a06df5..1cc5374e8 100644
--- a/src/brevitas_examples/stable_diffusion/README.md
+++ b/src/brevitas_examples/stable_diffusion/README.md
@@ -97,17 +97,15 @@ usage: main.py [-h] [-m MODEL] [-d DEVICE] [-b BATCH_SIZE] [--prompt PROMPT]
                [--quantize-input-zero-point | --no-quantize-input-zero-point]
                [--export-cpu-float32 | --no-export-cpu-float32]
                [--use-mlperf-inference | --no-use-mlperf-inference]
-               [--use-ocp | --no-use-ocp] [--use-fnuz | --no-use-fnuz]
                [--use-negative-prompts | --no-use-negative-prompts]
-               [--dry-run | --no-dry-run]
-               [--quantize-sdp-1 | --no-quantize-sdp-1]
-               [--quantize-sdp-2 | --no-quantize-sdp-2]
+               [--dry-run | --no-dry-run] [--quantize-sdp | --no-quantize-sdp]
                [--override-conv-quant-config | --no-override-conv-quant-config]
                [--vae-fp16-fix | --no-vae-fp16-fix]
+               [--share-qkv-quant | --no-share-qkv-quant]
 
 Stable Diffusion quantization
 
-optional arguments:
+options:
   -h, --help            show this help message and exit
   -m MODEL, --model MODEL
                         Path or name of the model.
@@ -203,10 +201,14 @@ optional arguments:
                         Input quantization type. Default: asym.
   --weight-quant-format WEIGHT_QUANT_FORMAT
                         Weight quantization type. Either int or eXmY, with
-                        X+Y==weight_bit_width-1. Default: int.
+                        X+Y==weight_bit_width-1. It's possible to add
+                        float_ocp_ or float_fnuz_ before the exponent/mantissa
+                        bitwidth. Default: int.
   --input-quant-format INPUT_QUANT_FORMAT
                         Input quantization type. Either int or eXmY, with
-                        X+Y==input_bit_width-1. Default: int.
+                        X+Y==input_bit_width-1. It's possible to add
+                        float_ocp_ or float_fnuz_ before the exponent/mantissa
+                        bitwidth. Default: int.
   --weight-quant-granularity {per_channel,per_tensor,per_group}
                         Granularity for scales/zero-point of weights. Default:
                         per_channel.
@@ -242,14 +244,6 @@ optional arguments:
   --no-use-mlperf-inference
                         Disable Evaluate FID score with MLPerf pipeline.
                         Default: False
-  --use-ocp             Enable Use OCP format for float quantization. Default:
-                        True
-  --no-use-ocp          Disable Use OCP format for float quantization.
-                        Default: True
-  --use-fnuz            Enable Use FNUZ format for float quantization.
-                        Default: True
-  --no-use-fnuz         Disable Use FNUZ format for float quantization.
-                        Default: True
   --use-negative-prompts
                         Enable Use negative prompts during
                         generation/calibration. Default: Enabled
@@ -260,10 +254,8 @@ optional arguments:
                         calibration. Default: Disabled
   --no-dry-run          Disable Generate a quantized model without any
                         calibration. Default: Disabled
-  --quantize-sdp-1      Enable Quantize SDP. Default: Disabled
-  --no-quantize-sdp-1   Disable Quantize SDP. Default: Disabled
-  --quantize-sdp-2      Enable Quantize SDP. Default: Disabled
-  --no-quantize-sdp-2   Disable Quantize SDP. Default: Disabled
+  --quantize-sdp        Enable Quantize SDP. Default: Disabled
+  --no-quantize-sdp     Disable Quantize SDP. Default: Disabled
   --override-conv-quant-config
                         Enable Quantize Convolutions in the same way as SDP
                         (i.e., FP8). Default: Disabled
@@ -274,4 +266,7 @@ optional arguments:
                         Default: Disabled
   --no-vae-fp16-fix     Disable Rescale the VAE to not go NaN with FP16.
                         Default: Disabled
+  --share-qkv-quant     Enable Share QKV/KV quantization. Default: Disabled
+  --no-share-qkv-quant  Disable Share QKV/KV quantization. Default: Disabled
+
 ```