Add I1/I3/I4 format

kaleid-liner · kaleid-liner · commit 1d19d0a02153 · 2024-08-06T19:07:50.000+08:00
diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit d230bc6a02e800c2c967516a27341424f502e7e6
+Subproject commit 185d96ce5087b117d6b3a48bc99f158e9daec58d
diff --git a/README.md b/README.md
@@ -7,6 +7,8 @@
 
 ## News
 
+- 08/06/2024 🚀: Support 1/2/3/4-bit quantized Llama models in GPTQ format. Test it using the pretrained models released by [EfficientQAT](https://github.com/OpenGVLab/EfficientQAT).
+
 - 07/27/2024 ✨: We've noted that T-MAC is even faster than the NPU in token generation speed on the latest Snapdragon X Elite chipset! Check [Compared to NPU](#compared-to-npu) for more details.
 
 - 07/23/2024 🚀🚀: We've enabled the execution of any 2-bit quantized Llama model in GPTQ format via T-MAC! Test it using the pretrained models released by [EfficientQAT](https://github.com/OpenGVLab/EfficientQAT).
@@ -263,6 +265,8 @@ python tools/run_pipeline.py -o ${model_dir} -m llama-3-8b-2bit
 
 > Use `-p` or `-s` argument to select the steps you want to run. And use `-u` argument to use our prebuilt kernels for ARM.
 
+> Use `--zero_point` for asymmetric quantization, which is required for *most* EfficientQAT models (only verified with Llama-3-8b-instruct-w4-g128/Llama-3-8b-instruct-w2-g128).
+
 An example output:
 
 ```
@@ -294,7 +298,7 @@ Check logs/2024-07-15-17-10-11.log for inference output
 
 We will soon:
 
-- [ ] Add `I4` format to simplify the deployment of 4-bit models.
+- [x] Add `I4` format to simplify the deployment of 4-bit models.
 - [ ] Embed T-MAC GEMM kernels into llama.cpp to accelerate prefill/prompt.
 - [ ] Optimize for ARMv9 CPU with SME2 through LUTI4
 
diff --git a/deploy/compile.py b/deploy/compile.py
@@ -40,6 +40,12 @@
         [2, 4096, 14336, 1, -1],
         [2, 1024, 4096, 1, -1],
     ],
+    "llama-3-8b-4bit": [
+        [4, 4096, 4096, 1, -1],
+        [4, 14336, 4096, 1, -1],
+        [4, 4096, 14336, 1, -1],
+        [4, 1024, 4096, 1, -1],
+    ],
     "hf-bitnet-3b": [
         [2, 3200, 8640, 1, 1],
         [2, 8640, 3200, 1, 1],
diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py
@@ -186,6 +186,7 @@ def run_inference():
     "llama-2-7b-2bit",
     "llama-2-13b-2bit",
     "llama-3-8b-2bit",
+    "llama-3-8b-4bit",
     "hf-bitnet-3b",
     "test",
 ]
@@ -211,11 +212,15 @@ def parse_args():
     parser.add_argument("-gs", "--group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
     parser.add_argument("-ags", "--act_group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
     parser.add_argument("-ld", "--logs_dir", type=str, default="logs")
-    parser.add_argument("-q", "--quant_type", type=str, choices=["i2"], default="i2")
+    parser.add_argument("-q", "--quant_type", type=str, choices=["in", "i1", "i2", "i3", "i4"], default="in")
+    parser.add_argument("-zp", "--zero_point", action="store_true", help="Enforce enable zero_point. Required by EfficientQAT models.")
+    parser.add_argument("-nzp", "--no_zero_point", action="store_false", help="Enforce disable zero_point. Don't set this argument if you don't know its meaning.")
 
     parser.add_argument("-v", "--verbose", action="store_true")
     parser.add_argument("-r", "--reuse_tuned", action="store_true")
     parser.add_argument("-u", "--use_prebuilt", action="store_true")
+
+    parser.set_defaults(zero_point=None)
     return parser.parse_args()
 
 
@@ -231,6 +236,8 @@ def get_quant_args():
         zero_point = True
     group_size = FLAGS.group_size or group_size
     act_group_size = FLAGS.act_group_size or act_group_size
+    if FLAGS.zero_point is not None:
+        zero_point = FLAGS.zero_point
     return {"group_size": group_size, "act_group_size": act_group_size, "zero_point": zero_point}