Skip to content

Commit 1d19d0a

Browse files
committed
Add I1/I3/I4 format
1 parent ea81882 commit 1d19d0a

File tree

4 files changed

+20
-3
lines changed

4 files changed

+20
-3
lines changed

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
## News
99

10+
- 08/06/2024 🚀: Support 1/2/3/4-bit quantized Llama models in GPTQ format. Test it using the pretrained models released by [EfficientQAT](https://github.com/OpenGVLab/EfficientQAT).
11+
1012
- 07/27/2024 ✨: We've noted that T-MAC is even faster than the NPU in token generation speed on the latest Snapdragon X Elite chipset! Check [Compared to NPU](#compared-to-npu) for more details.
1113

1214
- 07/23/2024 🚀🚀: We've enabled the execution of any 2-bit quantized Llama model in GPTQ format via T-MAC! Test it using the pretrained models released by [EfficientQAT](https://github.com/OpenGVLab/EfficientQAT).
@@ -263,6 +265,8 @@ python tools/run_pipeline.py -o ${model_dir} -m llama-3-8b-2bit
263265

264266
> Use `-p` or `-s` argument to select the steps you want to run. And use `-u` argument to use our prebuilt kernels for ARM.
265267
268+
> Use `--zero_point` for asymmetric quantization, which is required for *most* EfficientQAT models (only verified with Llama-3-8b-instruct-w4-g128/Llama-3-8b-instruct-w2-g128).
269+
266270
An example output:
267271

268272
```
@@ -294,7 +298,7 @@ Check logs/2024-07-15-17-10-11.log for inference output
294298

295299
We will soon:
296300

297-
- [ ] Add `I4` format to simplify the deployment of 4-bit models.
301+
- [x] Add `I4` format to simplify the deployment of 4-bit models.
298302
- [ ] Embed T-MAC GEMM kernels into llama.cpp to accelerate prefill/prompt.
299303
- [ ] Optimize for ARMv9 CPU with SME2 through LUTI4
300304

deploy/compile.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@
4040
[2, 4096, 14336, 1, -1],
4141
[2, 1024, 4096, 1, -1],
4242
],
43+
"llama-3-8b-4bit": [
44+
[4, 4096, 4096, 1, -1],
45+
[4, 14336, 4096, 1, -1],
46+
[4, 4096, 14336, 1, -1],
47+
[4, 1024, 4096, 1, -1],
48+
],
4349
"hf-bitnet-3b": [
4450
[2, 3200, 8640, 1, 1],
4551
[2, 8640, 3200, 1, 1],

tools/run_pipeline.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ def run_inference():
186186
"llama-2-7b-2bit",
187187
"llama-2-13b-2bit",
188188
"llama-3-8b-2bit",
189+
"llama-3-8b-4bit",
189190
"hf-bitnet-3b",
190191
"test",
191192
]
@@ -211,11 +212,15 @@ def parse_args():
211212
parser.add_argument("-gs", "--group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
212213
parser.add_argument("-ags", "--act_group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
213214
parser.add_argument("-ld", "--logs_dir", type=str, default="logs")
214-
parser.add_argument("-q", "--quant_type", type=str, choices=["i2"], default="i2")
215+
parser.add_argument("-q", "--quant_type", type=str, choices=["in", "i1", "i2", "i3", "i4"], default="in")
216+
parser.add_argument("-zp", "--zero_point", action="store_true", help="Enforce enable zero_point. Required by EfficientQAT models.")
217+
parser.add_argument("-nzp", "--no_zero_point", action="store_false", help="Enforce disable zero_point. Don't set this argument if you don't know its meaning.")
215218

216219
parser.add_argument("-v", "--verbose", action="store_true")
217220
parser.add_argument("-r", "--reuse_tuned", action="store_true")
218221
parser.add_argument("-u", "--use_prebuilt", action="store_true")
222+
223+
parser.set_defaults(zero_point=None)
219224
return parser.parse_args()
220225

221226

@@ -231,6 +236,8 @@ def get_quant_args():
231236
zero_point = True
232237
group_size = FLAGS.group_size or group_size
233238
act_group_size = FLAGS.act_group_size or act_group_size
239+
if FLAGS.zero_point is not None:
240+
zero_point = FLAGS.zero_point
234241
return {"group_size": group_size, "act_group_size": act_group_size, "zero_point": zero_point}
235242

236243

0 commit comments

Comments
 (0)