-
Notifications
You must be signed in to change notification settings - Fork 51
Open
Description
Hi
I test bitblas models with the https://github.com/ModelCloud/GPTQModel repo.
I found that the output is correct. However, BitBLAS obtains similar token generation speed in low-bits (2-bit and 4-bit) model with FP16 model. Detailed results are as follow:
the corresponding test code is:
import torch
from transformers import AutoTokenizer
from gptqmodel import GPTQModel, QuantizeConfig, get_backend
import time
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model", default=None, type=str, help="direction for saving quantization model")
parser.add_argument("--wbits", type=int, default=4, help="quantization bits")
parser.add_argument("--group_size", type=int, default=128, help="quantization group size")
parser.add_argument("--test_speed", action="store_true")
args = parser.parse_args()
tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False,legacy=False)
model = GPTQModel.from_quantized(args.model, device_map='auto',torch_dtype=torch.float16,backend=get_backend('BITBLAS'))
model.cuda()
print(f"memory footprint after loading quantized model: {torch.cuda.max_memory_allocated('cuda') / 1024**3:.2f}GiB")
if args.test_speed:
prompt = "Write a poem about large language model:"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
start_time = time.time()
output = model.generate(inputs=input_ids, do_sample=True, top_k=10, max_new_tokens=256)
end_time = time.time()
speed = len(output[0])/(end_time-start_time)
print(tokenizer.decode(output[0]))
print(f"generation speed:{speed}token/s")
if __name__ =='__main__':
main()
Do you know what is the potential problem to hinder speedup. Thank you.
Metadata
Metadata
Assignees
Labels
No labels