From dfb41d8a23e7390305f75190e243fd69c3df2281 Mon Sep 17 00:00:00 2001 From: kalineid Date: Sun, 15 Sep 2024 23:22:28 +0800 Subject: [PATCH 1/6] [WIP] Merge latest llama.cpp --- 3rdparty/llama.cpp | 2 +- tools/run_pipeline.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 70c312d..6cc1fee 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 70c312d654539860b4839e7851432b75813edaa1 +Subproject commit 6cc1feea007dcfb90e44b81e0f69a83f616ff4ba diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py index 07205cb..6d6f516 100644 --- a/tools/run_pipeline.py +++ b/tools/run_pipeline.py @@ -140,10 +140,9 @@ def cmake_llamacpp(): cmake_prefix_path = os.path.join(ROOT_DIR, "install", "lib", "cmake", "t-mac") command = [ 'cmake', '..', - '-DLLAMA_TMAC=ON', + '-DGGML_TMAC=ON', f'-DCMAKE_PREFIX_PATH={cmake_prefix_path}', '-DCMAKE_BUILD_TYPE=Release', - '-DLLAMA_LLAMAFILE_DEFAULT=OFF', ] if FLAGS.device == "android": try: @@ -154,8 +153,8 @@ def cmake_llamacpp(): command.append("-DANDROID_ABI=arm64-v8a") command.append("-DANDROID_PLATFORM=android-23") command.append("-DCMAKE_C_FLAGS=-march=armv8.2a+dotprod+fp16") - command.append("-DLLAMA_METAL=OFF") - command.append("-DLLAMA_ACCELERATE=OFF") + command.append("-DGGML_METAL=OFF") + command.append("-DGGML_ACCELERATE=OFF") command.append("-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH") command.append("-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH") elif is_win(): @@ -176,7 +175,7 @@ def cmake_llamacpp(): def build_llamacpp(): build_dir = get_llamacpp_build_dir() - command = ['cmake', '--build', '.', '--target', 'main', 'llama-bench', '--config', 'Release'] + command = ['cmake', '--build', '.', '--target', 'llama-cli', 'llama-bench', '--config', 'Release'] run_command(command, build_dir) @@ -184,18 +183,18 @@ def run_inference(): build_dir = get_llamacpp_build_dir() out_path = os.path.join(FLAGS.model_dir, f"ggml-model.{FLAGS.quant_type}.gguf") if is_win(): - main_path = os.path.join(build_dir, "bin", "Release", "main.exe") + main_path = os.path.join(build_dir, "bin", "Release", "llama-cli.exe") if not os.path.exists(main_path): - main_path = os.path.join(build_dir, "bin", "main") + main_path = os.path.join(build_dir, "bin", "llama-cli") else: - main_path = os.path.join(build_dir, "bin", "main") + main_path = os.path.join(build_dir, "bin", "llama-cli") prompt = "Microsoft Corporation is an American multinational corporation and technology company headquartered in Redmond, Washington." if FLAGS.device == "android": remote_bin_path = os.path.join(FLAGS.remote_dir, "bin") # TODO: verify in Windows command = ['push', os.path.join(build_dir, "bin"), FLAGS.remote_dir] run_adb_command(command, build_dir) - remote_main_path = os.path.join(remote_bin_path, "main") + remote_main_path = os.path.join(remote_bin_path, "llama-cli") command = ['shell', 'chmod', '-R', '+x', remote_bin_path] run_adb_command(command, build_dir) remote_out_path = os.path.join( From aaf29631604671cb6b61f35935440124f7f4d63d Mon Sep 17 00:00:00 2001 From: kalineid Date: Wed, 18 Sep 2024 18:03:42 +0800 Subject: [PATCH 2/6] Adapt scripts to latest llama.cpp --- 3rdparty/llama.cpp | 2 +- python/t_mac/model_utils.py | 39 +++++++++++++++++++++++++++++++++++-- tools/run_pipeline.py | 11 +++++++---- 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 6cc1fee..b2fa0d8 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 6cc1feea007dcfb90e44b81e0f69a83f616ff4ba +Subproject commit b2fa0d86fcffb99e1bdda4080efc1fab70aed62f diff --git a/python/t_mac/model_utils.py b/python/t_mac/model_utils.py index 64af728..4c0ce82 100644 --- a/python/t_mac/model_utils.py +++ b/python/t_mac/model_utils.py @@ -5,9 +5,13 @@ import os import logging import json +import configparser import numpy as np +from t_mac.weights import preprocess_weights + + logger = logging.getLogger("model_utils") @@ -113,7 +117,7 @@ def __init__(self, dir_model: Path): self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: self.part_names = _Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - + @staticmethod def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> List[str]: part_names: list[str] = [] @@ -170,7 +174,7 @@ def extract_kernel_shapes(self): raise RuntimeError("Models in {} not in GPTQ format".format(self.dir_model)) return ks - + @staticmethod def load_hparams(dir_model): with open(dir_model / "config.json", "r", encoding="utf-8") as f: @@ -208,3 +212,34 @@ def get_quantization_config(model_dir: Optional[str] = None) -> dict: "sym": sym, "quant_method": quant_method, } + + +def preprocess_for_t_mac( + kcfg_file: str, + w: np.ndarray, + scales: np.ndarray, + zeros: Optional[np.ndarray] = None, + bits: int = 2, + g: int = 4, +) -> np.ndarray: + + M, K = w.shape + cf = configparser.ConfigParser() + cf.read(kcfg_file) + secs = cf.sections() + found = False + for sec in secs: + sec_splits = str(sec).split('_') + if sec_splits[-4] == "m" + str(M * bits) and sec_splits[-3] == "k" + str(K): + bm = int(cf.get(sec, 'bm')) + kfactor = int(cf.get(sec, 'kfactor')) + simd_n_in = int(cf.get(sec, 'simd_n_in')) + simd_n_out = int(cf.get(sec, 'simd_n_out')) + found = True + break + + if not found: + raise KeyError("GEMM of shape ({}, {}) is not found in {}. Please compile the kernels using T-MAC first.".format(M, K, args.kcfg)) + + w, scales = preprocess_weights(w, scales, zeros, bits=bits, g=g, bm=bm, kfactor=kfactor, simd_n_in=simd_n_in, simd_n_out=simd_n_out) + return np.concatenate([w.flatten(), scales.astype(np.float32).view(np.uint8).flatten()]) diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py index 6d6f516..41bdce0 100644 --- a/tools/run_pipeline.py +++ b/tools/run_pipeline.py @@ -83,6 +83,8 @@ def compile_kernels(): def _clean_cmake(build_dir): + command = ['cmake', '--build', '.', '--target', 'clean'] + run_command(command, build_dir) shutil.rmtree(os.path.join(build_dir, "CMakeFiles"), ignore_errors=True) shutil.rmtree(os.path.join(build_dir, "CMakeCache.txt"), ignore_errors=True) @@ -125,12 +127,13 @@ def convert_models(): llamacpp_dir = os.path.join(ROOT_DIR, "3rdparty", "llama.cpp") command = [ 'python', - 'convert-hf-to-gguf-t-mac.py', + 'convert_hf_to_gguf.py', f'{model_dir}', - '--outtype', - f'{FLAGS.quant_type}', + '--outtype', f'{FLAGS.quant_type}', '--outfile', f'{out_path}', '--kcfg', f'{kcfg_path}', + '--enable-t-mac', + '--verbose', ] run_command(command, llamacpp_dir) @@ -275,7 +278,7 @@ def parse_args(): parser.add_argument("-gs", "--group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.") parser.add_argument("-ags", "--act_group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.") parser.add_argument("-ld", "--logs_dir", type=str, default="logs") - parser.add_argument("-q", "--quant_type", type=str, choices=["in", "i1", "i2", "i3", "i4"], default="in") + parser.add_argument("-q", "--quant_type", type=str, choices=["int_n", "f16", "f32"], default="int_n") parser.add_argument("-zp", "--zero_point", action="store_true", help="Enforce enable zero_point. Required by EfficientQAT models.") parser.add_argument("-nzp", "--no_zero_point", action="store_false", help="Enforce disable zero_point. Don't set this argument if you don't know its meaning.") From 0afa2e7c2fab0540073e4976ef6625bc919c0ccf Mon Sep 17 00:00:00 2001 From: kalineid Date: Wed, 25 Sep 2024 22:18:16 +0800 Subject: [PATCH 3/6] Fix run_pipe.py cmake error --- tools/run_pipeline.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py index 41bdce0..c71c5f0 100644 --- a/tools/run_pipeline.py +++ b/tools/run_pipeline.py @@ -12,7 +12,7 @@ logger = logging.getLogger("run_pipeline") -def run_command(command, pwd): +def run_command(command, pwd, ignore_errors=False): print(f" Running command in {pwd}:") print(f" {' '.join(command)}") os.makedirs(FLAGS.logs_dir, exist_ok=True) @@ -21,8 +21,9 @@ def run_command(command, pwd): try: subprocess.check_call(command, cwd=pwd, stdout=fp, stderr=fp) except subprocess.CalledProcessError as err: - print(RED + f"Please check {log_file} for what's wrong" + RESET) - exit(-1) + if not ignore_errors: + print(RED + f"Please check {log_file} for what's wrong" + RESET) + exit(-1) return log_file @@ -84,7 +85,7 @@ def compile_kernels(): def _clean_cmake(build_dir): command = ['cmake', '--build', '.', '--target', 'clean'] - run_command(command, build_dir) + run_command(command, build_dir, ignore_errors=True) shutil.rmtree(os.path.join(build_dir, "CMakeFiles"), ignore_errors=True) shutil.rmtree(os.path.join(build_dir, "CMakeCache.txt"), ignore_errors=True) From ac3591f35d09a67c2181bfe1dce91e14a3f297d4 Mon Sep 17 00:00:00 2001 From: Jianyu Wei Date: Sun, 29 Sep 2024 18:18:02 +0800 Subject: [PATCH 4/6] Attempt to optimize performance on arm cpus --- 3rdparty/llama.cpp | 2 +- python/t_mac/model_utils.py | 2 +- tools/run_pipeline.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index b2fa0d8..b5b3394 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit b2fa0d86fcffb99e1bdda4080efc1fab70aed62f +Subproject commit b5b33949aa49791d8e3252f01fdd9f37de01d3cc diff --git a/python/t_mac/model_utils.py b/python/t_mac/model_utils.py index 4c0ce82..464ecdb 100644 --- a/python/t_mac/model_utils.py +++ b/python/t_mac/model_utils.py @@ -239,7 +239,7 @@ def preprocess_for_t_mac( break if not found: - raise KeyError("GEMM of shape ({}, {}) is not found in {}. Please compile the kernels using T-MAC first.".format(M, K, args.kcfg)) + raise KeyError("GEMM of shape ({}, {}) is not found in {}. Please compile the kernels using T-MAC first.".format(M, K, kcfg_file)) w, scales = preprocess_weights(w, scales, zeros, bits=bits, g=g, bm=bm, kfactor=kfactor, simd_n_in=simd_n_in, simd_n_out=simd_n_out) return np.concatenate([w.flatten(), scales.astype(np.float32).view(np.uint8).flatten()]) diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py index c71c5f0..bc71647 100644 --- a/tools/run_pipeline.py +++ b/tools/run_pipeline.py @@ -147,6 +147,7 @@ def cmake_llamacpp(): '-DGGML_TMAC=ON', f'-DCMAKE_PREFIX_PATH={cmake_prefix_path}', '-DCMAKE_BUILD_TYPE=Release', + '-DGGML_OPENMP=OFF', ] if FLAGS.device == "android": try: From 4331d7579bb08a6d90ea5b76800afd72bf096249 Mon Sep 17 00:00:00 2001 From: Jianyu Wei Date: Mon, 7 Oct 2024 23:12:21 +0800 Subject: [PATCH 5/6] Support armv8.7a+ cpus --- 3rdparty/llama.cpp | 2 +- tools/run_pipeline.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index b5b3394..9cb855e 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit b5b33949aa49791d8e3252f01fdd9f37de01d3cc +Subproject commit 9cb855ea1bc415c4951e61a78f48d572eb1ac0a7 diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py index bc71647..8a3c753 100644 --- a/tools/run_pipeline.py +++ b/tools/run_pipeline.py @@ -164,9 +164,7 @@ def cmake_llamacpp(): command.append("-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH") elif is_win(): if is_arm(): - command.append("-DCMAKE_C_COMPILER=clang") - command.append("-DCMAKE_CXX_COMPILER=clang++") - command.append("-G Ninja") + command.append("--preset arm64-windows-llvm-release") else: command.append("-T ClangCL") else: From 76327215a6f6e30efc7cd867927fd7b11aca4ee3 Mon Sep 17 00:00:00 2001 From: Jianyu Wei Date: Thu, 10 Oct 2024 20:45:49 +0800 Subject: [PATCH 6/6] Finish merging and rebasing llama.cpp --- .gitmodules | 1 + 3rdparty/llama.cpp | 2 +- README.md | 4 ++++ python/t_mac/model_utils.py | 4 ++++ python/t_mac/version.py | 2 +- 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 5ff98bd..f20d6c4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,4 @@ [submodule "3rdparty/llama.cpp"] path = 3rdparty/llama.cpp url = https://github.com/kaleid-liner/llama.cpp + branch = master-rebased diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp index 9cb855e..481426c 160000 --- a/3rdparty/llama.cpp +++ b/3rdparty/llama.cpp @@ -1 +1 @@ -Subproject commit 9cb855ea1bc415c4951e61a78f48d572eb1ac0a7 +Subproject commit 481426c8b61bf93c49a883045b0ad7f2198f3836 diff --git a/README.md b/README.md index e07138f..011e5ad 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ ## News +- 10/10/2024 🚀🚀: By updating and rebasing our llama.cpp version, T-MAC now support more models (e.g., qwen2) and the end-to-end performance is further improved by 10~15%! Try qwen2 using [the Official GPTQ model](https://huggingface.co/Qwen/Qwen2-7B-Instruct-GPTQ-Int4). + - 08/21/2024 🎉🎉: T-MAC paper is accepted by EuroSys 2025. - 08/17/2024 🚀: T-MAC now supports 1/2/4-bit quantized models of (almost) any architecture in GPTQ format. @@ -32,6 +34,8 @@ T-MAC achieves a token generation throughput of 20 tokens/sec with a single core ## End-2-End Speedup +> All of the following data is profiled based on llama.cpp b2794 (May 2024). The latest T-MAC and baseline, after updating the llama.cpp version, is further optimized by 10~15%. + We evaluate the token generation performance of different models on five different devices: Surface Laptop 7, Apple M2-Ultra, Jetson AGX Orin, Raspberry Pi 5 and Surface Book 3. Check [datasheet](docs/profiling_data.md) for more details. > We evaluate BitNet-3B and Llama-2-7B (W2) with T-MAC 2-bit and llama.cpp Q2_K, and evaluate Llama-2-7B (W4) with T-MAC 4-bit and llama.cpp Q4_0. diff --git a/python/t_mac/model_utils.py b/python/t_mac/model_utils.py index 464ecdb..a4bf880 100644 --- a/python/t_mac/model_utils.py +++ b/python/t_mac/model_utils.py @@ -196,6 +196,7 @@ def extract_kernel_shapes(model_arch: Optional[str] = "gptq-auto", model_dir: Op def get_quantization_config(model_dir: Optional[str] = None) -> dict: hparams = _Model.load_hparams(Path(model_dir)) + # GPTQ quantization_config = hparams.get("quantization_config", {}) desc_act = quantization_config.get("desc_act", False) assert not desc_act, "desc_act=True currently unsupported by T-MAC" @@ -204,6 +205,8 @@ def get_quantization_config(model_dir: Optional[str] = None) -> dict: bits = quantization_config.get("bits", 0) sym = quantization_config.get("sym", False) quant_method = quantization_config.get("quant_method", "") + # BitNet + weight_bits = hparams.get("weight_bits", 0) return { "quantizer": quantizer, @@ -211,6 +214,7 @@ def get_quantization_config(model_dir: Optional[str] = None) -> dict: "bits": bits, "sym": sym, "quant_method": quant_method, + "weight_bits": weight_bits, } diff --git a/python/t_mac/version.py b/python/t_mac/version.py index 8d70ed6..6ba69d9 100644 --- a/python/t_mac/version.py +++ b/python/t_mac/version.py @@ -1 +1 @@ -__version__ = "1.0.0a3" +__version__ = "1.0.0a4"