From dfb41d8a23e7390305f75190e243fd69c3df2281 Mon Sep 17 00:00:00 2001
From: kalineid <noob@mail.ustc.edu.cn>
Date: Sun, 15 Sep 2024 23:22:28 +0800
Subject: [PATCH 1/6] [WIP] Merge latest llama.cpp

---
 3rdparty/llama.cpp    |  2 +-
 tools/run_pipeline.py | 17 ++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 70c312d..6cc1fee 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 70c312d654539860b4839e7851432b75813edaa1
+Subproject commit 6cc1feea007dcfb90e44b81e0f69a83f616ff4ba
diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py
index 07205cb..6d6f516 100644
--- a/tools/run_pipeline.py
+++ b/tools/run_pipeline.py
@@ -140,10 +140,9 @@ def cmake_llamacpp():
     cmake_prefix_path = os.path.join(ROOT_DIR, "install", "lib", "cmake", "t-mac")
     command = [
         'cmake', '..',
-        '-DLLAMA_TMAC=ON',
+        '-DGGML_TMAC=ON',
         f'-DCMAKE_PREFIX_PATH={cmake_prefix_path}',
         '-DCMAKE_BUILD_TYPE=Release',
-        '-DLLAMA_LLAMAFILE_DEFAULT=OFF',
     ]
     if FLAGS.device == "android":
         try:
@@ -154,8 +153,8 @@ def cmake_llamacpp():
         command.append("-DANDROID_ABI=arm64-v8a")
         command.append("-DANDROID_PLATFORM=android-23")
         command.append("-DCMAKE_C_FLAGS=-march=armv8.2a+dotprod+fp16")
-        command.append("-DLLAMA_METAL=OFF")
-        command.append("-DLLAMA_ACCELERATE=OFF")
+        command.append("-DGGML_METAL=OFF")
+        command.append("-DGGML_ACCELERATE=OFF")
         command.append("-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH")
         command.append("-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH")
     elif is_win():
@@ -176,7 +175,7 @@ def cmake_llamacpp():
 
 def build_llamacpp():
     build_dir = get_llamacpp_build_dir()
-    command = ['cmake', '--build', '.', '--target', 'main', 'llama-bench', '--config', 'Release']
+    command = ['cmake', '--build', '.', '--target', 'llama-cli', 'llama-bench', '--config', 'Release']
     run_command(command, build_dir)
 
 
@@ -184,18 +183,18 @@ def run_inference():
     build_dir = get_llamacpp_build_dir()
     out_path = os.path.join(FLAGS.model_dir, f"ggml-model.{FLAGS.quant_type}.gguf")
     if is_win():
-        main_path = os.path.join(build_dir, "bin", "Release", "main.exe")
+        main_path = os.path.join(build_dir, "bin", "Release", "llama-cli.exe")
         if not os.path.exists(main_path):
-            main_path = os.path.join(build_dir, "bin", "main")
+            main_path = os.path.join(build_dir, "bin", "llama-cli")
     else:
-        main_path = os.path.join(build_dir, "bin", "main")
+        main_path = os.path.join(build_dir, "bin", "llama-cli")
     prompt = "Microsoft Corporation is an American multinational corporation and technology company headquartered in Redmond, Washington."
     if FLAGS.device == "android":
         remote_bin_path = os.path.join(FLAGS.remote_dir, "bin")
         # TODO: verify in Windows
         command = ['push', os.path.join(build_dir, "bin"), FLAGS.remote_dir]
         run_adb_command(command, build_dir)
-        remote_main_path = os.path.join(remote_bin_path, "main")
+        remote_main_path = os.path.join(remote_bin_path, "llama-cli")
         command = ['shell', 'chmod', '-R', '+x', remote_bin_path]
         run_adb_command(command, build_dir)
         remote_out_path = os.path.join(

From aaf29631604671cb6b61f35935440124f7f4d63d Mon Sep 17 00:00:00 2001
From: kalineid <noob@mail.ustc.edu.cn>
Date: Wed, 18 Sep 2024 18:03:42 +0800
Subject: [PATCH 2/6] Adapt scripts to latest llama.cpp

---
 3rdparty/llama.cpp          |  2 +-
 python/t_mac/model_utils.py | 39 +++++++++++++++++++++++++++++++++++--
 tools/run_pipeline.py       | 11 +++++++----
 3 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 6cc1fee..b2fa0d8 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 6cc1feea007dcfb90e44b81e0f69a83f616ff4ba
+Subproject commit b2fa0d86fcffb99e1bdda4080efc1fab70aed62f
diff --git a/python/t_mac/model_utils.py b/python/t_mac/model_utils.py
index 64af728..4c0ce82 100644
--- a/python/t_mac/model_utils.py
+++ b/python/t_mac/model_utils.py
@@ -5,9 +5,13 @@
 import os
 import logging
 import json
+import configparser
 
 import numpy as np
 
+from t_mac.weights import preprocess_weights
+
+
 
 logger = logging.getLogger("model_utils")
 
@@ -113,7 +117,7 @@ def __init__(self, dir_model: Path):
         self.is_safetensors = len(self.part_names) > 0
         if not self.is_safetensors:
             self.part_names = _Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
-    
+
     @staticmethod
     def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> List[str]:
         part_names: list[str] = []
@@ -170,7 +174,7 @@ def extract_kernel_shapes(self):
             raise RuntimeError("Models in {} not in GPTQ format".format(self.dir_model))
 
         return ks
-    
+
     @staticmethod
     def load_hparams(dir_model):
         with open(dir_model / "config.json", "r", encoding="utf-8") as f:
@@ -208,3 +212,34 @@ def get_quantization_config(model_dir: Optional[str] = None) -> dict:
         "sym": sym,
         "quant_method": quant_method,
     }
+
+
+def preprocess_for_t_mac(
+    kcfg_file: str,
+    w: np.ndarray,
+    scales: np.ndarray,
+    zeros: Optional[np.ndarray] = None,
+    bits: int = 2,
+    g: int = 4,
+) -> np.ndarray:
+
+    M, K = w.shape
+    cf = configparser.ConfigParser()
+    cf.read(kcfg_file)
+    secs = cf.sections()
+    found = False
+    for sec in secs:
+        sec_splits = str(sec).split('_')
+        if sec_splits[-4] == "m" + str(M * bits) and sec_splits[-3] == "k" + str(K):
+            bm = int(cf.get(sec, 'bm'))
+            kfactor = int(cf.get(sec, 'kfactor'))
+            simd_n_in = int(cf.get(sec, 'simd_n_in'))
+            simd_n_out = int(cf.get(sec, 'simd_n_out'))
+            found = True
+            break
+
+    if not found:
+        raise KeyError("GEMM of shape ({}, {}) is not found in {}. Please compile the kernels using T-MAC first.".format(M, K, args.kcfg))
+
+    w, scales = preprocess_weights(w, scales, zeros, bits=bits, g=g, bm=bm, kfactor=kfactor, simd_n_in=simd_n_in, simd_n_out=simd_n_out)
+    return np.concatenate([w.flatten(), scales.astype(np.float32).view(np.uint8).flatten()])
diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py
index 6d6f516..41bdce0 100644
--- a/tools/run_pipeline.py
+++ b/tools/run_pipeline.py
@@ -83,6 +83,8 @@ def compile_kernels():
 
 
 def _clean_cmake(build_dir):
+    command = ['cmake', '--build', '.', '--target', 'clean']
+    run_command(command, build_dir)
     shutil.rmtree(os.path.join(build_dir, "CMakeFiles"), ignore_errors=True)
     shutil.rmtree(os.path.join(build_dir, "CMakeCache.txt"), ignore_errors=True)
 
@@ -125,12 +127,13 @@ def convert_models():
     llamacpp_dir = os.path.join(ROOT_DIR, "3rdparty", "llama.cpp")
     command = [
         'python',
-        'convert-hf-to-gguf-t-mac.py',
+        'convert_hf_to_gguf.py',
         f'{model_dir}',
-        '--outtype',
-        f'{FLAGS.quant_type}',
+        '--outtype', f'{FLAGS.quant_type}',
         '--outfile', f'{out_path}',
         '--kcfg', f'{kcfg_path}',
+        '--enable-t-mac',
+        '--verbose',
     ]
     run_command(command, llamacpp_dir)
 
@@ -275,7 +278,7 @@ def parse_args():
     parser.add_argument("-gs", "--group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
     parser.add_argument("-ags", "--act_group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
     parser.add_argument("-ld", "--logs_dir", type=str, default="logs")
-    parser.add_argument("-q", "--quant_type", type=str, choices=["in", "i1", "i2", "i3", "i4"], default="in")
+    parser.add_argument("-q", "--quant_type", type=str, choices=["int_n", "f16", "f32"], default="int_n")
     parser.add_argument("-zp", "--zero_point", action="store_true", help="Enforce enable zero_point. Required by EfficientQAT models.")
     parser.add_argument("-nzp", "--no_zero_point", action="store_false", help="Enforce disable zero_point. Don't set this argument if you don't know its meaning.")
 

From 0afa2e7c2fab0540073e4976ef6625bc919c0ccf Mon Sep 17 00:00:00 2001
From: kalineid <noob@mail.ustc.edu.cn>
Date: Wed, 25 Sep 2024 22:18:16 +0800
Subject: [PATCH 3/6] Fix run_pipe.py cmake error

---
 tools/run_pipeline.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py
index 41bdce0..c71c5f0 100644
--- a/tools/run_pipeline.py
+++ b/tools/run_pipeline.py
@@ -12,7 +12,7 @@
 logger = logging.getLogger("run_pipeline")
 
 
-def run_command(command, pwd):
+def run_command(command, pwd, ignore_errors=False):
     print(f"  Running command in {pwd}:")
     print(f"    {' '.join(command)}")
     os.makedirs(FLAGS.logs_dir, exist_ok=True)
@@ -21,8 +21,9 @@ def run_command(command, pwd):
         try:
             subprocess.check_call(command, cwd=pwd, stdout=fp, stderr=fp)
         except subprocess.CalledProcessError as err:
-            print(RED + f"Please check {log_file} for what's wrong" + RESET)
-            exit(-1)
+            if not ignore_errors:
+                print(RED + f"Please check {log_file} for what's wrong" + RESET)
+                exit(-1)
     return log_file
 
 
@@ -84,7 +85,7 @@ def compile_kernels():
 
 def _clean_cmake(build_dir):
     command = ['cmake', '--build', '.', '--target', 'clean']
-    run_command(command, build_dir)
+    run_command(command, build_dir, ignore_errors=True)
     shutil.rmtree(os.path.join(build_dir, "CMakeFiles"), ignore_errors=True)
     shutil.rmtree(os.path.join(build_dir, "CMakeCache.txt"), ignore_errors=True)
 

From ac3591f35d09a67c2181bfe1dce91e14a3f297d4 Mon Sep 17 00:00:00 2001
From: Jianyu Wei <noob@mail.ustc.edu.cn>
Date: Sun, 29 Sep 2024 18:18:02 +0800
Subject: [PATCH 4/6] Attempt to optimize performance on arm cpus

---
 3rdparty/llama.cpp          | 2 +-
 python/t_mac/model_utils.py | 2 +-
 tools/run_pipeline.py       | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index b2fa0d8..b5b3394 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit b2fa0d86fcffb99e1bdda4080efc1fab70aed62f
+Subproject commit b5b33949aa49791d8e3252f01fdd9f37de01d3cc
diff --git a/python/t_mac/model_utils.py b/python/t_mac/model_utils.py
index 4c0ce82..464ecdb 100644
--- a/python/t_mac/model_utils.py
+++ b/python/t_mac/model_utils.py
@@ -239,7 +239,7 @@ def preprocess_for_t_mac(
             break
 
     if not found:
-        raise KeyError("GEMM of shape ({}, {}) is not found in {}. Please compile the kernels using T-MAC first.".format(M, K, args.kcfg))
+        raise KeyError("GEMM of shape ({}, {}) is not found in {}. Please compile the kernels using T-MAC first.".format(M, K, kcfg_file))
 
     w, scales = preprocess_weights(w, scales, zeros, bits=bits, g=g, bm=bm, kfactor=kfactor, simd_n_in=simd_n_in, simd_n_out=simd_n_out)
     return np.concatenate([w.flatten(), scales.astype(np.float32).view(np.uint8).flatten()])
diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py
index c71c5f0..bc71647 100644
--- a/tools/run_pipeline.py
+++ b/tools/run_pipeline.py
@@ -147,6 +147,7 @@ def cmake_llamacpp():
         '-DGGML_TMAC=ON',
         f'-DCMAKE_PREFIX_PATH={cmake_prefix_path}',
         '-DCMAKE_BUILD_TYPE=Release',
+        '-DGGML_OPENMP=OFF',
     ]
     if FLAGS.device == "android":
         try:

From 4331d7579bb08a6d90ea5b76800afd72bf096249 Mon Sep 17 00:00:00 2001
From: Jianyu Wei <noob@mail.ustc.edu.cn>
Date: Mon, 7 Oct 2024 23:12:21 +0800
Subject: [PATCH 5/6] Support armv8.7a+ cpus

---
 3rdparty/llama.cpp    | 2 +-
 tools/run_pipeline.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index b5b3394..9cb855e 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit b5b33949aa49791d8e3252f01fdd9f37de01d3cc
+Subproject commit 9cb855ea1bc415c4951e61a78f48d572eb1ac0a7
diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py
index bc71647..8a3c753 100644
--- a/tools/run_pipeline.py
+++ b/tools/run_pipeline.py
@@ -164,9 +164,7 @@ def cmake_llamacpp():
         command.append("-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH")
     elif is_win():
         if is_arm():
-            command.append("-DCMAKE_C_COMPILER=clang")
-            command.append("-DCMAKE_CXX_COMPILER=clang++")
-            command.append("-G Ninja")
+            command.append("--preset arm64-windows-llvm-release")
         else:
             command.append("-T ClangCL")
     else:

From 76327215a6f6e30efc7cd867927fd7b11aca4ee3 Mon Sep 17 00:00:00 2001
From: Jianyu Wei <noob@mail.ustc.edu.cn>
Date: Thu, 10 Oct 2024 20:45:49 +0800
Subject: [PATCH 6/6] Finish merging and rebasing llama.cpp

---
 .gitmodules                 | 1 +
 3rdparty/llama.cpp          | 2 +-
 README.md                   | 4 ++++
 python/t_mac/model_utils.py | 4 ++++
 python/t_mac/version.py     | 2 +-
 5 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 5ff98bd..f20d6c4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,4 @@
 [submodule "3rdparty/llama.cpp"]
 	path = 3rdparty/llama.cpp
 	url = https://github.com/kaleid-liner/llama.cpp
+	branch = master-rebased
diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 9cb855e..481426c 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 9cb855ea1bc415c4951e61a78f48d572eb1ac0a7
+Subproject commit 481426c8b61bf93c49a883045b0ad7f2198f3836
diff --git a/README.md b/README.md
index e07138f..011e5ad 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,8 @@
 
 ## News
 
+- 10/10/2024 🚀🚀: By updating and rebasing our llama.cpp version, T-MAC now support more models (e.g., qwen2) and the end-to-end performance is further improved by 10~15%! Try qwen2 using [the Official GPTQ model](https://huggingface.co/Qwen/Qwen2-7B-Instruct-GPTQ-Int4).
+
 - 08/21/2024 🎉🎉: T-MAC paper is accepted by EuroSys 2025.
 
 - 08/17/2024 🚀: T-MAC now supports 1/2/4-bit quantized models of (almost) any architecture in GPTQ format.
@@ -32,6 +34,8 @@ T-MAC achieves a token generation throughput of 20 tokens/sec with a single core
 
 ## End-2-End Speedup
 
+> All of the following data is profiled based on llama.cpp b2794 (May 2024). The latest T-MAC and baseline, after updating the llama.cpp version, is further optimized by 10~15%.
+
 We evaluate the token generation performance of different models on five different devices: Surface Laptop 7, Apple M2-Ultra, Jetson AGX Orin, Raspberry Pi 5 and Surface Book 3. Check [datasheet](docs/profiling_data.md) for more details.
 
 > We evaluate BitNet-3B and Llama-2-7B (W2) with T-MAC 2-bit and llama.cpp Q2_K, and evaluate Llama-2-7B (W4) with T-MAC 4-bit and llama.cpp Q4_0.
diff --git a/python/t_mac/model_utils.py b/python/t_mac/model_utils.py
index 464ecdb..a4bf880 100644
--- a/python/t_mac/model_utils.py
+++ b/python/t_mac/model_utils.py
@@ -196,6 +196,7 @@ def extract_kernel_shapes(model_arch: Optional[str] = "gptq-auto", model_dir: Op
 
 def get_quantization_config(model_dir: Optional[str] = None) -> dict:
     hparams = _Model.load_hparams(Path(model_dir))
+    # GPTQ
     quantization_config = hparams.get("quantization_config", {})
     desc_act = quantization_config.get("desc_act", False)
     assert not desc_act, "desc_act=True currently unsupported by T-MAC"
@@ -204,6 +205,8 @@ def get_quantization_config(model_dir: Optional[str] = None) -> dict:
     bits = quantization_config.get("bits", 0)
     sym = quantization_config.get("sym", False)
     quant_method = quantization_config.get("quant_method", "")
+    # BitNet
+    weight_bits = hparams.get("weight_bits", 0)
 
     return {
         "quantizer": quantizer,
@@ -211,6 +214,7 @@ def get_quantization_config(model_dir: Optional[str] = None) -> dict:
         "bits": bits,
         "sym": sym,
         "quant_method": quant_method,
+        "weight_bits": weight_bits,
     }
 
 
diff --git a/python/t_mac/version.py b/python/t_mac/version.py
index 8d70ed6..6ba69d9 100644
--- a/python/t_mac/version.py
+++ b/python/t_mac/version.py
@@ -1 +1 @@
-__version__ = "1.0.0a3"
+__version__ = "1.0.0a4"