From 5d42ca60522782a5d7c639a794647ed6f6d56526 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 13 Aug 2025 10:57:47 -0600 Subject: [PATCH 01/15] feat: Add python-side constants and conversion for adapter.lora.invocation_string Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- convert_lora_to_gguf.py | 4 ++++ gguf-py/gguf/constants.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index a67c0536a4128..4dae9bd95711d 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -369,7 +369,11 @@ def set_type(self): self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") def set_gguf_parameters(self): + logger.debug("GGUF KV: %s = %d", gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) + if invocation_string := lparams.get("invocation_string"): + logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.LORA_INVOCATION_STRING, invocation_string) + self.gguf_writer.add_string(gguf.Keys.Adapter.LORA_INVOCATION_STRING, invocation_string) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # Never add extra tensors (e.g. rope_freqs) for LoRA adapters diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 61ebe6e5e7750..f619ec7056c70 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -231,8 +231,9 @@ class Tokenizer: MIDDLE_ID = "tokenizer.ggml.middle_token_id" class Adapter: - TYPE = "adapter.type" - LORA_ALPHA = "adapter.lora.alpha" + TYPE = "adapter.type" + LORA_ALPHA = "adapter.lora.alpha" + LORA_INVOCATION_STRING = "adapter.lora.invocation_string" class IMatrix: CHUNK_COUNT = "imatrix.chunk_count" From a242ee8c7210d81947c51b935e59a462548d8b99 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 13 Aug 2025 10:58:31 -0600 Subject: [PATCH 02/15] feat: Add c++ side constants for adapter.lora.invocation_string Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- src/llama-arch.cpp | 5 +++-- src/llama-arch.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index c759a9c6d9e05..8a5cd609c2f35 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -233,8 +233,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, - { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_LORA_INVOCATION_STRING, "adapter.lora.invocation_string" }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 7af587e7951bc..eb8ee83c71a32 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -229,6 +229,7 @@ enum llm_kv { LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, + LLM_KV_ADAPTER_LORA_INVOCATION_STRING, LLM_KV_POSNET_EMBEDDING_LENGTH, LLM_KV_POSNET_BLOCK_COUNT, From 4c214e42885be62c31d856b1528e1fce69377ac8 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 13 Aug 2025 11:20:20 -0600 Subject: [PATCH 03/15] feat: Parse invocation string for adapters from GGUF Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- src/llama-adapter.cpp | 5 +++++ src/llama-adapter.h | 3 +++ 2 files changed, 8 insertions(+) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8d94034aed95d..975668d8f76d7 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -190,6 +190,11 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ } adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); + + adapter.invocation_string = get_kv_str(llm_kv(LLM_KV_ADAPTER_LORA_INVOCATION_STRING)); + if (!adapter.invocation_string.empty()) { + LLAMA_LOG_INFO("%s: activated LoRA invocation string: '%s'\n", __func__, adapter.invocation_string.c_str()); + } } int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 65824e972765b..70ed7ad35b508 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -67,6 +67,9 @@ struct llama_adapter_lora { float alpha; + // activated lora (aLoRA) + std::string invocation_string; + llama_adapter_lora() = default; ~llama_adapter_lora() = default; From 636bb2b0914a2213ab0ccdb4196f24bcd198fd61 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 13 Aug 2025 15:09:22 -0600 Subject: [PATCH 04/15] fix(python): Update conversion to alora_invocation_tokens This is the preferred method in PEFT which is the source of ground truth https://github.com/huggingface/peft/pull/2609/files#diff-13380145401d203d5935c5189dd09879f990b81aa63e8e3aaff8ce9110333f0e Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- convert_lora_to_gguf.py | 13 ++++++++++--- gguf-py/gguf/constants.py | 6 +++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 4dae9bd95711d..a5d0ebebae08a 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -26,6 +26,8 @@ # reuse model definitions from convert_hf_to_gguf.py from convert_hf_to_gguf import LazyTorchTensor, ModelBase +from gguf.constants import GGUFValueType + logger = logging.getLogger("lora-to-gguf") @@ -371,9 +373,14 @@ def set_type(self): def set_gguf_parameters(self): logger.debug("GGUF KV: %s = %d", gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) - if invocation_string := lparams.get("invocation_string"): - logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.LORA_INVOCATION_STRING, invocation_string) - self.gguf_writer.add_string(gguf.Keys.Adapter.LORA_INVOCATION_STRING, invocation_string) + if alora_invocation_tokens := lparams.get("alora_invocation_tokens"): + logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens) + self.gguf_writer.add_key_value( + gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, + alora_invocation_tokens, + GGUFValueType.ARRAY, + GGUFValueType.UINT32, + ) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # Never add extra tensors (e.g. rope_freqs) for LoRA adapters diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f619ec7056c70..696c0c31c0cb8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -231,9 +231,9 @@ class Tokenizer: MIDDLE_ID = "tokenizer.ggml.middle_token_id" class Adapter: - TYPE = "adapter.type" - LORA_ALPHA = "adapter.lora.alpha" - LORA_INVOCATION_STRING = "adapter.lora.invocation_string" + TYPE = "adapter.type" + LORA_ALPHA = "adapter.lora.alpha" + ALORA_INVOCATION_TOKENS = "adapter.alora.invocation_tokens" class IMatrix: CHUNK_COUNT = "imatrix.chunk_count" From 215841fc0178de0797d23f1cae8ab8ee1baa8eb9 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 13 Aug 2025 15:09:44 -0600 Subject: [PATCH 05/15] fix(cpp): Update to alora_invocation_tokens on c++ side Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- src/llama-adapter.cpp | 32 +++++++++++++++++++++++++++++--- src/llama-adapter.h | 2 +- src/llama-arch.cpp | 6 +++--- src/llama-arch.h | 2 +- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 975668d8f76d7..11d0462142b31 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -6,6 +6,7 @@ #include #include +#include #include // vec @@ -191,9 +192,34 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); - adapter.invocation_string = get_kv_str(llm_kv(LLM_KV_ADAPTER_LORA_INVOCATION_STRING)); - if (!adapter.invocation_string.empty()) { - LLAMA_LOG_INFO("%s: activated LoRA invocation string: '%s'\n", __func__, adapter.invocation_string.c_str()); + // parse alora invocation sequence vector + const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS); + const int kid = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (kid >= 0) { + if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) { + throw std::runtime_error("invalid gguf type for " + key); + } + const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid); + if (arr_type != GGUF_TYPE_UINT32) { + throw std::runtime_error("invalid gguf element type for " + key); + } + const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid); + const void * data = gguf_get_arr_data(ctx_gguf.get(), kid); + adapter.alora_invocation_tokens.resize(seq_len); + std::copy( + (const llama_token *)data, + (const llama_token *)data + seq_len, + adapter.alora_invocation_tokens.begin()); + std::stringstream ss; + ss << "["; + for (size_t i = 0; i < adapter.alora_invocation_tokens.size(); ++i) { + ss << adapter.alora_invocation_tokens[i]; + if (i < adapter.alora_invocation_tokens.size() - 1) { + ss << ", "; + } + } + ss << "]"; + LLAMA_LOG_INFO("%s: %s = %s\n", __func__, key.c_str(), ss.str().c_str()); } } diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 70ed7ad35b508..fa4d997952884 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -68,7 +68,7 @@ struct llama_adapter_lora { float alpha; // activated lora (aLoRA) - std::string invocation_string; + std::vector alora_invocation_tokens; llama_adapter_lora() = default; ~llama_adapter_lora() = default; diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 8a5cd609c2f35..e0a251e9b43c0 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -233,9 +233,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, - { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, - { LLM_KV_ADAPTER_LORA_INVOCATION_STRING, "adapter.lora.invocation_string" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index eb8ee83c71a32..8651c8ca77479 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -229,7 +229,7 @@ enum llm_kv { LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, - LLM_KV_ADAPTER_LORA_INVOCATION_STRING, + LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, LLM_KV_POSNET_EMBEDDING_LENGTH, LLM_KV_POSNET_BLOCK_COUNT, From 7212b9de843f948c4ef5cef1a94e15046fbd51d1 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Thu, 14 Aug 2025 13:11:00 -0600 Subject: [PATCH 06/15] feat: Add C APIs to get alora invocation token array from lora Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- include/llama.h | 4 ++++ src/llama-adapter.cpp | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/llama.h b/include/llama.h index c5622cc16b4c2..f2cfc18a3e777 100644 --- a/include/llama.h +++ b/include/llama.h @@ -557,6 +557,10 @@ extern "C" { // Note: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); + // Get the invocation tokens if the current lora is an alora + LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter); + LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter); + // The following functions operate on a llama_context, hence the naming: llama_verb_... // Add a loaded LoRA adapter to given context diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 11d0462142b31..49019b4fc601b 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -417,3 +417,15 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p void llama_adapter_lora_free(llama_adapter_lora * adapter) { delete adapter; } + +uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) { + if (!adapter) { + return 0; + } + return adapter->alora_invocation_tokens.size(); +} + +const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) { + GGML_ASSERT(adapter); + return adapter->alora_invocation_tokens.data(); +} From 45eab0df389b817bd7aa35a649acf02454a1b363 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Thu, 14 Aug 2025 13:12:39 -0600 Subject: [PATCH 07/15] feat: Initial implementation of alora cache logic in server This does not yet do the part to identify the invocation tokens and only apply the lora adapter afterwards, but it does seem to produce correct results if the invocation tokens are the beginning of the uncached input. Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- tools/server/server.cpp | 11 ++++++++--- tools/server/utils.hpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 6eb5aeb582b3a..e297aacf65ac4 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -116,7 +116,7 @@ struct slot_params { int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_predict = -1; // new tokens to predict - int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters + int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit @@ -2282,8 +2282,13 @@ struct server_context { slot.prompt_tokens = std::move(task.prompt_tokens); if (!are_lora_equal(slot.params.lora, slot.lora)) { - // if lora is changed, we cannot reuse cached tokens - slot.cache_tokens.clear(); + // if lora has changed, check to see if the cache should be cleared + if (lora_should_clear_cache(slot.lora, slot.params.lora)) { + SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), slot.params.lora.size()); + slot.cache_tokens.clear(); + } else { + SLT_INF(slot, "keeping cache for alora. %zu target loras\n", slot.params.lora.size()); + } slot.lora = slot.params.lora; } diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 036060bb3e9c7..76cc09e480ec5 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -992,6 +992,34 @@ static bool are_lora_equal( return true; } +// if the two sets of loras are different, they require a cache clear unless the +// current lora set is empty and the next lora set is all activated loras. +static bool lora_should_clear_cache( + const std::vector & current, + const std::vector & next) { + + // This should always be called after determining that the two sets are + // _not_ equal. This assert is therefore some slightly wasted work and + // should be safe to remove as long as this method is called correctly. + GGML_ASSERT(!are_lora_equal(current, next)); + + // If the current has _any_ non-disabled loras (activated or otherwise) and + // is not the same as the target set, the cache should be cleared. + for (const auto & lora : current) { + if (lora.scale != 0) { + return true; + } + } + + // If the next has only aloras, the cache should not be cleared + for (const auto & lora : next) { + if (lora.scale != 0 && llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) { + return true; + } + } + return false; +} + // parse lora config from JSON request, returned a copy of lora_base with updated scale static std::vector parse_lora_request( const std::vector & lora_base, From 457c757e7d167376195289f64c77ef98d0f78e3b Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 15 Aug 2025 10:48:48 -0600 Subject: [PATCH 08/15] feat: Identify alora invocation sequences This currently limits to a single enabled alora per slot. Multiple aloras with different invocation sequences would be possible, but it would require a more complex integration of the adapter toggling and is not really a well studied case for alora since it's unclear if one alora can reuse cache from previous prefill computed with a different alora. Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- tools/server/server.cpp | 53 +++++++++++++++++++++++++++++++++++++++++ tools/server/utils.hpp | 45 +++++++++++++++++++++------------- 2 files changed, 82 insertions(+), 16 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index e297aacf65ac4..9255a80954068 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1322,6 +1322,7 @@ struct server_slot { common_speculative * spec = nullptr; std::vector lora; + int32_t alora_invocation_start = -1; // the index relative to completion multi-task request size_t index = 0; @@ -1416,6 +1417,9 @@ struct server_slot { // clear speculative decoding stats n_draft_total = 0; n_draft_accepted = 0; + + // clear alora start + alora_invocation_start = -1; } bool need_embd() const { @@ -2292,6 +2296,55 @@ struct server_context { slot.lora = slot.params.lora; } + // if using alora, make sure it's only a single one requested and active + size_t alora_invocation_start = slot.prompt_tokens.size(); + if (lora_all_alora(slot.lora)) { + + const auto & enabled_ids = lora_get_enabled_ids(slot.lora); + // TODO: This will error out if a user requests two aloras, but only + // provides the activation string for one. We could, instead search + // for all requested alora activation strings and then either keep + // only the last one, or reject if multiple are found. + if (enabled_ids.size() != 1) { + send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST); + return false; + } + const auto & lora = slot.lora[enabled_ids[0]].ptr; + + // get the pointer and count for the invocation tokens + const uint64_t n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora); + const llama_token * invocation_tokens = llama_adapter_get_alora_invocation_tokens (lora); + + // scan backwards through the prompt tokens to find the last + // occurrence of the invocation sequence + int match_idx = static_cast(n_invocation_tokens) - 1; + for (int i = slot.prompt_tokens.size() - 1; i >= 0; --i) { + // the token in this position matches the next token to find in + // the invocation sequence + if (slot.prompt_tokens[i] == invocation_tokens[match_idx]) { + // if it's a full match, we've found the start + if (match_idx == 0) { + alora_invocation_start = i; + break; + } + // otherwise, check the next token in the sequence + --match_idx; + } else { + // no match in this position, so start looking over again + match_idx = static_cast(n_invocation_tokens) - 1; + } + } + + // if the activation string is not found, disable the alora + if (alora_invocation_start == slot.prompt_tokens.size()) { + SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]); + slot.lora[enabled_ids[0]].scale = 0.0f; + } else { + SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start); + slot.alora_invocation_start = alora_invocation_start; + } + } + if (!slot.prompt_tokens.validate(ctx)) { send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST); return false; diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 76cc09e480ec5..d6808f9db437c 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -992,8 +992,33 @@ static bool are_lora_equal( return true; } +// get the ids of all enabled loras +static std::vector lora_get_enabled_ids(const std::vector & loras) { + std::vector enabled_ids; + for (size_t i = 0; i < loras.size(); ++i) { + if (loras[i].scale > 0) { + enabled_ids.push_back(i); + } + } + return enabled_ids; +} + +// check whether the given lora set has only aloras activated (empty => false) +static bool lora_all_alora(const std::vector & loras) { + bool found_alora = false; + for (const auto & lora : loras) { + if (lora.scale != 0) { + if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) { + return false; + } + found_alora = true; + } + } + return found_alora; +} + // if the two sets of loras are different, they require a cache clear unless the -// current lora set is empty and the next lora set is all activated loras. +// change is only from aloras to aloras. static bool lora_should_clear_cache( const std::vector & current, const std::vector & next) { @@ -1003,21 +1028,9 @@ static bool lora_should_clear_cache( // should be safe to remove as long as this method is called correctly. GGML_ASSERT(!are_lora_equal(current, next)); - // If the current has _any_ non-disabled loras (activated or otherwise) and - // is not the same as the target set, the cache should be cleared. - for (const auto & lora : current) { - if (lora.scale != 0) { - return true; - } - } - - // If the next has only aloras, the cache should not be cleared - for (const auto & lora : next) { - if (lora.scale != 0 && llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) { - return true; - } - } - return false; + return ( + !(lora_get_enabled_ids(current).empty() or lora_all_alora(current)) or + !lora_all_alora(next)); } // parse lora config from JSON request, returned a copy of lora_base with updated scale From a5a75da1e06dce9abc88ebf3c246e72bd97418e8 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 15 Aug 2025 10:50:11 -0600 Subject: [PATCH 09/15] feat: Only reuse cache for tokens before the alora invocation start This is a bit of an edge case, but theoretically a user could try the same query with the alora disabled (just using the base model), then retry with the alora. The cached tokens from the first pass should be invalid. Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- tools/server/server.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 9255a80954068..5023989107379 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3333,6 +3333,12 @@ struct server_context { // reuse any previously computed tokens that are common with the new prompt slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens); + // if there is an alora invoked, don't cache after the invocation start + if (slot.alora_invocation_start >= 0) { + SLT_DBG(slot, "only caching to alora invocation start (n_past=%d, alora_invocation_start=%d)\n", slot.n_past, slot.alora_invocation_start); + slot.n_past = std::min(slot.n_past, slot.alora_invocation_start); + } + // reuse chunks from the cached prompt by shifting their KV cache in the new position if (params_base.n_cache_reuse > 0) { size_t head_c = slot.n_past; // cache From b4a0cecc62d8588526865637580c056c8a9e7caf Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 15 Aug 2025 12:06:48 -0600 Subject: [PATCH 10/15] feat: Handle un-cached tokens that come before the alora activation The solution is to only fill up to the token before the invocation start in the batch if there are any tokens to be prefilled between those pulled from cache and the invocation start. When this is detected, the alora is temporarily disabled with a scale of 0.0, then immediately re-enabled after it has been initialized for the internal graph. Since the batch does not complete the prompt tokens, the remaining prompt tokens are handled in the next task, pulling all of the non-alora tokens from cache and proceeding with prefill for the alora tokens. Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- tools/server/server.cpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 5023989107379..448fb40a69d21 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3213,6 +3213,8 @@ struct server_context { int32_t n_ubatch = llama_n_ubatch(ctx); // next, batch any pending prompts without exceeding n_batch + float alora_scale = -1.0f; + size_t alora_disabled_id = 0; if (params_base.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { // check if we can batch this slot with the previous one @@ -3511,6 +3513,20 @@ struct server_context { slot.n_prompt_tokens_processed += n_pos; } + // If using an alora, there may be uncached tokens that come + // before the invocation sequence. When this happens, the + // tokens before the invocation sequence need to be + // processed without the adpter in a separate batch, then + // the adapter needs to be enabled for the remaining tokens. + if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.n_past) { + SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_past = %d, alora_invocation_start = %d)\n", slot.n_past, slot.alora_invocation_start); + const auto & enabled_loras = lora_get_enabled_ids(slot.lora); + GGML_ASSERT(enabled_loras.size() == 1); + alora_scale = slot.lora[enabled_loras[0]].scale; + slot.lora[enabled_loras[0]].scale = 0.0f; + alora_disabled_id = enabled_loras[0]; + } + // add prompt tokens for processing in the current batch while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { // get next token to process @@ -3519,6 +3535,14 @@ struct server_context { break; // end of text chunk } + // if this is an alora request with pre-invocation + // tokens that are not cached, we need to stop filling + // this batch at those pre-invocation tokens. + if (alora_scale > 0 && slot.n_past == slot.alora_invocation_start - 1) { + SLT_DBG(slot, "stop prompt batch filling at (n_past = %d, alora_invocation_start = %d)\n", slot.n_past, slot.alora_invocation_start); + break; + } + // embedding requires all tokens in the batch to be output const bool need_embd = server_task_type_need_embd(slot.task_type); @@ -3577,6 +3601,13 @@ struct server_context { // apply lora, only need to do it once per batch common_set_adapter_lora(ctx, slot_batched->lora); + // if the lora is temporarily disabled for an alora, re-enable it + // for next time + if (alora_scale > 0.0f) { + SRV_DBG("re-enabling alora with scale %f\n", alora_scale); + slot_batched->lora[alora_disabled_id].scale = alora_scale; + } + llama_set_embeddings(ctx, slot_batched->need_embd()); } From d03d1067368dde2cc29a69259908eb00a22cbb5b Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 15 Aug 2025 12:27:51 -0600 Subject: [PATCH 11/15] fix: Use || instead of 'or' Too much python :facepalm: Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- tools/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index d6808f9db437c..4ebfaf09fb615 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -1029,7 +1029,7 @@ static bool lora_should_clear_cache( GGML_ASSERT(!are_lora_equal(current, next)); return ( - !(lora_get_enabled_ids(current).empty() or lora_all_alora(current)) or + !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) || !lora_all_alora(next)); } From 52b37e13bc8adedf816826a51773e860cefa2d81 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 18 Aug 2025 15:06:56 -0600 Subject: [PATCH 12/15] fix: Fix off-by-one for limiting cached tokens to before alora start This was the cause of the inconsistent results from the dummy test script with and without the turn that runs the prompt without the adapter before running it with the adapter. Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- tools/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 448fb40a69d21..a5508811fbe71 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3338,7 +3338,7 @@ struct server_context { // if there is an alora invoked, don't cache after the invocation start if (slot.alora_invocation_start >= 0) { SLT_DBG(slot, "only caching to alora invocation start (n_past=%d, alora_invocation_start=%d)\n", slot.n_past, slot.alora_invocation_start); - slot.n_past = std::min(slot.n_past, slot.alora_invocation_start); + slot.n_past = std::min(slot.n_past, slot.alora_invocation_start - 1); } // reuse chunks from the cached prompt by shifting their KV cache in the new position From da2e8c1e62bbb9f97f0f88b68ec99d5dd5ef3e03 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 18 Aug 2025 15:52:08 -0600 Subject: [PATCH 13/15] fix: Support backwards-compatibility for "invocation_string" in adapter_config.json While this has been replaced in the PEFT PR in favor of alora_invocation_tokens, the existing adapters in the ibm-granite org on HF use "invocation_string," so this will enable backwards compatibility and enable testing now (before PEFT PR changes have percolated everywhere). Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- convert_lora_to_gguf.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index a5d0ebebae08a..befe8ab9cc838 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -12,7 +12,7 @@ from math import prod from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast -from transformers import AutoConfig +from transformers import AutoConfig, AutoTokenizer import torch @@ -373,7 +373,22 @@ def set_type(self): def set_gguf_parameters(self): logger.debug("GGUF KV: %s = %d", gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) - if alora_invocation_tokens := lparams.get("alora_invocation_tokens"): + alora_invocation_tokens = lparams.get("alora_invocation_tokens") + invocation_string = lparams.get("invocation_string") + if invocation_string and not alora_invocation_tokens: + logger.debug("Tokenizing invocation_string -> alora_invocation_tokens") + base_model_path_or_id = hparams.get("_name_or_path") + try: + tokenizer = AutoTokenizer.from_pretrained(base_model_path_or_id) + except ValueError: + logger.error("Unable to load tokenizer from %s", base_model_path_or_id) + raise + # NOTE: There's an off-by-one with the older aLoRAs where + # the invocation string includes the "<|start_of_turn|>" + # token, but the adapters themselves were trained to + # activate _after_ that first token, so we drop it here. + alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] + if alora_invocation_tokens: logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens) self.gguf_writer.add_key_value( gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, From 5958557f0af5a9cee338f8636704ed2c5dcfdc16 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 5 Sep 2025 15:09:29 -0600 Subject: [PATCH 14/15] fix: Remove duplicate logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Gabe Goodhart Co-authored-by: Sigbjørn Skjæret --- src/llama-adapter.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index ad407e32bcae4..d8eef75a7ad70 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -235,16 +235,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ (const llama_token *)data, (const llama_token *)data + seq_len, adapter.alora_invocation_tokens.begin()); - std::stringstream ss; - ss << "["; - for (size_t i = 0; i < adapter.alora_invocation_tokens.size(); ++i) { - ss << adapter.alora_invocation_tokens[i]; - if (i < adapter.alora_invocation_tokens.size() - 1) { - ss << ", "; - } - } - ss << "]"; - LLAMA_LOG_INFO("%s: %s = %s\n", __func__, key.c_str(), ss.str().c_str()); } } From 564767656f9f945a30511f999b9a8ca627fad314 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 5 Sep 2025 15:31:51 -0600 Subject: [PATCH 15/15] feat: Report alora_invocation_string and alora_invocation_tokens from /lora-adapters Branch: gabe-l-hart/alora-support Signed-off-by: Gabe Goodhart --- tools/server/server.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b95f786108c46..73fc43bada543 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5085,13 +5085,26 @@ int main(int argc, char ** argv) { const auto & loras = ctx_server.params_base.lora_adapters; for (size_t i = 0; i < loras.size(); ++i) { auto & lora = loras[i]; - result.push_back({ + json entry = { {"id", i}, {"path", lora.path}, {"scale", lora.scale}, {"task_name", lora.task_name}, {"prompt_prefix", lora.prompt_prefix}, - }); + }; + std::string alora_invocation_string = ""; + const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr); + std::vector alora_invocation_tokens; + if (n_alora_tokens) { + const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr); + for (uint64_t i = 0; i < n_alora_tokens; ++i) { + alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]); + alora_invocation_tokens.push_back(alora_tokens[i]); + } + entry["alora_invocation_string"] = alora_invocation_string; + entry["alora_invocation_tokens"] = alora_invocation_tokens; + } + result.push_back(std::move(entry)); } res_ok(res, result); res.status = 200; // HTTP OK