openvinotoolkit
diff --git a/‎site/docs/concepts/optimization-techniques/kvcache-eviction-algorithm.md‎
Lines changed: 37 additions & 0 deletions b/‎site/docs/concepts/optimization-techniques/kvcache-eviction-algorithm.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎src/cpp/include/openvino/genai/cache_eviction.hpp‎
Lines changed: 68 additions & 2 deletions b/‎src/cpp/include/openvino/genai/cache_eviction.hpp‎
Lines changed: 68 additions & 2 deletions
diff --git a/‎src/cpp/src/continuous_batching/cache_eviction.cpp‎
Lines changed: 33 additions & 1 deletion b/‎src/cpp/src/continuous_batching/cache_eviction.cpp‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎src/cpp/src/continuous_batching/cache_eviction.hpp‎
Lines changed: 2 additions & 0 deletions b/‎src/cpp/src/continuous_batching/cache_eviction.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cpp/src/continuous_batching/kvcrush.cpp‎
Lines changed: 176 additions & 0 deletions b/‎src/cpp/src/continuous_batching/kvcrush.cpp‎
Lines changed: 176 additions & 0 deletions
@@ -60,3 +60,40 @@ It can be enabled by setting the `CacheEvictionConfig.apply_rotation` field to `
 * Cache rotation is only targeted for the regular, linear LLaMa-like RoPE application and may degrade accuracy on models that use other RoPE schemes.
 
 * Cache rotation is currently only supported for the models with uniform V embedding sizes across the layers.
+
+## (Optional) KVCrush
+
+KVCrush enhances the standard H2O/SnapKV eviction by selecting the most representative blocks from the evictable area using clustering analysis, rather than simply evicting the low score blocks.
+
+### Algorithm Overview
+
+1. **Indicator Creation**: Generate binary indicators for tokens based on importance scores
+2. **Anchor Point Generation**: Create reference patterns using configurable modes
+3. **Distance Calculation**: Measure Hamming distance between block patterns and the anchor point
+4. **Representative Selection**: Select blocks to best represent context diversity
+
+### Configuration
+Setup KVCrush config parameters and pass it  to ```CacheEvictionConfig```. Sample code to allocate KVCrush a budget of 2 blocks and use MEAN anchor mode is following.
+```cpp
+const ov::genai::CacheEvictionConfig EXAMPLE_CACHE_EVICTION_CONFIG =
+    {32, 32, 192, ov::genai::AggregationMode::NORM_SUM, false, 8, KVCrushConfig(2, KVCrushAnchorPointMode::MEAN)};
+```
+```python
+CacheEvictionConfig(
+        start_size=32, 
+        recent_size=128, 
+        max_cache_size=448, 
+        aggregation_mode=AggregationMode.NORM_SUM,
+        apply_rotation=False,
+        snapkv_window_size=8,
+        kvcrush_config=KVCrushConfig(budget=2, anchor_point_mode=KVCrushAnchorPointMode.MEAN)
+    )
+```
+
+**Anchor Point Modes:**
+- `RANDOM`: Random binary pattern
+- `ZEROS`: All zeros pattern  
+- `ONES`: All ones pattern
+- `MEAN`: Mean of indicators across blocks
+- `ALTERNATE`: Alternating 0-1 pattern
+
@@ -19,22 +19,83 @@ enum class AggregationMode {
                 * of a given token in cache */
 };
 
+/**
+ * @brief Represents the mode of how anchor points are formed in KVCrush Cache eviction algorithm
+ */
+enum class KVCrushAnchorPointMode {
+    RANDOM, /**<In this mode the anchor point is a random binary vector of 0s and 1s > */
+    ZEROS,  /**<In this mode the anchor point is a vector of 0s */
+    ONES,   /**<In this mode the anchor point is a vector of 1s */
+    MEAN, /**<In this mode the anchor point is a random binary vector of 0s and 1s, where individual values are decided
+             based on majority value */
+    ALTERNATE /**In this mode the anchor point is a vector of alternate 0s and 1s */
+};
+
+class KVCrushConfig {
+public:
+    /**
+     * @brief Configuration struct for the KVCrush cache eviction algorithm.
+     */
+    /**
+     * @class KVCrushConfig
+     * @brief Configuration class for KVCrush cache mechanism.
+     *
+     * This class encapsulates the configuration parameters for the KVCrush cache,
+     * including cache budget, anchor point mode, and random seed.
+     */
+
+    KVCrushConfig() = default;
+
+    /**
+     * @brief Constructs a KVCrushConfig with the specified parameters.
+     * @param budget_ The cache budget, representing the number of blocks to store.
+     * @param anchor_point_mode_ The anchor point mode for KVCrush (see KVCrushAnchorPointMode).
+     * @param rng_seed_ Optional random seed for reproducibility (default is 0).
+     */
+
+    KVCrushConfig(size_t budget_, KVCrushAnchorPointMode anchor_point_mode_, size_t rng_seed_ = 0)
+        : budget(budget_),
+          anchor_point_mode(anchor_point_mode_),
+          rng_seed(rng_seed_) {}
+
+    /*KVCrush Cache budget - number of blocks*/
+    std::size_t budget = 0;
+    /*KVCrush Anchor point mode*/
+    KVCrushAnchorPointMode anchor_point_mode = KVCrushAnchorPointMode::RANDOM;
+    size_t rng_seed = 0;
+    std::size_t get_budget() const {
+        return budget;
+    }
+};
+
 /**
 * @brief Configuration struct for the cache eviction algorithm.
 */
 class CacheEvictionConfig {
 public:
     CacheEvictionConfig() = default;
 
-    CacheEvictionConfig(size_t start_size, size_t recent_size, size_t max_cache_size, AggregationMode aggregation_mode_, bool apply_rotation_ = false, size_t snapkv_window_size_ = 8) : aggregation_mode(aggregation_mode_), apply_rotation(apply_rotation_), snapkv_window_size(snapkv_window_size_), m_start_size(start_size), m_recent_size(recent_size), m_max_cache_size(max_cache_size) {
+    CacheEvictionConfig(size_t start_size,
+                        size_t recent_size,
+                        size_t max_cache_size,
+                        AggregationMode aggregation_mode_,
+                        bool apply_rotation_ = false,
+                        size_t snapkv_window_size_ = 8,
+                        const KVCrushConfig& kvcrush_config_ = KVCrushConfig(0, KVCrushAnchorPointMode::RANDOM))
+        : aggregation_mode(aggregation_mode_),
+          apply_rotation(apply_rotation_),
+          snapkv_window_size(snapkv_window_size_),
+          m_start_size(start_size),
+          m_recent_size(recent_size),
+          m_max_cache_size(max_cache_size),
+          kvcrush_config(kvcrush_config_) {
         OPENVINO_ASSERT(start_size, "CacheEvictionConfig.start_size must be non-zero");
         OPENVINO_ASSERT(recent_size, "CacheEvictionConfig.recent_size must be non-zero");
         OPENVINO_ASSERT(max_cache_size, "CacheEvictionConfig.max_cache_size must be non-zero");
 
         OPENVINO_ASSERT(max_cache_size > (start_size + recent_size),
                         "CacheEvictionConfig.max_cache_size must be larger than CacheEvictionConfig.start_size + CacheEvictionConfig.recent_size");
         m_evictable_size = m_max_cache_size - m_start_size - m_recent_size;
-
     }
 
     /** @return Number of tokens between the "start" and "recent" areas of KV cache that
@@ -76,6 +137,11 @@ class CacheEvictionConfig {
      * score aggregation. **/
     size_t snapkv_window_size = 8;
 
+    /** KVCrush configuration for this cache eviction algorithm.
+     * KVCrush is an additional mechanism that allows to retain some tokens in the cache
+     * even if they are not among the most important ones.*/
+    KVCrushConfig kvcrush_config;
+
 private:
     /** Number of tokens in the *beginning* of KV cache that should be retained
      * in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for
 
@@ -220,7 +220,7 @@ namespace ov::genai {
     CacheEvictionAlgorithm::CacheEvictionAlgorithm(const CacheEvictionConfig &eviction_config, size_t block_size,
                                                    size_t num_decoder_layers, size_t max_pool_window_size) :
             m_eviction_config(eviction_config), m_block_size(block_size), m_num_decoder_layers(num_decoder_layers),
-            m_score_manager(block_size, num_decoder_layers, max_pool_window_size, eviction_config.aggregation_mode, eviction_config.get_start_size() / block_size, eviction_config.snapkv_window_size)
+            m_score_manager(block_size, num_decoder_layers, max_pool_window_size, eviction_config.aggregation_mode, eviction_config.get_start_size() / block_size, eviction_config.snapkv_window_size), m_kvcrush_algo(eviction_config.kvcrush_config, block_size)
     {
             OPENVINO_ASSERT(!(m_eviction_config.get_start_size() % m_block_size),
                             "CacheEvictionConfig.start_size in tokens must be a multiple of block size ", m_block_size);
@@ -265,6 +265,38 @@ namespace ov::genai {
             size_t num_blocks_to_evict = get_num_blocks_to_evict(decoder_layer_idx);
             auto evicted_block_indices = get_indices_of_blocks_to_evict(scores_for_all_evictable_blocks, num_blocks_to_evict);
 
+            // KVCrush: start
+            bool should_apply_kvcrush = (m_eviction_config.kvcrush_config.budget > 0) &&
+                                        (evicted_block_indices.size() >= m_eviction_config.kvcrush_config.budget);
+            if (should_apply_kvcrush) {
+                size_t num_tokens_in_evictable_blocks = scores_for_all_evictable_blocks.size() * m_block_size;
+
+                auto kvcrush_retained_block_indices = m_kvcrush_algo.get_indices_of_blocks_to_retain_using_kvcrush(
+                    num_tokens_in_evictable_blocks,
+                    evicted_block_indices,
+                    m_score_manager.get_scores()[decoder_layer_idx]);
+
+                // Remove the indices in kvcrush_retained_block_indices from evicted_block_indices
+                if (!kvcrush_retained_block_indices.empty()) {
+                    // Convert both vectors to sets for efficient operations
+                    std::unordered_set<std::size_t> retained_set(kvcrush_retained_block_indices.begin(),
+                                                                 kvcrush_retained_block_indices.end());
+
+                    // Create a new vector containing only elements not in retained_set
+                    std::vector<std::size_t> filtered_evicted_indices;
+                    filtered_evicted_indices.reserve(evicted_block_indices.size());
+
+                    for (const auto& idx : evicted_block_indices) {
+                        if (retained_set.find(idx) == retained_set.end()) {
+                            filtered_evicted_indices.push_back(idx);
+                        }
+                    }
+                    // Replace the original vector with the filtered one
+                    evicted_block_indices = std::move(filtered_evicted_indices);
+                }
+            }
+            // KVCrush: end
+
             m_num_evicted_tokens += evicted_block_indices.size() * m_block_size;
 
             // No longer need to track the overall "heavy-hitter" attention scores for freshly evicted blocks
 
@@ -11,6 +11,7 @@
 #include "openvino/openvino.hpp"
 #include "continuous_batching/attention_output.hpp"
 #include "openvino/genai/cache_eviction.hpp"
+#include "continuous_batching/kvcrush.hpp"
 
 namespace ov::genai {
 
@@ -215,6 +216,7 @@ class CacheEvictionAlgorithm {
     void remove_scores_of_evicted_blocks(const std::vector<std::size_t>& evicted_block_indices, size_t decoder_layer_idx);
 
     CacheEvictionConfig m_eviction_config;
+    KVCrushAlgorithm m_kvcrush_algo;
     std::size_t m_block_size;
     std::size_t m_num_evicted_tokens = 0;
     std::size_t m_num_decoder_layers;
 
@@ -0,0 +1,176 @@
+// Copyright (C) 2023-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "continuous_batching/kvcrush.hpp"
+
+#include <random>
+namespace ov::genai {
+
+KVCrushAlgorithm::KVCrushAlgorithm(const KVCrushConfig& kvcrush_config, size_t block_size)
+    : m_kvcrush_config(kvcrush_config),
+      m_block_size(block_size),
+      rng(std::mt19937(kvcrush_config.rng_seed)) {}
+
+// step 1: create_indicators_kvcrush()
+std::vector<size_t> KVCrushAlgorithm::create_indicators_kvcrush(size_t num_tokens_in_evictable_blocks,
+
+                                                                std::vector<size_t>& evicted_block_indices,
+                                                                const std::vector<double>& layer_scores) {
+    // Step 1: Sort the scores of the blocks to be evicted
+    const auto& blocks_eligible_for_kvcrush = evicted_block_indices;
+    std::vector<size_t> indices(num_tokens_in_evictable_blocks);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::partial_sort(indices.begin(),
+                      indices.begin() + num_tokens_in_evictable_blocks / 2,
+                      indices.end(),
+                      [&](size_t i, size_t j) {
+                          return layer_scores[i] > layer_scores[j];
+                      });
+
+    std::vector<size_t> indicators(num_tokens_in_evictable_blocks, 0);
+    for (size_t i = 0; i < num_tokens_in_evictable_blocks / 2; ++i) {
+        indicators[indices[i]] = 1;
+    }
+    return indicators;
+}
+// step 2: create_anchor_point_kvcrush()
+std::vector<size_t> KVCrushAlgorithm::create_anchor_point_kvcrush(size_t num_tokens_in_evictable_blocks,
+
+                                                                  std::vector<size_t>& indicators) {
+    // Step 2: Create a binary vector of size block_size as anchor point
+    std::vector<size_t> anchor_point(m_block_size);
+    // Initialize anchor_point based on anchor using switch-case
+    switch (m_kvcrush_config.anchor_point_mode) {
+    case KVCrushAnchorPointMode::RANDOM: {
+        std::uniform_int_distribution<int> dist(0, 1);
+        std::generate(anchor_point.begin(), anchor_point.end(), [&]() {
+            return dist(rng);
+        });
+    } break;
+    case KVCrushAnchorPointMode::ZEROS:
+        std::fill(anchor_point.begin(), anchor_point.end(), 0);
+        break;
+    case KVCrushAnchorPointMode::ONES:
+        std::fill(anchor_point.begin(), anchor_point.end(), 1);
+        break;
+    case KVCrushAnchorPointMode::MEAN: {
+        size_t num_blocks = num_tokens_in_evictable_blocks / m_block_size;
+        for (size_t pos = 0; pos < m_block_size; pos++) {
+            // Calculate sum of indicators at this position across all blocks
+            size_t sum = 0;
+            for (size_t block_idx = 0; block_idx < num_blocks; block_idx++) {
+                size_t idx = block_idx * m_block_size + pos;
+                sum += indicators[idx];
+            }
+
+            // Calculate mean and set anchor point based on threshold (0.5)
+            double mean = static_cast<double>(sum) / num_blocks;
+            anchor_point[pos] = (mean > 0.5) ? 1 : 0;
+        }
+        break;
+    }
+    case KVCrushAnchorPointMode::ALTERNATE:
+        for (size_t i = 0; i < m_block_size; ++i) {
+            anchor_point[i] = i % 2;
+        }
+        break;
+    default:
+        OPENVINO_THROW("Invalid anchor point type");
+    }
+    return anchor_point;
+}
+
+// step 3: calculate_hamming_distance()
+std::vector<std::pair<size_t, size_t>> KVCrushAlgorithm::calculate_hamming_distance_kvcrush(
+    size_t num_tokens_in_evictable_blocks,
+
+    std::vector<size_t>& indicators,
+    std::vector<size_t>& anchor_point) {
+    // Step 3: Calculate Hamming distances between anchor point and each block
+    size_t num_blocks = num_tokens_in_evictable_blocks / m_block_size;
+    std::vector<std::pair<size_t, size_t>> block_distances;  // pair<hamming_distance, block_idx>
+    block_distances.reserve(num_blocks);
+
+    for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        size_t hamming_distance = 0;
+        for (size_t j = 0; j < m_block_size; ++j) {
+            size_t token_idx = block_idx * m_block_size + j;
+            if (token_idx < num_tokens_in_evictable_blocks) {
+                // Use the indicators vector to determine the bit value of this position
+                int bit_value = indicators[token_idx];
+                if (bit_value != anchor_point[j]) {
+                    hamming_distance++;
+                }
+            }
+        }
+        block_distances.emplace_back(hamming_distance, block_idx);
+    }
+    return block_distances;
+}
+
+// step 4: get_representative_blocks()
+std::vector<std::size_t> KVCrushAlgorithm::get_representative_blocks_kvcrush(
+
+    size_t num_tokens_in_evictable_blocks,
+    std::vector<std::pair<size_t, size_t>>& block_distances,
+    const std::vector<size_t>& blocks_eligible_for_kvcrush) {
+    // Step 4: Find the representative blocks
+    // Filter block indices that are in blocks_eligible_for_kvcrush
+    std::vector<size_t> filtered_block_indices;
+    filtered_block_indices.reserve(block_distances.size());
+
+    for (const auto& entry : block_distances) {
+        size_t block_idx = entry.second;
+        // Check if block_idx is in blocks_eligible_for_kvcrush
+        if (std::find(blocks_eligible_for_kvcrush.begin(), blocks_eligible_for_kvcrush.end(), block_idx) !=
+            blocks_eligible_for_kvcrush.end()) {
+            filtered_block_indices.push_back(block_idx);
+        }
+    }
+    // Sort filtered_block_indices based on Hamming distance
+    std::sort(filtered_block_indices.begin(), filtered_block_indices.end(), [&](size_t a, size_t b) {
+        return block_distances[a].first < block_distances[b].first;
+    });
+    // select kvcrush_budget number of blocks from filtered_block_indices, uniformly spaced
+    size_t num_blocks_to_retain = std::min(filtered_block_indices.size(), m_kvcrush_config.get_budget());
+    size_t step = filtered_block_indices.size() / num_blocks_to_retain;
+    std::vector<std::size_t> kvcrush_retained_block_indices;
+    kvcrush_retained_block_indices.reserve(num_blocks_to_retain);
+    for (size_t i = 0; i < num_blocks_to_retain; ++i) {
+        size_t idx = i * step;
+        if (idx < filtered_block_indices.size()) {
+            kvcrush_retained_block_indices.push_back(filtered_block_indices[idx]);
+        }
+    }
+
+    return kvcrush_retained_block_indices;
+}
+
+std::vector<std::size_t> KVCrushAlgorithm::get_indices_of_blocks_to_retain_using_kvcrush(
+
+    size_t num_tokens_in_evictable_blocks,
+    std::vector<std::size_t>& evicted_block_indices,
+    const std::vector<double>& layer_scores) {
+    // step 1: Create indicators_kvcrush makes binary feature vectors based on top-k/2 scores
+    const auto& blocks_eligible_for_kvcrush = evicted_block_indices;  // only the blocks that are evicted by the score
+                                                                      // based eviction are eligible for kvcrush
+
+    std::vector<size_t> indicators =
+        create_indicators_kvcrush(num_tokens_in_evictable_blocks, evicted_block_indices, layer_scores);
+
+    // Step 2: Create anchor_point based on the selected anchor point type
+    std::vector<size_t> anchor_point = create_anchor_point_kvcrush(num_tokens_in_evictable_blocks, indicators);
+
+    // Step 3: Calculate Hamming distances between anchor point and each block, where each block is represented by
+    // its binary feature vector called indicators
+    std::vector<std::pair<size_t, size_t>> block_distances =
+        calculate_hamming_distance_kvcrush(num_tokens_in_evictable_blocks, indicators, anchor_point);
+
+    // Step 4: Find the representative blocks
+    // Filter block indices that are in blocks_eligible_for_kvcrush
+    return get_representative_blocks_kvcrush(num_tokens_in_evictable_blocks,
+                                             block_distances,
+                                             blocks_eligible_for_kvcrush);
+}
+
+}  // namespace ov::genai