getsentry · vgrozdanic · Jul 9, 2025 · Jul 9, 2025 · vgrozdanic · Jul 8, 2025
@@ -20,6 +20,7 @@
 - Add support for playstation data requests. ([#4870](https://github.com/getsentry/relay/pull/4870))
 - Expand the NEL attributes & others. ([#4874](https://github.com/getsentry/relay/pull/4874))
 - Normalize legacy AI agents attributes to OTel compatible names. ([#4916](https://github.com/getsentry/relay/pull/4916))
+- Fix cost calculation for cached and reasoning tokens. ([#4922](https://github.com/getsentry/relay/pull/4922))
 
 ## 25.6.2
 

@@ -2330,8 +2330,8 @@ mod tests {
                         "data": {
                             "gen_ai.usage.input_tokens": 1000,
                             "gen_ai.usage.output_tokens": 2000,
-                            "gen_ai.usage.output_tokens.reasoning": 3000,
-                            "gen_ai.usage.input_tokens.cached": 4000,
+                            "gen_ai.usage.output_tokens.reasoning": 1000,
+                            "gen_ai.usage.input_tokens.cached": 500,
                             "gen_ai.request.model": "claude-2.1"
                         }
                     },
@@ -2382,15 +2382,15 @@ mod tests {
                                 input_per_token: 0.01,
                                 output_per_token: 0.02,
                                 output_reasoning_per_token: 0.03,
-                                input_cached_per_token: 0.0,
+                                input_cached_per_token: 0.04,
                             },
                         ),
                         (
                             "gpt4-21-04".to_owned(),
                             ModelCostV2 {
                                 input_per_token: 0.09,
                                 output_per_token: 0.05,
-                                output_reasoning_per_token: 0.06,
+                                output_reasoning_per_token: 0.0,
                                 input_cached_per_token: 0.0,
                             },
                         ),
@@ -2408,7 +2408,7 @@ mod tests {
             .and_then(|span| span.data.value());
         assert_eq!(
             first_span_data.and_then(|data| data.gen_ai_usage_total_cost.value()),
-            Some(&Value::F64(140.0))
+            Some(&Value::F64(75.0))
         );
         assert_eq!(
             first_span_data.and_then(|data| data.gen_ai_response_tokens_per_second.value()),
@@ -2525,8 +2525,8 @@ mod tests {
                         "data": {
                             "gen_ai.usage.input_tokens": 1000,
                             "gen_ai.usage.output_tokens": 2000,
-                            "gen_ai.usage.output_tokens.reasoning": 3000,
-                            "gen_ai.usage.input_tokens.cached": 4000,
+                            "gen_ai.usage.output_tokens.reasoning": 1000,
+                            "gen_ai.usage.input_tokens.cached": 500,
                             "gen_ai.request.model": "claude-2.1"
                         }
                     },
@@ -2562,8 +2562,8 @@ mod tests {
                             ModelCostV2 {
                                 input_per_token: 0.01,
                                 output_per_token: 0.02,
-                                output_reasoning_per_token: 0.03,
-                                input_cached_per_token: 0.0,
+                                output_reasoning_per_token: 0.0,
+                                input_cached_per_token: 0.04,
                             },
                         ),
                         (
@@ -2589,7 +2589,7 @@ mod tests {
                 .and_then(|span| span.value())
                 .and_then(|span| span.data.value())
                 .and_then(|data| data.gen_ai_usage_total_cost.value()),
-            Some(&Value::F64(140.0))
+            Some(&Value::F64(65.0))
         );
         assert_eq!(
             spans

@@ -32,11 +32,24 @@ fn calculate_ai_model_cost(model_cost: Option<ModelCostV2>, data: &SpanData) ->
 
     let mut result = 0.0;
 
-    result += cost_per_token.input_per_token * input_tokens_used.unwrap_or(0.0);
-    result += cost_per_token.output_per_token * output_tokens_used.unwrap_or(0.0);
-    result +=
-        cost_per_token.output_reasoning_per_token * output_reasoning_tokens_used.unwrap_or(0.0);
+    // Cached tokens are subset of the input tokens, so we need to subtract them
+    // from the input tokens
+    result += cost_per_token.input_per_token
+        * (input_tokens_used.unwrap_or(0.0) - input_cached_tokens_used.unwrap_or(0.0));
     result += cost_per_token.input_cached_per_token * input_cached_tokens_used.unwrap_or(0.0);
+    // Reasoning tokens are subset of the output tokens, so we need to subtract
+    // them from the output tokens
+    result += cost_per_token.output_per_token
+        * (output_tokens_used.unwrap_or(0.0) - output_reasoning_tokens_used.unwrap_or(0.0));
+
+    if cost_per_token.output_reasoning_per_token > 0.0 {
+        // for now most of the models do not differentiate between reasoning and output token cost,
+        // it costs the same
+        result +=
+            cost_per_token.output_reasoning_per_token * output_reasoning_tokens_used.unwrap_or(0.0);
+    } else {
+        result += cost_per_token.output_per_token * output_reasoning_tokens_used.unwrap_or(0.0);
+    }
 
     Some(result)
 }