feat(context): Add Prompt Caching to Code Context (CODY-4807) (#6878)

**Problem (why)** Currently the daily cost and token usage on models is very high. We want to find some ways to reduce them. **Solution (context)** Prompt caching can significantly reduce token costs. Each cache hit reduces costs by 90%, while each cache miss increases costs by 25%. After some [initial analysis](https://docs.google.com/document/d/1y-pOAUgwksyMx-Uq1rutzWYd6qb13iqikTSaAn6UdKM/edit?tab=t.0#heading=h.202nr339v40e), we decide to start with implementing [prompt caching](https://linear.app/sourcegraph/issue/CODY-4000/determine-effort-for-prompt-caching-in-sonnet-35-and-haiku-35) for Claude models. **Implementation (what)** - Adding the header `cache_control: ephemeral`, which creates a cache with a 5 min TTL. - Server Side Implementation in this [PR](sourcegraph/sourcegraph#3198) **Anthropic Docs (context)** - https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching - https://docs.anthropic.com/en/api/messages ## Test plan - Tested locally and cache is being added
sourcegraph · Feb 1, 2025 · f32989a · f32989a
1 parent 4ca3877
commit f32989a
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 45 deletions.
diff --git a/...s/src/integrationTest/resources/recordings/integration-test_2927926756/recording.har.yaml b/...s/src/integrationTest/resources/recordings/integration-test_2927926756/recording.har.yaml
@@ -358,7 +358,7 @@ log:
             value: 1; mode=block
           - name: strict-transport-security
             value: max-age=31536000; includeSubDomains; preload
-        headersSize: 1436
+        headersSize: 1296
         httpVersion: HTTP/1.1
         redirectURL: ""
         status: 200
@@ -431,15 +431,25 @@ log:
             value: null
         url: https://sourcegraph.com/.api/graphql?CurrentSiteCodyLlmConfiguration
       response:
-        bodySize: 259
+        bodySize: 248
         content:
           encoding: base64
           mimeType: application/json
-          size: 259
-          text: "[\"H4sIAAAAAAAAAwAAAP//\",\"hM5NDoIwEAXgu8yaaoMQDVu2svMCYztAA3ZIf4yG9O4G\
-            NhI1cfWSyZsvbwaNAaGawZtASyrWz/O5qdm2posOg2G73nsMDWsaoQLP0SnqHE79Xo0\
-            YNYnDrhSeraUA2bvb4OPCA1kPVVFKKTNo0Yf6DyV6NEOEj/LGOq6U4ts00rLvF6aJJk\
-            80CMWanLjnYjSBxBU9wdfvxs5lcUoppRcAAAD//wMAMOH90BoBAAA=\"]"
+          size: 248
+          text: "[\"H4sIAAAAAAAAA4TOTQ6CMBAF4LvMmmqDEA1btrLzAmM7QAN2SH+MhvTuBjYSNXH1ksmbL\
+            28GjQGhmsGbQEsq1s/zuanZtqaLDoNhu957DA1rGqECz9Ep6hxO/V6NGDWJw64Unq2l\
+            ANm72+DjwgNZD1VRSikzaNGH+g8lejRDhI/yxjqulOLbNNKy7xemiSZPNAjFmpy452I\
+            0gcQVPcHX78bOZXFKKaUXAAAA//8DADDh/dAaAQAA\"]"
+          textDecoded:
+            data:
+              site:
+                codyLLMConfiguration:
+                  chatModel: sourcegraph/claude-3.5-sonnet
+                  chatModelMaxTokens: 45000
+                  completionModel: sourcegraph/deepseek-coder-v2-lite-base
+                  completionModelMaxTokens: 2048
+                  fastChatModel: sourcegraph/claude-3-haiku
+                  fastChatModelMaxTokens: 7000
         cookies: []
         headers:
           - name: date
@@ -469,7 +479,7 @@ log:
             value: max-age=31536000; includeSubDomains; preload
           - name: content-encoding
             value: gzip
-        headersSize: 1468
+        headersSize: 1328
         httpVersion: HTTP/1.1
         redirectURL: ""
         status: 200
@@ -574,7 +584,7 @@ log:
             value: max-age=31536000; includeSubDomains; preload
           - name: content-encoding
             value: gzip
-        headersSize: 1468
+        headersSize: 1328
         httpVersion: HTTP/1.1
         redirectURL: ""
         status: 200
@@ -642,18 +652,13 @@ log:
             value: null
         url: https://sourcegraph.com/.api/graphql?CurrentSiteCodyLlmProvider
       response:
-        bodySize: 128
+        bodySize: 131
         content:
           encoding: base64
           mimeType: application/json
-          size: 128
-          text: "[\"H4sIAAAAAAAAA6pWSkksSVSyqlYqzixJBdHJ+SmVPj6+zvl5aZnppUWJJZn5eSDxgqL8s\
-            syU1CIlK6Xi/NKi5NT0osSCDKXa2tpaAAAAAP//AwAfFAXARQAAAA==\"]"
-          textDecoded:
-            data:
-              site:
-                codyLLMConfiguration:
-                  provider: sourcegraph
+          size: 131
+          text: "[\"H4sIAAAAAAAAA6pWSkksSVSyqlY=\",\"Ks4sSQXRyfkplT4+vs75eWmZ6aVFiSWZ+Xkg\
+            8YKi/LLMlNQiJSul4vzSouTU9KLEggyl2traWgAAAAD//wMAHxQFwEUAAAA=\"]"
         cookies: []
         headers:
           - name: date
@@ -683,7 +688,7 @@ log:
             value: max-age=31536000; includeSubDomains; preload
           - name: content-encoding
             value: gzip
-        headersSize: 1468
+        headersSize: 1328
         httpVersion: HTTP/1.1
         redirectURL: ""
         status: 200
@@ -814,7 +819,7 @@ log:
             value: max-age=31536000; includeSubDomains; preload
           - name: content-encoding
             value: gzip
-        headersSize: 1468
+        headersSize: 1328
         httpVersion: HTTP/1.1
         redirectURL: ""
         status: 200
@@ -933,7 +938,7 @@ log:
             value: max-age=31536000; includeSubDomains; preload
           - name: content-encoding
             value: gzip
-        headersSize: 1468
+        headersSize: 1328
         httpVersion: HTTP/1.1
         redirectURL: ""
         status: 200
@@ -1039,7 +1044,7 @@ log:
             value: max-age=31536000; includeSubDomains; preload
           - name: content-encoding
             value: gzip
-        headersSize: 1468
+        headersSize: 1328
         httpVersion: HTTP/1.1
         redirectURL: ""
         status: 200
@@ -1105,24 +1110,16 @@ log:
             value: null
         url: https://sourcegraph.com/.api/graphql?ViewerSettings
       response:
-        bodySize: 280
+        bodySize: 283
         content:
           encoding: base64
           mimeType: application/json
-          size: 280
-          text: "[\"H4sIAAAAAAAAA4zPwUoDQRAE0H/pc75gbyoGAwrikttcOkk529D2LD292cRl/l0WAsGD4\
-            LXqUVALnTiYuoXOghneI0Is1zX5FGOljpZEuIxw+YIF6xYck6Mm6tbG+KB45e/ro5ZD\
-            f7Xgy4vkQSUP61KiLnzCJpFh7sF+HD5QJ42taMDrOxv0T1T3u3s3euFjyBm/xEPOjsw\
-            hxerd1n+Qocxvk4ao2G3yqVgtiptpm0RlhO3s+SRRfD3cGrXWfgAAAP//AwDHP3NmNg\
-            EAAA==\"]"
-          textDecoded:
-            data:
-              viewerSettings:
-                final: "{\"experimentalFeatures\":{\"enableLazyBlobSyntaxHighlighting\":true,\"\
-                  newSearchResultFiltersPanel\":true,\"newSearchResultsUI\":tru\
-                  e,\"proactiveSearchResultsAggregations\":true,\"searchResults\
-                  Aggregations\":true,\"showMultilineSearchConsole\":true},\"op\
-                  enInEditor\":{}}"
+          size: 283
+          text: "[\"H4sIAAAAAAAAA4zPwUoDQRAE0H8=\",\"6XO+YG8qBgMK4pLbXDpJOdvQ9iw9vdnEZf5d\
+            FgLBg+C16lFQC504mLqFzoIZ3iNCLNc1+RRjpY6WRLiMcPmCBesWHJOjJurWxvigeOX\
+            v66OWQ3+14MuL5EElD+tSoi58wiaRYe7Bfhw+UCeNrWjA6zsb9E9U97t7N3rhY8gZv8\
+            RDzo7MIcXq3dZ/kKHMb5OGqNht8qlYLYqbaZtEZYTt7PkkUXw93Bq11n4AAAD//wMAx\
+            z9zZjYBAAA=\"]"
         cookies: []
         headers:
           - name: date
@@ -1152,7 +1149,7 @@ log:
             value: max-age=31536000; includeSubDomains; preload
           - name: content-encoding
             value: gzip
-        headersSize: 1468
+        headersSize: 1328
         httpVersion: HTTP/1.1
         redirectURL: ""
         status: 200

diff --git a/lib/shared/src/chat/chat.ts b/lib/shared/src/chat/chat.ts
@@ -57,9 +57,10 @@ export class ChatClient {
 
         // We only want to send up the speaker and prompt text, regardless of whatever other fields
         // might be on the messages objects (`file`, `displayText`, `contextFiles`, etc.).
-        const messagesToSend = augmentedMessages.map(({ speaker, text }) => ({
+        const messagesToSend = augmentedMessages.map(({ speaker, text, cache_enabled }) => ({
             text,
             speaker,
+            cache_enabled,
         }))
 
         const completionParams = {

diff --git a/lib/shared/src/codebase-context/messages.ts b/lib/shared/src/codebase-context/messages.ts
@@ -231,7 +231,7 @@ export type ContextItemWithContent = ContextItem & { content: string }
 /**
  * A system chat message that adds a context item to the conversation.
  */
-export interface ContextMessage extends Required<Message> {
+export interface ContextMessage extends Required<Omit<Message, 'cache_enabled'>> {
     /**
      * Context messages are always "from" the human. (In the future, this could be from "system" for
      * LLMs that support that kind of message, but that `speaker` value is not currently supported
@@ -243,6 +243,7 @@ export interface ContextMessage extends Required<Message> {
      * The context item that this message introduces into the conversation.
      */
     file: ContextItem
+    cache_enabled?: boolean | null
 }
 
 export const GENERAL_HELP_LABEL = 'Search for a file to include, or type # for symbols...'

diff --git a/lib/shared/src/sourcegraph-api/clientConfig.ts b/lib/shared/src/sourcegraph-api/clientConfig.ts
@@ -33,7 +33,8 @@ export interface CodyNotice {
 //
 // This is fetched from the Sourcegraph instance and is specific to the current user.
 //
-// For the canonical type definition, see https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/internal/clientconfig/types.go
+// For the canonical type definition, see model ClientConfig in https://sourcegraph.sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/internal/openapi/internal.tsp
+// API Spec: https://sourcegraph.sourcegraph.com/api/openapi/internal#get-api-client-config
 export interface CodyClientConfig {
     // Whether the site admin allows this user to make use of the Cody chat feature.
     chatEnabled: boolean
@@ -73,6 +74,9 @@ export interface CodyClientConfig {
 
     // Whether code search is enabled for the SG instance.
     codeSearchEnabled: boolean
+
+    // The latest supported completions stream API version.
+    latestSupportedCompletionsStreamAPIVersion?: number
 }
 
 export const dummyClientConfigForTest: CodyClientConfig = {
@@ -317,6 +321,7 @@ export class ClientConfigSingleton {
                 if (isError(clientConfig)) {
                     throw clientConfig
                 }
+                latestCodyClientConfig = clientConfig
                 return clientConfig
             })
     }
@@ -329,3 +334,12 @@ export class ClientConfigSingleton {
         return this.fetchConfigEndpoint(signal, config)
     }
 }
+// It's really complicated to access CodyClientConfig from functions like utils.ts
+export let latestCodyClientConfig: CodyClientConfig | undefined
+
+export function serverSupportsPromptCaching(): boolean {
+    return (
+        latestCodyClientConfig?.latestSupportedCompletionsStreamAPIVersion !== undefined &&
+        latestCodyClientConfig?.latestSupportedCompletionsStreamAPIVersion >= 7
+    )
+}
diff --git a/lib/shared/src/sourcegraph-api/completions/types.ts b/lib/shared/src/sourcegraph-api/completions/types.ts
@@ -19,7 +19,23 @@ export type Event = DoneEvent | CompletionEvent | ErrorEvent
 export interface Message {
     // Note: The unified API only supports one system message passed as the first message
     speaker: 'human' | 'assistant' | 'system'
+    // content used to be text, but starting from api-version 7, we require Cody clients to
+    // stop using text and send content to instead to respect the official API contract and
+    // mirrors what OpenAI and Anthropic expect
     text?: PromptString
+    cache_enabled?: boolean | null
+}
+
+export interface CompletionUsage {
+    completion_tokens: number | null
+    prompt_tokens: number | null
+    total_tokens: number | null
+    prompt_tokens_details?: PromptTokensDetails | null
+}
+
+export interface PromptTokensDetails {
+    cached_tokens?: number | null
+    cache_read_input_tokens?: number | null
 }
 
 export interface CompletionResponse {

diff --git a/lib/shared/src/sourcegraph-api/completions/utils.ts b/lib/shared/src/sourcegraph-api/completions/utils.ts
@@ -1,4 +1,5 @@
 import { type SerializedChatMessage, contextFiltersProvider } from '../..'
+import { serverSupportsPromptCaching } from '../clientConfig'
 import type { CompletionParameters, Message, SerializedCompletionParameters } from './types'
 
 /**
@@ -26,9 +27,24 @@ async function serializePrompts(
     }
 
     return Promise.all(
-        messages.map(async m => ({
-            ...m,
-            text: await m.text?.toFilteredString(contextFiltersProvider),
-        }))
+        messages.map(async m => {
+            const text = await m.text?.toFilteredString(contextFiltersProvider)
+            if (serverSupportsPromptCaching() && m.cache_enabled) {
+                return {
+                    speaker: m.speaker,
+                    content: [
+                        {
+                            type: 'text',
+                            text: text ?? '',
+                            cache_control: { type: 'ephemeral' },
+                        },
+                    ],
+                }
+            }
+            return {
+                ...m,
+                text: text,
+            }
+        })
     )
 }