From f32989a8ec4e5a2faba0a602304216864eb504e1 Mon Sep 17 00:00:00 2001 From: julialeex Date: Sat, 1 Feb 2025 09:16:03 +0900 Subject: [PATCH] feat(context): Add Prompt Caching to Code Context (CODY-4807) (#6878) **Problem (why)** Currently the daily cost and token usage on models is very high. We want to find some ways to reduce them. **Solution (context)** Prompt caching can significantly reduce token costs. Each cache hit reduces costs by 90%, while each cache miss increases costs by 25%. After some [initial analysis](https://docs.google.com/document/d/1y-pOAUgwksyMx-Uq1rutzWYd6qb13iqikTSaAn6UdKM/edit?tab=t.0#heading=h.202nr339v40e), we decide to start with implementing [prompt caching](https://linear.app/sourcegraph/issue/CODY-4000/determine-effort-for-prompt-caching-in-sonnet-35-and-haiku-35) for Claude models. **Implementation (what)** - Adding the header `cache_control: ephemeral`, which creates a cache with a 5 min TTL. - Server Side Implementation in this [PR](https://github.com/sourcegraph/sourcegraph/pull/3198) **Anthropic Docs (context)** - https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching - https://docs.anthropic.com/en/api/messages ## Test plan - Tested locally and cache is being added --- .../recording.har.yaml | 73 +++++++++---------- lib/shared/src/chat/chat.ts | 3 +- lib/shared/src/codebase-context/messages.ts | 3 +- .../src/sourcegraph-api/clientConfig.ts | 16 +++- .../src/sourcegraph-api/completions/types.ts | 16 ++++ .../src/sourcegraph-api/completions/utils.ts | 24 +++++- 6 files changed, 90 insertions(+), 45 deletions(-) diff --git a/jetbrains/src/integrationTest/resources/recordings/integration-test_2927926756/recording.har.yaml b/jetbrains/src/integrationTest/resources/recordings/integration-test_2927926756/recording.har.yaml index 2a24b21d26b5..f02b39478515 100644 --- a/jetbrains/src/integrationTest/resources/recordings/integration-test_2927926756/recording.har.yaml +++ b/jetbrains/src/integrationTest/resources/recordings/integration-test_2927926756/recording.har.yaml @@ -358,7 +358,7 @@ log: value: 1; mode=block - name: strict-transport-security value: max-age=31536000; includeSubDomains; preload - headersSize: 1436 + headersSize: 1296 httpVersion: HTTP/1.1 redirectURL: "" status: 200 @@ -431,15 +431,25 @@ log: value: null url: https://sourcegraph.com/.api/graphql?CurrentSiteCodyLlmConfiguration response: - bodySize: 259 + bodySize: 248 content: encoding: base64 mimeType: application/json - size: 259 - text: "[\"H4sIAAAAAAAAAwAAAP//\",\"hM5NDoIwEAXgu8yaaoMQDVu2svMCYztAA3ZIf4yG9O4G\ - NhI1cfWSyZsvbwaNAaGawZtASyrWz/O5qdm2posOg2G73nsMDWsaoQLP0SnqHE79Xo0\ - YNYnDrhSeraUA2bvb4OPCA1kPVVFKKTNo0Yf6DyV6NEOEj/LGOq6U4ts00rLvF6aJJk\ - 80CMWanLjnYjSBxBU9wdfvxs5lcUoppRcAAAD//wMAMOH90BoBAAA=\"]" + size: 248 + text: "[\"H4sIAAAAAAAAA4TOTQ6CMBAF4LvMmmqDEA1btrLzAmM7QAN2SH+MhvTuBjYSNXH1ksmbL\ + 28GjQGhmsGbQEsq1s/zuanZtqaLDoNhu957DA1rGqECz9Ep6hxO/V6NGDWJw64Unq2l\ + ANm72+DjwgNZD1VRSikzaNGH+g8lejRDhI/yxjqulOLbNNKy7xemiSZPNAjFmpy452I\ + 0gcQVPcHX78bOZXFKKaUXAAAA//8DADDh/dAaAQAA\"]" + textDecoded: + data: + site: + codyLLMConfiguration: + chatModel: sourcegraph/claude-3.5-sonnet + chatModelMaxTokens: 45000 + completionModel: sourcegraph/deepseek-coder-v2-lite-base + completionModelMaxTokens: 2048 + fastChatModel: sourcegraph/claude-3-haiku + fastChatModelMaxTokens: 7000 cookies: [] headers: - name: date @@ -469,7 +479,7 @@ log: value: max-age=31536000; includeSubDomains; preload - name: content-encoding value: gzip - headersSize: 1468 + headersSize: 1328 httpVersion: HTTP/1.1 redirectURL: "" status: 200 @@ -574,7 +584,7 @@ log: value: max-age=31536000; includeSubDomains; preload - name: content-encoding value: gzip - headersSize: 1468 + headersSize: 1328 httpVersion: HTTP/1.1 redirectURL: "" status: 200 @@ -642,18 +652,13 @@ log: value: null url: https://sourcegraph.com/.api/graphql?CurrentSiteCodyLlmProvider response: - bodySize: 128 + bodySize: 131 content: encoding: base64 mimeType: application/json - size: 128 - text: "[\"H4sIAAAAAAAAA6pWSkksSVSyqlYqzixJBdHJ+SmVPj6+zvl5aZnppUWJJZn5eSDxgqL8s\ - syU1CIlK6Xi/NKi5NT0osSCDKXa2tpaAAAAAP//AwAfFAXARQAAAA==\"]" - textDecoded: - data: - site: - codyLLMConfiguration: - provider: sourcegraph + size: 131 + text: "[\"H4sIAAAAAAAAA6pWSkksSVSyqlY=\",\"Ks4sSQXRyfkplT4+vs75eWmZ6aVFiSWZ+Xkg\ + 8YKi/LLMlNQiJSul4vzSouTU9KLEggyl2traWgAAAAD//wMAHxQFwEUAAAA=\"]" cookies: [] headers: - name: date @@ -683,7 +688,7 @@ log: value: max-age=31536000; includeSubDomains; preload - name: content-encoding value: gzip - headersSize: 1468 + headersSize: 1328 httpVersion: HTTP/1.1 redirectURL: "" status: 200 @@ -814,7 +819,7 @@ log: value: max-age=31536000; includeSubDomains; preload - name: content-encoding value: gzip - headersSize: 1468 + headersSize: 1328 httpVersion: HTTP/1.1 redirectURL: "" status: 200 @@ -933,7 +938,7 @@ log: value: max-age=31536000; includeSubDomains; preload - name: content-encoding value: gzip - headersSize: 1468 + headersSize: 1328 httpVersion: HTTP/1.1 redirectURL: "" status: 200 @@ -1039,7 +1044,7 @@ log: value: max-age=31536000; includeSubDomains; preload - name: content-encoding value: gzip - headersSize: 1468 + headersSize: 1328 httpVersion: HTTP/1.1 redirectURL: "" status: 200 @@ -1105,24 +1110,16 @@ log: value: null url: https://sourcegraph.com/.api/graphql?ViewerSettings response: - bodySize: 280 + bodySize: 283 content: encoding: base64 mimeType: application/json - size: 280 - text: "[\"H4sIAAAAAAAAA4zPwUoDQRAE0H/pc75gbyoGAwrikttcOkk529D2LD292cRl/l0WAsGD4\ - LXqUVALnTiYuoXOghneI0Is1zX5FGOljpZEuIxw+YIF6xYck6Mm6tbG+KB45e/ro5ZD\ - f7Xgy4vkQSUP61KiLnzCJpFh7sF+HD5QJ42taMDrOxv0T1T3u3s3euFjyBm/xEPOjsw\ - hxerd1n+Qocxvk4ao2G3yqVgtiptpm0RlhO3s+SRRfD3cGrXWfgAAAP//AwDHP3NmNg\ - EAAA==\"]" - textDecoded: - data: - viewerSettings: - final: "{\"experimentalFeatures\":{\"enableLazyBlobSyntaxHighlighting\":true,\"\ - newSearchResultFiltersPanel\":true,\"newSearchResultsUI\":tru\ - e,\"proactiveSearchResultsAggregations\":true,\"searchResults\ - Aggregations\":true,\"showMultilineSearchConsole\":true},\"op\ - enInEditor\":{}}" + size: 283 + text: "[\"H4sIAAAAAAAAA4zPwUoDQRAE0H8=\",\"6XO+YG8qBgMK4pLbXDpJOdvQ9iw9vdnEZf5d\ + FgLBg+C16lFQC504mLqFzoIZ3iNCLNc1+RRjpY6WRLiMcPmCBesWHJOjJurWxvigeOX\ + v66OWQ3+14MuL5EElD+tSoi58wiaRYe7Bfhw+UCeNrWjA6zsb9E9U97t7N3rhY8gZv8\ + RDzo7MIcXq3dZ/kKHMb5OGqNht8qlYLYqbaZtEZYTt7PkkUXw93Bq11n4AAAD//wMAx\ + z9zZjYBAAA=\"]" cookies: [] headers: - name: date @@ -1152,7 +1149,7 @@ log: value: max-age=31536000; includeSubDomains; preload - name: content-encoding value: gzip - headersSize: 1468 + headersSize: 1328 httpVersion: HTTP/1.1 redirectURL: "" status: 200 diff --git a/lib/shared/src/chat/chat.ts b/lib/shared/src/chat/chat.ts index 48551d4c10df..57d9774996c2 100644 --- a/lib/shared/src/chat/chat.ts +++ b/lib/shared/src/chat/chat.ts @@ -57,9 +57,10 @@ export class ChatClient { // We only want to send up the speaker and prompt text, regardless of whatever other fields // might be on the messages objects (`file`, `displayText`, `contextFiles`, etc.). - const messagesToSend = augmentedMessages.map(({ speaker, text }) => ({ + const messagesToSend = augmentedMessages.map(({ speaker, text, cache_enabled }) => ({ text, speaker, + cache_enabled, })) const completionParams = { diff --git a/lib/shared/src/codebase-context/messages.ts b/lib/shared/src/codebase-context/messages.ts index 97351b8d6856..a99f447a5a6f 100644 --- a/lib/shared/src/codebase-context/messages.ts +++ b/lib/shared/src/codebase-context/messages.ts @@ -231,7 +231,7 @@ export type ContextItemWithContent = ContextItem & { content: string } /** * A system chat message that adds a context item to the conversation. */ -export interface ContextMessage extends Required { +export interface ContextMessage extends Required> { /** * Context messages are always "from" the human. (In the future, this could be from "system" for * LLMs that support that kind of message, but that `speaker` value is not currently supported @@ -243,6 +243,7 @@ export interface ContextMessage extends Required { * The context item that this message introduces into the conversation. */ file: ContextItem + cache_enabled?: boolean | null } export const GENERAL_HELP_LABEL = 'Search for a file to include, or type # for symbols...' diff --git a/lib/shared/src/sourcegraph-api/clientConfig.ts b/lib/shared/src/sourcegraph-api/clientConfig.ts index 8b91b5dfc0fa..3706b116e532 100644 --- a/lib/shared/src/sourcegraph-api/clientConfig.ts +++ b/lib/shared/src/sourcegraph-api/clientConfig.ts @@ -33,7 +33,8 @@ export interface CodyNotice { // // This is fetched from the Sourcegraph instance and is specific to the current user. // -// For the canonical type definition, see https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/internal/clientconfig/types.go +// For the canonical type definition, see model ClientConfig in https://sourcegraph.sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/internal/openapi/internal.tsp +// API Spec: https://sourcegraph.sourcegraph.com/api/openapi/internal#get-api-client-config export interface CodyClientConfig { // Whether the site admin allows this user to make use of the Cody chat feature. chatEnabled: boolean @@ -73,6 +74,9 @@ export interface CodyClientConfig { // Whether code search is enabled for the SG instance. codeSearchEnabled: boolean + + // The latest supported completions stream API version. + latestSupportedCompletionsStreamAPIVersion?: number } export const dummyClientConfigForTest: CodyClientConfig = { @@ -317,6 +321,7 @@ export class ClientConfigSingleton { if (isError(clientConfig)) { throw clientConfig } + latestCodyClientConfig = clientConfig return clientConfig }) } @@ -329,3 +334,12 @@ export class ClientConfigSingleton { return this.fetchConfigEndpoint(signal, config) } } +// It's really complicated to access CodyClientConfig from functions like utils.ts +export let latestCodyClientConfig: CodyClientConfig | undefined + +export function serverSupportsPromptCaching(): boolean { + return ( + latestCodyClientConfig?.latestSupportedCompletionsStreamAPIVersion !== undefined && + latestCodyClientConfig?.latestSupportedCompletionsStreamAPIVersion >= 7 + ) +} diff --git a/lib/shared/src/sourcegraph-api/completions/types.ts b/lib/shared/src/sourcegraph-api/completions/types.ts index d606152dbe2b..b82e36a30383 100644 --- a/lib/shared/src/sourcegraph-api/completions/types.ts +++ b/lib/shared/src/sourcegraph-api/completions/types.ts @@ -19,7 +19,23 @@ export type Event = DoneEvent | CompletionEvent | ErrorEvent export interface Message { // Note: The unified API only supports one system message passed as the first message speaker: 'human' | 'assistant' | 'system' + // content used to be text, but starting from api-version 7, we require Cody clients to + // stop using text and send content to instead to respect the official API contract and + // mirrors what OpenAI and Anthropic expect text?: PromptString + cache_enabled?: boolean | null +} + +export interface CompletionUsage { + completion_tokens: number | null + prompt_tokens: number | null + total_tokens: number | null + prompt_tokens_details?: PromptTokensDetails | null +} + +export interface PromptTokensDetails { + cached_tokens?: number | null + cache_read_input_tokens?: number | null } export interface CompletionResponse { diff --git a/lib/shared/src/sourcegraph-api/completions/utils.ts b/lib/shared/src/sourcegraph-api/completions/utils.ts index 794d268ab9b2..8da2d38b0f0a 100644 --- a/lib/shared/src/sourcegraph-api/completions/utils.ts +++ b/lib/shared/src/sourcegraph-api/completions/utils.ts @@ -1,4 +1,5 @@ import { type SerializedChatMessage, contextFiltersProvider } from '../..' +import { serverSupportsPromptCaching } from '../clientConfig' import type { CompletionParameters, Message, SerializedCompletionParameters } from './types' /** @@ -26,9 +27,24 @@ async function serializePrompts( } return Promise.all( - messages.map(async m => ({ - ...m, - text: await m.text?.toFilteredString(contextFiltersProvider), - })) + messages.map(async m => { + const text = await m.text?.toFilteredString(contextFiltersProvider) + if (serverSupportsPromptCaching() && m.cache_enabled) { + return { + speaker: m.speaker, + content: [ + { + type: 'text', + text: text ?? '', + cache_control: { type: 'ephemeral' }, + }, + ], + } + } + return { + ...m, + text: text, + } + }) ) }