Skip to content

Commit

Permalink
feat(context): Add Prompt Caching to Code Context (CODY-4807) (#6878)
Browse files Browse the repository at this point in the history
**Problem (why)**
Currently the daily cost and token usage on models is very high. We want
to find some ways to reduce them.

**Solution (context)**
Prompt caching can significantly reduce token costs. Each cache hit
reduces costs by 90%, while each cache miss increases costs by 25%.

After some [initial
analysis](https://docs.google.com/document/d/1y-pOAUgwksyMx-Uq1rutzWYd6qb13iqikTSaAn6UdKM/edit?tab=t.0#heading=h.202nr339v40e),
we decide to start with implementing [prompt
caching](https://linear.app/sourcegraph/issue/CODY-4000/determine-effort-for-prompt-caching-in-sonnet-35-and-haiku-35)
for Claude models.

**Implementation (what)**
- Adding the header `cache_control: ephemeral`, which creates a cache
with a 5 min TTL.
- Server Side Implementation in this
[PR](sourcegraph/sourcegraph#3198)

**Anthropic Docs (context)**
- https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching
- https://docs.anthropic.com/en/api/messages

## Test plan
- Tested locally and cache is being added
  • Loading branch information
julialeex authored Feb 1, 2025
1 parent 4ca3877 commit f32989a
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ log:
value: 1; mode=block
- name: strict-transport-security
value: max-age=31536000; includeSubDomains; preload
headersSize: 1436
headersSize: 1296
httpVersion: HTTP/1.1
redirectURL: ""
status: 200
Expand Down Expand Up @@ -431,15 +431,25 @@ log:
value: null
url: https://sourcegraph.com/.api/graphql?CurrentSiteCodyLlmConfiguration
response:
bodySize: 259
bodySize: 248
content:
encoding: base64
mimeType: application/json
size: 259
text: "[\"H4sIAAAAAAAAAwAAAP//\",\"hM5NDoIwEAXgu8yaaoMQDVu2svMCYztAA3ZIf4yG9O4G\
NhI1cfWSyZsvbwaNAaGawZtASyrWz/O5qdm2posOg2G73nsMDWsaoQLP0SnqHE79Xo0\
YNYnDrhSeraUA2bvb4OPCA1kPVVFKKTNo0Yf6DyV6NEOEj/LGOq6U4ts00rLvF6aJJk\
80CMWanLjnYjSBxBU9wdfvxs5lcUoppRcAAAD//wMAMOH90BoBAAA=\"]"
size: 248
text: "[\"H4sIAAAAAAAAA4TOTQ6CMBAF4LvMmmqDEA1btrLzAmM7QAN2SH+MhvTuBjYSNXH1ksmbL\
28GjQGhmsGbQEsq1s/zuanZtqaLDoNhu957DA1rGqECz9Ep6hxO/V6NGDWJw64Unq2l\
ANm72+DjwgNZD1VRSikzaNGH+g8lejRDhI/yxjqulOLbNNKy7xemiSZPNAjFmpy452I\
0gcQVPcHX78bOZXFKKaUXAAAA//8DADDh/dAaAQAA\"]"
textDecoded:
data:
site:
codyLLMConfiguration:
chatModel: sourcegraph/claude-3.5-sonnet
chatModelMaxTokens: 45000
completionModel: sourcegraph/deepseek-coder-v2-lite-base
completionModelMaxTokens: 2048
fastChatModel: sourcegraph/claude-3-haiku
fastChatModelMaxTokens: 7000
cookies: []
headers:
- name: date
Expand Down Expand Up @@ -469,7 +479,7 @@ log:
value: max-age=31536000; includeSubDomains; preload
- name: content-encoding
value: gzip
headersSize: 1468
headersSize: 1328
httpVersion: HTTP/1.1
redirectURL: ""
status: 200
Expand Down Expand Up @@ -574,7 +584,7 @@ log:
value: max-age=31536000; includeSubDomains; preload
- name: content-encoding
value: gzip
headersSize: 1468
headersSize: 1328
httpVersion: HTTP/1.1
redirectURL: ""
status: 200
Expand Down Expand Up @@ -642,18 +652,13 @@ log:
value: null
url: https://sourcegraph.com/.api/graphql?CurrentSiteCodyLlmProvider
response:
bodySize: 128
bodySize: 131
content:
encoding: base64
mimeType: application/json
size: 128
text: "[\"H4sIAAAAAAAAA6pWSkksSVSyqlYqzixJBdHJ+SmVPj6+zvl5aZnppUWJJZn5eSDxgqL8s\
syU1CIlK6Xi/NKi5NT0osSCDKXa2tpaAAAAAP//AwAfFAXARQAAAA==\"]"
textDecoded:
data:
site:
codyLLMConfiguration:
provider: sourcegraph
size: 131
text: "[\"H4sIAAAAAAAAA6pWSkksSVSyqlY=\",\"Ks4sSQXRyfkplT4+vs75eWmZ6aVFiSWZ+Xkg\
8YKi/LLMlNQiJSul4vzSouTU9KLEggyl2traWgAAAAD//wMAHxQFwEUAAAA=\"]"
cookies: []
headers:
- name: date
Expand Down Expand Up @@ -683,7 +688,7 @@ log:
value: max-age=31536000; includeSubDomains; preload
- name: content-encoding
value: gzip
headersSize: 1468
headersSize: 1328
httpVersion: HTTP/1.1
redirectURL: ""
status: 200
Expand Down Expand Up @@ -814,7 +819,7 @@ log:
value: max-age=31536000; includeSubDomains; preload
- name: content-encoding
value: gzip
headersSize: 1468
headersSize: 1328
httpVersion: HTTP/1.1
redirectURL: ""
status: 200
Expand Down Expand Up @@ -933,7 +938,7 @@ log:
value: max-age=31536000; includeSubDomains; preload
- name: content-encoding
value: gzip
headersSize: 1468
headersSize: 1328
httpVersion: HTTP/1.1
redirectURL: ""
status: 200
Expand Down Expand Up @@ -1039,7 +1044,7 @@ log:
value: max-age=31536000; includeSubDomains; preload
- name: content-encoding
value: gzip
headersSize: 1468
headersSize: 1328
httpVersion: HTTP/1.1
redirectURL: ""
status: 200
Expand Down Expand Up @@ -1105,24 +1110,16 @@ log:
value: null
url: https://sourcegraph.com/.api/graphql?ViewerSettings
response:
bodySize: 280
bodySize: 283
content:
encoding: base64
mimeType: application/json
size: 280
text: "[\"H4sIAAAAAAAAA4zPwUoDQRAE0H/pc75gbyoGAwrikttcOkk529D2LD292cRl/l0WAsGD4\
LXqUVALnTiYuoXOghneI0Is1zX5FGOljpZEuIxw+YIF6xYck6Mm6tbG+KB45e/ro5ZD\
f7Xgy4vkQSUP61KiLnzCJpFh7sF+HD5QJ42taMDrOxv0T1T3u3s3euFjyBm/xEPOjsw\
hxerd1n+Qocxvk4ao2G3yqVgtiptpm0RlhO3s+SRRfD3cGrXWfgAAAP//AwDHP3NmNg\
EAAA==\"]"
textDecoded:
data:
viewerSettings:
final: "{\"experimentalFeatures\":{\"enableLazyBlobSyntaxHighlighting\":true,\"\
newSearchResultFiltersPanel\":true,\"newSearchResultsUI\":tru\
e,\"proactiveSearchResultsAggregations\":true,\"searchResults\
Aggregations\":true,\"showMultilineSearchConsole\":true},\"op\
enInEditor\":{}}"
size: 283
text: "[\"H4sIAAAAAAAAA4zPwUoDQRAE0H8=\",\"6XO+YG8qBgMK4pLbXDpJOdvQ9iw9vdnEZf5d\
FgLBg+C16lFQC504mLqFzoIZ3iNCLNc1+RRjpY6WRLiMcPmCBesWHJOjJurWxvigeOX\
v66OWQ3+14MuL5EElD+tSoi58wiaRYe7Bfhw+UCeNrWjA6zsb9E9U97t7N3rhY8gZv8\
RDzo7MIcXq3dZ/kKHMb5OGqNht8qlYLYqbaZtEZYTt7PkkUXw93Bq11n4AAAD//wMAx\
z9zZjYBAAA=\"]"
cookies: []
headers:
- name: date
Expand Down Expand Up @@ -1152,7 +1149,7 @@ log:
value: max-age=31536000; includeSubDomains; preload
- name: content-encoding
value: gzip
headersSize: 1468
headersSize: 1328
httpVersion: HTTP/1.1
redirectURL: ""
status: 200
Expand Down
3 changes: 2 additions & 1 deletion lib/shared/src/chat/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,10 @@ export class ChatClient {

// We only want to send up the speaker and prompt text, regardless of whatever other fields
// might be on the messages objects (`file`, `displayText`, `contextFiles`, etc.).
const messagesToSend = augmentedMessages.map(({ speaker, text }) => ({
const messagesToSend = augmentedMessages.map(({ speaker, text, cache_enabled }) => ({
text,
speaker,
cache_enabled,
}))

const completionParams = {
Expand Down
3 changes: 2 additions & 1 deletion lib/shared/src/codebase-context/messages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ export type ContextItemWithContent = ContextItem & { content: string }
/**
* A system chat message that adds a context item to the conversation.
*/
export interface ContextMessage extends Required<Message> {
export interface ContextMessage extends Required<Omit<Message, 'cache_enabled'>> {
/**
* Context messages are always "from" the human. (In the future, this could be from "system" for
* LLMs that support that kind of message, but that `speaker` value is not currently supported
Expand All @@ -243,6 +243,7 @@ export interface ContextMessage extends Required<Message> {
* The context item that this message introduces into the conversation.
*/
file: ContextItem
cache_enabled?: boolean | null
}

export const GENERAL_HELP_LABEL = 'Search for a file to include, or type # for symbols...'
Expand Down
16 changes: 15 additions & 1 deletion lib/shared/src/sourcegraph-api/clientConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ export interface CodyNotice {
//
// This is fetched from the Sourcegraph instance and is specific to the current user.
//
// For the canonical type definition, see https://sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/internal/clientconfig/types.go
// For the canonical type definition, see model ClientConfig in https://sourcegraph.sourcegraph.com/github.com/sourcegraph/sourcegraph/-/blob/internal/openapi/internal.tsp
// API Spec: https://sourcegraph.sourcegraph.com/api/openapi/internal#get-api-client-config
export interface CodyClientConfig {
// Whether the site admin allows this user to make use of the Cody chat feature.
chatEnabled: boolean
Expand Down Expand Up @@ -73,6 +74,9 @@ export interface CodyClientConfig {

// Whether code search is enabled for the SG instance.
codeSearchEnabled: boolean

// The latest supported completions stream API version.
latestSupportedCompletionsStreamAPIVersion?: number
}

export const dummyClientConfigForTest: CodyClientConfig = {
Expand Down Expand Up @@ -317,6 +321,7 @@ export class ClientConfigSingleton {
if (isError(clientConfig)) {
throw clientConfig
}
latestCodyClientConfig = clientConfig
return clientConfig
})
}
Expand All @@ -329,3 +334,12 @@ export class ClientConfigSingleton {
return this.fetchConfigEndpoint(signal, config)
}
}
// It's really complicated to access CodyClientConfig from functions like utils.ts
export let latestCodyClientConfig: CodyClientConfig | undefined

export function serverSupportsPromptCaching(): boolean {
return (
latestCodyClientConfig?.latestSupportedCompletionsStreamAPIVersion !== undefined &&
latestCodyClientConfig?.latestSupportedCompletionsStreamAPIVersion >= 7
)
}
16 changes: 16 additions & 0 deletions lib/shared/src/sourcegraph-api/completions/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,23 @@ export type Event = DoneEvent | CompletionEvent | ErrorEvent
export interface Message {
// Note: The unified API only supports one system message passed as the first message
speaker: 'human' | 'assistant' | 'system'
// content used to be text, but starting from api-version 7, we require Cody clients to
// stop using text and send content to instead to respect the official API contract and
// mirrors what OpenAI and Anthropic expect
text?: PromptString
cache_enabled?: boolean | null
}

export interface CompletionUsage {
completion_tokens: number | null
prompt_tokens: number | null
total_tokens: number | null
prompt_tokens_details?: PromptTokensDetails | null
}

export interface PromptTokensDetails {
cached_tokens?: number | null
cache_read_input_tokens?: number | null
}

export interface CompletionResponse {
Expand Down
24 changes: 20 additions & 4 deletions lib/shared/src/sourcegraph-api/completions/utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { type SerializedChatMessage, contextFiltersProvider } from '../..'
import { serverSupportsPromptCaching } from '../clientConfig'
import type { CompletionParameters, Message, SerializedCompletionParameters } from './types'

/**
Expand Down Expand Up @@ -26,9 +27,24 @@ async function serializePrompts(
}

return Promise.all(
messages.map(async m => ({
...m,
text: await m.text?.toFilteredString(contextFiltersProvider),
}))
messages.map(async m => {
const text = await m.text?.toFilteredString(contextFiltersProvider)
if (serverSupportsPromptCaching() && m.cache_enabled) {
return {
speaker: m.speaker,
content: [
{
type: 'text',
text: text ?? '',
cache_control: { type: 'ephemeral' },
},
],
}
}
return {
...m,
text: text,
}
})
)
}

0 comments on commit f32989a

Please sign in to comment.