From 0f060877edbb8ef2c1c6b79a03ee91cf98ed78a0 Mon Sep 17 00:00:00 2001 From: Olafur Geirsson Date: Fri, 20 Sep 2024 15:20:22 +0200 Subject: [PATCH] Chat: add vision support, upload images Previously, it was only possible to send text to Cody via Chat. This PR adds new support to additionally send Cody images. This can be helpful, for example, when you want to write a basic HTML structure based on a Figma design. --- lib/shared/src/chat/chat.ts | 9 +-- lib/shared/src/chat/transcript/messages.ts | 1 + .../src/sourcegraph-api/completions/types.ts | 18 ++--- vscode/src/chat/chat-view/ChatBuilder.ts | 27 +++++++- vscode/src/chat/chat-view/ChatController.ts | 5 ++ vscode/src/chat/chat-view/prompt.ts | 3 + vscode/src/chat/protocol.ts | 2 + vscode/src/completions/nodeClient.ts | 8 ++- vscode/src/prompt-builder/index.ts | 27 +++++++- .../human/editor/HumanMessageEditor.tsx | 41 ++++++++++- .../human/editor/toolbar/Toolbar.tsx | 16 +++++ .../editor/toolbar/UploadImageButton.tsx | 68 +++++++++++++++++++ .../ModelSelectField.module.css | 11 ++- 13 files changed, 217 insertions(+), 19 deletions(-) create mode 100644 vscode/webviews/chat/cells/messageCell/human/editor/toolbar/UploadImageButton.tsx diff --git a/lib/shared/src/chat/chat.ts b/lib/shared/src/chat/chat.ts index 48551d4c10df..a5432bd677d7 100644 --- a/lib/shared/src/chat/chat.ts +++ b/lib/shared/src/chat/chat.ts @@ -57,12 +57,13 @@ export class ChatClient { // We only want to send up the speaker and prompt text, regardless of whatever other fields // might be on the messages objects (`file`, `displayText`, `contextFiles`, etc.). - const messagesToSend = augmentedMessages.map(({ speaker, text }) => ({ + const messagesToSend = augmentedMessages.map(({ speaker, text, content }) => ({ text, speaker, + content, })) - const completionParams = { + const completionParams: CompletionParameters = { ...DEFAULT_CHAT_COMPLETION_PARAMETERS, ...params, messages: messagesToSend, @@ -107,8 +108,8 @@ export function sanitizeMessages(messages: Message[]): Message[] { // the next one const nextMessage = sanitizedMessages[index + 1] if ( - (nextMessage.speaker === 'assistant' && !nextMessage.text?.length) || - (message.speaker === 'assistant' && !message.text?.length) + (nextMessage.speaker === 'assistant' && !nextMessage.text?.length && !nextMessage.content) || + (message.speaker === 'assistant' && !message.text?.length && !message.content) ) { return false } diff --git a/lib/shared/src/chat/transcript/messages.ts b/lib/shared/src/chat/transcript/messages.ts index 144f5e1eceb1..74ffab45d537 100644 --- a/lib/shared/src/chat/transcript/messages.ts +++ b/lib/shared/src/chat/transcript/messages.ts @@ -22,6 +22,7 @@ export interface SubMessage { export interface ChatMessage extends Message { contextFiles?: ContextItem[] + base64Image?: string contextAlternatives?: RankedContext[] diff --git a/lib/shared/src/sourcegraph-api/completions/types.ts b/lib/shared/src/sourcegraph-api/completions/types.ts index d606152dbe2b..5c1211f2cacb 100644 --- a/lib/shared/src/sourcegraph-api/completions/types.ts +++ b/lib/shared/src/sourcegraph-api/completions/types.ts @@ -5,27 +5,28 @@ interface DoneEvent { type: 'done' } -interface CompletionEvent extends CompletionResponse { - type: 'completion' -} +// interface CompletionEvent extends CompletionResponse { +// type: 'completion' +// } interface ErrorEvent { type: 'error' error: string } -export type Event = DoneEvent | CompletionEvent | ErrorEvent +export type Event = DoneEvent | ErrorEvent export interface Message { // Note: The unified API only supports one system message passed as the first message speaker: 'human' | 'assistant' | 'system' text?: PromptString + content?: string | MessagePart[] + base64Image?: string } -export interface CompletionResponse { - completion: string - stopReason?: string -} +type MessagePart = + | { type: 'text'; text: string } // a normal text message + | { type: 'image_url'; image_url: { url: string } } // image message, per https://platform.openai.com/docs/guides/vision export interface CompletionParameters { fast?: boolean @@ -45,6 +46,7 @@ export interface CompletionParameters { type: 'content' content: string } + base64Image?: string } export interface SerializedCompletionParameters extends Omit { diff --git a/vscode/src/chat/chat-view/ChatBuilder.ts b/vscode/src/chat/chat-view/ChatBuilder.ts index c5dcab311e04..9a51f56821f4 100644 --- a/vscode/src/chat/chat-view/ChatBuilder.ts +++ b/vscode/src/chat/chat-view/ChatBuilder.ts @@ -157,7 +157,7 @@ export class ChatBuilder { if (this.messages.at(-1)?.speaker === 'human') { throw new Error('Cannot add a user message after a user message') } - this.messages.push({ ...message, speaker: 'human' }) + this.messages.push({ ...message, speaker: 'human', base64Image: this.getAndResetImage() }) this.changeNotifications.next() } @@ -322,6 +322,31 @@ export class ChatBuilder { } return result } + + /** + * Store the base64-encoded image uploaded by user to a multi-modal model. + * Requires vision support in the model, added in the PR + * https://github.com/sourcegraph/sourcegraph/pull/546 + */ + private image: string | undefined = undefined + + /** + * Sets the base64-encoded image for the chat model. + * @param base64Image - The base64-encoded image data to set. + */ + public setImage(base64Image: string): void { + this.image = base64Image + } + + /** + * Gets the base64-encoded image for the chat model and resets the internal image property to undefined. + * @returns The base64-encoded image, or undefined if no image has been set. + */ + public getAndResetImage(): string | undefined { + const image = this.image + this.image = undefined + return image + } } function messageToSerializedChatInteraction( diff --git a/vscode/src/chat/chat-view/ChatController.ts b/vscode/src/chat/chat-view/ChatController.ts index 3c7d560ef0cb..1aa2dbbe3f11 100644 --- a/vscode/src/chat/chat-view/ChatController.ts +++ b/vscode/src/chat/chat-view/ChatController.ts @@ -513,6 +513,11 @@ export class ChatController implements vscode.Disposable, vscode.WebviewViewProv } break } + + case 'chat/upload-file': { + this.chatBuilder.setImage(message.base64) + break + } case 'log': { const logger = message.level === 'debug' ? logDebug : logError logger(message.filterLabel, message.message) diff --git a/vscode/src/chat/chat-view/prompt.ts b/vscode/src/chat/chat-view/prompt.ts index d528559cc553..f9e18cd0e08e 100644 --- a/vscode/src/chat/chat-view/prompt.ts +++ b/vscode/src/chat/chat-view/prompt.ts @@ -89,6 +89,9 @@ export class DefaultPrompter { `Ignored ${messagesIgnored} chat messages due to context limit` ) } + for (const message of reverseTranscript) { + promptBuilder.tryAddImage(message.base64Image) + } // Counter for context items categorized by source const ignoredContext = { user: 0, corpus: 0, transcript: 0 } diff --git a/vscode/src/chat/protocol.ts b/vscode/src/chat/protocol.ts index 88c54d82f157..7c61aec93aee 100644 --- a/vscode/src/chat/protocol.ts +++ b/vscode/src/chat/protocol.ts @@ -156,6 +156,8 @@ export type WebviewMessage = selectedFilters: NLSSearchDynamicFilter[] } | { command: 'action/confirmation'; id: string; response: boolean } + | { command: 'log'; level: 'debug' | 'error'; filterLabel: string; message: string } + | { command: 'chat/upload-file'; base64: string } export interface SmartApplyResult { taskId: FixupTaskID diff --git a/vscode/src/completions/nodeClient.ts b/vscode/src/completions/nodeClient.ts index 4f2fc6500492..b4c196175658 100644 --- a/vscode/src/completions/nodeClient.ts +++ b/vscode/src/completions/nodeClient.ts @@ -9,7 +9,6 @@ import { type CompletionCallbacks, type CompletionParameters, type CompletionRequestParameters, - type CompletionResponse, NetworkError, RateLimitError, SourcegraphCompletionsClient, @@ -21,6 +20,7 @@ import { getTraceparentHeaders, globalAgentRef, isError, + logDebug, logError, onAbort, parseEvents, @@ -38,6 +38,10 @@ export class SourcegraphNodeCompletionsClient extends SourcegraphCompletionsClie signal?: AbortSignal ): Promise { const { apiVersion, interactionId } = requestParams + for (const message of params.messages) { + logDebug('apiVersion', JSON.stringify(apiVersion, null, 2)) + logDebug('base64Image', JSON.stringify(message, null, 2)) + } const url = new URL(await this.completionsEndpoint()) if (apiVersion >= 1) { @@ -326,7 +330,7 @@ export class SourcegraphNodeCompletionsClient extends SourcegraphCompletionsClie getActiveTraceAndSpanId()?.traceId ) } - const json = (await response.json()) as CompletionResponse + const json = await response.json() if (typeof json?.completion === 'string') { cb.onChange(json.completion) cb.onComplete() diff --git a/vscode/src/prompt-builder/index.ts b/vscode/src/prompt-builder/index.ts index cf55068e1fff..43ad5b7d633c 100644 --- a/vscode/src/prompt-builder/index.ts +++ b/vscode/src/prompt-builder/index.ts @@ -33,6 +33,7 @@ export class PromptBuilder { * A list of context items that are used to build context messages. */ public contextItems: ContextItem[] = [] + public images: string[] = [] /** * Convenience constructor because loading the tokenizer is async due to its large size. @@ -47,10 +48,28 @@ export class PromptBuilder { if (this.contextItems.length > 0) { this.buildContextMessages() } - + this.buildImageMessages() return this.prefixMessages.concat([...this.reverseMessages].reverse()) } + private buildImageMessages(): void { + for (const image of this.images) { + const imageMessage: Message = { + speaker: 'human', + content: [ + { + type: 'image_url', + image_url: { + // TODO: Handle PNG/JPEG, don't hardcode to JPEG + url: `data:image/jpeg;base64,${image}`, + }, + }, + ], + } + this.reverseMessages.push(...[ASSISTANT_MESSAGE, imageMessage]) + } + } + private buildContextMessages(): void { for (const item of this.contextItems) { // Create context messages for each context item, where @@ -108,6 +127,12 @@ export class PromptBuilder { return undefined } + public tryAddImage(base64Image: string | undefined): void { + if (base64Image) { + this.images.push(base64Image) + } + } + public async tryAddContext( type: ContextTokenUsageType | 'history', contextItems: ContextItem[] diff --git a/vscode/webviews/chat/cells/messageCell/human/editor/HumanMessageEditor.tsx b/vscode/webviews/chat/cells/messageCell/human/editor/HumanMessageEditor.tsx index 6bb138f536bf..e4769a35ebf9 100644 --- a/vscode/webviews/chat/cells/messageCell/human/editor/HumanMessageEditor.tsx +++ b/vscode/webviews/chat/cells/messageCell/human/editor/HumanMessageEditor.tsx @@ -31,6 +31,7 @@ import { import type { UserAccountInfo } from '../../../../../Chat' import { type ClientActionListener, useClientActionListener } from '../../../../../client/clientState' import { promptModeToIntent } from '../../../../../prompts/PromptsTab' +import { getVSCodeAPI } from '../../../../../utils/VSCodeApi' import { useTelemetryRecorder } from '../../../../../utils/telemetry' import { useFeatureFlag } from '../../../../../utils/useFeatureFlags' import { useLinkOpener } from '../../../../../utils/useLinkOpener' @@ -99,6 +100,8 @@ export const HumanMessageEditor: FunctionComponent<{ }) => { const telemetryRecorder = useTelemetryRecorder() + const [imageFile, setImageFile] = useState(undefined) + const editorRef = useRef(null) useImperativeHandle(parentEditorRef, (): PromptEditorRefAPI | null => editorRef.current, []) @@ -126,7 +129,7 @@ export const HumanMessageEditor: FunctionComponent<{ const experimentalPromptEditorEnabled = useFeatureFlag(FeatureFlag.CodyExperimentalPromptEditor) const onSubmitClick = useCallback( - (intent?: ChatMessage['intent'], forceSubmit?: boolean): void => { + async (intent?: ChatMessage['intent'], forceSubmit?: boolean): Promise => { if (!forceSubmit && submitState === 'emptyEditorValue') { return } @@ -142,6 +145,28 @@ export const HumanMessageEditor: FunctionComponent<{ const value = editorRef.current.getSerializedValue() parentOnSubmit(intent) + if (imageFile) { + const readFileGetBase64String = (file: File): Promise => { + return new Promise((resolve, reject) => { + const reader = new FileReader() + reader.onload = () => { + const base64 = reader.result + if (base64 && typeof base64 === 'string') { + resolve(base64.split(',')[1]) + } else { + reject(new Error('Failed to read file')) + } + } + reader.onerror = () => reject(new Error('Failed to read file')) + reader.readAsDataURL(file) + }) + } + + const base64 = await readFileGetBase64String(imageFile) + getVSCodeAPI().postMessage({ command: 'chat/upload-file', base64 }) + setImageFile(undefined) + } + parentOnSubmit(intent) telemetryRecorder.recordEvent('cody.humanMessageEditor', 'submit', { metadata: { @@ -157,7 +182,15 @@ export const HumanMessageEditor: FunctionComponent<{ }, }) }, - [submitState, parentOnSubmit, onStop, telemetryRecorder.recordEvent, isFirstMessage, isSent] + [ + submitState, + parentOnSubmit, + onStop, + telemetryRecorder.recordEvent, + isFirstMessage, + isSent, + imageFile, + ] ) const onEditorEnterKey = useCallback( @@ -423,6 +456,7 @@ export const HumanMessageEditor: FunctionComponent<{ ) const Editor = experimentalPromptEditorEnabled ? PromptEditorV2 : PromptEditor + const experimentalOneBoxEnabled = useFeatureFlag(FeatureFlag.CodyExperimentalOneBoxDebug) return ( // biome-ignore lint/a11y/useKeyWithClickEvents: only relevant to click areas @@ -470,6 +504,9 @@ export const HumanMessageEditor: FunctionComponent<{ hidden={!focused && isSent} className={styles.toolbar} intent={intent} + imageFile={imageFile} + setImageFile={setImageFile} + experimentalOneBoxEnabled={experimentalOneBoxEnabled} /> )} diff --git a/vscode/webviews/chat/cells/messageCell/human/editor/toolbar/Toolbar.tsx b/vscode/webviews/chat/cells/messageCell/human/editor/toolbar/Toolbar.tsx index 6735273fb14b..e03d5839e94d 100644 --- a/vscode/webviews/chat/cells/messageCell/human/editor/toolbar/Toolbar.tsx +++ b/vscode/webviews/chat/cells/messageCell/human/editor/toolbar/Toolbar.tsx @@ -10,6 +10,7 @@ import { useActionSelect } from '../../../../../../prompts/PromptsTab' import { useClientConfig } from '../../../../../../utils/useClientConfig' import { AddContextButton } from './AddContextButton' import { SubmitButton, type SubmitButtonState } from './SubmitButton' +import { UploadImageButton } from './UploadImageButton' /** * The toolbar for the human message editor. @@ -35,6 +36,10 @@ export const Toolbar: FunctionComponent<{ intent?: ChatMessage['intent'] manuallySelectIntent: (intent: ChatMessage['intent']) => void + experimentalOneBoxEnabled?: boolean + + imageFile?: File + setImageFile: (file: File | undefined) => void }> = ({ userInfo, isEditorFocused, @@ -48,6 +53,9 @@ export const Toolbar: FunctionComponent<{ models, intent, manuallySelectIntent, + experimentalOneBoxEnabled, + imageFile, + setImageFile, }) => { /** * If the user clicks in a gap or on the toolbar outside of any of its buttons, report back to @@ -88,6 +96,14 @@ export const Toolbar: FunctionComponent<{ /> )} + { + + } + void +} + +export const UploadImageButton = (props: UploadImageButtonProps) => { + const fileInputRef = useRef(null) + + const handleButtonClick = () => { + fileInputRef.current?.click() + } + + const handleFileChange = async (event: React.ChangeEvent) => { + const file = event.target.files?.[0] + props.onClick(file) + } + + return ( + + + + + + {props.imageFile ? 'Remove attached image' : 'Upload an image'} + + + + ) +} diff --git a/vscode/webviews/components/modelSelectField/ModelSelectField.module.css b/vscode/webviews/components/modelSelectField/ModelSelectField.module.css index 6ef4a7f15e62..14350c68bdf4 100644 --- a/vscode/webviews/components/modelSelectField/ModelSelectField.module.css +++ b/vscode/webviews/components/modelSelectField/ModelSelectField.module.css @@ -36,6 +36,10 @@ margin-left: auto; } +.supports-image-upload-icon { + margin-left: auto; +} + .badge { margin-left: auto; line-height: 16px; @@ -45,12 +49,17 @@ border: 1px solid var(--vscode-contrastBorder); } +.supports-image-upload-icon + .badge { + margin-left: 0; +} + button > .model-title-with-icon .model-name { font-weight: normal; } button > .model-title-with-icon .model-icon, button > .model-title-with-icon .model-provider, -button > .model-title-with-icon .badge { +button > .model-title-with-icon .badge, +.supports-image-upload-icon { display: none; }