Skip to content

Commit b59882d

Browse files
committed
feat: implement multimodal support with user-defined overrides and remove screenshot functionality
1 parent 4eea9dd commit b59882d

File tree

16 files changed

+160
-184
lines changed

16 files changed

+160
-184
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ SECRET_CONFIG
1414
gcp-*.json
1515
db
1616
models/*
17-
!models/add-your-models-here.txt
17+
!models/add-your-models-here.txt
18+
scirac

scira-chat

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit 892ec4a60d23cd1c19b1a079c799d24a8a79a262

src/lib/components/Switch.svelte

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,33 @@
11
<script lang="ts">
2-
interface Props {
3-
checked: boolean;
4-
name: string;
5-
}
2+
interface Props {
3+
checked: boolean;
4+
name: string;
5+
}
66
7-
let { checked = $bindable(), name }: Props = $props();
7+
let { checked = $bindable(), name }: Props = $props();
88
9-
function toggle() {
10-
checked = !checked;
11-
}
9+
function toggle() {
10+
checked = !checked;
11+
}
1212
13-
function onKeydown(e: KeyboardEvent) {
14-
if (e.key === " " || e.key === "Enter") {
15-
e.preventDefault();
16-
toggle();
17-
}
18-
}
13+
function onKeydown(e: KeyboardEvent) {
14+
if (e.key === " " || e.key === "Enter") {
15+
e.preventDefault();
16+
toggle();
17+
}
18+
}
1919
</script>
2020

2121
<input bind:checked type="checkbox" {name} class="peer pointer-events-none absolute opacity-0" />
2222
<div
23-
aria-checked={checked}
24-
aria-roledescription="switch"
25-
aria-label="switch"
26-
role="switch"
27-
tabindex="0"
28-
onclick={toggle}
29-
onkeydown={onKeydown}
30-
class="relative inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full bg-gray-300 p-1 shadow-inner ring-gray-400 transition-all hover:bg-gray-400 focus-visible:ring focus-visible:ring-offset-1 peer-checked:bg-blue-600 dark:bg-gray-600 peer-checked:[&>div]:translate-x-3.5"
23+
aria-checked={checked}
24+
aria-roledescription="switch"
25+
aria-label="switch"
26+
role="switch"
27+
tabindex="0"
28+
onclick={toggle}
29+
onkeydown={onKeydown}
30+
class="relative inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full bg-gray-300 p-1 shadow-inner ring-gray-400 transition-all peer-checked:bg-black hover:bg-gray-400 focus-visible:ring focus-visible:ring-offset-1 peer-checked:[&>div]:translate-x-3.5"
3131
>
32-
<div class="h-3.5 w-3.5 rounded-full bg-white shadow-sm transition-all"></div>
33-
</div>
32+
<div class="h-3.5 w-3.5 rounded-full bg-white shadow-sm transition-all"></div>
33+
</div>

src/lib/components/chat/ChatInput.svelte

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
import IconPaperclip from "$lib/components/icons/IconPaperclip.svelte";
66
import { useSettingsStore } from "$lib/stores/settings";
77
import { page } from "$app/state";
8-
import { captureScreen } from "$lib/utils/screenshot";
9-
import IconScreenshot from "../icons/IconScreenshot.svelte";
108
import { loginModalOpen } from "$lib/stores/loginModal";
119
1210
import { isVirtualKeyboard } from "$lib/utils/isVirtualKeyboard";
@@ -94,7 +92,7 @@
9492
9593
const settings = useSettingsStore();
9694
97-
// Tools removed; only show file upload and screenshot when applicable
95+
// Tools removed; only show file upload when applicable
9896
let showFileUpload = $derived(modelIsMultimodal && mimeTypes.length > 0);
9997
let showNoTools = $derived(!showFileUpload);
10098
</script>
@@ -162,32 +160,7 @@
162160
</label>
163161
</HoverTooltip>
164162
</div>
165-
{#if mimeTypes.includes("image/*")}
166-
<HoverTooltip
167-
label="Capture screenshot"
168-
position="top"
169-
TooltipClassNames="text-xs !text-left !w-auto whitespace-nowrap !py-1 !mb-0 max-sm:hidden"
170-
>
171-
<button
172-
class="base-tool"
173-
onclick={async (e) => {
174-
e.preventDefault();
175-
const screenshot = await captureScreen();
176-
177-
// Convert base64 to blob
178-
const base64Response = await fetch(screenshot);
179-
const blob = await base64Response.blob();
180-
181-
// Create a File object from the blob
182-
const file = new File([blob], "screenshot.png", { type: "image/png" });
183-
184-
files = [...files, file];
185-
}}
186-
>
187-
<IconScreenshot classNames="text-xl" />
188-
</button>
189-
</HoverTooltip>
190-
{/if}
163+
191164
{/if}
192165
</div>
193166
{/if}

src/lib/components/chat/ChatWindow.svelte

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,10 +178,14 @@
178178
179179
const settings = useSettingsStore();
180180
181+
// Respect per‑model multimodal toggle from settings (force enable)
182+
let modelIsMultimodal = $derived(
183+
currentModel.multimodal || ($settings.multimodalOverrides?.[currentModel.id] ?? false)
184+
);
181185
let activeMimeTypes = $derived(
182186
Array.from(
183187
new Set([
184-
...(currentModel.multimodal
188+
...(modelIsMultimodal
185189
? (currentModel.multimodalAcceptedMimetypes ?? ["image/*"])
186190
: []),
187191
])
@@ -364,7 +368,7 @@
364368
on:submit={handleSubmit}
365369
{onPaste}
366370
disabled={isReadOnly || lastIsError}
367-
modelIsMultimodal={currentModel.multimodal}
371+
modelIsMultimodal={modelIsMultimodal}
368372
bind:focused
369373
/>
370374
{/if}

src/lib/components/icons/IconScreenshot.svelte

Lines changed: 0 additions & 24 deletions
This file was deleted.

src/lib/server/api/routes/groups/user.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ export const userGroup = new Elysia()
7272
DEFAULT_SETTINGS.shareConversationsWithModelAuthors,
7373

7474
customPrompts: settings?.customPrompts ?? {},
75+
multimodalOverrides: settings?.multimodalOverrides ?? {},
7576
};
7677
})
7778
.post("/settings", async ({ locals, request }) => {
@@ -86,6 +87,7 @@ export const userGroup = new Elysia()
8687
ethicsModalAccepted: z.boolean().optional(),
8788
activeModel: z.string().default(DEFAULT_SETTINGS.activeModel),
8889
customPrompts: z.record(z.string()).default({}),
90+
multimodalOverrides: z.record(z.boolean()).default({}),
8991
disableStream: z.boolean().default(false),
9092
directPaste: z.boolean().default(false),
9193
})

src/lib/server/endpoints/openai/endpointOai.ts

Lines changed: 49 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -31,24 +31,21 @@ export const endpointOAIParametersSchema = z.object({
3131
defaultHeaders: z.record(z.string()).optional(),
3232
defaultQuery: z.record(z.string()).optional(),
3333
extraBody: z.record(z.any()).optional(),
34-
multimodal: z
35-
.object({
36-
image: createImageProcessorOptionsValidator({
37-
supportedMimeTypes: [
38-
"image/png",
39-
"image/jpeg",
40-
"image/webp",
41-
"image/avif",
42-
"image/tiff",
43-
"image/gif",
44-
],
45-
preferredMimeType: "image/webp",
46-
maxSizeInMB: Infinity,
47-
maxWidth: 4096,
48-
maxHeight: 4096,
49-
}),
50-
})
51-
.default({}),
34+
multimodal: z
35+
.object({
36+
image: createImageProcessorOptionsValidator({
37+
supportedMimeTypes: [
38+
// Restrict to the most widely-supported formats
39+
"image/png",
40+
"image/jpeg",
41+
],
42+
preferredMimeType: "image/jpeg",
43+
maxSizeInMB: 3,
44+
maxWidth: 2048,
45+
maxHeight: 2048,
46+
}),
47+
})
48+
.default({}),
5249
/* enable use of max_completion_tokens in place of max_tokens */
5350
useCompletionTokens: z.boolean().default(false),
5451
streamingSupported: z.boolean().default(true),
@@ -118,11 +115,15 @@ export async function endpointOai(
118115

119116
return openAICompletionToTextGenerationStream(openAICompletion);
120117
};
121-
} else if (completion === "chat_completions") {
122-
return async ({ messages, preprompt, generateSettings, conversationId }) => {
118+
} else if (completion === "chat_completions") {
119+
return async ({ messages, preprompt, generateSettings, conversationId, isMultimodal }) => {
123120
// Format messages for the chat API, handling multimodal content if supported
124-
let messagesOpenAI: OpenAI.Chat.Completions.ChatCompletionMessageParam[] =
125-
await prepareMessages(messages, imageProcessor, model.multimodal);
121+
let messagesOpenAI: OpenAI.Chat.Completions.ChatCompletionMessageParam[] =
122+
await prepareMessages(
123+
messages,
124+
imageProcessor,
125+
isMultimodal ?? model.multimodal
126+
);
126127

127128
// Check if a system message already exists as the first message
128129
const hasSystemMessage = messagesOpenAI.length > 0 && messagesOpenAI[0]?.role === "system";
@@ -214,36 +215,34 @@ async function prepareMessages(
214215
imageProcessor: ReturnType<typeof makeImageProcessor>,
215216
isMultimodal: boolean
216217
): Promise<OpenAI.Chat.Completions.ChatCompletionMessageParam[]> {
217-
return Promise.all(
218-
messages.map(async (message) => {
219-
if (message.from === "user" && isMultimodal) {
220-
return {
221-
role: message.from,
222-
content: [
223-
...(await prepareFiles(imageProcessor, message.files ?? [])),
224-
{ type: "text", text: message.content },
225-
],
226-
};
227-
}
228-
return {
229-
role: message.from,
230-
content: message.content,
231-
};
232-
})
233-
);
218+
return Promise.all(
219+
messages.map(async (message) => {
220+
if (message.from === "user" && isMultimodal) {
221+
const parts = [
222+
{ type: "text" as const, text: message.content },
223+
...(await prepareFiles(imageProcessor, message.files ?? [])),
224+
];
225+
return { role: message.from, content: parts };
226+
}
227+
return { role: message.from, content: message.content };
228+
})
229+
);
234230
}
235231

236232
async function prepareFiles(
237-
imageProcessor: ReturnType<typeof makeImageProcessor>,
238-
files: MessageFile[]
233+
imageProcessor: ReturnType<typeof makeImageProcessor>,
234+
files: MessageFile[]
239235
): Promise<OpenAI.Chat.Completions.ChatCompletionContentPartImage[]> {
240-
const processedFiles = await Promise.all(
241-
files.filter((file) => file.mime.startsWith("image/")).map(imageProcessor)
242-
);
243-
return processedFiles.map((file) => ({
244-
type: "image_url" as const,
245-
image_url: {
246-
url: `data:${file.mime};base64,${file.image.toString("base64")}`,
247-
},
248-
}));
236+
const processedFiles = await Promise.all(
237+
files.filter((file) => file.mime.startsWith("image/")).map(imageProcessor)
238+
);
239+
return processedFiles.map((file) => ({
240+
type: "image_url" as const,
241+
image_url: {
242+
url: `data:${file.mime};base64,${file.image.toString("base64")}`,
243+
// Improves compatibility with some OpenAI-compatible servers
244+
// that expect an explicit detail setting.
245+
detail: "auto",
246+
},
247+
}));
249248
}

src/lib/server/textGeneration/generate.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,16 @@ import { logger } from "../logger";
1414
type GenerateContext = Omit<TextGenerationContext, "messages"> & { messages: EndpointMessage[] };
1515

1616
export async function* generate(
17-
{ model, endpoint, conv, messages, assistant, isContinue, promptedAt }: GenerateContext,
17+
{
18+
model,
19+
endpoint,
20+
conv,
21+
messages,
22+
assistant,
23+
isContinue,
24+
promptedAt,
25+
forceMultimodal,
26+
}: GenerateContext,
1827
preprompt?: string
1928
): AsyncIterable<MessageUpdate> {
2029
// reasoning mode is false by default
@@ -45,7 +54,8 @@ export async function* generate(
4554
preprompt,
4655
continueMessage: isContinue,
4756
generateSettings: assistant?.generateSettings,
48-
isMultimodal: model.multimodal,
57+
// Allow user-level override to force multimodal
58+
isMultimodal: (forceMultimodal ?? false) || model.multimodal,
4959
conversationId: conv._id,
5060
})) {
5161
// text generation completed

src/lib/server/textGeneration/types.ts

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@ import type { Message } from "$lib/types/Message";
55
import type { Assistant } from "$lib/types/Assistant";
66

77
export interface TextGenerationContext {
8-
model: ProcessedModel;
9-
endpoint: Endpoint;
10-
conv: Conversation;
11-
messages: Message[];
12-
assistant?: Pick<Assistant, "dynamicPrompt" | "generateSettings">;
13-
isContinue: boolean;
14-
promptedAt: Date;
15-
ip: string;
16-
username?: string;
8+
model: ProcessedModel;
9+
endpoint: Endpoint;
10+
conv: Conversation;
11+
messages: Message[];
12+
assistant?: Pick<Assistant, "dynamicPrompt" | "generateSettings">;
13+
isContinue: boolean;
14+
promptedAt: Date;
15+
ip: string;
16+
username?: string;
17+
/** Force-enable multimodal handling for endpoints that support it */
18+
forceMultimodal?: boolean;
1719
}

0 commit comments

Comments
 (0)