Multimodality: Upload PDFs and Images (#135)

This commit is contained in:
Brace Sproul
2025-05-20 10:23:21 -07:00
committed by GitHub
10 changed files with 1947 additions and 1214 deletions

View File

@@ -1,4 +1,10 @@
/** @type {import('next').NextConfig} */ /** @type {import('next').NextConfig} */
const nextConfig = {}; const nextConfig = {
experimental: {
serverActions: {
bodySizeLimit: "10mb",
},
},
};
export default nextConfig; export default nextConfig;

View File

@@ -53,7 +53,7 @@
"tailwind-merge": "^3.0.2", "tailwind-merge": "^3.0.2",
"tailwindcss-animate": "^1.0.7", "tailwindcss-animate": "^1.0.7",
"use-stick-to-bottom": "^1.0.46", "use-stick-to-bottom": "^1.0.46",
"uuid": "^11.0.5", "uuid": "^11.1.0",
"zod": "^3.24.2" "zod": "^3.24.2"
}, },
"devDependencies": { "devDependencies": {
@@ -64,6 +64,7 @@
"@types/react": "^19.0.8", "@types/react": "^19.0.8",
"@types/react-dom": "^19.0.3", "@types/react-dom": "^19.0.3",
"@types/react-syntax-highlighter": "^15.5.13", "@types/react-syntax-highlighter": "^15.5.13",
"@types/uuid": "^10.0.0",
"autoprefixer": "^10.4.20", "autoprefixer": "^10.4.20",
"dotenv": "^16.4.7", "dotenv": "^16.4.7",
"eslint": "^9.19.0", "eslint": "^9.19.0",
@@ -81,8 +82,7 @@
"typescript-eslint": "^8.22.0" "typescript-eslint": "^8.22.0"
}, },
"overrides": { "overrides": {
"react-is": "^19.0.0-rc-69d4b800-20241021", "react-is": "^19.0.0-rc-69d4b800-20241021"
"@langchain/langgraph-checkpoint": "^0.0.16"
}, },
"packageManager": "pnpm@10.5.1" "packageManager": "pnpm@10.5.1"
} }

2634
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,37 @@
import React from "react";
import type { Base64ContentBlock } from "@langchain/core/messages";
import { MultimodalPreview } from "../ui/MultimodalPreview";
import { cn } from "@/lib/utils";
interface ContentBlocksPreviewProps {
blocks: Base64ContentBlock[];
onRemove: (idx: number) => void;
size?: "sm" | "md" | "lg";
className?: string;
}
/**
* Renders a preview of content blocks with optional remove functionality.
* Uses cn utility for robust class merging.
*/
export const ContentBlocksPreview: React.FC<ContentBlocksPreviewProps> = ({
blocks,
onRemove,
size = "md",
className,
}) => {
if (!blocks.length) return null;
return (
<div className={cn("flex flex-wrap gap-2 p-3.5 pb-0", className)}>
{blocks.map((block, idx) => (
<MultimodalPreview
key={idx}
block={block}
removable
onRemove={() => onRemove(idx)}
size={size}
/>
))}
</div>
);
};

View File

@@ -21,6 +21,8 @@ import {
PanelRightClose, PanelRightClose,
SquarePen, SquarePen,
XIcon, XIcon,
Plus,
CircleX,
} from "lucide-react"; } from "lucide-react";
import { useQueryState, parseAsBoolean } from "nuqs"; import { useQueryState, parseAsBoolean } from "nuqs";
import { StickToBottom, useStickToBottomContext } from "use-stick-to-bottom"; import { StickToBottom, useStickToBottomContext } from "use-stick-to-bottom";
@@ -36,6 +38,8 @@ import {
TooltipProvider, TooltipProvider,
TooltipTrigger, TooltipTrigger,
} from "../ui/tooltip"; } from "../ui/tooltip";
import { useFileUpload } from "@/hooks/use-file-upload";
import { ContentBlocksPreview } from "./ContentBlocksPreview";
import { import {
useArtifactOpen, useArtifactOpen,
ArtifactContent, ArtifactContent,
@@ -122,6 +126,14 @@ export function Thread() {
parseAsBoolean.withDefault(false), parseAsBoolean.withDefault(false),
); );
const [input, setInput] = useState(""); const [input, setInput] = useState("");
const {
contentBlocks,
setContentBlocks,
handleFileUpload,
dropRef,
removeBlock,
resetBlocks,
} = useFileUpload();
const [firstTokenReceived, setFirstTokenReceived] = useState(false); const [firstTokenReceived, setFirstTokenReceived] = useState(false);
const isLargeScreen = useMediaQuery("(min-width: 1024px)"); const isLargeScreen = useMediaQuery("(min-width: 1024px)");
@@ -183,13 +195,17 @@ export function Thread() {
const handleSubmit = (e: FormEvent) => { const handleSubmit = (e: FormEvent) => {
e.preventDefault(); e.preventDefault();
if (!input.trim() || isLoading) return; if ((input.trim().length === 0 && contentBlocks.length === 0) || isLoading)
return;
setFirstTokenReceived(false); setFirstTokenReceived(false);
const newHumanMessage: Message = { const newHumanMessage: Message = {
id: uuidv4(), id: uuidv4(),
type: "human", type: "human",
content: input, content: [
...(input.trim().length > 0 ? [{ type: "text", text: input }] : []),
...contentBlocks,
] as Message["content"],
}; };
const toolMessages = ensureToolCallsHaveResponses(stream.messages); const toolMessages = ensureToolCallsHaveResponses(stream.messages);
@@ -214,6 +230,7 @@ export function Thread() {
); );
setInput(""); setInput("");
setContentBlocks([]);
}; };
const handleRegenerate = ( const handleRegenerate = (
@@ -423,11 +440,18 @@ export function Thread() {
<ScrollToBottom className="animate-in fade-in-0 zoom-in-95 absolute bottom-full left-1/2 mb-4 -translate-x-1/2" /> <ScrollToBottom className="animate-in fade-in-0 zoom-in-95 absolute bottom-full left-1/2 mb-4 -translate-x-1/2" />
<div className="bg-muted relative z-10 mx-auto mb-8 w-full max-w-3xl rounded-2xl border shadow-xs"> <div
ref={dropRef}
className="bg-muted relative z-10 mx-auto mb-8 w-full max-w-3xl rounded-2xl border shadow-xs"
>
<form <form
onSubmit={handleSubmit} onSubmit={handleSubmit}
className="mx-auto grid max-w-3xl grid-rows-[1fr_auto] gap-2" className="mx-auto grid max-w-3xl grid-rows-[1fr_auto] gap-2"
> >
<ContentBlocksPreview
blocks={contentBlocks}
onRemove={removeBlock}
/>
<textarea <textarea
value={input} value={input}
onChange={(e) => setInput(e.target.value)} onChange={(e) => setInput(e.target.value)}
@@ -448,7 +472,7 @@ export function Thread() {
className="field-sizing-content resize-none border-none bg-transparent p-3.5 pb-0 shadow-none ring-0 outline-none focus:ring-0 focus:outline-none" className="field-sizing-content resize-none border-none bg-transparent p-3.5 pb-0 shadow-none ring-0 outline-none focus:ring-0 focus:outline-none"
/> />
<div className="flex items-center justify-between p-2 pt-4"> <div className="flex items-center gap-6 p-2 pt-4">
<div> <div>
<div className="flex items-center space-x-2"> <div className="flex items-center space-x-2">
<Switch <Switch
@@ -464,10 +488,28 @@ export function Thread() {
</Label> </Label>
</div> </div>
</div> </div>
<Label
htmlFor="file-input"
className="flex cursor-pointer items-center gap-2"
>
<Plus className="size-5 text-gray-600" />
<span className="text-sm text-gray-600">
Upload PDF or Image
</span>
</Label>
<input
id="file-input"
type="file"
onChange={handleFileUpload}
multiple
accept="image/jpeg,image/png,image/gif,image/webp,application/pdf"
className="hidden"
/>
{stream.isLoading ? ( {stream.isLoading ? (
<Button <Button
key="stop" key="stop"
onClick={() => stream.stop()} onClick={() => stream.stop()}
className="ml-auto"
> >
<LoaderCircle className="h-4 w-4 animate-spin" /> <LoaderCircle className="h-4 w-4 animate-spin" />
Cancel Cancel
@@ -475,8 +517,11 @@ export function Thread() {
) : ( ) : (
<Button <Button
type="submit" type="submit"
className="shadow-md transition-all" className="ml-auto shadow-md transition-all"
disabled={isLoading || !input.trim()} disabled={
isLoading ||
(!input.trim() && contentBlocks.length === 0)
}
> >
Send Send
</Button> </Button>

View File

@@ -5,6 +5,8 @@ import { getContentString } from "../utils";
import { cn } from "@/lib/utils"; import { cn } from "@/lib/utils";
import { Textarea } from "@/components/ui/textarea"; import { Textarea } from "@/components/ui/textarea";
import { BranchSwitcher, CommandBar } from "./shared"; import { BranchSwitcher, CommandBar } from "./shared";
import { MultimodalPreview } from "@/components/ui/MultimodalPreview";
import type { Base64ContentBlock } from "@langchain/core/messages";
function EditableContent({ function EditableContent({
value, value,
@@ -32,6 +34,36 @@ function EditableContent({
); );
} }
// Type guard for Base64ContentBlock
function isBase64ContentBlock(block: unknown): block is Base64ContentBlock {
if (typeof block !== "object" || block === null || !("type" in block))
return false;
// file type (legacy)
if (
(block as { type: unknown }).type === "file" &&
"source_type" in block &&
(block as { source_type: unknown }).source_type === "base64" &&
"mime_type" in block &&
typeof (block as { mime_type?: unknown }).mime_type === "string" &&
((block as { mime_type: string }).mime_type.startsWith("image/") ||
(block as { mime_type: string }).mime_type === "application/pdf")
) {
return true;
}
// image type (new)
if (
(block as { type: unknown }).type === "image" &&
"source_type" in block &&
(block as { source_type: unknown }).source_type === "base64" &&
"mime_type" in block &&
typeof (block as { mime_type?: unknown }).mime_type === "string" &&
(block as { mime_type: string }).mime_type.startsWith("image/")
) {
return true;
}
return false;
}
export function HumanMessage({ export function HumanMessage({
message, message,
isLoading, isLoading,
@@ -84,9 +116,34 @@ export function HumanMessage({
onSubmit={handleSubmitEdit} onSubmit={handleSubmitEdit}
/> />
) : ( ) : (
<p className="bg-muted ml-auto w-fit rounded-3xl px-4 py-2 whitespace-pre-wrap"> <div className="flex flex-col gap-2">
{contentString} {/* Render images and files if no text */}
</p> {Array.isArray(message.content) && message.content.length > 0 && (
<div className="flex flex-col items-end gap-2">
{message.content.reduce<React.ReactNode[]>(
(acc, block, idx) => {
if (isBase64ContentBlock(block)) {
acc.push(
<MultimodalPreview
key={idx}
block={block}
size="md"
/>,
);
}
return acc;
},
[],
)}
</div>
)}
{/* Render text if present, otherwise fallback to file/image name */}
{contentString ? (
<p className="bg-muted ml-auto w-fit rounded-3xl px-4 py-2 text-right whitespace-pre-wrap">
{contentString}
</p>
) : null}
</div>
)} )}
<div <div

View File

@@ -1,5 +1,11 @@
import type { Message } from "@langchain/langgraph-sdk"; import type { Message } from "@langchain/langgraph-sdk";
/**
* Extracts a string summary from a message's content, supporting multimodal (text, image, file, etc.).
* - If text is present, returns the joined text.
* - If not, returns a label for the first non-text modality (e.g., 'Image', 'Other').
* - If unknown, returns 'Multimodal message'.
*/
export function getContentString(content: Message["content"]): string { export function getContentString(content: Message["content"]): string {
if (typeof content === "string") return content; if (typeof content === "string") return content;
const texts = content const texts = content

View File

@@ -0,0 +1,129 @@
import React from "react";
import { File, Image as ImageIcon, X as XIcon } from "lucide-react";
import type { Base64ContentBlock } from "@langchain/core/messages";
import { cn } from "@/lib/utils";
import Image from "next/image";
export interface MultimodalPreviewProps {
block: Base64ContentBlock;
removable?: boolean;
onRemove?: () => void;
className?: string;
size?: "sm" | "md" | "lg";
}
export const MultimodalPreview: React.FC<MultimodalPreviewProps> = ({
block,
removable = false,
onRemove,
className,
size = "md",
}) => {
// Sizing
const sizeMap = {
sm: "h-10 w-10 text-base",
md: "h-16 w-16 text-lg",
lg: "h-24 w-24 text-xl",
};
const iconSize: string =
typeof sizeMap[size] === "string" ? sizeMap[size] : sizeMap["md"];
// Image block
if (
block.type === "image" &&
block.source_type === "base64" &&
typeof block.mime_type === "string" &&
block.mime_type.startsWith("image/")
) {
const url = `data:${block.mime_type};base64,${block.data}`;
let imgClass: string = "rounded-md object-cover h-16 w-16 text-lg";
if (size === "sm") imgClass = "rounded-md object-cover h-10 w-10 text-base";
if (size === "lg") imgClass = "rounded-md object-cover h-24 w-24 text-xl";
return (
<div className={cn("relative inline-block", className)}>
<Image
src={url}
alt={String(block.metadata?.name || "uploaded image")}
className={imgClass}
width={size === "sm" ? 16 : size === "md" ? 32 : 48}
height={size === "sm" ? 16 : size === "md" ? 32 : 48}
/>
{removable && (
<button
type="button"
className="absolute top-1 right-1 z-10 rounded-full bg-gray-500 text-white hover:bg-gray-700"
onClick={onRemove}
aria-label="Remove image"
>
<XIcon className="h-4 w-4" />
</button>
)}
</div>
);
}
// PDF block
if (
block.type === "file" &&
block.source_type === "base64" &&
block.mime_type === "application/pdf"
) {
const filename =
block.metadata?.filename || block.metadata?.name || "PDF file";
return (
<div
className={cn(
"relative flex items-center gap-2 rounded-md border bg-gray-100 px-3 py-2",
className,
)}
>
<File
className={cn(
"flex-shrink-0 text-teal-700",
size === "sm" ? "h-5 w-5" : "h-7 w-7",
)}
/>
<span
className={cn(
"truncate text-sm text-gray-800",
size === "sm" ? "max-w-[80px]" : "max-w-[160px]",
)}
>
{String(filename)}
</span>
{removable && (
<button
type="button"
className="ml-2 rounded-full bg-gray-200 p-1 text-teal-700 hover:bg-gray-300"
onClick={onRemove}
aria-label="Remove PDF"
>
<XIcon className="h-4 w-4" />
</button>
)}
</div>
);
}
// Fallback for unknown types
return (
<div
className={cn(
"flex items-center gap-2 rounded-md border bg-gray-100 px-3 py-2 text-gray-500",
className,
)}
>
<File className="h-5 w-5 flex-shrink-0" />
<span className="truncate text-xs">Unsupported file type</span>
{removable && (
<button
type="button"
className="ml-2 rounded-full bg-gray-200 p-1 text-gray-500 hover:bg-gray-300"
onClick={onRemove}
aria-label="Remove file"
>
<XIcon className="h-4 w-4" />
</button>
)}
</div>
);
};

View File

@@ -0,0 +1,164 @@
import { useState, useRef, useEffect, ChangeEvent } from "react";
import { toast } from "sonner";
import type { Base64ContentBlock } from "@langchain/core/messages";
import { fileToContentBlock } from "@/lib/multimodal-utils";
export const SUPPORTED_FILE_TYPES = [
"image/jpeg",
"image/png",
"image/gif",
"image/webp",
"application/pdf",
];
interface UseFileUploadOptions {
initialBlocks?: Base64ContentBlock[];
}
export function useFileUpload({
initialBlocks = [],
}: UseFileUploadOptions = {}) {
const [contentBlocks, setContentBlocks] =
useState<Base64ContentBlock[]>(initialBlocks);
const dropRef = useRef<HTMLDivElement>(null);
const isDuplicate = (file: File, blocks: Base64ContentBlock[]) => {
if (file.type === "application/pdf") {
return blocks.some(
(b) =>
b.type === "file" &&
b.mime_type === "application/pdf" &&
b.metadata?.filename === file.name,
);
}
if (SUPPORTED_FILE_TYPES.includes(file.type)) {
return blocks.some(
(b) =>
b.type === "image" &&
b.metadata?.name === file.name &&
b.mime_type === file.type,
);
}
return false;
};
const handleFileUpload = async (e: ChangeEvent<HTMLInputElement>) => {
const files = e.target.files;
if (!files) return;
const fileArray = Array.from(files);
const validFiles = fileArray.filter((file) =>
SUPPORTED_FILE_TYPES.includes(file.type),
);
const invalidFiles = fileArray.filter(
(file) => !SUPPORTED_FILE_TYPES.includes(file.type),
);
const duplicateFiles = validFiles.filter((file) =>
isDuplicate(file, contentBlocks),
);
const uniqueFiles = validFiles.filter(
(file) => !isDuplicate(file, contentBlocks),
);
if (invalidFiles.length > 0) {
toast.error(
"You have uploaded invalid file type. Please upload a JPEG, PNG, GIF, WEBP image or a PDF.",
);
}
if (duplicateFiles.length > 0) {
toast.error(
`Duplicate file(s) detected: ${duplicateFiles.map((f) => f.name).join(", ")}. Each file can only be uploaded once per message.`,
);
}
const newBlocks = uniqueFiles.length
? await Promise.all(uniqueFiles.map(fileToContentBlock))
: [];
setContentBlocks((prev) => [...prev, ...newBlocks]);
e.target.value = "";
};
// Drag and drop handlers
useEffect(() => {
if (!dropRef.current) return;
const handleDragOver = (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
};
const handleDrop = async (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
if (!e.dataTransfer) return;
const files = Array.from(e.dataTransfer.files);
const validFiles = files.filter((file) =>
SUPPORTED_FILE_TYPES.includes(file.type),
);
const invalidFiles = files.filter(
(file) => !SUPPORTED_FILE_TYPES.includes(file.type),
);
const duplicateFiles = validFiles.filter((file) =>
isDuplicate(file, contentBlocks),
);
const uniqueFiles = validFiles.filter(
(file) => !isDuplicate(file, contentBlocks),
);
if (invalidFiles.length > 0) {
toast.error(
"You have uploaded invalid file type. Please upload a JPEG, PNG, GIF, WEBP image or a PDF.",
);
}
if (duplicateFiles.length > 0) {
toast.error(
`Duplicate file(s) detected: ${duplicateFiles.map((f) => f.name).join(", ")}. Each file can only be uploaded once per message.`,
);
}
const newBlocks = uniqueFiles.length
? await Promise.all(uniqueFiles.map(fileToContentBlock))
: [];
setContentBlocks((prev) => [...prev, ...newBlocks]);
};
const handleDragEnter = (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
};
const handleDragLeave = (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
};
const element = dropRef.current;
element.addEventListener("dragover", handleDragOver);
element.addEventListener("drop", handleDrop);
element.addEventListener("dragenter", handleDragEnter);
element.addEventListener("dragleave", handleDragLeave);
return () => {
element.removeEventListener("dragover", handleDragOver);
element.removeEventListener("drop", handleDrop);
element.removeEventListener("dragenter", handleDragEnter);
element.removeEventListener("dragleave", handleDragLeave);
};
}, [contentBlocks]);
const removeBlock = (idx: number) => {
setContentBlocks((prev) => prev.filter((_, i) => i !== idx));
};
const resetBlocks = () => setContentBlocks([]);
return {
contentBlocks,
setContentBlocks,
handleFileUpload,
dropRef,
removeBlock,
resetBlocks,
};
}

View File

@@ -0,0 +1,57 @@
import type { Base64ContentBlock } from "@langchain/core/messages";
import { toast } from "sonner";
// Returns a Promise of a typed multimodal block for images or PDFs
export async function fileToContentBlock(
file: File,
): Promise<Base64ContentBlock> {
const supportedImageTypes = [
"image/jpeg",
"image/png",
"image/gif",
"image/webp",
];
const supportedFileTypes = [...supportedImageTypes, "application/pdf"];
if (!supportedFileTypes.includes(file.type)) {
toast.error(
`Unsupported file type: ${file.type}. Supported types are: ${supportedFileTypes.join(", ")}`,
);
return Promise.reject(new Error(`Unsupported file type: ${file.type}`));
}
const data = await fileToBase64(file);
if (supportedImageTypes.includes(file.type)) {
return {
type: "image",
source_type: "base64",
mime_type: file.type,
data,
metadata: { name: file.name },
};
}
// PDF
return {
type: "file",
source_type: "base64",
mime_type: "application/pdf",
data,
metadata: { filename: file.name },
};
}
// Helper to convert File to base64 string
export async function fileToBase64(file: File): Promise<string> {
return new Promise<string>((resolve, reject) => {
const reader = new FileReader();
reader.onloadend = () => {
const result = reader.result as string;
// Remove the data:...;base64, prefix
resolve(result.split(",")[1]);
};
reader.onerror = reject;
reader.readAsDataURL(file);
});
}