Multimodality: Upload PDFs and Images (#135)

This commit is contained in:
Brace Sproul
2025-05-20 10:23:21 -07:00
committed by GitHub
10 changed files with 1947 additions and 1214 deletions

View File

@@ -1,4 +1,10 @@
/** @type {import('next').NextConfig} */
const nextConfig = {};
const nextConfig = {
experimental: {
serverActions: {
bodySizeLimit: "10mb",
},
},
};
export default nextConfig;

View File

@@ -53,7 +53,7 @@
"tailwind-merge": "^3.0.2",
"tailwindcss-animate": "^1.0.7",
"use-stick-to-bottom": "^1.0.46",
"uuid": "^11.0.5",
"uuid": "^11.1.0",
"zod": "^3.24.2"
},
"devDependencies": {
@@ -64,6 +64,7 @@
"@types/react": "^19.0.8",
"@types/react-dom": "^19.0.3",
"@types/react-syntax-highlighter": "^15.5.13",
"@types/uuid": "^10.0.0",
"autoprefixer": "^10.4.20",
"dotenv": "^16.4.7",
"eslint": "^9.19.0",
@@ -81,8 +82,7 @@
"typescript-eslint": "^8.22.0"
},
"overrides": {
"react-is": "^19.0.0-rc-69d4b800-20241021",
"@langchain/langgraph-checkpoint": "^0.0.16"
"react-is": "^19.0.0-rc-69d4b800-20241021"
},
"packageManager": "pnpm@10.5.1"
}

2634
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,37 @@
import React from "react";
import type { Base64ContentBlock } from "@langchain/core/messages";
import { MultimodalPreview } from "../ui/MultimodalPreview";
import { cn } from "@/lib/utils";
interface ContentBlocksPreviewProps {
blocks: Base64ContentBlock[];
onRemove: (idx: number) => void;
size?: "sm" | "md" | "lg";
className?: string;
}
/**
* Renders a preview of content blocks with optional remove functionality.
* Uses cn utility for robust class merging.
*/
export const ContentBlocksPreview: React.FC<ContentBlocksPreviewProps> = ({
blocks,
onRemove,
size = "md",
className,
}) => {
if (!blocks.length) return null;
return (
<div className={cn("flex flex-wrap gap-2 p-3.5 pb-0", className)}>
{blocks.map((block, idx) => (
<MultimodalPreview
key={idx}
block={block}
removable
onRemove={() => onRemove(idx)}
size={size}
/>
))}
</div>
);
};

View File

@@ -21,6 +21,8 @@ import {
PanelRightClose,
SquarePen,
XIcon,
Plus,
CircleX,
} from "lucide-react";
import { useQueryState, parseAsBoolean } from "nuqs";
import { StickToBottom, useStickToBottomContext } from "use-stick-to-bottom";
@@ -36,6 +38,8 @@ import {
TooltipProvider,
TooltipTrigger,
} from "../ui/tooltip";
import { useFileUpload } from "@/hooks/use-file-upload";
import { ContentBlocksPreview } from "./ContentBlocksPreview";
import {
useArtifactOpen,
ArtifactContent,
@@ -122,6 +126,14 @@ export function Thread() {
parseAsBoolean.withDefault(false),
);
const [input, setInput] = useState("");
const {
contentBlocks,
setContentBlocks,
handleFileUpload,
dropRef,
removeBlock,
resetBlocks,
} = useFileUpload();
const [firstTokenReceived, setFirstTokenReceived] = useState(false);
const isLargeScreen = useMediaQuery("(min-width: 1024px)");
@@ -183,13 +195,17 @@ export function Thread() {
const handleSubmit = (e: FormEvent) => {
e.preventDefault();
if (!input.trim() || isLoading) return;
if ((input.trim().length === 0 && contentBlocks.length === 0) || isLoading)
return;
setFirstTokenReceived(false);
const newHumanMessage: Message = {
id: uuidv4(),
type: "human",
content: input,
content: [
...(input.trim().length > 0 ? [{ type: "text", text: input }] : []),
...contentBlocks,
] as Message["content"],
};
const toolMessages = ensureToolCallsHaveResponses(stream.messages);
@@ -214,6 +230,7 @@ export function Thread() {
);
setInput("");
setContentBlocks([]);
};
const handleRegenerate = (
@@ -423,11 +440,18 @@ export function Thread() {
<ScrollToBottom className="animate-in fade-in-0 zoom-in-95 absolute bottom-full left-1/2 mb-4 -translate-x-1/2" />
<div className="bg-muted relative z-10 mx-auto mb-8 w-full max-w-3xl rounded-2xl border shadow-xs">
<div
ref={dropRef}
className="bg-muted relative z-10 mx-auto mb-8 w-full max-w-3xl rounded-2xl border shadow-xs"
>
<form
onSubmit={handleSubmit}
className="mx-auto grid max-w-3xl grid-rows-[1fr_auto] gap-2"
>
<ContentBlocksPreview
blocks={contentBlocks}
onRemove={removeBlock}
/>
<textarea
value={input}
onChange={(e) => setInput(e.target.value)}
@@ -448,7 +472,7 @@ export function Thread() {
className="field-sizing-content resize-none border-none bg-transparent p-3.5 pb-0 shadow-none ring-0 outline-none focus:ring-0 focus:outline-none"
/>
<div className="flex items-center justify-between p-2 pt-4">
<div className="flex items-center gap-6 p-2 pt-4">
<div>
<div className="flex items-center space-x-2">
<Switch
@@ -464,10 +488,28 @@ export function Thread() {
</Label>
</div>
</div>
<Label
htmlFor="file-input"
className="flex cursor-pointer items-center gap-2"
>
<Plus className="size-5 text-gray-600" />
<span className="text-sm text-gray-600">
Upload PDF or Image
</span>
</Label>
<input
id="file-input"
type="file"
onChange={handleFileUpload}
multiple
accept="image/jpeg,image/png,image/gif,image/webp,application/pdf"
className="hidden"
/>
{stream.isLoading ? (
<Button
key="stop"
onClick={() => stream.stop()}
className="ml-auto"
>
<LoaderCircle className="h-4 w-4 animate-spin" />
Cancel
@@ -475,8 +517,11 @@ export function Thread() {
) : (
<Button
type="submit"
className="shadow-md transition-all"
disabled={isLoading || !input.trim()}
className="ml-auto shadow-md transition-all"
disabled={
isLoading ||
(!input.trim() && contentBlocks.length === 0)
}
>
Send
</Button>

View File

@@ -5,6 +5,8 @@ import { getContentString } from "../utils";
import { cn } from "@/lib/utils";
import { Textarea } from "@/components/ui/textarea";
import { BranchSwitcher, CommandBar } from "./shared";
import { MultimodalPreview } from "@/components/ui/MultimodalPreview";
import type { Base64ContentBlock } from "@langchain/core/messages";
function EditableContent({
value,
@@ -32,6 +34,36 @@ function EditableContent({
);
}
// Type guard for Base64ContentBlock
function isBase64ContentBlock(block: unknown): block is Base64ContentBlock {
if (typeof block !== "object" || block === null || !("type" in block))
return false;
// file type (legacy)
if (
(block as { type: unknown }).type === "file" &&
"source_type" in block &&
(block as { source_type: unknown }).source_type === "base64" &&
"mime_type" in block &&
typeof (block as { mime_type?: unknown }).mime_type === "string" &&
((block as { mime_type: string }).mime_type.startsWith("image/") ||
(block as { mime_type: string }).mime_type === "application/pdf")
) {
return true;
}
// image type (new)
if (
(block as { type: unknown }).type === "image" &&
"source_type" in block &&
(block as { source_type: unknown }).source_type === "base64" &&
"mime_type" in block &&
typeof (block as { mime_type?: unknown }).mime_type === "string" &&
(block as { mime_type: string }).mime_type.startsWith("image/")
) {
return true;
}
return false;
}
export function HumanMessage({
message,
isLoading,
@@ -84,9 +116,34 @@ export function HumanMessage({
onSubmit={handleSubmitEdit}
/>
) : (
<p className="bg-muted ml-auto w-fit rounded-3xl px-4 py-2 whitespace-pre-wrap">
<div className="flex flex-col gap-2">
{/* Render images and files if no text */}
{Array.isArray(message.content) && message.content.length > 0 && (
<div className="flex flex-col items-end gap-2">
{message.content.reduce<React.ReactNode[]>(
(acc, block, idx) => {
if (isBase64ContentBlock(block)) {
acc.push(
<MultimodalPreview
key={idx}
block={block}
size="md"
/>,
);
}
return acc;
},
[],
)}
</div>
)}
{/* Render text if present, otherwise fallback to file/image name */}
{contentString ? (
<p className="bg-muted ml-auto w-fit rounded-3xl px-4 py-2 text-right whitespace-pre-wrap">
{contentString}
</p>
) : null}
</div>
)}
<div

View File

@@ -1,5 +1,11 @@
import type { Message } from "@langchain/langgraph-sdk";
/**
* Extracts a string summary from a message's content, supporting multimodal (text, image, file, etc.).
* - If text is present, returns the joined text.
* - If not, returns a label for the first non-text modality (e.g., 'Image', 'Other').
* - If unknown, returns 'Multimodal message'.
*/
export function getContentString(content: Message["content"]): string {
if (typeof content === "string") return content;
const texts = content

View File

@@ -0,0 +1,129 @@
import React from "react";
import { File, Image as ImageIcon, X as XIcon } from "lucide-react";
import type { Base64ContentBlock } from "@langchain/core/messages";
import { cn } from "@/lib/utils";
import Image from "next/image";
export interface MultimodalPreviewProps {
block: Base64ContentBlock;
removable?: boolean;
onRemove?: () => void;
className?: string;
size?: "sm" | "md" | "lg";
}
export const MultimodalPreview: React.FC<MultimodalPreviewProps> = ({
block,
removable = false,
onRemove,
className,
size = "md",
}) => {
// Sizing
const sizeMap = {
sm: "h-10 w-10 text-base",
md: "h-16 w-16 text-lg",
lg: "h-24 w-24 text-xl",
};
const iconSize: string =
typeof sizeMap[size] === "string" ? sizeMap[size] : sizeMap["md"];
// Image block
if (
block.type === "image" &&
block.source_type === "base64" &&
typeof block.mime_type === "string" &&
block.mime_type.startsWith("image/")
) {
const url = `data:${block.mime_type};base64,${block.data}`;
let imgClass: string = "rounded-md object-cover h-16 w-16 text-lg";
if (size === "sm") imgClass = "rounded-md object-cover h-10 w-10 text-base";
if (size === "lg") imgClass = "rounded-md object-cover h-24 w-24 text-xl";
return (
<div className={cn("relative inline-block", className)}>
<Image
src={url}
alt={String(block.metadata?.name || "uploaded image")}
className={imgClass}
width={size === "sm" ? 16 : size === "md" ? 32 : 48}
height={size === "sm" ? 16 : size === "md" ? 32 : 48}
/>
{removable && (
<button
type="button"
className="absolute top-1 right-1 z-10 rounded-full bg-gray-500 text-white hover:bg-gray-700"
onClick={onRemove}
aria-label="Remove image"
>
<XIcon className="h-4 w-4" />
</button>
)}
</div>
);
}
// PDF block
if (
block.type === "file" &&
block.source_type === "base64" &&
block.mime_type === "application/pdf"
) {
const filename =
block.metadata?.filename || block.metadata?.name || "PDF file";
return (
<div
className={cn(
"relative flex items-center gap-2 rounded-md border bg-gray-100 px-3 py-2",
className,
)}
>
<File
className={cn(
"flex-shrink-0 text-teal-700",
size === "sm" ? "h-5 w-5" : "h-7 w-7",
)}
/>
<span
className={cn(
"truncate text-sm text-gray-800",
size === "sm" ? "max-w-[80px]" : "max-w-[160px]",
)}
>
{String(filename)}
</span>
{removable && (
<button
type="button"
className="ml-2 rounded-full bg-gray-200 p-1 text-teal-700 hover:bg-gray-300"
onClick={onRemove}
aria-label="Remove PDF"
>
<XIcon className="h-4 w-4" />
</button>
)}
</div>
);
}
// Fallback for unknown types
return (
<div
className={cn(
"flex items-center gap-2 rounded-md border bg-gray-100 px-3 py-2 text-gray-500",
className,
)}
>
<File className="h-5 w-5 flex-shrink-0" />
<span className="truncate text-xs">Unsupported file type</span>
{removable && (
<button
type="button"
className="ml-2 rounded-full bg-gray-200 p-1 text-gray-500 hover:bg-gray-300"
onClick={onRemove}
aria-label="Remove file"
>
<XIcon className="h-4 w-4" />
</button>
)}
</div>
);
};

View File

@@ -0,0 +1,164 @@
import { useState, useRef, useEffect, ChangeEvent } from "react";
import { toast } from "sonner";
import type { Base64ContentBlock } from "@langchain/core/messages";
import { fileToContentBlock } from "@/lib/multimodal-utils";
export const SUPPORTED_FILE_TYPES = [
"image/jpeg",
"image/png",
"image/gif",
"image/webp",
"application/pdf",
];
interface UseFileUploadOptions {
initialBlocks?: Base64ContentBlock[];
}
export function useFileUpload({
initialBlocks = [],
}: UseFileUploadOptions = {}) {
const [contentBlocks, setContentBlocks] =
useState<Base64ContentBlock[]>(initialBlocks);
const dropRef = useRef<HTMLDivElement>(null);
const isDuplicate = (file: File, blocks: Base64ContentBlock[]) => {
if (file.type === "application/pdf") {
return blocks.some(
(b) =>
b.type === "file" &&
b.mime_type === "application/pdf" &&
b.metadata?.filename === file.name,
);
}
if (SUPPORTED_FILE_TYPES.includes(file.type)) {
return blocks.some(
(b) =>
b.type === "image" &&
b.metadata?.name === file.name &&
b.mime_type === file.type,
);
}
return false;
};
const handleFileUpload = async (e: ChangeEvent<HTMLInputElement>) => {
const files = e.target.files;
if (!files) return;
const fileArray = Array.from(files);
const validFiles = fileArray.filter((file) =>
SUPPORTED_FILE_TYPES.includes(file.type),
);
const invalidFiles = fileArray.filter(
(file) => !SUPPORTED_FILE_TYPES.includes(file.type),
);
const duplicateFiles = validFiles.filter((file) =>
isDuplicate(file, contentBlocks),
);
const uniqueFiles = validFiles.filter(
(file) => !isDuplicate(file, contentBlocks),
);
if (invalidFiles.length > 0) {
toast.error(
"You have uploaded invalid file type. Please upload a JPEG, PNG, GIF, WEBP image or a PDF.",
);
}
if (duplicateFiles.length > 0) {
toast.error(
`Duplicate file(s) detected: ${duplicateFiles.map((f) => f.name).join(", ")}. Each file can only be uploaded once per message.`,
);
}
const newBlocks = uniqueFiles.length
? await Promise.all(uniqueFiles.map(fileToContentBlock))
: [];
setContentBlocks((prev) => [...prev, ...newBlocks]);
e.target.value = "";
};
// Drag and drop handlers
useEffect(() => {
if (!dropRef.current) return;
const handleDragOver = (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
};
const handleDrop = async (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
if (!e.dataTransfer) return;
const files = Array.from(e.dataTransfer.files);
const validFiles = files.filter((file) =>
SUPPORTED_FILE_TYPES.includes(file.type),
);
const invalidFiles = files.filter(
(file) => !SUPPORTED_FILE_TYPES.includes(file.type),
);
const duplicateFiles = validFiles.filter((file) =>
isDuplicate(file, contentBlocks),
);
const uniqueFiles = validFiles.filter(
(file) => !isDuplicate(file, contentBlocks),
);
if (invalidFiles.length > 0) {
toast.error(
"You have uploaded invalid file type. Please upload a JPEG, PNG, GIF, WEBP image or a PDF.",
);
}
if (duplicateFiles.length > 0) {
toast.error(
`Duplicate file(s) detected: ${duplicateFiles.map((f) => f.name).join(", ")}. Each file can only be uploaded once per message.`,
);
}
const newBlocks = uniqueFiles.length
? await Promise.all(uniqueFiles.map(fileToContentBlock))
: [];
setContentBlocks((prev) => [...prev, ...newBlocks]);
};
const handleDragEnter = (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
};
const handleDragLeave = (e: DragEvent) => {
e.preventDefault();
e.stopPropagation();
};
const element = dropRef.current;
element.addEventListener("dragover", handleDragOver);
element.addEventListener("drop", handleDrop);
element.addEventListener("dragenter", handleDragEnter);
element.addEventListener("dragleave", handleDragLeave);
return () => {
element.removeEventListener("dragover", handleDragOver);
element.removeEventListener("drop", handleDrop);
element.removeEventListener("dragenter", handleDragEnter);
element.removeEventListener("dragleave", handleDragLeave);
};
}, [contentBlocks]);
const removeBlock = (idx: number) => {
setContentBlocks((prev) => prev.filter((_, i) => i !== idx));
};
const resetBlocks = () => setContentBlocks([]);
return {
contentBlocks,
setContentBlocks,
handleFileUpload,
dropRef,
removeBlock,
resetBlocks,
};
}

View File

@@ -0,0 +1,57 @@
import type { Base64ContentBlock } from "@langchain/core/messages";
import { toast } from "sonner";
// Returns a Promise of a typed multimodal block for images or PDFs
export async function fileToContentBlock(
file: File,
): Promise<Base64ContentBlock> {
const supportedImageTypes = [
"image/jpeg",
"image/png",
"image/gif",
"image/webp",
];
const supportedFileTypes = [...supportedImageTypes, "application/pdf"];
if (!supportedFileTypes.includes(file.type)) {
toast.error(
`Unsupported file type: ${file.type}. Supported types are: ${supportedFileTypes.join(", ")}`,
);
return Promise.reject(new Error(`Unsupported file type: ${file.type}`));
}
const data = await fileToBase64(file);
if (supportedImageTypes.includes(file.type)) {
return {
type: "image",
source_type: "base64",
mime_type: file.type,
data,
metadata: { name: file.name },
};
}
// PDF
return {
type: "file",
source_type: "base64",
mime_type: "application/pdf",
data,
metadata: { filename: file.name },
};
}
// Helper to convert File to base64 string
export async function fileToBase64(file: File): Promise<string> {
return new Promise<string>((resolve, reject) => {
const reader = new FileReader();
reader.onloadend = () => {
const result = reader.result as string;
// Remove the data:...;base64, prefix
resolve(result.split(",")[1]);
};
reader.onerror = reject;
reader.readAsDataURL(file);
});
}