Spaces:
Running
Running
Upload 51 files
Browse files
src/components/MultiSourceCaptioningView.tsx
CHANGED
|
@@ -1,717 +1,721 @@
|
|
| 1 |
-
import * as React from "react";
|
| 2 |
-
import { useState, useRef, useEffect } from "react";
|
| 3 |
-
import { useVLMContext } from "../context/useVLMContext";
|
| 4 |
-
import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
|
| 5 |
-
|
| 6 |
-
const MODES = ["File"] as const;
|
| 7 |
-
type Mode = typeof MODES[number];
|
| 8 |
-
|
| 9 |
-
const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4";
|
| 10 |
-
const EXAMPLE_PROMPT = "Describe the video";
|
| 11 |
-
|
| 12 |
-
function isImageFile(file: File) {
|
| 13 |
-
return file.type.startsWith("image/");
|
| 14 |
-
}
|
| 15 |
-
function isVideoFile(file: File) {
|
| 16 |
-
return file.type.startsWith("video/");
|
| 17 |
-
}
|
| 18 |
-
|
| 19 |
-
function denormalizeBox(box: number[], width: number, height: number) {
|
| 20 |
-
// If all values are between 0 and 1, treat as normalized
|
| 21 |
-
if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
|
| 22 |
-
return [
|
| 23 |
-
box[0] * width,
|
| 24 |
-
box[1] * height,
|
| 25 |
-
box[2] * width,
|
| 26 |
-
box[3] * height
|
| 27 |
-
];
|
| 28 |
-
}
|
| 29 |
-
return box;
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
// Add this robust fallback parser near the top
|
| 33 |
-
function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
|
| 34 |
-
// Try to parse as JSON first
|
| 35 |
-
try {
|
| 36 |
-
const parsed = JSON.parse(output);
|
| 37 |
-
if (Array.isArray(parsed)) {
|
| 38 |
-
const result: { label: string, bbox_2d: number[] }[] = [];
|
| 39 |
-
for (const obj of parsed) {
|
| 40 |
-
if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
|
| 41 |
-
if (Array.isArray(obj.bbox_2d[0])) {
|
| 42 |
-
for (const arr of obj.bbox_2d) {
|
| 43 |
-
if (Array.isArray(arr) && arr.length === 4) {
|
| 44 |
-
result.push({ label: obj.label, bbox_2d: arr });
|
| 45 |
-
}
|
| 46 |
-
}
|
| 47 |
-
} else if (obj.bbox_2d.length === 4) {
|
| 48 |
-
result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
|
| 49 |
-
}
|
| 50 |
-
}
|
| 51 |
-
}
|
| 52 |
-
if (result.length > 0) return result;
|
| 53 |
-
}
|
| 54 |
-
} catch (e) {}
|
| 55 |
-
// Fallback: extract all [x1, y1, x2, y2] arrays from the string
|
| 56 |
-
const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
|
| 57 |
-
const boxes: { label: string, bbox_2d: number[] }[] = [];
|
| 58 |
-
let match;
|
| 59 |
-
while ((match = boxRegex.exec(output)) !== null) {
|
| 60 |
-
const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
|
| 61 |
-
boxes.push({ label: '', bbox_2d: arr });
|
| 62 |
-
}
|
| 63 |
-
return boxes;
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
// NOTE: You must install onnxruntime-web:
|
| 67 |
-
// npm install onnxruntime-web
|
| 68 |
-
// @ts-ignore
|
| 69 |
-
import * as ort from 'onnxruntime-web';
|
| 70 |
-
// If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web';
|
| 71 |
-
|
| 72 |
-
// Set your YOLOv8 ONNX model URL here:
|
| 73 |
-
const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE
|
| 74 |
-
|
| 75 |
-
// Add these constants to match the YOLOv8 input size
|
| 76 |
-
const YOLOV8_INPUT_WIDTH = 640;
|
| 77 |
-
const YOLOV8_INPUT_HEIGHT = 480;
|
| 78 |
-
|
| 79 |
-
// 1. Load the ONNX model once
|
| 80 |
-
let yoloSession: ort.InferenceSession | null = null;
|
| 81 |
-
// Add a busy flag to prevent concurrent YOLOv8 inferences
|
| 82 |
-
let isYoloBusy = false;
|
| 83 |
-
async function loadYoloModel() {
|
| 84 |
-
if (!yoloSession) {
|
| 85 |
-
yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL);
|
| 86 |
-
}
|
| 87 |
-
return yoloSession;
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
// COCO class names for YOLOv8
|
| 91 |
-
const YOLO_CLASSES: string[] = [
|
| 92 |
-
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
|
| 93 |
-
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
|
| 94 |
-
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
|
| 95 |
-
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
|
| 96 |
-
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
|
| 97 |
-
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
|
| 98 |
-
"dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
|
| 99 |
-
"toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
|
| 100 |
-
];
|
| 101 |
-
|
| 102 |
-
// Preprocess video frame to YOLOv8 input tensor [1,3,640,640]
|
| 103 |
-
function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor {
|
| 104 |
-
const width = 640;
|
| 105 |
-
const height = 480;
|
| 106 |
-
const canvas = document.createElement('canvas');
|
| 107 |
-
canvas.width = width;
|
| 108 |
-
canvas.height = height;
|
| 109 |
-
const ctx = canvas.getContext('2d');
|
| 110 |
-
if (!ctx) throw new Error('Could not get 2D context');
|
| 111 |
-
ctx.drawImage(video, 0, 0, width, height);
|
| 112 |
-
const imageData = ctx.getImageData(0, 0, width, height);
|
| 113 |
-
const { data } = imageData;
|
| 114 |
-
// Convert to Float32Array [1,3,480,640], normalize to [0,1]
|
| 115 |
-
const floatData = new Float32Array(1 * 3 * height * width);
|
| 116 |
-
for (let i = 0; i < width * height; i++) {
|
| 117 |
-
floatData[i] = data[i * 4] / 255; // R
|
| 118 |
-
floatData[i + width * height] = data[i * 4 + 1] / 255; // G
|
| 119 |
-
floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B
|
| 120 |
-
}
|
| 121 |
-
return new ort.Tensor('float32', floatData, [1, 3, height, width]);
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
// Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters
|
| 125 |
-
function postprocessYoloOutput(output: ort.Tensor) {
|
| 126 |
-
// output.dims: [1, num_detections, 6]
|
| 127 |
-
const data = output.data;
|
| 128 |
-
const numDetections = output.dims[1];
|
| 129 |
-
const results = [];
|
| 130 |
-
for (let i = 0; i < numDetections; i++) {
|
| 131 |
-
const offset = i * 6;
|
| 132 |
-
const x1 = data[offset];
|
| 133 |
-
const y1 = data[offset + 1];
|
| 134 |
-
const x2 = data[offset + 2];
|
| 135 |
-
const y2 = data[offset + 3];
|
| 136 |
-
const score = data[offset + 4];
|
| 137 |
-
const classId = data[offset + 5];
|
| 138 |
-
if (score < 0.2) continue; // adjust threshold as needed
|
| 139 |
-
results.push({
|
| 140 |
-
bbox: [x1, y1, x2, y2],
|
| 141 |
-
label: YOLO_CLASSES[classId] || `class_${classId}`,
|
| 142 |
-
score
|
| 143 |
-
});
|
| 144 |
-
}
|
| 145 |
-
return results;
|
| 146 |
-
}
|
| 147 |
-
|
| 148 |
-
// Helper type guard for annotation
|
| 149 |
-
function hasAnnotation(obj: any): obj is { annotation: string } {
|
| 150 |
-
return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string';
|
| 151 |
-
}
|
| 152 |
-
|
| 153 |
-
export default function MultiSourceCaptioningView() {
|
| 154 |
-
const [mode, setMode] = useState<Mode>("File");
|
| 155 |
-
const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
|
| 156 |
-
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
|
| 157 |
-
const [processing, setProcessing] = useState(false);
|
| 158 |
-
const [error, setError] = useState<string | null>(null);
|
| 159 |
-
const [uploadedFile, setUploadedFile] = useState<File | null>(null);
|
| 160 |
-
const [uploadedUrl, setUploadedUrl] = useState<string>("");
|
| 161 |
-
const [videoProcessing, setVideoProcessing] = useState(false);
|
| 162 |
-
const [imageProcessed, setImageProcessed] = useState(false);
|
| 163 |
-
const [exampleProcessing, setExampleProcessing] = useState(false);
|
| 164 |
-
const [debugOutput, setDebugOutput] = useState<string>("");
|
| 165 |
-
const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
|
| 166 |
-
const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
|
| 167 |
-
const [inferenceStatus, setInferenceStatus] = useState<string>("");
|
| 168 |
-
const [showProcessingVideo, setShowProcessingVideo] = useState(false);
|
| 169 |
-
|
| 170 |
-
const videoRef = useRef<HTMLVideoElement | null>(null);
|
| 171 |
-
const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
|
| 172 |
-
const processingVideoRef = useRef<HTMLVideoElement | null>(null);
|
| 173 |
-
const canvasRef = useRef<HTMLCanvasElement | null>(null);
|
| 174 |
-
const imageRef = useRef<HTMLImageElement | null>(null);
|
| 175 |
-
const boxHistoryRef = useRef<any[]>([]);
|
| 176 |
-
// Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation)
|
| 177 |
-
const lastYoloBoxesRef = React.useRef<any[]>([]);
|
| 178 |
-
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
|
| 179 |
-
|
| 180 |
-
// Remove videoProcessingRef and exampleProcessingRef
|
| 181 |
-
// Add a single processingLoopRef
|
| 182 |
-
const processingLoopRef = React.useRef(false);
|
| 183 |
-
|
| 184 |
-
const processVideoLoop = async () => {
|
| 185 |
-
if (!processingLoopRef.current) return;
|
| 186 |
-
if (isYoloBusy) {
|
| 187 |
-
// Optionally log: "Inference already running, skipping frame"
|
| 188 |
-
requestAnimationFrame(processVideoLoop);
|
| 189 |
-
return;
|
| 190 |
-
}
|
| 191 |
-
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
|
| 192 |
-
// Schedule the next frame as soon as possible
|
| 193 |
-
requestAnimationFrame(processVideoLoop);
|
| 194 |
-
};
|
| 195 |
-
const processExampleLoop = async () => {
|
| 196 |
-
while (processingLoopRef.current) {
|
| 197 |
-
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
|
| 198 |
-
await new Promise(res => setTimeout(res, 1000));
|
| 199 |
-
}
|
| 200 |
-
};
|
| 201 |
-
|
| 202 |
-
// Set your YOLOv8 ONNX backend API endpoint here:
|
| 203 |
-
// const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE
|
| 204 |
-
|
| 205 |
-
// Add this useEffect for overlay video synchronization
|
| 206 |
-
useEffect(() => {
|
| 207 |
-
const main = videoRef.current;
|
| 208 |
-
const overlay = overlayVideoRef.current;
|
| 209 |
-
if (!main || !overlay) return;
|
| 210 |
-
// Sync play/pause
|
| 211 |
-
const onPlay = () => { if (overlay.paused) overlay.play(); };
|
| 212 |
-
const onPause = () => { if (!overlay.paused) overlay.pause(); };
|
| 213 |
-
// Sync seeking and time
|
| 214 |
-
const onSeekOrTime = () => {
|
| 215 |
-
if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
|
| 216 |
-
overlay.currentTime = main.currentTime;
|
| 217 |
-
}
|
| 218 |
-
};
|
| 219 |
-
main.addEventListener('play', onPlay);
|
| 220 |
-
main.addEventListener('pause', onPause);
|
| 221 |
-
main.addEventListener('seeked', onSeekOrTime);
|
| 222 |
-
main.addEventListener('timeupdate', onSeekOrTime);
|
| 223 |
-
// Clean up
|
| 224 |
-
return () => {
|
| 225 |
-
main.removeEventListener('play', onPlay);
|
| 226 |
-
main.removeEventListener('pause', onPause);
|
| 227 |
-
main.removeEventListener('seeked', onSeekOrTime);
|
| 228 |
-
main.removeEventListener('timeupdate', onSeekOrTime);
|
| 229 |
-
};
|
| 230 |
-
}, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
|
| 231 |
-
|
| 232 |
-
useEffect(() => {
|
| 233 |
-
if ((mode === "File") && processingVideoRef.current) {
|
| 234 |
-
processingVideoRef.current.play().catch(() => {});
|
| 235 |
-
}
|
| 236 |
-
}, [mode, videoUrl, uploadedUrl]);
|
| 237 |
-
|
| 238 |
-
// Remove old prompt-based box extraction logic and only use the above for video frames.
|
| 239 |
-
|
| 240 |
-
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
| 241 |
-
const file = e.target.files?.[0] || null;
|
| 242 |
-
setUploadedFile(file);
|
| 243 |
-
setUploadedUrl(file ? URL.createObjectURL(file) : "");
|
| 244 |
-
setError(null);
|
| 245 |
-
setImageProcessed(false);
|
| 246 |
-
setVideoProcessing(false);
|
| 247 |
-
setExampleProcessing(false);
|
| 248 |
-
};
|
| 249 |
-
|
| 250 |
-
// Webcam mode: process frames with setInterval
|
| 251 |
-
useEffect(() => {
|
| 252 |
-
if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
|
| 253 |
-
processVideoLoop();
|
| 254 |
-
}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
|
| 255 |
-
|
| 256 |
-
// Example video mode: process frames with setInterval
|
| 257 |
-
useEffect(() => {
|
| 258 |
-
if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
|
| 259 |
-
processExampleLoop();
|
| 260 |
-
}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
|
| 261 |
-
|
| 262 |
-
// File mode: process uploaded image (only on button click)
|
| 263 |
-
const handleProcessImage = async () => {
|
| 264 |
-
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
|
| 265 |
-
const img = imageRef.current;
|
| 266 |
-
const canvas = canvasRef.current;
|
| 267 |
-
canvas.width = img.naturalWidth;
|
| 268 |
-
canvas.height = img.naturalHeight;
|
| 269 |
-
setCanvasDims({w:canvas.width,h:canvas.height});
|
| 270 |
-
setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
|
| 271 |
-
const ctx = canvas.getContext("2d");
|
| 272 |
-
if (!ctx) return;
|
| 273 |
-
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
| 274 |
-
setProcessing(true);
|
| 275 |
-
setError(null);
|
| 276 |
-
setInferenceStatus("Running inference...");
|
| 277 |
-
await runInference(img, prompt, (output: string) => {
|
| 278 |
-
setDebugOutput(output);
|
| 279 |
-
setInferenceStatus("Inference complete.");
|
| 280 |
-
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
| 281 |
-
let boxes = extractAllBoundingBoxes(output);
|
| 282 |
-
console.log("Model output:", output);
|
| 283 |
-
console.log("Boxes after normalization:", boxes);
|
| 284 |
-
console.log("Canvas size:", canvas.width, canvas.height);
|
| 285 |
-
if (boxes.length > 0) {
|
| 286 |
-
const [x1, y1, x2, y2] = boxes[0].bbox_2d;
|
| 287 |
-
console.log("First box coords:", x1, y1, x2, y2);
|
| 288 |
-
}
|
| 289 |
-
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
| 290 |
-
if (Array.isArray(boxes) && boxes.length > 0) {
|
| 291 |
-
const scaleX = canvas.width / img.naturalWidth;
|
| 292 |
-
const scaleY = canvas.height / img.naturalHeight;
|
| 293 |
-
drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
|
| 294 |
-
}
|
| 295 |
-
setImageProcessed(true);
|
| 296 |
-
});
|
| 297 |
-
setProcessing(false);
|
| 298 |
-
};
|
| 299 |
-
|
| 300 |
-
// File mode: process uploaded video frames (start/stop)
|
| 301 |
-
const handleToggleVideoProcessing = () => {
|
| 302 |
-
setVideoProcessing((prev: boolean) => {
|
| 303 |
-
const next = !prev;
|
| 304 |
-
// Always stop all loops before starting
|
| 305 |
-
processingLoopRef.current = false;
|
| 306 |
-
setTimeout(() => {
|
| 307 |
-
if (next) {
|
| 308 |
-
processingLoopRef.current = true;
|
| 309 |
-
processVideoLoop();
|
| 310 |
-
}
|
| 311 |
-
}, 50);
|
| 312 |
-
return next;
|
| 313 |
-
});
|
| 314 |
-
};
|
| 315 |
-
|
| 316 |
-
// Handle start/stop for example video processing
|
| 317 |
-
const handleToggleExampleProcessing = () => {
|
| 318 |
-
setExampleProcessing((prev: boolean) => {
|
| 319 |
-
const next = !prev;
|
| 320 |
-
// Always stop all loops before starting
|
| 321 |
-
processingLoopRef.current = false;
|
| 322 |
-
setTimeout(() => {
|
| 323 |
-
if (next) {
|
| 324 |
-
processingLoopRef.current = true;
|
| 325 |
-
processVideoLoop();
|
| 326 |
-
}
|
| 327 |
-
}, 50);
|
| 328 |
-
return next;
|
| 329 |
-
});
|
| 330 |
-
};
|
| 331 |
-
|
| 332 |
-
// Test draw box function
|
| 333 |
-
const handleTestDrawBox = () => {
|
| 334 |
-
if (!canvasRef.current) return;
|
| 335 |
-
const canvas = canvasRef.current;
|
| 336 |
-
const ctx = canvas.getContext("2d");
|
| 337 |
-
if (!ctx) return;
|
| 338 |
-
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 339 |
-
ctx.strokeStyle = "#FF00FF";
|
| 340 |
-
ctx.lineWidth = 4;
|
| 341 |
-
ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
|
| 342 |
-
ctx.font = "20px Arial";
|
| 343 |
-
ctx.fillStyle = "#FF00FF";
|
| 344 |
-
ctx.fillText("Test Box", 50, 35);
|
| 345 |
-
};
|
| 346 |
-
|
| 347 |
-
useEffect(() => {
|
| 348 |
-
const draw = () => {
|
| 349 |
-
const overlayVideo = overlayVideoRef.current;
|
| 350 |
-
const canvas = canvasRef.current;
|
| 351 |
-
if (!overlayVideo || !canvas) return;
|
| 352 |
-
const displayWidth = overlayVideo.clientWidth;
|
| 353 |
-
const displayHeight = overlayVideo.clientHeight;
|
| 354 |
-
canvas.width = displayWidth;
|
| 355 |
-
canvas.height = displayHeight;
|
| 356 |
-
const ctx = canvas.getContext("2d");
|
| 357 |
-
if (!ctx) return;
|
| 358 |
-
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 359 |
-
const now = Date.now();
|
| 360 |
-
const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
|
| 361 |
-
if (boxHistory.length > 0) {
|
| 362 |
-
// Fix: Draw all boxes, even if bbox_2d is an array of arrays
|
| 363 |
-
const denormalizedBoxes: any[] = [];
|
| 364 |
-
for (const b of boxHistory) {
|
| 365 |
-
if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
|
| 366 |
-
// Multiple boxes per label
|
| 367 |
-
for (const arr of b.bbox_2d) {
|
| 368 |
-
if (Array.isArray(arr) && arr.length === 4) {
|
| 369 |
-
denormalizedBoxes.push({
|
| 370 |
-
...b,
|
| 371 |
-
bbox_2d: denormalizeBox(arr, displayWidth, displayHeight)
|
| 372 |
-
});
|
| 373 |
-
}
|
| 374 |
-
}
|
| 375 |
-
} else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
|
| 376 |
-
// Single box
|
| 377 |
-
denormalizedBoxes.push({
|
| 378 |
-
...b,
|
| 379 |
-
bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight)
|
| 380 |
-
});
|
| 381 |
-
}
|
| 382 |
-
}
|
| 383 |
-
drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 });
|
| 384 |
-
}
|
| 385 |
-
};
|
| 386 |
-
draw();
|
| 387 |
-
const interval = setInterval(draw, 100);
|
| 388 |
-
// Redraw on window resize
|
| 389 |
-
const handleResize = () => draw();
|
| 390 |
-
window.addEventListener('resize', handleResize);
|
| 391 |
-
return () => {
|
| 392 |
-
clearInterval(interval);
|
| 393 |
-
window.removeEventListener('resize', handleResize);
|
| 394 |
-
};
|
| 395 |
-
}, [overlayVideoRef, canvasRef]);
|
| 396 |
-
|
| 397 |
-
// Drawing loop: draws the latest YOLOv8 boxes every frame
|
| 398 |
-
React.useEffect(() => {
|
| 399 |
-
let running = true;
|
| 400 |
-
function drawLoop() {
|
| 401 |
-
if (!running) return;
|
| 402 |
-
const overlayVideo = overlayVideoRef.current;
|
| 403 |
-
const canvas = canvasRef.current;
|
| 404 |
-
const processingVideo = processingVideoRef.current;
|
| 405 |
-
if (canvas && overlayVideo && processingVideo) {
|
| 406 |
-
// Set canvas size to match the visible video
|
| 407 |
-
canvas.width = overlayVideo.clientWidth;
|
| 408 |
-
canvas.height = overlayVideo.clientHeight;
|
| 409 |
-
const ctx = canvas.getContext('2d');
|
| 410 |
-
if (ctx) {
|
| 411 |
-
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 412 |
-
// Draw all YOLOv8 boxes from last detection
|
| 413 |
-
const yoloBoxes = lastYoloBoxesRef.current;
|
| 414 |
-
yoloBoxes.forEach((obj: any) => {
|
| 415 |
-
// Scale from YOLOv8 input size to canvas size
|
| 416 |
-
const scaleX = canvas.width / YOLOV8_INPUT_WIDTH;
|
| 417 |
-
const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT;
|
| 418 |
-
const [x1, y1, x2, y2] = obj.bbox;
|
| 419 |
-
const drawX = x1 * scaleX;
|
| 420 |
-
const drawY = y1 * scaleY;
|
| 421 |
-
const drawW = (x2 - x1) * scaleX;
|
| 422 |
-
const drawH = (y2 - y1) * scaleY;
|
| 423 |
-
ctx.strokeStyle = '#00FFFF';
|
| 424 |
-
ctx.lineWidth = 5;
|
| 425 |
-
ctx.strokeRect(drawX, drawY, drawW, drawH);
|
| 426 |
-
ctx.font = 'bold 22px Arial';
|
| 427 |
-
// Draw YOLOv8 label and confidence
|
| 428 |
-
const yoloLabel = obj.label || '';
|
| 429 |
-
const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : '';
|
| 430 |
-
const yoloText = `${yoloLabel}${yoloScore}`;
|
| 431 |
-
ctx.fillStyle = 'rgba(0,0,0,0.7)';
|
| 432 |
-
const yoloTextWidth = ctx.measureText(yoloText).width + 8;
|
| 433 |
-
ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26);
|
| 434 |
-
ctx.fillStyle = '#00FFFF';
|
| 435 |
-
ctx.fillText(yoloText, drawX, drawY - 4);
|
| 436 |
-
// Draw FastVLM annotation below the box if available
|
| 437 |
-
if (hasAnnotation(obj)) {
|
| 438 |
-
ctx.font = 'bold 18px Arial';
|
| 439 |
-
ctx.fillStyle = 'rgba(0,0,0,0.7)';
|
| 440 |
-
const annTextWidth = ctx.measureText(obj.annotation).width + 8;
|
| 441 |
-
ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24);
|
| 442 |
-
ctx.fillStyle = '#00FFFF';
|
| 443 |
-
ctx.fillText(obj.annotation, drawX, drawY + drawH + 22);
|
| 444 |
-
}
|
| 445 |
-
});
|
| 446 |
-
}
|
| 447 |
-
}
|
| 448 |
-
requestAnimationFrame(drawLoop);
|
| 449 |
-
}
|
| 450 |
-
drawLoop();
|
| 451 |
-
return () => { running = false; };
|
| 452 |
-
}, [overlayVideoRef, canvasRef, processingVideoRef]);
|
| 453 |
-
|
| 454 |
-
// YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background
|
| 455 |
-
const yoloDetectionLoop = async () => {
|
| 456 |
-
if (!processingLoopRef.current) return;
|
| 457 |
-
if (isYoloBusy) {
|
| 458 |
-
requestAnimationFrame(yoloDetectionLoop);
|
| 459 |
-
return;
|
| 460 |
-
}
|
| 461 |
-
isYoloBusy = true;
|
| 462 |
-
try {
|
| 463 |
-
const processingVideo = processingVideoRef.current;
|
| 464 |
-
if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) {
|
| 465 |
-
isYoloBusy = false;
|
| 466 |
-
requestAnimationFrame(yoloDetectionLoop);
|
| 467 |
-
return;
|
| 468 |
-
}
|
| 469 |
-
// Run YOLOv8 detection
|
| 470 |
-
const session = await loadYoloModel();
|
| 471 |
-
const inputTensor = preprocessFrameToTensor(processingVideo);
|
| 472 |
-
const feeds: Record<string, ort.Tensor> = {};
|
| 473 |
-
feeds[session.inputNames[0]] = inputTensor;
|
| 474 |
-
const results = await session.run(feeds);
|
| 475 |
-
const output = results[session.outputNames[0]];
|
| 476 |
-
const detections = postprocessYoloOutput(output);
|
| 477 |
-
lastYoloBoxesRef.current = detections;
|
| 478 |
-
// Run FastVLM on the full frame (wait for YOLOv8 to finish)
|
| 479 |
-
await runInference(processingVideo, prompt, (output: string) => {
|
| 480 |
-
setDebugOutput(output);
|
| 481 |
-
});
|
| 482 |
-
} catch (err) {
|
| 483 |
-
console.error('YOLOv8+FastVLM error:', err);
|
| 484 |
-
} finally {
|
| 485 |
-
isYoloBusy = false;
|
| 486 |
-
requestAnimationFrame(yoloDetectionLoop);
|
| 487 |
-
}
|
| 488 |
-
};
|
| 489 |
-
|
| 490 |
-
// Add this effect after the processing loop and toggle handlers
|
| 491 |
-
useEffect(() => {
|
| 492 |
-
// Stop processing loop on video source change or processing toggle
|
| 493 |
-
processingLoopRef.current = false;
|
| 494 |
-
// Start processing loop for the correct video after refs update
|
| 495 |
-
setTimeout(() => {
|
| 496 |
-
if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) {
|
| 497 |
-
processingLoopRef.current = true;
|
| 498 |
-
yoloDetectionLoop();
|
| 499 |
-
} else if (exampleProcessing && !uploadedFile) {
|
| 500 |
-
processingLoopRef.current = true;
|
| 501 |
-
yoloDetectionLoop();
|
| 502 |
-
}
|
| 503 |
-
}, 100);
|
| 504 |
-
// eslint-disable-next-line
|
| 505 |
-
}, [uploadedFile, videoProcessing, exampleProcessing]);
|
| 506 |
-
|
| 507 |
-
return (
|
| 508 |
-
<div className="absolute inset-0 text-white">
|
| 509 |
-
<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
|
| 510 |
-
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
|
| 511 |
-
</div>
|
| 512 |
-
<div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
|
| 513 |
-
<div className="flex flex-col items-center justify-center h-full w-full">
|
| 514 |
-
{/* Mode Selector */}
|
| 515 |
-
<div className="mb-6">
|
| 516 |
-
<div className="flex space-x-4">
|
| 517 |
-
{MODES.map((m) => (
|
| 518 |
-
<button
|
| 519 |
-
key={m}
|
| 520 |
-
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
|
| 521 |
-
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
|
| 522 |
-
}`}
|
| 523 |
-
onClick={() => setMode(m)}
|
| 524 |
-
>
|
| 525 |
-
{m}
|
| 526 |
-
</button>
|
| 527 |
-
))}
|
| 528 |
-
</div>
|
| 529 |
-
</div>
|
| 530 |
-
|
| 531 |
-
{/* Mode Content */}
|
| 532 |
-
<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
|
| 533 |
-
{mode === "File" && (
|
| 534 |
-
<div className="w-full text-center flex flex-col items-center">
|
| 535 |
-
<div className="mb-4 w-full max-w-xl">
|
| 536 |
-
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
|
| 537 |
-
<textarea
|
| 538 |
-
className="w-full p-2 rounded-lg text-black"
|
| 539 |
-
rows={3}
|
| 540 |
-
value={prompt}
|
| 541 |
-
onChange={(e) => setPrompt(e.target.value)}
|
| 542 |
-
/>
|
| 543 |
-
</div>
|
| 544 |
-
<div className="mb-4 w-full max-w-xl">
|
| 545 |
-
<input
|
| 546 |
-
type="file"
|
| 547 |
-
accept="image/*,video/*"
|
| 548 |
-
onChange={handleFileChange}
|
| 549 |
-
className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
|
| 550 |
-
/>
|
| 551 |
-
</div>
|
| 552 |
-
{/* Add toggle button above video area */}
|
| 553 |
-
<div className="mb-2 w-full max-w-xl flex justify-end">
|
| 554 |
-
<button
|
| 555 |
-
className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`}
|
| 556 |
-
onClick={() => setShowProcessingVideo(v => !v)}
|
| 557 |
-
type="button"
|
| 558 |
-
>
|
| 559 |
-
{showProcessingVideo ? 'Hide' : 'Show'} Processed Video
|
| 560 |
-
</button>
|
| 561 |
-
</div>
|
| 562 |
-
{/* Show uploaded image */}
|
| 563 |
-
{uploadedFile && isImageFile(uploadedFile) && (
|
| 564 |
-
<div className="relative w-full max-w-xl">
|
| 565 |
-
<img
|
| 566 |
-
ref={imageRef}
|
| 567 |
-
src={uploadedUrl}
|
| 568 |
-
alt="Uploaded"
|
| 569 |
-
className="w-full rounded-lg shadow-lg mb-2"
|
| 570 |
-
style={{ background: "#222" }}
|
| 571 |
-
/>
|
| 572 |
-
<canvas
|
| 573 |
-
ref={canvasRef}
|
| 574 |
-
className="absolute top-0 left-0 w-full h-full pointer-events-none"
|
| 575 |
-
style={{ zIndex: 10, pointerEvents: "none" }}
|
| 576 |
-
/>
|
| 577 |
-
<button
|
| 578 |
-
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
| 579 |
-
onClick={handleProcessImage}
|
| 580 |
-
disabled={processing}
|
| 581 |
-
>
|
| 582 |
-
{processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
|
| 583 |
-
</button>
|
| 584 |
-
</div>
|
| 585 |
-
)}
|
| 586 |
-
{/* Show uploaded video */}
|
| 587 |
-
{uploadedFile && isVideoFile(uploadedFile) && (
|
| 588 |
-
<div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
|
| 589 |
-
{/* Visible overlay video for user */}
|
| 590 |
-
<video
|
| 591 |
-
ref={overlayVideoRef}
|
| 592 |
-
src={uploadedUrl}
|
| 593 |
-
controls
|
| 594 |
-
autoPlay
|
| 595 |
-
loop
|
| 596 |
-
muted
|
| 597 |
-
playsInline
|
| 598 |
-
className="w-full rounded-lg shadow-lg mb-2"
|
| 599 |
-
style={{ background: "#222", display: "block" }}
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
canvasRef.current.
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
canvasRef.current.
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
{
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
>
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
}
|
|
|
|
| 1 |
+
import * as React from "react";
|
| 2 |
+
import { useState, useRef, useEffect } from "react";
|
| 3 |
+
import { useVLMContext } from "../context/useVLMContext";
|
| 4 |
+
import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
|
| 5 |
+
|
| 6 |
+
const MODES = ["File"] as const;
|
| 7 |
+
type Mode = typeof MODES[number];
|
| 8 |
+
|
| 9 |
+
const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4";
|
| 10 |
+
const EXAMPLE_PROMPT = "Describe the video";
|
| 11 |
+
|
| 12 |
+
function isImageFile(file: File) {
|
| 13 |
+
return file.type.startsWith("image/");
|
| 14 |
+
}
|
| 15 |
+
function isVideoFile(file: File) {
|
| 16 |
+
return file.type.startsWith("video/");
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
function denormalizeBox(box: number[], width: number, height: number) {
|
| 20 |
+
// If all values are between 0 and 1, treat as normalized
|
| 21 |
+
if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
|
| 22 |
+
return [
|
| 23 |
+
box[0] * width,
|
| 24 |
+
box[1] * height,
|
| 25 |
+
box[2] * width,
|
| 26 |
+
box[3] * height
|
| 27 |
+
];
|
| 28 |
+
}
|
| 29 |
+
return box;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
// Add this robust fallback parser near the top
|
| 33 |
+
function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
|
| 34 |
+
// Try to parse as JSON first
|
| 35 |
+
try {
|
| 36 |
+
const parsed = JSON.parse(output);
|
| 37 |
+
if (Array.isArray(parsed)) {
|
| 38 |
+
const result: { label: string, bbox_2d: number[] }[] = [];
|
| 39 |
+
for (const obj of parsed) {
|
| 40 |
+
if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
|
| 41 |
+
if (Array.isArray(obj.bbox_2d[0])) {
|
| 42 |
+
for (const arr of obj.bbox_2d) {
|
| 43 |
+
if (Array.isArray(arr) && arr.length === 4) {
|
| 44 |
+
result.push({ label: obj.label, bbox_2d: arr });
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
} else if (obj.bbox_2d.length === 4) {
|
| 48 |
+
result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
if (result.length > 0) return result;
|
| 53 |
+
}
|
| 54 |
+
} catch (e) {}
|
| 55 |
+
// Fallback: extract all [x1, y1, x2, y2] arrays from the string
|
| 56 |
+
const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
|
| 57 |
+
const boxes: { label: string, bbox_2d: number[] }[] = [];
|
| 58 |
+
let match;
|
| 59 |
+
while ((match = boxRegex.exec(output)) !== null) {
|
| 60 |
+
const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
|
| 61 |
+
boxes.push({ label: '', bbox_2d: arr });
|
| 62 |
+
}
|
| 63 |
+
return boxes;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
// NOTE: You must install onnxruntime-web:
|
| 67 |
+
// npm install onnxruntime-web
|
| 68 |
+
// @ts-ignore
|
| 69 |
+
import * as ort from 'onnxruntime-web';
|
| 70 |
+
// If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web';
|
| 71 |
+
|
| 72 |
+
// Set your YOLOv8 ONNX model URL here:
|
| 73 |
+
const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE
|
| 74 |
+
|
| 75 |
+
// Add these constants to match the YOLOv8 input size
|
| 76 |
+
const YOLOV8_INPUT_WIDTH = 640;
|
| 77 |
+
const YOLOV8_INPUT_HEIGHT = 480;
|
| 78 |
+
|
| 79 |
+
// 1. Load the ONNX model once
|
| 80 |
+
let yoloSession: ort.InferenceSession | null = null;
|
| 81 |
+
// Add a busy flag to prevent concurrent YOLOv8 inferences
|
| 82 |
+
let isYoloBusy = false;
|
| 83 |
+
async function loadYoloModel() {
|
| 84 |
+
if (!yoloSession) {
|
| 85 |
+
yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL);
|
| 86 |
+
}
|
| 87 |
+
return yoloSession;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
// COCO class names for YOLOv8
|
| 91 |
+
const YOLO_CLASSES: string[] = [
|
| 92 |
+
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
|
| 93 |
+
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
|
| 94 |
+
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
|
| 95 |
+
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
|
| 96 |
+
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
|
| 97 |
+
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
|
| 98 |
+
"dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
|
| 99 |
+
"toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
|
| 100 |
+
];
|
| 101 |
+
|
| 102 |
+
// Preprocess video frame to YOLOv8 input tensor [1,3,640,640]
|
| 103 |
+
function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor {
|
| 104 |
+
const width = 640;
|
| 105 |
+
const height = 480;
|
| 106 |
+
const canvas = document.createElement('canvas');
|
| 107 |
+
canvas.width = width;
|
| 108 |
+
canvas.height = height;
|
| 109 |
+
const ctx = canvas.getContext('2d');
|
| 110 |
+
if (!ctx) throw new Error('Could not get 2D context');
|
| 111 |
+
ctx.drawImage(video, 0, 0, width, height);
|
| 112 |
+
const imageData = ctx.getImageData(0, 0, width, height);
|
| 113 |
+
const { data } = imageData;
|
| 114 |
+
// Convert to Float32Array [1,3,480,640], normalize to [0,1]
|
| 115 |
+
const floatData = new Float32Array(1 * 3 * height * width);
|
| 116 |
+
for (let i = 0; i < width * height; i++) {
|
| 117 |
+
floatData[i] = data[i * 4] / 255; // R
|
| 118 |
+
floatData[i + width * height] = data[i * 4 + 1] / 255; // G
|
| 119 |
+
floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B
|
| 120 |
+
}
|
| 121 |
+
return new ort.Tensor('float32', floatData, [1, 3, height, width]);
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
// Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters
|
| 125 |
+
function postprocessYoloOutput(output: ort.Tensor) {
|
| 126 |
+
// output.dims: [1, num_detections, 6]
|
| 127 |
+
const data = output.data;
|
| 128 |
+
const numDetections = output.dims[1];
|
| 129 |
+
const results = [];
|
| 130 |
+
for (let i = 0; i < numDetections; i++) {
|
| 131 |
+
const offset = i * 6;
|
| 132 |
+
const x1 = data[offset];
|
| 133 |
+
const y1 = data[offset + 1];
|
| 134 |
+
const x2 = data[offset + 2];
|
| 135 |
+
const y2 = data[offset + 3];
|
| 136 |
+
const score = data[offset + 4];
|
| 137 |
+
const classId = data[offset + 5];
|
| 138 |
+
if (score < 0.2) continue; // adjust threshold as needed
|
| 139 |
+
results.push({
|
| 140 |
+
bbox: [x1, y1, x2, y2],
|
| 141 |
+
label: YOLO_CLASSES[classId] || `class_${classId}`,
|
| 142 |
+
score
|
| 143 |
+
});
|
| 144 |
+
}
|
| 145 |
+
return results;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
// Helper type guard for annotation
|
| 149 |
+
function hasAnnotation(obj: any): obj is { annotation: string } {
|
| 150 |
+
return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string';
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
export default function MultiSourceCaptioningView() {
|
| 154 |
+
const [mode, setMode] = useState<Mode>("File");
|
| 155 |
+
const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
|
| 156 |
+
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
|
| 157 |
+
const [processing, setProcessing] = useState(false);
|
| 158 |
+
const [error, setError] = useState<string | null>(null);
|
| 159 |
+
const [uploadedFile, setUploadedFile] = useState<File | null>(null);
|
| 160 |
+
const [uploadedUrl, setUploadedUrl] = useState<string>("");
|
| 161 |
+
const [videoProcessing, setVideoProcessing] = useState(false);
|
| 162 |
+
const [imageProcessed, setImageProcessed] = useState(false);
|
| 163 |
+
const [exampleProcessing, setExampleProcessing] = useState(false);
|
| 164 |
+
const [debugOutput, setDebugOutput] = useState<string>("");
|
| 165 |
+
const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
|
| 166 |
+
const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
|
| 167 |
+
const [inferenceStatus, setInferenceStatus] = useState<string>("");
|
| 168 |
+
const [showProcessingVideo, setShowProcessingVideo] = useState(false);
|
| 169 |
+
|
| 170 |
+
const videoRef = useRef<HTMLVideoElement | null>(null);
|
| 171 |
+
const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
|
| 172 |
+
const processingVideoRef = useRef<HTMLVideoElement | null>(null);
|
| 173 |
+
const canvasRef = useRef<HTMLCanvasElement | null>(null);
|
| 174 |
+
const imageRef = useRef<HTMLImageElement | null>(null);
|
| 175 |
+
const boxHistoryRef = useRef<any[]>([]);
|
| 176 |
+
// Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation)
|
| 177 |
+
const lastYoloBoxesRef = React.useRef<any[]>([]);
|
| 178 |
+
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
|
| 179 |
+
|
| 180 |
+
// Remove videoProcessingRef and exampleProcessingRef
|
| 181 |
+
// Add a single processingLoopRef
|
| 182 |
+
const processingLoopRef = React.useRef(false);
|
| 183 |
+
|
| 184 |
+
const processVideoLoop = async () => {
|
| 185 |
+
if (!processingLoopRef.current) return;
|
| 186 |
+
if (isYoloBusy) {
|
| 187 |
+
// Optionally log: "Inference already running, skipping frame"
|
| 188 |
+
requestAnimationFrame(processVideoLoop);
|
| 189 |
+
return;
|
| 190 |
+
}
|
| 191 |
+
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
|
| 192 |
+
// Schedule the next frame as soon as possible
|
| 193 |
+
requestAnimationFrame(processVideoLoop);
|
| 194 |
+
};
|
| 195 |
+
const processExampleLoop = async () => {
|
| 196 |
+
while (processingLoopRef.current) {
|
| 197 |
+
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
|
| 198 |
+
await new Promise(res => setTimeout(res, 1000));
|
| 199 |
+
}
|
| 200 |
+
};
|
| 201 |
+
|
| 202 |
+
// Set your YOLOv8 ONNX backend API endpoint here:
|
| 203 |
+
// const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE
|
| 204 |
+
|
| 205 |
+
// Add this useEffect for overlay video synchronization
|
| 206 |
+
useEffect(() => {
|
| 207 |
+
const main = videoRef.current;
|
| 208 |
+
const overlay = overlayVideoRef.current;
|
| 209 |
+
if (!main || !overlay) return;
|
| 210 |
+
// Sync play/pause
|
| 211 |
+
const onPlay = () => { if (overlay.paused) overlay.play(); };
|
| 212 |
+
const onPause = () => { if (!overlay.paused) overlay.pause(); };
|
| 213 |
+
// Sync seeking and time
|
| 214 |
+
const onSeekOrTime = () => {
|
| 215 |
+
if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
|
| 216 |
+
overlay.currentTime = main.currentTime;
|
| 217 |
+
}
|
| 218 |
+
};
|
| 219 |
+
main.addEventListener('play', onPlay);
|
| 220 |
+
main.addEventListener('pause', onPause);
|
| 221 |
+
main.addEventListener('seeked', onSeekOrTime);
|
| 222 |
+
main.addEventListener('timeupdate', onSeekOrTime);
|
| 223 |
+
// Clean up
|
| 224 |
+
return () => {
|
| 225 |
+
main.removeEventListener('play', onPlay);
|
| 226 |
+
main.removeEventListener('pause', onPause);
|
| 227 |
+
main.removeEventListener('seeked', onSeekOrTime);
|
| 228 |
+
main.removeEventListener('timeupdate', onSeekOrTime);
|
| 229 |
+
};
|
| 230 |
+
}, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
|
| 231 |
+
|
| 232 |
+
useEffect(() => {
|
| 233 |
+
if ((mode === "File") && processingVideoRef.current) {
|
| 234 |
+
processingVideoRef.current.play().catch(() => {});
|
| 235 |
+
}
|
| 236 |
+
}, [mode, videoUrl, uploadedUrl]);
|
| 237 |
+
|
| 238 |
+
// Remove old prompt-based box extraction logic and only use the above for video frames.
|
| 239 |
+
|
| 240 |
+
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
| 241 |
+
const file = e.target.files?.[0] || null;
|
| 242 |
+
setUploadedFile(file);
|
| 243 |
+
setUploadedUrl(file ? URL.createObjectURL(file) : "");
|
| 244 |
+
setError(null);
|
| 245 |
+
setImageProcessed(false);
|
| 246 |
+
setVideoProcessing(false);
|
| 247 |
+
setExampleProcessing(false);
|
| 248 |
+
};
|
| 249 |
+
|
| 250 |
+
// Webcam mode: process frames with setInterval
|
| 251 |
+
useEffect(() => {
|
| 252 |
+
if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
|
| 253 |
+
processVideoLoop();
|
| 254 |
+
}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
|
| 255 |
+
|
| 256 |
+
// Example video mode: process frames with setInterval
|
| 257 |
+
useEffect(() => {
|
| 258 |
+
if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
|
| 259 |
+
processExampleLoop();
|
| 260 |
+
}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
|
| 261 |
+
|
| 262 |
+
// File mode: process uploaded image (only on button click)
|
| 263 |
+
const handleProcessImage = async () => {
|
| 264 |
+
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
|
| 265 |
+
const img = imageRef.current;
|
| 266 |
+
const canvas = canvasRef.current;
|
| 267 |
+
canvas.width = img.naturalWidth;
|
| 268 |
+
canvas.height = img.naturalHeight;
|
| 269 |
+
setCanvasDims({w:canvas.width,h:canvas.height});
|
| 270 |
+
setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
|
| 271 |
+
const ctx = canvas.getContext("2d");
|
| 272 |
+
if (!ctx) return;
|
| 273 |
+
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
| 274 |
+
setProcessing(true);
|
| 275 |
+
setError(null);
|
| 276 |
+
setInferenceStatus("Running inference...");
|
| 277 |
+
await runInference(img, prompt, (output: string) => {
|
| 278 |
+
setDebugOutput(output);
|
| 279 |
+
setInferenceStatus("Inference complete.");
|
| 280 |
+
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
| 281 |
+
let boxes = extractAllBoundingBoxes(output);
|
| 282 |
+
console.log("Model output:", output);
|
| 283 |
+
console.log("Boxes after normalization:", boxes);
|
| 284 |
+
console.log("Canvas size:", canvas.width, canvas.height);
|
| 285 |
+
if (boxes.length > 0) {
|
| 286 |
+
const [x1, y1, x2, y2] = boxes[0].bbox_2d;
|
| 287 |
+
console.log("First box coords:", x1, y1, x2, y2);
|
| 288 |
+
}
|
| 289 |
+
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
| 290 |
+
if (Array.isArray(boxes) && boxes.length > 0) {
|
| 291 |
+
const scaleX = canvas.width / img.naturalWidth;
|
| 292 |
+
const scaleY = canvas.height / img.naturalHeight;
|
| 293 |
+
drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
|
| 294 |
+
}
|
| 295 |
+
setImageProcessed(true);
|
| 296 |
+
});
|
| 297 |
+
setProcessing(false);
|
| 298 |
+
};
|
| 299 |
+
|
| 300 |
+
// File mode: process uploaded video frames (start/stop)
|
| 301 |
+
const handleToggleVideoProcessing = () => {
|
| 302 |
+
setVideoProcessing((prev: boolean) => {
|
| 303 |
+
const next = !prev;
|
| 304 |
+
// Always stop all loops before starting
|
| 305 |
+
processingLoopRef.current = false;
|
| 306 |
+
setTimeout(() => {
|
| 307 |
+
if (next) {
|
| 308 |
+
processingLoopRef.current = true;
|
| 309 |
+
processVideoLoop();
|
| 310 |
+
}
|
| 311 |
+
}, 50);
|
| 312 |
+
return next;
|
| 313 |
+
});
|
| 314 |
+
};
|
| 315 |
+
|
| 316 |
+
// Handle start/stop for example video processing
|
| 317 |
+
const handleToggleExampleProcessing = () => {
|
| 318 |
+
setExampleProcessing((prev: boolean) => {
|
| 319 |
+
const next = !prev;
|
| 320 |
+
// Always stop all loops before starting
|
| 321 |
+
processingLoopRef.current = false;
|
| 322 |
+
setTimeout(() => {
|
| 323 |
+
if (next) {
|
| 324 |
+
processingLoopRef.current = true;
|
| 325 |
+
processVideoLoop();
|
| 326 |
+
}
|
| 327 |
+
}, 50);
|
| 328 |
+
return next;
|
| 329 |
+
});
|
| 330 |
+
};
|
| 331 |
+
|
| 332 |
+
// Test draw box function
|
| 333 |
+
const handleTestDrawBox = () => {
|
| 334 |
+
if (!canvasRef.current) return;
|
| 335 |
+
const canvas = canvasRef.current;
|
| 336 |
+
const ctx = canvas.getContext("2d");
|
| 337 |
+
if (!ctx) return;
|
| 338 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 339 |
+
ctx.strokeStyle = "#FF00FF";
|
| 340 |
+
ctx.lineWidth = 4;
|
| 341 |
+
ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
|
| 342 |
+
ctx.font = "20px Arial";
|
| 343 |
+
ctx.fillStyle = "#FF00FF";
|
| 344 |
+
ctx.fillText("Test Box", 50, 35);
|
| 345 |
+
};
|
| 346 |
+
|
| 347 |
+
useEffect(() => {
|
| 348 |
+
const draw = () => {
|
| 349 |
+
const overlayVideo = overlayVideoRef.current;
|
| 350 |
+
const canvas = canvasRef.current;
|
| 351 |
+
if (!overlayVideo || !canvas) return;
|
| 352 |
+
const displayWidth = overlayVideo.clientWidth;
|
| 353 |
+
const displayHeight = overlayVideo.clientHeight;
|
| 354 |
+
canvas.width = displayWidth;
|
| 355 |
+
canvas.height = displayHeight;
|
| 356 |
+
const ctx = canvas.getContext("2d");
|
| 357 |
+
if (!ctx) return;
|
| 358 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 359 |
+
const now = Date.now();
|
| 360 |
+
const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
|
| 361 |
+
if (boxHistory.length > 0) {
|
| 362 |
+
// Fix: Draw all boxes, even if bbox_2d is an array of arrays
|
| 363 |
+
const denormalizedBoxes: any[] = [];
|
| 364 |
+
for (const b of boxHistory) {
|
| 365 |
+
if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
|
| 366 |
+
// Multiple boxes per label
|
| 367 |
+
for (const arr of b.bbox_2d) {
|
| 368 |
+
if (Array.isArray(arr) && arr.length === 4) {
|
| 369 |
+
denormalizedBoxes.push({
|
| 370 |
+
...b,
|
| 371 |
+
bbox_2d: denormalizeBox(arr, displayWidth, displayHeight)
|
| 372 |
+
});
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
} else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
|
| 376 |
+
// Single box
|
| 377 |
+
denormalizedBoxes.push({
|
| 378 |
+
...b,
|
| 379 |
+
bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight)
|
| 380 |
+
});
|
| 381 |
+
}
|
| 382 |
+
}
|
| 383 |
+
drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 });
|
| 384 |
+
}
|
| 385 |
+
};
|
| 386 |
+
draw();
|
| 387 |
+
const interval = setInterval(draw, 100);
|
| 388 |
+
// Redraw on window resize
|
| 389 |
+
const handleResize = () => draw();
|
| 390 |
+
window.addEventListener('resize', handleResize);
|
| 391 |
+
return () => {
|
| 392 |
+
clearInterval(interval);
|
| 393 |
+
window.removeEventListener('resize', handleResize);
|
| 394 |
+
};
|
| 395 |
+
}, [overlayVideoRef, canvasRef]);
|
| 396 |
+
|
| 397 |
+
// Drawing loop: draws the latest YOLOv8 boxes every frame
|
| 398 |
+
React.useEffect(() => {
|
| 399 |
+
let running = true;
|
| 400 |
+
function drawLoop() {
|
| 401 |
+
if (!running) return;
|
| 402 |
+
const overlayVideo = overlayVideoRef.current;
|
| 403 |
+
const canvas = canvasRef.current;
|
| 404 |
+
const processingVideo = processingVideoRef.current;
|
| 405 |
+
if (canvas && overlayVideo && processingVideo) {
|
| 406 |
+
// Set canvas size to match the visible video
|
| 407 |
+
canvas.width = overlayVideo.clientWidth;
|
| 408 |
+
canvas.height = overlayVideo.clientHeight;
|
| 409 |
+
const ctx = canvas.getContext('2d');
|
| 410 |
+
if (ctx) {
|
| 411 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 412 |
+
// Draw all YOLOv8 boxes from last detection
|
| 413 |
+
const yoloBoxes = lastYoloBoxesRef.current;
|
| 414 |
+
yoloBoxes.forEach((obj: any) => {
|
| 415 |
+
// Scale from YOLOv8 input size to canvas size
|
| 416 |
+
const scaleX = canvas.width / YOLOV8_INPUT_WIDTH;
|
| 417 |
+
const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT;
|
| 418 |
+
const [x1, y1, x2, y2] = obj.bbox;
|
| 419 |
+
const drawX = x1 * scaleX;
|
| 420 |
+
const drawY = y1 * scaleY;
|
| 421 |
+
const drawW = (x2 - x1) * scaleX;
|
| 422 |
+
const drawH = (y2 - y1) * scaleY;
|
| 423 |
+
ctx.strokeStyle = '#00FFFF';
|
| 424 |
+
ctx.lineWidth = 5;
|
| 425 |
+
ctx.strokeRect(drawX, drawY, drawW, drawH);
|
| 426 |
+
ctx.font = 'bold 22px Arial';
|
| 427 |
+
// Draw YOLOv8 label and confidence
|
| 428 |
+
const yoloLabel = obj.label || '';
|
| 429 |
+
const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : '';
|
| 430 |
+
const yoloText = `${yoloLabel}${yoloScore}`;
|
| 431 |
+
ctx.fillStyle = 'rgba(0,0,0,0.7)';
|
| 432 |
+
const yoloTextWidth = ctx.measureText(yoloText).width + 8;
|
| 433 |
+
ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26);
|
| 434 |
+
ctx.fillStyle = '#00FFFF';
|
| 435 |
+
ctx.fillText(yoloText, drawX, drawY - 4);
|
| 436 |
+
// Draw FastVLM annotation below the box if available
|
| 437 |
+
if (hasAnnotation(obj)) {
|
| 438 |
+
ctx.font = 'bold 18px Arial';
|
| 439 |
+
ctx.fillStyle = 'rgba(0,0,0,0.7)';
|
| 440 |
+
const annTextWidth = ctx.measureText(obj.annotation).width + 8;
|
| 441 |
+
ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24);
|
| 442 |
+
ctx.fillStyle = '#00FFFF';
|
| 443 |
+
ctx.fillText(obj.annotation, drawX, drawY + drawH + 22);
|
| 444 |
+
}
|
| 445 |
+
});
|
| 446 |
+
}
|
| 447 |
+
}
|
| 448 |
+
requestAnimationFrame(drawLoop);
|
| 449 |
+
}
|
| 450 |
+
drawLoop();
|
| 451 |
+
return () => { running = false; };
|
| 452 |
+
}, [overlayVideoRef, canvasRef, processingVideoRef]);
|
| 453 |
+
|
| 454 |
+
// YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background
|
| 455 |
+
const yoloDetectionLoop = async () => {
|
| 456 |
+
if (!processingLoopRef.current) return;
|
| 457 |
+
if (isYoloBusy) {
|
| 458 |
+
requestAnimationFrame(yoloDetectionLoop);
|
| 459 |
+
return;
|
| 460 |
+
}
|
| 461 |
+
isYoloBusy = true;
|
| 462 |
+
try {
|
| 463 |
+
const processingVideo = processingVideoRef.current;
|
| 464 |
+
if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) {
|
| 465 |
+
isYoloBusy = false;
|
| 466 |
+
requestAnimationFrame(yoloDetectionLoop);
|
| 467 |
+
return;
|
| 468 |
+
}
|
| 469 |
+
// Run YOLOv8 detection
|
| 470 |
+
const session = await loadYoloModel();
|
| 471 |
+
const inputTensor = preprocessFrameToTensor(processingVideo);
|
| 472 |
+
const feeds: Record<string, ort.Tensor> = {};
|
| 473 |
+
feeds[session.inputNames[0]] = inputTensor;
|
| 474 |
+
const results = await session.run(feeds);
|
| 475 |
+
const output = results[session.outputNames[0]];
|
| 476 |
+
const detections = postprocessYoloOutput(output);
|
| 477 |
+
lastYoloBoxesRef.current = detections;
|
| 478 |
+
// Run FastVLM on the full frame (wait for YOLOv8 to finish)
|
| 479 |
+
await runInference(processingVideo, prompt, (output: string) => {
|
| 480 |
+
setDebugOutput(output);
|
| 481 |
+
});
|
| 482 |
+
} catch (err) {
|
| 483 |
+
console.error('YOLOv8+FastVLM error:', err);
|
| 484 |
+
} finally {
|
| 485 |
+
isYoloBusy = false;
|
| 486 |
+
requestAnimationFrame(yoloDetectionLoop);
|
| 487 |
+
}
|
| 488 |
+
};
|
| 489 |
+
|
| 490 |
+
// Add this effect after the processing loop and toggle handlers
|
| 491 |
+
useEffect(() => {
|
| 492 |
+
// Stop processing loop on video source change or processing toggle
|
| 493 |
+
processingLoopRef.current = false;
|
| 494 |
+
// Start processing loop for the correct video after refs update
|
| 495 |
+
setTimeout(() => {
|
| 496 |
+
if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) {
|
| 497 |
+
processingLoopRef.current = true;
|
| 498 |
+
yoloDetectionLoop();
|
| 499 |
+
} else if (exampleProcessing && !uploadedFile) {
|
| 500 |
+
processingLoopRef.current = true;
|
| 501 |
+
yoloDetectionLoop();
|
| 502 |
+
}
|
| 503 |
+
}, 100);
|
| 504 |
+
// eslint-disable-next-line
|
| 505 |
+
}, [uploadedFile, videoProcessing, exampleProcessing]);
|
| 506 |
+
|
| 507 |
+
return (
|
| 508 |
+
<div className="absolute inset-0 text-white">
|
| 509 |
+
<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
|
| 510 |
+
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
|
| 511 |
+
</div>
|
| 512 |
+
<div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
|
| 513 |
+
<div className="flex flex-col items-center justify-center h-full w-full">
|
| 514 |
+
{/* Mode Selector */}
|
| 515 |
+
<div className="mb-6">
|
| 516 |
+
<div className="flex space-x-4">
|
| 517 |
+
{MODES.map((m) => (
|
| 518 |
+
<button
|
| 519 |
+
key={m}
|
| 520 |
+
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
|
| 521 |
+
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
|
| 522 |
+
}`}
|
| 523 |
+
onClick={() => setMode(m)}
|
| 524 |
+
>
|
| 525 |
+
{m}
|
| 526 |
+
</button>
|
| 527 |
+
))}
|
| 528 |
+
</div>
|
| 529 |
+
</div>
|
| 530 |
+
|
| 531 |
+
{/* Mode Content */}
|
| 532 |
+
<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
|
| 533 |
+
{mode === "File" && (
|
| 534 |
+
<div className="w-full text-center flex flex-col items-center">
|
| 535 |
+
<div className="mb-4 w-full max-w-xl">
|
| 536 |
+
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
|
| 537 |
+
<textarea
|
| 538 |
+
className="w-full p-2 rounded-lg text-black"
|
| 539 |
+
rows={3}
|
| 540 |
+
value={prompt}
|
| 541 |
+
onChange={(e) => setPrompt(e.target.value)}
|
| 542 |
+
/>
|
| 543 |
+
</div>
|
| 544 |
+
<div className="mb-4 w-full max-w-xl">
|
| 545 |
+
<input
|
| 546 |
+
type="file"
|
| 547 |
+
accept="image/*,video/*"
|
| 548 |
+
onChange={handleFileChange}
|
| 549 |
+
className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
|
| 550 |
+
/>
|
| 551 |
+
</div>
|
| 552 |
+
{/* Add toggle button above video area */}
|
| 553 |
+
<div className="mb-2 w-full max-w-xl flex justify-end">
|
| 554 |
+
<button
|
| 555 |
+
className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`}
|
| 556 |
+
onClick={() => setShowProcessingVideo(v => !v)}
|
| 557 |
+
type="button"
|
| 558 |
+
>
|
| 559 |
+
{showProcessingVideo ? 'Hide' : 'Show'} Processed Video
|
| 560 |
+
</button>
|
| 561 |
+
</div>
|
| 562 |
+
{/* Show uploaded image */}
|
| 563 |
+
{uploadedFile && isImageFile(uploadedFile) && (
|
| 564 |
+
<div className="relative w-full max-w-xl">
|
| 565 |
+
<img
|
| 566 |
+
ref={imageRef}
|
| 567 |
+
src={uploadedUrl}
|
| 568 |
+
alt="Uploaded"
|
| 569 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
| 570 |
+
style={{ background: "#222" }}
|
| 571 |
+
/>
|
| 572 |
+
<canvas
|
| 573 |
+
ref={canvasRef}
|
| 574 |
+
className="absolute top-0 left-0 w-full h-full pointer-events-none"
|
| 575 |
+
style={{ zIndex: 10, pointerEvents: "none" }}
|
| 576 |
+
/>
|
| 577 |
+
<button
|
| 578 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
| 579 |
+
onClick={handleProcessImage}
|
| 580 |
+
disabled={processing}
|
| 581 |
+
>
|
| 582 |
+
{processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
|
| 583 |
+
</button>
|
| 584 |
+
</div>
|
| 585 |
+
)}
|
| 586 |
+
{/* Show uploaded video */}
|
| 587 |
+
{uploadedFile && isVideoFile(uploadedFile) && (
|
| 588 |
+
<div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
|
| 589 |
+
{/* Visible overlay video for user */}
|
| 590 |
+
<video
|
| 591 |
+
ref={overlayVideoRef}
|
| 592 |
+
src={uploadedUrl}
|
| 593 |
+
controls
|
| 594 |
+
autoPlay
|
| 595 |
+
loop
|
| 596 |
+
muted
|
| 597 |
+
playsInline
|
| 598 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
| 599 |
+
style={{ background: "#222", display: "block" }}
|
| 600 |
+
crossOrigin="anonymous"
|
| 601 |
+
onLoadedMetadata={(e: React.SyntheticEvent<HTMLVideoElement, Event>) => {
|
| 602 |
+
if (canvasRef.current) {
|
| 603 |
+
canvasRef.current.width = e.currentTarget.clientWidth;
|
| 604 |
+
canvasRef.current.height = e.currentTarget.clientHeight;
|
| 605 |
+
}
|
| 606 |
+
}}
|
| 607 |
+
onResize={() => {
|
| 608 |
+
if (canvasRef.current && overlayVideoRef.current) {
|
| 609 |
+
canvasRef.current.width = overlayVideoRef.current.clientWidth;
|
| 610 |
+
canvasRef.current.height = overlayVideoRef.current.clientHeight;
|
| 611 |
+
}
|
| 612 |
+
}}
|
| 613 |
+
/>
|
| 614 |
+
{/* Canvas overlay */}
|
| 615 |
+
<canvas
|
| 616 |
+
ref={canvasRef}
|
| 617 |
+
style={{
|
| 618 |
+
position: "absolute",
|
| 619 |
+
top: 0,
|
| 620 |
+
left: 0,
|
| 621 |
+
width: "100%",
|
| 622 |
+
height: "100%",
|
| 623 |
+
zIndex: 100,
|
| 624 |
+
pointerEvents: "none",
|
| 625 |
+
display: "block"
|
| 626 |
+
}}
|
| 627 |
+
width={overlayVideoRef.current?.clientWidth || 640}
|
| 628 |
+
height={overlayVideoRef.current?.clientHeight || 480}
|
| 629 |
+
/>
|
| 630 |
+
{/* Hidden or visible processing video for FastVLM/canvas */}
|
| 631 |
+
<video
|
| 632 |
+
ref={processingVideoRef}
|
| 633 |
+
src={uploadedUrl}
|
| 634 |
+
autoPlay
|
| 635 |
+
loop
|
| 636 |
+
muted
|
| 637 |
+
playsInline
|
| 638 |
+
crossOrigin="anonymous"
|
| 639 |
+
style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
|
| 640 |
+
onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
|
| 641 |
+
/>
|
| 642 |
+
<button
|
| 643 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
| 644 |
+
onClick={handleToggleVideoProcessing}
|
| 645 |
+
>
|
| 646 |
+
{videoProcessing ? "Stop Processing" : "Start Processing"}
|
| 647 |
+
</button>
|
| 648 |
+
</div>
|
| 649 |
+
)}
|
| 650 |
+
{/* Show example video if no file uploaded */}
|
| 651 |
+
{!uploadedFile && (
|
| 652 |
+
<div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
|
| 653 |
+
{/* Visible overlay video for user */}
|
| 654 |
+
<video
|
| 655 |
+
ref={overlayVideoRef}
|
| 656 |
+
src={EXAMPLE_VIDEO_URL}
|
| 657 |
+
controls
|
| 658 |
+
autoPlay
|
| 659 |
+
loop
|
| 660 |
+
muted
|
| 661 |
+
playsInline
|
| 662 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
| 663 |
+
style={{ background: "#222", display: "block" }}
|
| 664 |
+
crossOrigin="anonymous"
|
| 665 |
+
/>
|
| 666 |
+
{/* Canvas overlay */}
|
| 667 |
+
<canvas
|
| 668 |
+
ref={canvasRef}
|
| 669 |
+
style={{
|
| 670 |
+
position: "absolute",
|
| 671 |
+
top: 0,
|
| 672 |
+
left: 0,
|
| 673 |
+
width: "100%",
|
| 674 |
+
height: "100%",
|
| 675 |
+
zIndex: 100,
|
| 676 |
+
pointerEvents: "none",
|
| 677 |
+
display: "block"
|
| 678 |
+
}}
|
| 679 |
+
width={overlayVideoRef.current?.clientWidth || 640}
|
| 680 |
+
height={overlayVideoRef.current?.clientHeight || 480}
|
| 681 |
+
/>
|
| 682 |
+
{/* Hidden or visible processing video for FastVLM/canvas */}
|
| 683 |
+
<video
|
| 684 |
+
ref={processingVideoRef}
|
| 685 |
+
src={EXAMPLE_VIDEO_URL}
|
| 686 |
+
autoPlay
|
| 687 |
+
loop
|
| 688 |
+
muted
|
| 689 |
+
playsInline
|
| 690 |
+
crossOrigin="anonymous"
|
| 691 |
+
style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
|
| 692 |
+
onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
|
| 693 |
+
/>
|
| 694 |
+
<button
|
| 695 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
| 696 |
+
onClick={handleToggleExampleProcessing}
|
| 697 |
+
>
|
| 698 |
+
{exampleProcessing ? "Stop Processing" : "Start Processing"}
|
| 699 |
+
</button>
|
| 700 |
+
</div>
|
| 701 |
+
)}
|
| 702 |
+
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
|
| 703 |
+
{error && <div className="text-red-400 mt-2">Error: {error}</div>}
|
| 704 |
+
<button
|
| 705 |
+
className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
|
| 706 |
+
onClick={handleTestDrawBox}
|
| 707 |
+
>
|
| 708 |
+
Test Draw Box
|
| 709 |
+
</button>
|
| 710 |
+
<div className="mt-2 p-2 bg-gray-800 rounded text-xs">
|
| 711 |
+
<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
|
| 712 |
+
<div>Raw Model Output:</div>
|
| 713 |
+
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
|
| 714 |
+
</div>
|
| 715 |
+
</div>
|
| 716 |
+
)}
|
| 717 |
+
</div>
|
| 718 |
+
</div>
|
| 719 |
+
</div>
|
| 720 |
+
);
|
| 721 |
}
|