Spaces:

developer0hye
/

InternVL2_5-2B

Running on Zero

App Files Files Community

InternVL2_5-2B / app.py

developer0hye

Update app.py

fa48fd5 verified 10 months ago

raw

history blame contribute delete

11.6 kB

	import gradio as gr
	import spaces
	import torch
	import os
	import uuid
	import io
	import numpy as np
	from PIL import Image
	import torchvision.transforms as T
	from torchvision.transforms.functional import InterpolationMode
	from transformers import AutoModel, AutoTokenizer
	from decord import VideoReader, cpu

	# =============================================================================
	# InternVL 전처리/로딩 코드 (원본 예시에서 발췌)
	# =============================================================================
	IMAGENET_MEAN = (0.485, 0.456, 0.406)
	IMAGENET_STD = (0.229, 0.224, 0.225)

	def build_transform(input_size):
	MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
	transform = T.Compose([
	T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
	T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
	T.ToTensor(),
	T.Normalize(mean=MEAN, std=STD)
	])
	return transform

	def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
	best_ratio_diff = float('inf')
	best_ratio = (1, 1)
	area = width * height
	for ratio in target_ratios:
	target_aspect_ratio = ratio[0] / ratio[1]
	ratio_diff = abs(aspect_ratio - target_aspect_ratio)
	if ratio_diff < best_ratio_diff:
	best_ratio_diff = ratio_diff
	best_ratio = ratio
	elif ratio_diff == best_ratio_diff:
	# 이미지 면적 기준으로 좀 더 큰 쪽 선택
	if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
	best_ratio = ratio
	return best_ratio

	def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
	orig_width, orig_height = image.size
	aspect_ratio = orig_width / orig_height

	target_ratios = set(
	(i, j) for n in range(min_num, max_num + 1)
	for i in range(1, n + 1)
	for j in range(1, n + 1)
	if i * j <= max_num and i * j >= min_num
	)
	target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
	target_aspect_ratio = find_closest_aspect_ratio(
	aspect_ratio, target_ratios, orig_width, orig_height, image_size
	)
	target_width = image_size * target_aspect_ratio[0]
	target_height = image_size * target_aspect_ratio[1]
	blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

	resized_img = image.resize((target_width, target_height))
	processed_images = []
	for i in range(blocks):
	box = (
	(i % (target_width // image_size)) * image_size,
	(i // (target_width // image_size)) * image_size,
	((i % (target_width // image_size)) + 1) * image_size,
	((i // (target_width // image_size)) + 1) * image_size
	)
	split_img = resized_img.crop(box)
	processed_images.append(split_img)

	if use_thumbnail and len(processed_images) != 1:
	thumbnail_img = image.resize((image_size, image_size))
	processed_images.append(thumbnail_img)
	return processed_images

	def load_image(image_file, input_size=448, max_num=12):
	image = Image.open(image_file).convert('RGB')
	transform = build_transform(input_size=input_size)
	images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
	pixel_values = [transform(img) for img in images]
	pixel_values = torch.stack(pixel_values)
	return pixel_values

	def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
	if bound:
	start, end = bound[0], bound[1]
	else:
	start, end = -100000, 100000
	start_idx = max(first_idx, round(start * fps))
	end_idx = min(round(end * fps), max_frame)
	seg_size = float(end_idx - start_idx) / num_segments
	frame_indices = np.array([
	int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
	for idx in range(num_segments)
	])
	return frame_indices

	def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=8):
	"""
	InternVL 예시 코드 참고: 여러 프레임을 추출하여 dynamic_preprocess 적용.
	여기서는 기본적으로 num_segments=8로 설정.
	"""
	vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
	max_frame = len(vr) - 1
	fps = float(vr.get_avg_fps())

	pixel_values_list, num_patches_list = [], []
	transform = build_transform(input_size=input_size)
	frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)

	for frame_index in frame_indices:
	frame = vr[frame_index]
	img = Image.fromarray(frame.asnumpy()).convert('RGB')
	processed_imgs = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
	tile_values = [transform(tile) for tile in processed_imgs]
	tile_values = torch.stack(tile_values)
	num_patches_list.append(tile_values.shape[0])
	pixel_values_list.append(tile_values)

	# 여러 프레임을 이어 붙여 최종 pixel_values 생성
	pixel_values = torch.cat(pixel_values_list, dim=0) # (sum(num_patches_list), 3, H, W)
	return pixel_values, num_patches_list


	# =============================================================================
	# InternVL 모델 로딩
	# =============================================================================
	MODEL_ID = "OpenGVLab/InternVL2_5-2B"

	model = AutoModel.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	use_flash_attn=True,
	trust_remote_code=True
	).eval().cuda()

	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	use_fast=False
	)

	# Gradio 상단에 표시할 설명 문구
	DESCRIPTION = "[InternVL2_5-2B Demo](https://github.com/OpenGVLab/InternVL) - Using the InternVL2_5-2B"

	image_extensions = Image.registered_extensions()
	video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")

	def identify_and_save_blob(blob_path):
	"""
	Qwen 예제 코드와 동일: blob을 열어보고 이미지인지 영상인지 확인 후,
	임시 파일로 저장하여 경로 리턴
	"""
	try:
	with open(blob_path, 'rb') as file:
	blob_content = file.read()
	# Try to identify if it's an image
	try:
	Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
	extension = ".png" # Default to PNG for saving
	media_type = "image"
	except (IOError, SyntaxError):
	# If it's not a valid image, assume it's a video
	extension = ".mp4" # Default to MP4 for saving
	media_type = "video"

	# Create a unique filename
	filename = f"temp_{uuid.uuid4()}_media{extension}"
	with open(filename, "wb") as f:
	f.write(blob_content)
	return filename, media_type
	except FileNotFoundError:
	raise ValueError(f"The file {blob_path} was not found.")
	except Exception as e:
	raise ValueError(f"An error occurred while processing the file: {e}")

	def process_file_upload(file_path):
	"""
	파일 업로드 시 이미지/영상 미리보기 혹은 그대로 패스.
	"""
	if isinstance(file_path, str):
	if file_path.endswith(tuple([i for i, f in image_extensions.items()])):
	# 이미지를 열어서 preview로 넘김
	return file_path, Image.open(file_path)
	elif file_path.endswith(video_extensions):
	# 영상은 preview를 None으로
	return file_path, None
	else:
	# blob 파일인 경우 처리
	try:
	media_path, media_type = identify_and_save_blob(file_path)
	if media_type == "image":
	return media_path, Image.open(media_path)
	return media_path, None
	except Exception as e:
	print(e)
	raise ValueError("Unsupported media type. Please upload an image or video.")
	return None, None

	@spaces.GPU
	def internvl_inference(media_input, text_input=None):
	"""
	Qwen 예제의 qwen_inference 대신 InternVL을 이용한 추론 함수.
	- 이미지/영상 파일을 InternVL에서 요구하는 pixel_values로 변환 후
	model.chat() 호출하여 답변 생성.
	"""
	if isinstance(media_input, str): # If it's a filepath
	media_path = media_input

	# 미디어 종류 식별
	if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
	media_type = "image"
	elif media_path.endswith(video_extensions):
	media_type = "video"
	else:
	# blob인지 체크
	try:
	media_path, media_type = identify_and_save_blob(media_input)
	except Exception as e:
	print(e)
	raise ValueError("Unsupported media type. Please upload an image or video.")
	else:
	return "No media input found"

	# 이미지 vs 영상 처리
	if media_type == "image":
	# 단일 이미지만 처리한다고 가정 (멀티-이미지도 확장 가능)
	pixel_values = load_image(media_path, max_num=12)
	pixel_values = pixel_values.to(torch.bfloat16).cuda() # (N, 3, H, W)
	# InternVL 대화
	question = f"<image>\n{text_input}" if text_input else "<image>\n"
	generation_config = dict(max_new_tokens=1024, do_sample=True)

	response = model.chat(
	tokenizer,
	pixel_values,
	question,
	generation_config
	)
	return response

	elif media_type == "video":
	# 영상: 예시로 첫 8프레임에 대해 처리
	pixel_values, num_patches_list = load_video(
	media_path,
	num_segments=8,
	max_num=1
	)
	pixel_values = pixel_values.to(torch.bfloat16).cuda()
	question_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
	question = question_prefix + (text_input if text_input else "")
	generation_config = dict(max_new_tokens=1024, do_sample=True)

	# 영상에서도 동일한 chat() 함수 사용
	response = model.chat(
	tokenizer,
	pixel_values,
	question,
	generation_config,
	num_patches_list=num_patches_list
	)
	return response

	return "Unsupported media type"

	# 간단한 CSS
	css = """
	#output {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""

	# Gradio 데모 구성
	with gr.Blocks(css=css) as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Tab(label="Image/Video Input"):
	with gr.Row():
	with gr.Column():
	input_media = gr.File(
	label="Upload Image or Video", type="filepath"
	)
	preview_image = gr.Image(label="Preview", visible=True)
	text_input = gr.Textbox(label="Question")
	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	output_text = gr.Textbox(label="Output Text")

	input_media.change(
	fn=process_file_upload,
	inputs=[input_media],
	outputs=[input_media, preview_image]
	)

	submit_btn.click(
	internvl_inference,
	[input_media, text_input],
	[output_text]
	)

	demo.launch(debug=True)