Spaces:

SynaptechX
/

ImgTextParser

Sleeping

App Files Files Community

ImgTextParser / app.py

SynaptechX

Update app.py

3bf5b34 verified 3 months ago

raw

history blame contribute delete

12 kB

	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoModel, AutoTokenizer
	import warnings
	import os
	import spaces

	# 禁用警告信息
	warnings.filterwarnings("ignore")

	# 全局变量存储模型
	model = None
	tokenizer = None

	@spaces.GPU
	def load_model():
	"""加载MiniCPM-o模型"""
	global model, tokenizer
	if model is None:
	print("正在加载MiniCPM-o模型...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = AutoModel.from_pretrained(
	'openbmb/MiniCPM-o-2_6',
	trust_remote_code=True,
	torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
	device_map="auto" if device == "cuda" else None,
	init_vision=True,
	init_audio=False,
	init_tts=False
	)
	model = model.eval().to(device)
	tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
	print("模型加载完成")
	return model, tokenizer

	def clean_markdown_output(text):
	"""清理输出文本，只保留markdown表格"""
	lines = text.strip().split('\n')
	markdown_lines = []

	# 查找markdown表格的开始和结束
	in_table = False
	for line in lines:
	line = line.strip()
	# 检查是否是表格行（包含\|符号）
	if '\|' in line and not line.startswith('```'):
	in_table = True
	markdown_lines.append(line)
	elif in_table and line == '':
	# 空行可能表示表格结束
	break
	elif in_table and not line.startswith('```'):
	# 继续收集表格相关行
	markdown_lines.append(line)

	# 如果没有找到表格，返回原始清理后的文本
	if not markdown_lines:
	# 移除代码块标记和多余的说明文字
	cleaned_text = text.replace('```markdown', '').replace('```', '').strip()
	# 移除常见的解释性文字
	lines = cleaned_text.split('\n')
	result_lines = []
	for line in lines:
	line = line.strip()
	if line and not line.startswith('这个表格') and not line.startswith('该表格') and not line.startswith('表格显示'):
	result_lines.append(line)
	return '\n'.join(result_lines)

	return '\n'.join(markdown_lines)

	def clean_formula_output(text):
	"""清理输出文本，只保留LaTeX公式"""
	lines = text.strip().split('\n')
	formula_lines = []

	for line in lines:
	line = line.strip()
	# 跳过解释性文字
	if line and not any(line.startswith(prefix) for prefix in [
	'这个公式', '该公式', '公式表示', '根据图片', '图片中的', '识别结果'
	]):
	# 保留包含LaTeX语法的行
	if any(symbol in line for symbol in ['$', '\\', '{', '}', '^', '_']) or '=' in line:
	formula_lines.append(line)
	# 或者保留纯数学表达式
	elif any(char.isdigit() or char in '+-*/=()[]{}^_' for char in line):
	formula_lines.append(line)

	# 如果没有找到公式，返回原始清理后的文本
	if not formula_lines:
	cleaned_text = text.replace('```latex', '').replace('```', '').strip()
	lines = cleaned_text.split('\n')
	result_lines = []
	for line in lines:
	line = line.strip()
	if line and not any(line.startswith(prefix) for prefix in [
	'这个公式', '该公式', '公式表示', '根据图片', '图片中的'
	]):
	result_lines.append(line)
	return '\n'.join(result_lines)

	return '\n'.join(formula_lines)

	def clean_text_output(text):
	"""清理输出文本，只保留识别的文字内容"""
	# 移除代码块标记
	cleaned_text = text.replace('```text', '').replace('```', '').strip()
	lines = cleaned_text.split('\n')

	text_lines = []
	for line in lines:
	line = line.strip()
	# 跳过解释性文字和标签信息
	if line and not any(line.startswith(prefix) for prefix in [
	'图片中的文字', '识别结果', '文字内容', '根据图片', '这张图片', '该图片',
	'标题:', '正文:', '内容:', '文本:', '题目:', '段落:', '文字:'
	]):
	# 移除行首的标签格式（如 "标题：内容" -> "内容"）
	if '：' in line:
	# 检查是否是标签格式
	parts = line.split('：', 1)
	if len(parts) == 2 and len(parts[0]) <= 10: # 标签通常很短
	# 可能的标签词
	label_keywords = ['标题', '正文', '内容', '文本', '题目', '段落', '文字', '主题', '副标题']
	if any(keyword in parts[0] for keyword in label_keywords):
	# 只保留标签后的内容
	text_lines.append(parts[1].strip())
	else:
	# 不是标签格式，保留整行
	text_lines.append(line)
	else:
	text_lines.append(line)
	else:
	text_lines.append(line)

	return '\n'.join(text_lines)

	@spaces.GPU
	def parse_image(image, parse_type):
	"""解析图片内容为指定格式"""
	try:
	# 确保模型已加载
	model, tokenizer = load_model()

	if image is None:
	return "请上传一张图片", ""

	# 转换图片格式
	if isinstance(image, str):
	image = Image.open(image).convert('RGB')
	elif hasattr(image, 'convert'):
	image = image.convert('RGB')

	# 根据解析类型设置不同的提示词
	questions = {
	"表格解析": "解析一下这个表格为markdown格式,不需要任何解释和思考,直接输出markdown格式",
	"公式解析": "识别并提取图片中的数学公式，用LaTeX格式输出，不需要任何解释，直接输出公式",
	"文本解析": "识别并提取图片中的所有文字内容，保持原有格式，不需要任何解释，直接输出文字内容"
	}

	question = questions.get(parse_type, questions["表格解析"])
	msgs = [{'role': 'user', 'content': [image, question]}]

	# 使用流式输出获取结果
	res = model.chat(
	msgs=msgs,
	tokenizer=tokenizer,
	sampling=True,
	stream=True
	)

	# 收集所有输出文本
	generated_text = ""
	for new_text in res:
	generated_text += new_text

	# 根据类型清理输出
	if parse_type == "表格解析":
	result = clean_markdown_output(generated_text)
	output_format = "Markdown"
	elif parse_type == "公式解析":
	result = clean_formula_output(generated_text)
	output_format = "LaTeX"
	elif parse_type == "文本解析":
	result = clean_text_output(generated_text)
	output_format = "纯文本"
	else:
	result = generated_text.strip()
	output_format = "原始输出"

	return result, f"解析完成 - 输出格式: {output_format}"

	except Exception as e:
	return f"解析失败: {str(e)}", "错误"

	def create_interface():
	"""创建Gradio界面"""

	# 自定义CSS样式
	css = """
	.gradio-container {
	font-family: 'Helvetica Neue', Arial, sans-serif;
	}
	.output-text {
	font-family: 'Courier New', monospace;
	font-size: 14px;
	}
	"""

	with gr.Blocks(css=css, title="MiniCPM 多模态内容解析工具", analytics_enabled=False) as interface:
	gr.Markdown("""
	# 🚀 MiniCPM 多模态内容解析工具

	基于MiniCPM-o多模态模型的智能图片内容解析工具，支持表格、公式、文本三种解析模式。

	## 📋 使用说明
	1. 上传图片: 支持 PNG、JPG、JPEG 等格式
	2. 选择解析类型: 根据图片内容选择相应的解析模式
	3. 获取结果: 自动清理输出，获得纯净的解析结果

	## 🎯 解析类型说明
	- 📊 表格解析: 将表格图片转换为Markdown格式
	- 🧮 公式解析: 识别数学公式并输出LaTeX格式
	- 📝 文本解析: 提取图片中的所有文字内容
	""")

	with gr.Row():
	with gr.Column(scale=1):
	# 输入组件
	image_input = gr.Image(
	label="📷 上传图片",
	type="pil",
	height=400
	)

	parse_type = gr.Radio(
	choices=["表格解析", "公式解析", "文本解析"],
	value="表格解析",
	label="🎛️ 选择解析类型",
	info="根据图片内容选择合适的解析模式"
	)

	parse_button = gr.Button(
	"🔍 开始解析",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	# 输出组件
	status_output = gr.Textbox(
	label="📊 解析状态",
	value="等待上传图片...",
	interactive=False
	)

	result_output = gr.Textbox(
	label="📄 解析结果",
	lines=20,
	max_lines=30,
	show_copy_button=True,
	elem_classes=["output-text"],
	placeholder="解析结果将在这里显示...",
	interactive=True
	)

	# 示例图片
	gr.Markdown("## 📖 示例图片")
	with gr.Row():
	gr.Examples(
	examples=[
	["./table.png", "表格解析"],
	["./formulas.png", "公式解析"],
	["./text.png", "文本解析"]
	],
	inputs=[image_input, parse_type],
	label="点击示例快速体验",
	cache_examples=False
	)

	# 绑定事件
	parse_button.click(
	fn=parse_image,
	inputs=[image_input, parse_type],
	outputs=[result_output, status_output]
	)

	# 添加页脚信息
	gr.Markdown("""
	---
	### 💡 使用提示
	- 确保图片清晰，内容结构明显
	- 复杂表格建议分段处理
	- 公式图片建议使用高分辨率
	- 文字图片避免模糊、倾斜或光线不足

	### 🔧 技术支持
	- 模型: MiniCPM-o-2.6
	- 框架: Gradio + Transformers
	- GPU: CUDA加速推理
	""")

	return interface

	if __name__ == "__main__":
	# 在ZeroGPU环境中不预加载模型，按需加载以节省资源
	print("🚀 启动MiniCPM多模态内容解析工具")
	print("📝 模型将在首次使用时自动加载")

	# 创建并启动界面
	interface = create_interface()
	interface.launch(
	server_name="0.0.0.0", # 允许外部访问
	server_port=7860, # Hugging Face Spaces默认端口
	share=False, # 在Hugging Face上部署时设为False
	show_error=True, # 显示详细错误信息
	quiet=False, # 显示启动信息
	debug=False, # 关闭调试模式
	)