zRzRzRzRzRzRzR commited on
Commit
a735de3
·
1 Parent(s): f0ae490
Files changed (1) hide show
  1. app.py +143 -100
app.py CHANGED
@@ -7,8 +7,14 @@ import re
7
  import argparse
8
  import copy
9
  import spaces
 
 
 
 
 
10
 
11
- MODEL_PATH = "/model/glm-4v-9b-0529"
 
12
 
13
 
14
  class GLM4VModel:
@@ -32,6 +38,26 @@ class GLM4VModel:
32
  def _wrap_text(self, t):
33
  return [{"type": "text", "text": t}]
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def _files_to_content(self, media):
36
  out = []
37
  for f in media or []:
@@ -40,94 +66,70 @@ class GLM4VModel:
40
  out.append({"type": "video", "url": f.name})
41
  elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
42
  out.append({"type": "image", "url": f.name})
 
 
 
 
 
 
43
  return out
44
 
45
- # -----------------------------------------------------------
46
- # 🖼️ Output formatting
47
- # -----------------------------------------------------------
48
- def _format_output(self, txt):
49
- """Called once完整生成结束时"""
50
- think_pat, ans_pat = r"<think>(.*?)</think>", r"<answer>(.*?)</answer>"
51
- think = re.findall(think_pat, txt, re.DOTALL)
52
- ans = re.findall(ans_pat, txt, re.DOTALL)
53
- html = ""
54
- if think:
55
- html += (
56
- "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 Thinking Process</summary>"
57
- "<div style='color:#cccccc;line-height:1.4;'>"
58
- + think[0].strip()
59
- + "</div></details><br>"
60
- )
61
- body = ans[0] if ans else re.sub(think_pat, "", txt, flags=re.DOTALL)
62
- html += f"<div style='color:#ffffff;'>{body.strip()}</div>"
63
- return html
64
-
65
  def _stream_fragment(self, buf: str) -> str:
66
  think_html = ""
67
  if "<think>" in buf:
68
  if "</think>" in buf:
69
- think_content = re.search(r"<think>(.*?)</think>", buf, re.DOTALL)
70
- if think_content:
71
  think_html = (
72
- "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 Thinking Process</summary>"
73
- "<div style='color:#cccccc;line-height:1.4;'>"
74
- + think_content.group(1).strip()
75
- + "</div></details><br>"
76
  )
77
  else:
78
- partial = buf.split("<think>", 1)[1]
79
  think_html = (
80
- "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 Thinking Process</summary>"
81
- "<div style='color:#cccccc;line-height:1.4;'>" + partial
 
 
82
  )
83
 
84
  answer_html = ""
85
  if "<answer>" in buf:
86
  if "</answer>" in buf:
87
- ans_content = re.search(r"<answer>(.*?)</answer>", buf, re.DOTALL)
88
- if ans_content:
89
- answer_html = (
90
- "<div style='color:#ffffff;'>" + ans_content.group(1).strip() + "</div>"
91
- )
92
  else:
93
- partial = buf.split("<answer>", 1)[1]
94
- answer_html = "<div style='color:#ffffff;'>" + partial
95
 
96
  if not think_html and not answer_html:
97
  return self._strip_html(buf)
98
-
99
  return think_html + answer_html
100
 
101
- def _build_messages(self, hist, sys_prompt):
102
  msgs = []
103
 
104
  if sys_prompt.strip():
105
- msgs.append({
106
- "role": "system",
107
- "content": [{"type": "text", "text": sys_prompt.strip()}]
108
- })
109
 
110
- for h in hist:
111
  if h["role"] == "user":
112
- payload = h.get("file_info") or self._wrap_text(
113
- self._strip_html(h["content"])
114
- )
115
- msgs.append({"role": "user", "content": payload})
116
-
117
  else:
118
  raw = h["content"]
119
  raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
120
  raw = re.sub(r"<details.*?</details>", "", raw, flags=re.DOTALL)
121
  clean = self._strip_html(raw).strip()
122
-
123
  msgs.append({"role": "assistant", "content": self._wrap_text(clean)})
124
-
125
  return msgs
126
 
127
  @spaces.GPU(duration=240)
128
- def stream_generate(self, hist, sys_prompt):
129
- msgs = self._build_messages(hist, sys_prompt)
130
- print(msgs)
 
131
  inputs = self.processor.apply_chat_template(
132
  msgs,
133
  tokenize=True,
@@ -137,9 +139,7 @@ class GLM4VModel:
137
  padding=True,
138
  ).to(self.device)
139
 
140
- streamer = TextIteratorStreamer(
141
- self.processor.tokenizer, skip_prompt=True, skip_special_tokens=False
142
- )
143
  gen_args = dict(
144
  inputs,
145
  max_new_tokens=8192,
@@ -150,13 +150,46 @@ class GLM4VModel:
150
  top_p=1e-5,
151
  streamer=streamer,
152
  )
153
- threading.Thread(target=self.model.generate, kwargs=gen_args).start()
 
 
154
 
155
  buf = ""
156
  for tok in streamer:
 
 
157
  buf += tok
158
  yield self._stream_fragment(buf)
159
- yield self._format_output(buf)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
 
162
  glm4v = GLM4VModel()
@@ -164,27 +197,35 @@ glm4v.load()
164
 
165
 
166
  def check_files(files):
167
- vids = imgs = 0
168
  for f in files or []:
169
  ext = Path(f.name).suffix.lower()
170
  if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]:
171
  vids += 1
172
  elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
173
  imgs += 1
174
- if vids > 1:
175
- return False, "Only 1 video allowed"
 
 
 
 
176
  if imgs > 10:
177
- return False, "Max 10 images"
178
- if vids and imgs:
179
- return False, "Cannot mix video and images"
180
  return True, ""
181
 
182
 
183
- def chat(files, msg, hist, sys_prompt):
 
 
 
184
  ok, err = check_files(files)
185
  if not ok:
186
- hist.append({"role": "assistant", "content": err})
187
- yield copy.deepcopy(hist), None, ""
 
188
  return
189
 
190
  payload = glm4v._files_to_content(files) if files else None
@@ -194,24 +235,33 @@ def chat(files, msg, hist, sys_prompt):
194
  else:
195
  payload.append({"type": "text", "text": msg.strip()})
196
 
197
- display = f"[{len(files)} file(s) uploaded]\n{msg}" if files else msg
198
- user_rec = {"role": "user", "content": display}
199
- if payload:
200
- user_rec["file_info"] = payload
201
- hist.append(user_rec)
202
 
203
  place = {"role": "assistant", "content": ""}
204
- hist.append(place)
205
- yield copy.deepcopy(hist), None, ""
 
 
206
 
207
- for chunk in glm4v.stream_generate(hist[:-1], sys_prompt):
 
 
208
  place["content"] = chunk
209
- yield copy.deepcopy(hist), None, ""
210
- yield copy.deepcopy(hist), None, ""
 
 
 
211
 
212
 
213
  def reset():
214
- return [], None, ""
 
 
 
215
 
216
 
217
  css = """.chatbot-container .message-wrap .message{font-size:14px!important}
@@ -222,14 +272,16 @@ demo = gr.Blocks(title="GLM-4.1V Chat", theme=gr.themes.Soft(), css=css)
222
  with demo:
223
  gr.Markdown("""
224
  <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
225
- GLM-4.1V-9B Gradio Space🤗
226
  </div>
227
  <div style="text-align: center;">
228
  <a href="https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking">🤗 Model Hub</a> |
229
- <a href="https://github.com/THUDM/CogVLM">🌐 Github</a> |
230
- <a href="https://arxiv.org/abs/">📜 arxiv</a>
231
  </div>
232
  """)
 
 
 
233
  with gr.Row():
234
  with gr.Column(scale=7):
235
  chatbox = gr.Chatbot(
@@ -246,27 +298,18 @@ with demo:
246
  up = gr.File(
247
  label="📁 Upload",
248
  file_count="multiple",
249
- file_types=["image", "video"],
250
  type="filepath",
251
  )
252
- gr.Markdown("""
253
- <span style="color:red">Please upload the Bay image before entering text.</span>
254
- """)
 
255
  sys = gr.Textbox(label="⚙️ System Prompt", lines=6)
256
 
257
- send.click(chat, inputs=[up, textbox, chatbox, sys], outputs=[chatbox, up, textbox])
258
- textbox.submit(chat, inputs=[up, textbox, chatbox, sys], outputs=[chatbox, up, textbox])
259
- clear.click(reset, outputs=[chatbox, up, textbox])
260
 
261
  if __name__ == "__main__":
262
- parser = argparse.ArgumentParser()
263
- parser.add_argument("--port", type=int, default=8000)
264
- parser.add_argument("--host", type=str, default="0.0.0.0")
265
- parser.add_argument("--share", action="store_true")
266
- args = parser.parse_args()
267
-
268
- demo.launch(
269
- server_port=args.port,
270
- server_name=args.host,
271
- share=args.share,
272
- )
 
7
  import argparse
8
  import copy
9
  import spaces
10
+ import fitz
11
+ import subprocess
12
+ import tempfile
13
+ import os
14
+ import time
15
 
16
+ MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
17
+ stop_generation = False
18
 
19
 
20
  class GLM4VModel:
 
38
  def _wrap_text(self, t):
39
  return [{"type": "text", "text": t}]
40
 
41
+ def _pdf_to_imgs(self, pdf_path):
42
+ doc = fitz.open(pdf_path)
43
+ imgs = []
44
+ for i in range(doc.page_count):
45
+ pix = doc.load_page(i).get_pixmap(dpi=180)
46
+ img_p = os.path.join(tempfile.gettempdir(), f"{Path(pdf_path).stem}_{i}.png")
47
+ pix.save(img_p)
48
+ imgs.append(img_p)
49
+ doc.close()
50
+ return imgs
51
+
52
+ def _ppt_to_imgs(self, ppt_path):
53
+ tmp = tempfile.mkdtemp()
54
+ subprocess.run(
55
+ ["libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmp, ppt_path],
56
+ check=True,
57
+ )
58
+ pdf_path = os.path.join(tmp, Path(ppt_path).stem + ".pdf")
59
+ return self._pdf_to_imgs(pdf_path)
60
+
61
  def _files_to_content(self, media):
62
  out = []
63
  for f in media or []:
 
66
  out.append({"type": "video", "url": f.name})
67
  elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
68
  out.append({"type": "image", "url": f.name})
69
+ elif ext in [".ppt", ".pptx"]:
70
+ for p in self._ppt_to_imgs(f.name):
71
+ out.append({"type": "image", "url": p})
72
+ elif ext == ".pdf":
73
+ for p in self._pdf_to_imgs(f.name):
74
+ out.append({"type": "image", "url": p})
75
  return out
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def _stream_fragment(self, buf: str) -> str:
78
  think_html = ""
79
  if "<think>" in buf:
80
  if "</think>" in buf:
81
+ seg = re.search(r"<think>(.*?)</think>", buf, re.DOTALL)
82
+ if seg:
83
  think_html = (
84
+ "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 思考过程</summary>"
85
+ "<div style='color:#cccccc;line-height:1.4;padding:10px;border-left:3px solid #666;margin:5px 0;background-color:rgba(128,128,128,0.1);'>"
86
+ + seg.group(1).strip().replace("\n", "<br>")
87
+ + "</div></details>"
88
  )
89
  else:
90
+ part = buf.split("<think>", 1)[1]
91
  think_html = (
92
+ "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 思考过程</summary>"
93
+ "<div style='color:#cccccc;line-height:1.4;padding:10px;border-left:3px solid #666;margin:5px 0;background-color:rgba(128,128,128,0.1);'>"
94
+ + part.replace("\n", "<br>")
95
+ + "</div></details>"
96
  )
97
 
98
  answer_html = ""
99
  if "<answer>" in buf:
100
  if "</answer>" in buf:
101
+ seg = re.search(r"<answer>(.*?)</answer>", buf, re.DOTALL)
102
+ if seg:
103
+ answer_html = seg.group(1).strip()
 
 
104
  else:
105
+ answer_html = buf.split("<answer>", 1)[1]
 
106
 
107
  if not think_html and not answer_html:
108
  return self._strip_html(buf)
 
109
  return think_html + answer_html
110
 
111
+ def _build_messages(self, raw_hist, sys_prompt):
112
  msgs = []
113
 
114
  if sys_prompt.strip():
115
+ msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
 
 
 
116
 
117
+ for h in raw_hist:
118
  if h["role"] == "user":
119
+ msgs.append({"role": "user", "content": h["content"]})
 
 
 
 
120
  else:
121
  raw = h["content"]
122
  raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
123
  raw = re.sub(r"<details.*?</details>", "", raw, flags=re.DOTALL)
124
  clean = self._strip_html(raw).strip()
 
125
  msgs.append({"role": "assistant", "content": self._wrap_text(clean)})
 
126
  return msgs
127
 
128
  @spaces.GPU(duration=240)
129
+ def stream_generate(self, raw_hist, sys_prompt):
130
+ global stop_generation
131
+ stop_generation = False
132
+ msgs = self._build_messages(raw_hist, sys_prompt)
133
  inputs = self.processor.apply_chat_template(
134
  msgs,
135
  tokenize=True,
 
139
  padding=True,
140
  ).to(self.device)
141
 
142
+ streamer = TextIteratorStreamer(self.processor.tokenizer, skip_prompt=True, skip_special_tokens=False)
 
 
143
  gen_args = dict(
144
  inputs,
145
  max_new_tokens=8192,
 
150
  top_p=1e-5,
151
  streamer=streamer,
152
  )
153
+
154
+ generation_thread = threading.Thread(target=self.model.generate, kwargs=gen_args)
155
+ generation_thread.start()
156
 
157
  buf = ""
158
  for tok in streamer:
159
+ if stop_generation:
160
+ break
161
  buf += tok
162
  yield self._stream_fragment(buf)
163
+
164
+ generation_thread.join()
165
+
166
+
167
+ def format_display_content(content):
168
+ if isinstance(content, list):
169
+ text_parts = []
170
+ file_count = 0
171
+ for item in content:
172
+ if item["type"] == "text":
173
+ text_parts.append(item["text"])
174
+ else:
175
+ file_count += 1
176
+
177
+ display_text = " ".join(text_parts)
178
+ if file_count > 0:
179
+ return f"[{file_count} file(s) uploaded]\n{display_text}"
180
+ return display_text
181
+ return content
182
+
183
+
184
+ def create_display_history(raw_hist):
185
+ display_hist = []
186
+ for h in raw_hist:
187
+ if h["role"] == "user":
188
+ display_content = format_display_content(h["content"])
189
+ display_hist.append({"role": "user", "content": display_content})
190
+ else:
191
+ display_hist.append({"role": "assistant", "content": h["content"]})
192
+ return display_hist
193
 
194
 
195
  glm4v = GLM4VModel()
 
197
 
198
 
199
  def check_files(files):
200
+ vids = imgs = ppts = pdfs = 0
201
  for f in files or []:
202
  ext = Path(f.name).suffix.lower()
203
  if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]:
204
  vids += 1
205
  elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
206
  imgs += 1
207
+ elif ext in [".ppt", ".pptx"]:
208
+ ppts += 1
209
+ elif ext == ".pdf":
210
+ pdfs += 1
211
+ if vids > 1 or ppts > 1 or pdfs > 1:
212
+ return False, "Only one video or one PPT or one PDF allowed"
213
  if imgs > 10:
214
+ return False, "Maximum 10 images allowed"
215
+ if (ppts or pdfs) and (vids or imgs) or (vids and imgs):
216
+ return False, "Cannot mix documents, videos, and images"
217
  return True, ""
218
 
219
 
220
+ def chat(files, msg, raw_hist, sys_prompt):
221
+ global stop_generation
222
+ stop_generation = False
223
+
224
  ok, err = check_files(files)
225
  if not ok:
226
+ raw_hist.append({"role": "assistant", "content": err})
227
+ display_hist = create_display_history(raw_hist)
228
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
229
  return
230
 
231
  payload = glm4v._files_to_content(files) if files else None
 
235
  else:
236
  payload.append({"type": "text", "text": msg.strip()})
237
 
238
+ user_rec = {"role": "user", "content": payload if payload else msg.strip()}
239
+ if raw_hist is None:
240
+ raw_hist = []
241
+ raw_hist.append(user_rec)
 
242
 
243
  place = {"role": "assistant", "content": ""}
244
+ raw_hist.append(place)
245
+
246
+ display_hist = create_display_history(raw_hist)
247
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
248
 
249
+ for chunk in glm4v.stream_generate(raw_hist[:-1], sys_prompt):
250
+ if stop_generation:
251
+ break
252
  place["content"] = chunk
253
+ display_hist = create_display_history(raw_hist)
254
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
255
+
256
+ display_hist = create_display_history(raw_hist)
257
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
258
 
259
 
260
  def reset():
261
+ global stop_generation
262
+ stop_generation = True
263
+ time.sleep(0.1)
264
+ return [], [], None, ""
265
 
266
 
267
  css = """.chatbot-container .message-wrap .message{font-size:14px!important}
 
272
  with demo:
273
  gr.Markdown("""
274
  <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
275
+ GLM-4.1V-9B-Thinking Gradio Space🤗
276
  </div>
277
  <div style="text-align: center;">
278
  <a href="https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking">🤗 Model Hub</a> |
279
+ <a href="https://github.com/THUDM/GLM-4.1V-Thinking">🌐 Github</a>
 
280
  </div>
281
  """)
282
+
283
+ raw_history = gr.State([])
284
+
285
  with gr.Row():
286
  with gr.Column(scale=7):
287
  chatbox = gr.Chatbot(
 
298
  up = gr.File(
299
  label="📁 Upload",
300
  file_count="multiple",
301
+ file_types=["file"],
302
  type="filepath",
303
  )
304
+ gr.Markdown("Supports images / videos / PPT / PDF")
305
+ gr.Markdown(
306
+ "The maximum supported input is 10 images or 1 video/PPT/PDF. During the conversation, video and images cannot be present at the same time."
307
+ )
308
  sys = gr.Textbox(label="⚙️ System Prompt", lines=6)
309
 
310
+ send.click(chat, inputs=[up, textbox, raw_history, sys], outputs=[chatbox, raw_history, up, textbox])
311
+ textbox.submit(chat, inputs=[up, textbox, raw_history, sys], outputs=[chatbox, raw_history, up, textbox])
312
+ clear.click(reset, outputs=[chatbox, raw_history, up, textbox])
313
 
314
  if __name__ == "__main__":
315
+ demo.launch()