Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -149,12 +149,54 @@ class PodcastGenerator:
|
|
| 149 |
|
| 150 |
add_log(f"β
PDF extraction complete. Text length: {len(text)} characters")
|
| 151 |
return text.strip()
|
| 152 |
-
|
| 153 |
except Exception as e:
|
| 154 |
error_msg = f"β PDF extraction failed: {str(e)}"
|
| 155 |
add_log(error_msg)
|
| 156 |
raise Exception(error_msg)
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
def clean_and_validate_json(self, text: str) -> Dict:
|
| 159 |
"""Improved JSON extraction and validation - CRITICAL FIX #4"""
|
| 160 |
add_log("π Attempting to extract JSON from generated text")
|
|
@@ -379,6 +421,12 @@ Speaker 2: ...
|
|
| 379 |
|
| 380 |
add_log(f"π Generated text length: {len(generated_text)} characters")
|
| 381 |
add_log(f"π Generated text preview: {generated_text[:2000]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
if progress:
|
| 384 |
progress(0.4, "π Processing generated script...")
|
|
|
|
| 149 |
|
| 150 |
add_log(f"β
PDF extraction complete. Text length: {len(text)} characters")
|
| 151 |
return text.strip()
|
| 152 |
+
|
| 153 |
except Exception as e:
|
| 154 |
error_msg = f"β PDF extraction failed: {str(e)}"
|
| 155 |
add_log(error_msg)
|
| 156 |
raise Exception(error_msg)
|
| 157 |
|
| 158 |
+
async def postprocess_conversation(self, raw_text: str) -> str:
|
| 159 |
+
"""Run LLM again to enforce strict Speaker 1/2 format"""
|
| 160 |
+
prompt = f"""
|
| 161 |
+
You are a podcast formatter.
|
| 162 |
+
|
| 163 |
+
Take the following input conversation, and reformat it so that:
|
| 164 |
+
- Every line begins with exactly `Speaker 1:` or `Speaker 2:` (with colon)
|
| 165 |
+
- No timestamps, names, parentheses, or extra formatting
|
| 166 |
+
- No blank lines
|
| 167 |
+
- Do not invent or change the content
|
| 168 |
+
|
| 169 |
+
Example output:
|
| 170 |
+
Speaker 1: Hello and welcome.
|
| 171 |
+
Speaker 2: Thanks! Glad to be here.
|
| 172 |
+
|
| 173 |
+
Now format the following:
|
| 174 |
+
{raw_text}
|
| 175 |
+
"""
|
| 176 |
+
|
| 177 |
+
inputs = self.tokenizer(
|
| 178 |
+
prompt,
|
| 179 |
+
return_tensors="pt",
|
| 180 |
+
truncation=True,
|
| 181 |
+
max_length=2048
|
| 182 |
+
)
|
| 183 |
+
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
|
| 184 |
+
|
| 185 |
+
with torch.no_grad():
|
| 186 |
+
outputs = self.model.generate(
|
| 187 |
+
**inputs,
|
| 188 |
+
max_new_tokens=1024,
|
| 189 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
| 190 |
+
eos_token_id=self.tokenizer.eos_token_id
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
formatted = self.tokenizer.decode(
|
| 194 |
+
outputs[0][inputs['input_ids'].shape[1]:],
|
| 195 |
+
skip_special_tokens=True
|
| 196 |
+
)
|
| 197 |
+
return formatted.strip()
|
| 198 |
+
|
| 199 |
+
|
| 200 |
def clean_and_validate_json(self, text: str) -> Dict:
|
| 201 |
"""Improved JSON extraction and validation - CRITICAL FIX #4"""
|
| 202 |
add_log("π Attempting to extract JSON from generated text")
|
|
|
|
| 421 |
|
| 422 |
add_log(f"π Generated text length: {len(generated_text)} characters")
|
| 423 |
add_log(f"π Generated text preview: {generated_text[:2000]}...")
|
| 424 |
+
|
| 425 |
+
formatted_text = await self.postprocess_conversation(generated_text)
|
| 426 |
+
add_log(f"π§Ό Post-processed text:\n{formatted_text[:2000]}")
|
| 427 |
+
|
| 428 |
+
# Proceed with parsing to JSON
|
| 429 |
+
generated_text = self.conversation_to_json(formatted_text)
|
| 430 |
|
| 431 |
if progress:
|
| 432 |
progress(0.4, "π Processing generated script...")
|