Waqas167 commited on
Commit
9b548a6
·
verified ·
1 Parent(s): 54a79f1

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +513 -0
  2. packages.txt +2 -0
  3. requirements (4).txt +13 -0
app.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # from transformers import AutoProcessor, Wav2Vec2ForCTC
3
+ # import torch
4
+ # import librosa
5
+ # import os
6
+ # from pydub import AudioSegment
7
+ # from moviepy.editor import VideoFileClip
8
+ # import google.generativeai as genai
9
+ # from google import genai
10
+ # from google.genai import types
11
+
12
+ # # ----------- Configuration -----------
13
+ # model_id = "facebook/mms-1b-l1107"
14
+ # lang_code = "urd-script_arabic"
15
+ # api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo" # ⚠️ Replace with st.secrets for production
16
+
17
+ # # ----------- Load Processor and Model -----------
18
+ # @st.cache_resource
19
+ # def load_model_and_processor():
20
+ # processor = AutoProcessor.from_pretrained(model_id, target_lang=lang_code)
21
+ # model = Wav2Vec2ForCTC.from_pretrained(
22
+ # model_id,
23
+ # target_lang=lang_code,
24
+ # ignore_mismatched_sizes=True
25
+ # )
26
+ # model.load_adapter(lang_code)
27
+ # return processor, model
28
+
29
+ # processor, model = load_model_and_processor()
30
+
31
+ # # ----------- Audio Conversion -----------
32
+ # def get_wav_from_input(file_path, output_path="converted.wav"):
33
+ # ext = os.path.splitext(file_path)[-1].lower()
34
+ # if ext in [".mp4", ".mkv", ".avi", ".mov"]:
35
+ # video = VideoFileClip(file_path)
36
+ # video.audio.write_audiofile(output_path, fps=16000)
37
+ # elif ext in [".mp3", ".aac", ".flac", ".ogg", ".m4a"]:
38
+ # audio = AudioSegment.from_file(file_path)
39
+ # audio = audio.set_frame_rate(16000).set_channels(1)
40
+ # audio.export(output_path, format="wav")
41
+ # elif ext == ".wav":
42
+ # audio = AudioSegment.from_wav(file_path)
43
+ # audio.export(output_path, format="wav")
44
+ # else:
45
+ # raise ValueError("Unsupported file format.")
46
+ # return output_path
47
+
48
+ # # ----------- Transcription -----------
49
+ # def transcribe(file_path):
50
+ # wav_path = get_wav_from_input(file_path)
51
+ # audio, sr = librosa.load(wav_path, sr=16000)
52
+ # inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
53
+ # with torch.no_grad():
54
+ # logits = model(**inputs).logits
55
+ # pred_ids = torch.argmax(logits, dim=-1)
56
+ # return processor.batch_decode(pred_ids)[0]
57
+
58
+ # # ----------- Gemini Analysis -----------
59
+ # def analyze_transcript(transcript):
60
+ # client = genai.Client(api_key=st.secrets["GEMINI_API_KEY"])
61
+
62
+ # system_instr = """
63
+ # You are a speech analyst. The following transcription is in Urdu and contains no punctuation — your first task is to correct the transcript by segmenting it into grammatically correct sentences.
64
+
65
+ # Then:
66
+ # 1. Translate the corrected Urdu transcript into English.
67
+ # 2. Determine whether the transcript involves a single speaker or multiple speakers.
68
+ # 3. If multiple speakers are detected, perform diarization by segmenting the transcript with clear speaker labels.
69
+
70
+ # ⚠️ Format the segmented transcript *exactly* like this:
71
+
72
+ # **Segmented Transcript**
73
+
74
+ # **Urdu:**
75
+ # Person 01:
76
+ # [Urdu line here]
77
+
78
+ # Person 02:
79
+ # [Urdu line here]
80
+
81
+ # ...
82
+
83
+ # **English:**
84
+ # Person 01:
85
+ # [English line here]
86
+
87
+ # Person 02:
88
+ # [English line here]
89
+
90
+ # ...
91
+
92
+ # After that, provide your analysis in the following format:
93
+
94
+ # **Speaker-wise Analysis**
95
+ # [One or two sentences per speaker about tone, emotion, behavior]
96
+
97
+ # **Sentiment and Communication Style**
98
+ # [Concise overall tone: e.g., friendly, formal, tense, etc.]
99
+
100
+ # **Summary of Discussion**
101
+ # [A 2–3 line summary of what the speakers talked about, in English]
102
+ # """
103
+
104
+ # response = client.models.generate_content(
105
+ # model="gemini-2.5-flash",
106
+ # contents=[transcript],
107
+ # config=types.GenerateContentConfig(
108
+ # system_instruction=system_instr,
109
+ # temperature=0.0
110
+ # )
111
+ # )
112
+ # return response.text
113
+
114
+ # # def analyze_transcript(transcript: str):
115
+ # # client = genai.Client(api_key=st.secrets["GEMINI_API_KEY"])
116
+
117
+ # # system_instr = """
118
+ # # You are a speech analyst. The following transcription is in Urdu and contains no punctuation — your first task is to correct the transcript by segmenting it into grammatically correct sentences.
119
+
120
+ # # Then:
121
+ # # 1. Translate the corrected Urdu transcript into English.
122
+ # # 2. Determine whether the transcript involves a single speaker or multiple speakers.
123
+ # # 3. If multiple speakers are detected, perform diarization by segmenting the transcript with clear speaker labels.
124
+
125
+ # # ⚠️ Format the segmented transcript *exactly* like this:
126
+
127
+ # # **Segmented Transcript**
128
+
129
+ # # **Urdu:**
130
+ # # Person 01:
131
+ # # [Urdu line here]
132
+
133
+ # # Person 02:
134
+ # # [Urdu line here]
135
+
136
+ # # ...
137
+
138
+ # # **English:**
139
+ # # Person 01:
140
+ # # [English line here]
141
+
142
+ # # Person 02:
143
+ # # [English line here]
144
+
145
+ # # ...
146
+
147
+ # # After that, provide your analysis in the following format:
148
+
149
+ # # **Speaker-wise Analysis**
150
+ # # [One or two sentences per speaker about tone, emotion, behavior]
151
+
152
+ # # **Sentiment and Communication Style**
153
+ # # [Concise overall tone: e.g., friendly, formal, tense, etc.]
154
+
155
+ # # **Summary of Discussion**
156
+ # # [A 2–3 line summary of what the speakers talked about, in English]
157
+ # # """
158
+ # # resp = client.models.generate_content(
159
+ # # model="gemini-2.5-flash",
160
+ # # contents=[transcript],
161
+ # # config=types.GenerateContentConfig(
162
+ # # system_instruction=system_instr,
163
+ # # temperature=0.0
164
+ # # ),
165
+ # # )
166
+ # # return resp.text
167
+
168
+ # # ----------- Format Display Helper -----------
169
+ # def format_transcript_block(text: str) -> str:
170
+ # lines = text.split("Person ")
171
+ # formatted = ""
172
+ # for line in lines:
173
+ # line = line.strip()
174
+ # if not line:
175
+ # continue
176
+ # if line.startswith("01:") or line.startswith("02:"):
177
+ # formatted += f"\n**Person {line[:2]}**:\n{line[3:].strip()}\n\n"
178
+ # else:
179
+ # formatted += f"{line.strip()}\n\n"
180
+ # return formatted
181
+
182
+ # # ----------- Streamlit UI -----------
183
+ # # Styled Header
184
+ # st.markdown("""
185
+ # <div style="text-align: left; padding-bottom: 1rem;">
186
+ # <h1 style='color:#1f77b4; font-size: 2.5em; font-weight: 800; margin-bottom: 0.2em;'>
187
+ # 🎙️ Urdu Audio & Video Speech Analyzer
188
+ # </h1>
189
+ # <p style='color: #CCCCCC; font-size: 1.05em; margin-top: 0;'>
190
+ # Upload Urdu audio or video to get structured transcription, speaker diarization, and smart AI analysis.
191
+ # </p>
192
+ # </div>
193
+ # """, unsafe_allow_html=True)
194
+
195
+ # # File Upload
196
+ # st.markdown("### 📂 Upload an audio or video file")
197
+ # with st.container():
198
+ # uploaded_file = st.file_uploader(
199
+ # label="",
200
+ # type=["mp3", "mp4", "wav", "mkv", "aac", "ogg", "m4a", "flac"],
201
+ # label_visibility="collapsed"
202
+ # )
203
+
204
+ # if uploaded_file is not None:
205
+ # with st.spinner("⏳ Transcribing..."):
206
+ # file_name = uploaded_file.name
207
+ # temp_path = f"temp_input{os.path.splitext(file_name)[-1]}"
208
+ # with open(temp_path, "wb") as f:
209
+ # f.write(uploaded_file.read())
210
+ # transcript = transcribe(temp_path)
211
+
212
+ # st.markdown("### 📝 Raw Urdu Transcription")
213
+ # st.text(transcript)
214
+
215
+ # with st.spinner("🔍 Analyzing with Gemini..."):
216
+ # report = analyze_transcript(transcript)
217
+
218
+ # # Extract Segmented Urdu and English
219
+ # segmented_urdu = ""
220
+ # segmented_english = ""
221
+ # analysis_only = ""
222
+
223
+ # if "Urdu:" in report and "English:" in report:
224
+ # urdu_start = report.find("Urdu:")
225
+ # english_start = report.find("English:")
226
+ # segmented_urdu = report[urdu_start + len("Urdu:"):english_start].strip()
227
+
228
+ # english_section = report[english_start + len("English:"):].strip()
229
+ # if "**Speaker-wise Analysis**" in english_section:
230
+ # parts = english_section.split("**Speaker-wise Analysis**")
231
+ # segmented_english = parts[0].strip()
232
+ # analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
233
+ # else:
234
+ # segmented_english = english_section.strip()
235
+ # analysis_only = "⚠️ Could not extract structured analysis."
236
+
237
+ # # Show Segmented Transcript
238
+ # if segmented_urdu and segmented_english:
239
+ # st.markdown("### 🗣️ Segmented Transcript")
240
+ # col1, col2 = st.columns(2)
241
+
242
+ # with col1:
243
+ # st.markdown("#### Urdu")
244
+ # st.markdown(format_transcript_block(segmented_urdu))
245
+
246
+ # with col2:
247
+ # st.markdown("#### English")
248
+ # st.markdown(format_transcript_block(segmented_english))
249
+
250
+ # # Show Gemini Analysis Only (No transcript repeat)
251
+ # if analysis_only:
252
+ # st.markdown("### 🧠 Gemini Analysis Summary")
253
+ # st.markdown(analysis_only)
254
+
255
+
256
+ import io, os, numpy as np, streamlit as st, librosa, torch, soundfile as sf
257
+ from transformers import AutoProcessor, Wav2Vec2ForCTC
258
+ from pydub import AudioSegment
259
+ from moviepy.editor import VideoFileClip
260
+ from google import genai
261
+ from google.genai import types
262
+
263
+ # ✅ programmatic Start/Stop mic (no WebRTC)
264
+ from streamlit_mic_recorder import mic_recorder
265
+
266
+ # ---------------- Config ----------------
267
+ st.set_page_config(page_title="Urdu Speech Analyzer", page_icon="🎙️", layout="wide")
268
+ PAGE_TITLE = "🎙️ Urdu Audio & Video Speech Analyzer"
269
+ model_id = "facebook/mms-1b-l1107"
270
+ lang_code = "urd-script_arabic"
271
+ api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo" # hard-coded as requested
272
+
273
+ # ---------------- Model ----------------
274
+ @st.cache_resource
275
+ def load_model_and_processor():
276
+ processor = AutoProcessor.from_pretrained(model_id, target_lang=lang_code)
277
+ model = Wav2Vec2ForCTC.from_pretrained(
278
+ model_id, target_lang=lang_code, ignore_mismatched_sizes=True
279
+ )
280
+ model.load_adapter(lang_code)
281
+ return processor, model
282
+
283
+ processor, model = load_model_and_processor()
284
+
285
+ # ---------------- Helpers ----------------
286
+ def get_wav_from_input(file_path, output_path="converted.wav"):
287
+ ext = os.path.splitext(file_path)[-1].lower()
288
+ if ext in [".mp4", ".mkv", ".avi", ".mov"]:
289
+ video = VideoFileClip(file_path)
290
+ video.audio.write_audiofile(output_path, fps=16000)
291
+ elif ext in [".mp3", ".aac", ".flac", ".ogg", ".m4a"]:
292
+ audio = AudioSegment.from_file(file_path)
293
+ audio = audio.set_frame_rate(16000).set_channels(1)
294
+ audio.export(output_path, format="wav")
295
+ elif ext == ".wav":
296
+ audio = AudioSegment.from_wav(file_path)
297
+ audio = audio.set_frame_rate(16000).set_channels(1)
298
+ audio.export(output_path, format="wav")
299
+ else:
300
+ raise ValueError("Unsupported file format.")
301
+ return output_path
302
+
303
+ def save_wav_resampled(audio_f32: np.ndarray, sr_in: int, path: str):
304
+ if sr_in != 16000:
305
+ audio_f32 = librosa.resample(audio_f32, orig_sr=sr_in, target_sr=16000)
306
+ audio_f32 = librosa.util.normalize(audio_f32)
307
+ sf.write(path, audio_f32.astype(np.float32), 16000)
308
+
309
+ def transcribe(wav_path) -> str:
310
+ audio, sr = librosa.load(wav_path, sr=16000, mono=True)
311
+ inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
312
+ with torch.no_grad():
313
+ logits = model(**inputs).logits
314
+ pred_ids = torch.argmax(logits, dim=-1)
315
+ return processor.batch_decode(pred_ids)[0]
316
+
317
+ def analyze_transcript(transcript: str) -> str:
318
+ client = genai.Client(api_key=api_key)
319
+ system_instr = """
320
+ You are a speech analyst. The following transcription is in Urdu and contains no punctuation — your first task is to correct the transcript by segmenting it into grammatically correct sentences.
321
+
322
+ Then:
323
+ 1. Translate the corrected Urdu transcript into English.
324
+ 2. Determine whether the transcript involves a single speaker or multiple speakers.
325
+ 3. If multiple speakers are detected, perform diarization by segmenting the transcript with clear speaker labels.
326
+
327
+ ⚠️ Format the segmented transcript *exactly* like this:
328
+
329
+ **Segmented Transcript**
330
+
331
+ **Urdu:**
332
+ Person 01:
333
+ [Urdu line here]
334
+
335
+ Person 02:
336
+ [Urdu line here]
337
+
338
+ ...
339
+
340
+ **English:**
341
+ Person 01:
342
+ [English line here]
343
+
344
+ Person 02:
345
+ [English line here]
346
+
347
+ ...
348
+
349
+ After that, provide your analysis in the following format:
350
+
351
+ **Speaker-wise Analysis**
352
+ [One or two sentences per speaker about tone, emotion, behavior]
353
+
354
+ **Sentiment and Communication Style**
355
+ [Concise overall tone: e.g., friendly, formal, tense, etc.]
356
+
357
+ **Summary of Discussion**
358
+ [A 2–3 line summary of what the speakers talked about, in English]
359
+ """
360
+ resp = client.models.generate_content(
361
+ model="gemini-2.5-flash",
362
+ contents=[transcript],
363
+ config=types.GenerateContentConfig(system_instruction=system_instr, temperature=0.0)
364
+ )
365
+ return resp.text
366
+
367
+ def format_transcript_block(text: str) -> str:
368
+ lines = text.split("Person ")
369
+ out = ""
370
+ for line in lines:
371
+ line = line.strip()
372
+ if not line:
373
+ continue
374
+ if line.startswith("01:") or line.startswith("02:"):
375
+ out += f"\n**Person {line[:2]}**:\n{line[3:].strip()}\n\n"
376
+ else:
377
+ out += f"{line}\n\n"
378
+ return out
379
+
380
+ # ---------------- Header ----------------
381
+ st.markdown(f"""
382
+ <div style="text-align: left; padding-bottom: 1rem;">
383
+ <h1 style='color:#1f77b4; font-size: 2.5em; font-weight: 800; margin-bottom: 0.2em;'>
384
+ {PAGE_TITLE}
385
+ </h1>
386
+ <p style='color: #7c8a98; font-size: 1.05em; margin-top: 0;'>
387
+ Record or upload Urdu speech for structured transcription, diarization, and smart AI analysis.
388
+ </p>
389
+ </div>
390
+ """, unsafe_allow_html=True)
391
+
392
+ # ================= Mic: true Start/Stop + narrow Analyze =================
393
+ st.markdown("### 🎤 Live recording")
394
+
395
+ # The component renders **Start** and **Stop** buttons and keeps recording until you press Stop.
396
+ rec = mic_recorder(
397
+ start_prompt="▶️ Start",
398
+ stop_prompt="⏹️ Stop",
399
+ just_once=False, # allow multiple recordings in a session
400
+ key="recorder",
401
+ format="wav" # returns WAV bytes
402
+ )
403
+
404
+ # `rec` returns after Stop. Different versions return bytes or a dict — handle both.
405
+ audio_bytes, sr_in = None, 44100
406
+ if rec is not None:
407
+ if isinstance(rec, dict) and "bytes" in rec:
408
+ audio_bytes = rec["bytes"]
409
+ sr_in = int(rec.get("sample_rate", 44100))
410
+ elif isinstance(rec, (bytes, bytearray)):
411
+ audio_bytes = rec
412
+ sr_in = 44100 # component default
413
+ else:
414
+ # fallback: try to extract .get("audio") etc if lib changes
415
+ audio_bytes = rec.get("audio") if isinstance(rec, dict) else None
416
+
417
+ if audio_bytes:
418
+ st.success("Audio captured.")
419
+ # Convert to mono float32
420
+ data, sr_read = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=False)
421
+ if data.ndim > 1:
422
+ data = data.mean(axis=1)
423
+ if sr_read: # prefer the rate embedded in the WAV
424
+ sr_in = sr_read
425
+
426
+ # Save as 16 kHz mono for the model
427
+ tmp_wav = "mic_recording.wav"
428
+ save_wav_resampled(data, sr_in, tmp_wav)
429
+
430
+ # Minimal playback (no waveform)
431
+ st.audio(audio_bytes, format="audio/wav")
432
+ st.caption(f"Duration: {data.size / sr_in:.2f} s")
433
+
434
+ # Slim Analyze button (not full width)
435
+ if st.button("🔍 Analyze", type="primary"):
436
+ with st.spinner("⏳ Transcribing & analyzing..."):
437
+ transcript = transcribe(tmp_wav) # raw not displayed
438
+ report = analyze_transcript(transcript)
439
+
440
+ segmented_urdu = segmented_english = analysis_only = ""
441
+ if "Urdu:" in report and "English:" in report:
442
+ u0 = report.find("Urdu:")
443
+ e0 = report.find("English:")
444
+ segmented_urdu = report[u0 + len("Urdu:"):e0].strip()
445
+ english_section = report[e0 + len("English:"):].strip()
446
+ if "**Speaker-wise Analysis**" in english_section:
447
+ parts = english_section.split("**Speaker-wise Analysis**")
448
+ segmented_english = parts[0].strip()
449
+ analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
450
+ else:
451
+ segmented_english = english_section.strip()
452
+ analysis_only = "⚠️ Could not extract structured analysis."
453
+
454
+ if segmented_urdu or segmented_english:
455
+ st.markdown("### 🗣️ Segmented Transcript")
456
+ c1, c2 = st.columns(2)
457
+ with c1:
458
+ st.markdown("#### Urdu")
459
+ st.markdown(format_transcript_block(segmented_urdu) if segmented_urdu else "_(none)_")
460
+ with c2:
461
+ st.markdown("#### English")
462
+ st.markdown(format_transcript_block(segmented_english) if segmented_english else "_(none)_")
463
+ if analysis_only:
464
+ st.markdown("### 🧠 Gemini Analysis Summary")
465
+ st.markdown(analysis_only)
466
+
467
+ st.markdown("---")
468
+
469
+ # ================= Upload (unchanged) =================
470
+ st.markdown("### 📂 Or upload an audio/video file")
471
+ uploaded_file = st.file_uploader(
472
+ label="",
473
+ type=["mp3", "mp4", "wav", "mkv", "aac", "ogg", "m4a", "flac"],
474
+ label_visibility="collapsed"
475
+ )
476
+ if uploaded_file is not None:
477
+ with st.spinner("⏳ Transcribing..."):
478
+ file_name = uploaded_file.name
479
+ temp_path = f"temp_input{os.path.splitext(file_name)[-1]}"
480
+ with open(temp_path, "wb") as f:
481
+ f.write(uploaded_file.read())
482
+ wav_path = get_wav_from_input(temp_path)
483
+ transcript = transcribe(wav_path)
484
+
485
+ with st.spinner("🔍 Analyzing with Gemini..."):
486
+ report = analyze_transcript(transcript)
487
+
488
+ segmented_urdu = segmented_english = analysis_only = ""
489
+ if "Urdu:" in report and "English:" in report:
490
+ u0 = report.find("Urdu:")
491
+ e0 = report.find("English:")
492
+ segmented_urdu = report[u0 + len("Urdu:"):e0].strip()
493
+ english_section = report[e0 + len("English:"):].strip()
494
+ if "**Speaker-wise Analysis**" in english_section:
495
+ parts = english_section.split("**Speaker-wise Analysis**")
496
+ segmented_english = parts[0].strip()
497
+ analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
498
+ else:
499
+ segmented_english = english_section.strip()
500
+ analysis_only = "⚠️ Could not extract structured analysis."
501
+
502
+ if segmented_urdu or segmented_english:
503
+ st.markdown("### 🗣️ Segmented Transcript")
504
+ c1, c2 = st.columns(2)
505
+ with c1:
506
+ st.markdown("#### Urdu")
507
+ st.markdown(format_transcript_block(segmented_urdu) if segmented_urdu else "_(none)_")
508
+ with c2:
509
+ st.markdown("#### English")
510
+ st.markdown(format_transcript_block(segmented_english) if segmented_english else "_(none)_")
511
+ if analysis_only:
512
+ st.markdown("### 🧠 Gemini Analysis Summary")
513
+ st.markdown(analysis_only)
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
requirements (4).txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ torch==2.3.1
3
+ torchaudio==2.3.1
4
+ accelerate
5
+ datasets
6
+ transformers>=4.41.0
7
+ moviepy==1.0.3
8
+ imageio-ffmpeg
9
+ pydub
10
+ librosa
11
+ soundfile
12
+ google-generativeai
13
+ streamlit-mic-recorder