Spaces:

nvidia
/

canary-1b-flash

Running on Zero

App Files Files Community

erastorgueva-nv commited on Mar 18

Commit

f71078c

1 Parent(s): b43c4a1

styling, move frame_asr init to transcribe function

Browse files

Signed-off-by: Elena Rastorgueva <[email protected]>

Files changed (1) hide show

app.py +29 -20

app.py CHANGED Viewed

@@ -32,20 +32,6 @@ model.cfg.preprocessor.pad_to = 0
 feature_stride = model.cfg.preprocessor['window_stride']
 model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
-frame_asr_10s = FrameBatchMultiTaskAED(
-	asr_model=model,
-	frame_len=10.0,
-	total_buffer=10.0,
-	batch_size=16,
-)
-frame_asr_40s = FrameBatchMultiTaskAED(
-	asr_model=model,
-	frame_len=40.0,
-	total_buffer=40.0,
-	batch_size=16,
-)
 amp_dtype = torch.float16
 def convert_audio(audio_filepath, tmpdir, utt_id):
@@ -139,16 +125,23 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
 			<html lang="en">
 			<head>
 				<style>
 					.transcript {
 						font-family: Arial, sans-serif;
 						line-height: 1.6;
 					}
 					.timestamp {
 						color: gray;
 						font-size: 0.8em;
 						margin-right: 5px;
 					}
 				</style>
 			</head>
 			<body>
@@ -160,8 +153,15 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
 			if duration < 10:
 				output = model.transcribe(manifest_filepath)
 			else:
 				output = get_buffered_pred_feat_multitaskAED(
-					frame_asr_10s,
 					model.cfg.preprocessor,
 					model_stride_in_secs,
 					model.device,
@@ -172,14 +172,14 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
 			# process output to get word and segment level timestamps
 			word_level_timestamps = output[0].timestamp["word"]
-			output_html += "<p><b>Transcript with word-level timestamps (in seconds)</b></p>\n"
 			output_html += "<div class='transcript'>\n"
 			for entry in word_level_timestamps:
 				output_html += f'<span>{entry["word"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
 			output_html += "</div>\n"
 			segment_level_timestamps = output[0].timestamp["segment"]
-			output_html += "<p><b>Transcript with segment-level timestamps (in seconds)</b></p>\n"
 			output_html += "<div class='transcript'>\n"
 			for entry in segment_level_timestamps:
 				output_html += f'<span>{entry["segment"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
@@ -191,8 +191,14 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
 				output = model.transcribe(manifest_filepath)
 			else: # do buffered inference
 				output = get_buffered_pred_feat_multitaskAED(
-					frame_asr_40s,
 					model.cfg.preprocessor,
 					model_stride_in_secs,
 					model.device,
@@ -200,7 +206,10 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc, gen_ts):
 					filepaths=None,
 				)
-			output_html += "<p><b>Transcript</b></p>\n"
 			output_text = output[0].text
 			output_html += f'<div class="transcript">{output_text}</div>\n'

 feature_stride = model.cfg.preprocessor['window_stride']
 model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
 amp_dtype = torch.float16
 def convert_audio(audio_filepath, tmpdir, utt_id):
 			<html lang="en">
 			<head>
 				<style>
 					.transcript {
 						font-family: Arial, sans-serif;
 						line-height: 1.6;
+						margin: 20px 0;
 					}
 					.timestamp {
 						color: gray;
 						font-size: 0.8em;
 						margin-right: 5px;
 					}
+					.heading {
+						color: #2c3e50;
+						font-family: Arial, sans-serif;
+						font-weight: bold;
+						margin: 15px 0 8px 0;
+						border-bottom: 1px solid #eee;
+					}
 				</style>
 			</head>
 			<body>
 			if duration < 10:
 				output = model.transcribe(manifest_filepath)
 			else:
+				frame_asr = FrameBatchMultiTaskAED(
+					asr_model=model,
+					frame_len=10.0,
+					total_buffer=10.0,
+					batch_size=16,
+				)
 				output = get_buffered_pred_feat_multitaskAED(
+					frame_asr,
 					model.cfg.preprocessor,
 					model_stride_in_secs,
 					model.device,
 			# process output to get word and segment level timestamps
 			word_level_timestamps = output[0].timestamp["word"]
+			output_html += "<div class='heading'>Transcript with word-level timestamps (in seconds)</div>\n"
 			output_html += "<div class='transcript'>\n"
 			for entry in word_level_timestamps:
 				output_html += f'<span>{entry["word"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
 			output_html += "</div>\n"
 			segment_level_timestamps = output[0].timestamp["segment"]
+			output_html += "<div class='heading'>Transcript with segment-level timestamps (in seconds)</div>\n"
 			output_html += "<div class='transcript'>\n"
 			for entry in segment_level_timestamps:
 				output_html += f'<span>{entry["segment"]} <span class="timestamp">({entry["start"]:.2f}-{entry["end"]:.2f})</span></span>\n'
 				output = model.transcribe(manifest_filepath)
 			else: # do buffered inference
+				frame_asr = FrameBatchMultiTaskAED(
+					asr_model=model,
+					frame_len=40.0,
+					total_buffer=40.0,
+					batch_size=16,
+				)
 				output = get_buffered_pred_feat_multitaskAED(
+					frame_asr,
 					model.cfg.preprocessor,
 					model_stride_in_secs,
 					model.device,
 					filepaths=None,
 				)
+			if taskname == "asr":
+				output_html += "<div class='heading'>Transcript</div>\n"
+			else:
+				output_html += "<div class='heading'>Translated Text</div>\n"
 			output_text = output[0].text
 			output_html += f'<div class="transcript">{output_text}</div>\n'