|
|
from whisper_online import * |
|
|
from voice_activity_controller import * |
|
|
import soundfile |
|
|
import io |
|
|
|
|
|
SAMPLING_RATE = 16000 |
|
|
|
|
|
class VACOnlineASRProcessor(OnlineASRProcessor): |
|
|
|
|
|
def __init__(self, online_chunk_size, *a, **kw): |
|
|
self.online_chunk_size = online_chunk_size |
|
|
|
|
|
self.online = OnlineASRProcessor(*a, **kw) |
|
|
self.vac = VoiceActivityController(use_vad_result = False) |
|
|
|
|
|
self.logfile = self.online.logfile |
|
|
|
|
|
self.init() |
|
|
|
|
|
def init(self): |
|
|
self.online.init() |
|
|
self.vac.reset_states() |
|
|
self.current_online_chunk_buffer_size = 0 |
|
|
self.is_currently_final = False |
|
|
|
|
|
|
|
|
def insert_audio_chunk(self, audio): |
|
|
r = self.vac.detect_speech_iter(audio,audio_in_int16=False) |
|
|
audio, is_final = r |
|
|
print(is_final) |
|
|
self.is_currently_final = is_final |
|
|
self.online.insert_audio_chunk(audio) |
|
|
self.current_online_chunk_buffer_size += len(audio) |
|
|
|
|
|
def process_iter(self): |
|
|
if self.is_currently_final: |
|
|
return self.finish() |
|
|
elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size: |
|
|
self.current_online_chunk_buffer_size = 0 |
|
|
ret = self.online.process_iter() |
|
|
return ret |
|
|
else: |
|
|
print("no online update, only VAD", file=self.logfile) |
|
|
return (None, None, "") |
|
|
|
|
|
def finish(self): |
|
|
ret = self.online.finish() |
|
|
self.online.init(keep_offset=True) |
|
|
self.current_online_chunk_buffer_size = 0 |
|
|
return ret |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
import argparse |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.") |
|
|
add_shared_args(parser) |
|
|
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.') |
|
|
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.') |
|
|
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.') |
|
|
parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.') |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
logfile = sys.stderr |
|
|
|
|
|
if args.offline and args.comp_unaware: |
|
|
print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile) |
|
|
sys.exit(1) |
|
|
|
|
|
audio_path = args.audio_path |
|
|
|
|
|
SAMPLING_RATE = 16000 |
|
|
duration = len(load_audio(audio_path))/SAMPLING_RATE |
|
|
print("Audio duration is: %2.2f seconds" % duration, file=logfile) |
|
|
|
|
|
size = args.model |
|
|
language = args.lan |
|
|
|
|
|
t = time.time() |
|
|
print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True) |
|
|
|
|
|
if args.backend == "faster-whisper": |
|
|
asr_cls = FasterWhisperASR |
|
|
else: |
|
|
asr_cls = WhisperTimestampedASR |
|
|
|
|
|
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir) |
|
|
|
|
|
if args.task == "translate": |
|
|
asr.set_translate_task() |
|
|
tgt_language = "en" |
|
|
else: |
|
|
tgt_language = language |
|
|
|
|
|
|
|
|
e = time.time() |
|
|
print(f"done. It took {round(e-t,2)} seconds.",file=logfile) |
|
|
|
|
|
if args.vad: |
|
|
print("setting VAD filter",file=logfile) |
|
|
asr.use_vad() |
|
|
|
|
|
|
|
|
min_chunk = args.vac_chunk_size |
|
|
if args.buffer_trimming == "sentence": |
|
|
tokenizer = create_tokenizer(tgt_language) |
|
|
else: |
|
|
tokenizer = None |
|
|
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec)) |
|
|
|
|
|
|
|
|
|
|
|
a = load_audio_chunk(audio_path,0,1) |
|
|
|
|
|
|
|
|
asr.transcribe(a) |
|
|
|
|
|
beg = args.start_at |
|
|
start = time.time()-beg |
|
|
|
|
|
def output_transcript(o, now=None): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if now is None: |
|
|
now = time.time()-start |
|
|
if o[0] is not None: |
|
|
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True) |
|
|
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True) |
|
|
else: |
|
|
print(o,file=logfile,flush=True) |
|
|
|
|
|
if args.offline: |
|
|
a = load_audio(audio_path) |
|
|
online.insert_audio_chunk(a) |
|
|
try: |
|
|
o = online.process_iter() |
|
|
except AssertionError: |
|
|
print("assertion error",file=logfile) |
|
|
pass |
|
|
else: |
|
|
output_transcript(o) |
|
|
now = None |
|
|
elif args.comp_unaware: |
|
|
end = beg + min_chunk |
|
|
while True: |
|
|
a = load_audio_chunk(audio_path,beg,end) |
|
|
online.insert_audio_chunk(a) |
|
|
try: |
|
|
o = online.process_iter() |
|
|
except AssertionError: |
|
|
print("assertion error",file=logfile) |
|
|
pass |
|
|
else: |
|
|
output_transcript(o, now=end) |
|
|
|
|
|
print(f"## last processed {end:.2f}s",file=logfile,flush=True) |
|
|
|
|
|
if end >= duration: |
|
|
break |
|
|
|
|
|
beg = end |
|
|
|
|
|
if end + min_chunk > duration: |
|
|
end = duration |
|
|
else: |
|
|
end += min_chunk |
|
|
now = duration |
|
|
|
|
|
else: |
|
|
end = 0 |
|
|
while True: |
|
|
now = time.time() - start |
|
|
if now < end+min_chunk: |
|
|
time.sleep(min_chunk+end-now) |
|
|
end = time.time() - start |
|
|
a = load_audio_chunk(audio_path,beg,end) |
|
|
beg = end |
|
|
online.insert_audio_chunk(a) |
|
|
|
|
|
try: |
|
|
o = online.process_iter() |
|
|
except AssertionError: |
|
|
print("assertion error",file=logfile) |
|
|
pass |
|
|
else: |
|
|
output_transcript(o) |
|
|
now = time.time() - start |
|
|
print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True) |
|
|
|
|
|
if end >= duration: |
|
|
break |
|
|
now = None |
|
|
|
|
|
o = online.finish() |
|
|
output_transcript(o, now=now) |
|
|
|