Merge branch 'main' into ayo-logging-fixes
Browse files- README.md +1 -1
- line_packet.py +1 -2
- whisper_online.py +19 -18
- whisper_online_server.py +15 -22
README.md
CHANGED
|
@@ -183,7 +183,7 @@ online.init() # refresh if you're going to re-use the object for the next audio
|
|
| 183 |
|
| 184 |
### Server -- real-time from mic
|
| 185 |
|
| 186 |
-
`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection
|
| 187 |
|
| 188 |
Client example:
|
| 189 |
|
|
|
|
| 183 |
|
| 184 |
### Server -- real-time from mic
|
| 185 |
|
| 186 |
+
`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection and the `--warmup-file`. See the help message (`-h` option).
|
| 187 |
|
| 188 |
Client example:
|
| 189 |
|
line_packet.py
CHANGED
|
@@ -2,8 +2,6 @@
|
|
| 2 |
|
| 3 |
"""Functions for sending and receiving individual lines of text over a socket.
|
| 4 |
|
| 5 |
-
Used by marian-server-server.py to communicate with the Marian worker.
|
| 6 |
-
|
| 7 |
A line is transmitted using one or more fixed-size packets of UTF-8 bytes
|
| 8 |
containing:
|
| 9 |
|
|
@@ -11,6 +9,7 @@ containing:
|
|
| 11 |
|
| 12 |
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
|
| 13 |
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
PACKET_SIZE = 65536
|
|
|
|
| 2 |
|
| 3 |
"""Functions for sending and receiving individual lines of text over a socket.
|
| 4 |
|
|
|
|
|
|
|
| 5 |
A line is transmitted using one or more fixed-size packets of UTF-8 bytes
|
| 6 |
containing:
|
| 7 |
|
|
|
|
| 9 |
|
| 10 |
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
|
| 11 |
|
| 12 |
+
Originally from the UEDIN team of the ELITR project.
|
| 13 |
"""
|
| 14 |
|
| 15 |
PACKET_SIZE = 65536
|
whisper_online.py
CHANGED
|
@@ -559,7 +559,7 @@ def add_shared_args(parser):
|
|
| 559 |
|
| 560 |
def asr_factory(args, logfile=sys.stderr):
|
| 561 |
"""
|
| 562 |
-
Creates and configures an ASR instance based on the specified backend and arguments.
|
| 563 |
"""
|
| 564 |
backend = args.backend
|
| 565 |
if backend == "openai-api":
|
|
@@ -584,8 +584,23 @@ def asr_factory(args, logfile=sys.stderr):
|
|
| 584 |
logging.info("Setting VAD filter")
|
| 585 |
asr.use_vad()
|
| 586 |
|
| 587 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
## main:
|
| 590 |
|
| 591 |
if __name__ == "__main__":
|
|
@@ -613,27 +628,13 @@ if __name__ == "__main__":
|
|
| 613 |
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
| 614 |
logging.info("Audio duration is: %2.2f seconds" % duration)
|
| 615 |
|
| 616 |
-
asr = asr_factory(args, logfile=logfile)
|
| 617 |
-
language = args.lan
|
| 618 |
-
|
| 619 |
-
if args.task == "translate":
|
| 620 |
-
asr.set_translate_task()
|
| 621 |
-
tgt_language = "en" # Whisper translates into English
|
| 622 |
-
else:
|
| 623 |
-
tgt_language = language # Whisper transcribes in this language
|
| 624 |
-
|
| 625 |
min_chunk = args.min_chunk_size
|
| 626 |
-
if args.buffer_trimming == "sentence":
|
| 627 |
-
tokenizer = create_tokenizer(tgt_language)
|
| 628 |
-
else:
|
| 629 |
-
tokenizer = None
|
| 630 |
-
online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 631 |
-
|
| 632 |
|
| 633 |
# load the audio into the LRU cache before we start the timer
|
| 634 |
a = load_audio_chunk(audio_path,0,1)
|
| 635 |
|
| 636 |
-
# warm up the ASR
|
| 637 |
asr.transcribe(a)
|
| 638 |
|
| 639 |
beg = args.start_at
|
|
|
|
| 559 |
|
| 560 |
def asr_factory(args, logfile=sys.stderr):
|
| 561 |
"""
|
| 562 |
+
Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
|
| 563 |
"""
|
| 564 |
backend = args.backend
|
| 565 |
if backend == "openai-api":
|
|
|
|
| 584 |
logging.info("Setting VAD filter")
|
| 585 |
asr.use_vad()
|
| 586 |
|
| 587 |
+
language = args.lan
|
| 588 |
+
if args.task == "translate":
|
| 589 |
+
asr.set_translate_task()
|
| 590 |
+
tgt_language = "en" # Whisper translates into English
|
| 591 |
+
else:
|
| 592 |
+
tgt_language = language # Whisper transcribes in this language
|
| 593 |
+
|
| 594 |
+
# Create the tokenizer
|
| 595 |
+
if args.buffer_trimming == "sentence":
|
| 596 |
+
tokenizer = create_tokenizer(tgt_language)
|
| 597 |
+
else:
|
| 598 |
+
tokenizer = None
|
| 599 |
|
| 600 |
+
# Create the OnlineASRProcessor
|
| 601 |
+
online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 602 |
+
|
| 603 |
+
return asr, online
|
| 604 |
## main:
|
| 605 |
|
| 606 |
if __name__ == "__main__":
|
|
|
|
| 628 |
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
| 629 |
logging.info("Audio duration is: %2.2f seconds" % duration)
|
| 630 |
|
| 631 |
+
asr, online = asr_factory(args, logfile=logfile)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
min_chunk = args.min_chunk_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
|
| 634 |
# load the audio into the LRU cache before we start the timer
|
| 635 |
a = load_audio_chunk(audio_path,0,1)
|
| 636 |
|
| 637 |
+
# warm up the ASR because the very first transcribe takes much more time than the other
|
| 638 |
asr.transcribe(a)
|
| 639 |
|
| 640 |
beg = args.start_at
|
whisper_online_server.py
CHANGED
|
@@ -12,6 +12,8 @@ parser = argparse.ArgumentParser()
|
|
| 12 |
# server options
|
| 13 |
parser.add_argument("--host", type=str, default='localhost')
|
| 14 |
parser.add_argument("--port", type=int, default=43007)
|
|
|
|
|
|
|
| 15 |
|
| 16 |
parser.add_argument("-l", "--log-level", dest="log_level",
|
| 17 |
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
|
@@ -33,37 +35,28 @@ SAMPLING_RATE = 16000
|
|
| 33 |
|
| 34 |
size = args.model
|
| 35 |
language = args.lan
|
| 36 |
-
|
| 37 |
-
asr = asr_factory(args)
|
| 38 |
-
|
| 39 |
-
if args.task == "translate":
|
| 40 |
-
asr.set_translate_task()
|
| 41 |
-
tgt_language = "en"
|
| 42 |
-
else:
|
| 43 |
-
tgt_language = language
|
| 44 |
-
|
| 45 |
min_chunk = args.min_chunk_size
|
| 46 |
|
|
|
|
| 47 |
if args.buffer_trimming == "sentence":
|
| 48 |
tokenizer = create_tokenizer(tgt_language)
|
| 49 |
else:
|
| 50 |
tokenizer = None
|
| 51 |
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
if
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
asr.transcribe(a)
|
| 64 |
-
logging.debug("Whisper is warmed up")
|
| 65 |
else:
|
| 66 |
-
|
| 67 |
|
| 68 |
|
| 69 |
######### Server objects
|
|
|
|
| 12 |
# server options
|
| 13 |
parser.add_argument("--host", type=str, default='localhost')
|
| 14 |
parser.add_argument("--port", type=int, default=43007)
|
| 15 |
+
parser.add_argument("--warmup-file", type=str, dest="warmup_file",
|
| 16 |
+
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
|
| 17 |
|
| 18 |
parser.add_argument("-l", "--log-level", dest="log_level",
|
| 19 |
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
|
|
|
| 35 |
|
| 36 |
size = args.model
|
| 37 |
language = args.lan
|
| 38 |
+
asr, online = asr_factory(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
min_chunk = args.min_chunk_size
|
| 40 |
|
| 41 |
+
|
| 42 |
if args.buffer_trimming == "sentence":
|
| 43 |
tokenizer = create_tokenizer(tgt_language)
|
| 44 |
else:
|
| 45 |
tokenizer = None
|
| 46 |
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 47 |
|
| 48 |
+
# warm up the ASR because the very first transcribe takes more time than the others.
|
| 49 |
+
# Test results in https://github.com/ufal/whisper_streaming/pull/81
|
| 50 |
+
msg = "Whisper is not warmed up. The first chunk processing may take longer."
|
| 51 |
+
if args.warmup_file:
|
| 52 |
+
if os.path.isfile(args.warmup_file):
|
| 53 |
+
a = load_audio_chunk(args.warmup_file,0,1)
|
| 54 |
+
asr.transcribe(a)
|
| 55 |
+
print("INFO: Whisper is warmed up.",file=sys.stderr)
|
| 56 |
+
else:
|
| 57 |
+
print("WARNING: The warm up file is not available. "+msg,file=sys.stderr)
|
|
|
|
|
|
|
| 58 |
else:
|
| 59 |
+
print("WARNING: " + msg, file=sys.stderr)
|
| 60 |
|
| 61 |
|
| 62 |
######### Server objects
|