Dominik Macháček commited on
Commit
36d4bc9
·
0 Parent(s):

feat: HF / git LFS

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +140 -0
  3. CONTRIBUTING.md +46 -0
  4. Dockerfile +32 -0
  5. LICENSE +52 -0
  6. README.md +308 -0
  7. architecture.png +3 -0
  8. demo.png +3 -0
  9. pyproject.toml +59 -0
  10. whisperlivekit/__init__.py +12 -0
  11. whisperlivekit/audio_processor.py +586 -0
  12. whisperlivekit/basic_server.py +120 -0
  13. whisperlivekit/core.py +152 -0
  14. whisperlivekit/diarization/__init__.py +0 -0
  15. whisperlivekit/diarization/diarization_online.py +311 -0
  16. whisperlivekit/ffmpeg_manager.py +193 -0
  17. whisperlivekit/parse_args.py +253 -0
  18. whisperlivekit/remove_silences.py +103 -0
  19. whisperlivekit/simul_whisper/__init__.py +6 -0
  20. whisperlivekit/simul_whisper/backend.py +223 -0
  21. whisperlivekit/simul_whisper/beam.py +17 -0
  22. whisperlivekit/simul_whisper/config.py +29 -0
  23. whisperlivekit/simul_whisper/eow_detection.py +65 -0
  24. whisperlivekit/simul_whisper/generation_progress.py +43 -0
  25. whisperlivekit/simul_whisper/license_simulstreaming.py +5 -0
  26. whisperlivekit/simul_whisper/simul_whisper.py +602 -0
  27. whisperlivekit/simul_whisper/token_buffer.py +73 -0
  28. whisperlivekit/simul_whisper/whisper/__init__.py +160 -0
  29. whisperlivekit/simul_whisper/whisper/__main__.py +3 -0
  30. whisperlivekit/simul_whisper/whisper/assets/__init__.py +0 -0
  31. whisperlivekit/simul_whisper/whisper/assets/gpt2.tiktoken +0 -0
  32. whisperlivekit/simul_whisper/whisper/assets/mel_filters.npz +0 -0
  33. whisperlivekit/simul_whisper/whisper/assets/multilingual.tiktoken +0 -0
  34. whisperlivekit/simul_whisper/whisper/audio.py +157 -0
  35. whisperlivekit/simul_whisper/whisper/decoding.py +826 -0
  36. whisperlivekit/simul_whisper/whisper/model.py +348 -0
  37. whisperlivekit/simul_whisper/whisper/normalizers/__init__.py +2 -0
  38. whisperlivekit/simul_whisper/whisper/normalizers/basic.py +80 -0
  39. whisperlivekit/simul_whisper/whisper/normalizers/english.json +1741 -0
  40. whisperlivekit/simul_whisper/whisper/normalizers/english.py +550 -0
  41. whisperlivekit/simul_whisper/whisper/timing.py +388 -0
  42. whisperlivekit/simul_whisper/whisper/tokenizer.py +395 -0
  43. whisperlivekit/simul_whisper/whisper/transcribe.py +623 -0
  44. whisperlivekit/simul_whisper/whisper/triton_ops.py +117 -0
  45. whisperlivekit/simul_whisper/whisper/utils.py +318 -0
  46. whisperlivekit/simul_whisper/whisper/version.py +1 -0
  47. whisperlivekit/timed_objects.py +32 -0
  48. whisperlivekit/warmup.py +62 -0
  49. whisperlivekit/web/__init__.py +0 -0
  50. whisperlivekit/web/live_transcription.html +861 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+ db.sqlite3
61
+ db.sqlite3-journal
62
+
63
+ # Flask stuff:
64
+ instance/
65
+ .webassets-cache
66
+
67
+ # Scrapy stuff:
68
+ .scrapy
69
+
70
+ # Sphinx documentation
71
+ docs/_build/
72
+
73
+ # PyBuilder
74
+ target/
75
+
76
+ # Jupyter Notebook
77
+ .ipynb_checkpoints
78
+
79
+ # IPython
80
+ profile_default/
81
+ ipython_config.py
82
+
83
+ # pyenv
84
+ .python-version
85
+
86
+ # pipenv
87
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
88
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
89
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
90
+ # install all needed dependencies.
91
+ #Pipfile.lock
92
+
93
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
94
+ __pypackages__/
95
+
96
+ # Celery stuff
97
+ celerybeat-schedule
98
+ celerybeat.pid
99
+
100
+ # SageMath parsed files
101
+ *.sage.py
102
+
103
+ # Environments
104
+ .env
105
+ .venv
106
+ env/
107
+ venv/
108
+ ENV/
109
+ env.bak/
110
+ venv.bak/
111
+
112
+ # Spyder project settings
113
+ .spyderproject
114
+ .spyproject
115
+
116
+ # Rope project settings
117
+ .ropeproject
118
+
119
+ # mkdocs documentation
120
+ /site
121
+
122
+ # mypy
123
+ .mypy_cache/
124
+ .dmypy.json
125
+ dmypy.json
126
+
127
+ # Pyre type checker
128
+ .pyre/
129
+
130
+ *.wav
131
+ run_*.sh
132
+
133
+ # Downloaded models
134
+ *.pt
135
+
136
+ # Debug & testing
137
+ test_*.py
138
+ launch.json
139
+ .DS_Store
140
+ test/*
CONTRIBUTING.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing
2
+
3
+ Thank you for considering contributing ! We appreciate your time and effort to help make this project better.
4
+
5
+ ## Before You Start
6
+
7
+ 1. **Search for Existing Issues or Discussions:**
8
+ - Before opening a new issue or discussion, please check if there's already an existing one related to your topic. This helps avoid duplicates and keeps discussions centralized.
9
+
10
+ 2. **Discuss Your Contribution:**
11
+ - If you plan to make a significant change, it's advisable to discuss it in an issue first. This ensures that your contribution aligns with the project's goals and avoids duplicated efforts.
12
+
13
+ 3. **General questions about whisper streaming web:**
14
+ - For general questions about whisper streaming web, use the discussion space on GitHub. This helps in fostering a collaborative environment and encourages knowledge-sharing.
15
+
16
+ ## Opening Issues
17
+
18
+ If you encounter a problem with WhisperLiveKit or want to suggest an improvement, please follow these guidelines when opening an issue:
19
+
20
+ - **Bug Reports:**
21
+ - Clearly describe the error. **Please indicate the parameters you use, especially the model(s)**
22
+ - Provide a minimal, reproducible example that demonstrates the issue.
23
+
24
+ - **Feature Requests:**
25
+ - Clearly outline the new feature you are proposing.
26
+ - Explain how it would benefit the project.
27
+
28
+ ## Opening Pull Requests
29
+
30
+ We welcome and appreciate contributions! To ensure a smooth review process, please follow these guidelines when opening a pull request:
31
+
32
+ - **Commit Messages:**
33
+ - Write clear and concise commit messages, explaining the purpose of each change.
34
+
35
+ - **Documentation:**
36
+ - Update documentation when introducing new features or making changes that impact existing functionality.
37
+
38
+ - **Tests:**
39
+ - If applicable, add or update tests to cover your changes.
40
+
41
+ - **Discuss Before Major Changes:**
42
+ - If your PR includes significant changes, discuss it in an issue first.
43
+
44
+ ## Thank You
45
+
46
+ Your contributions make WhisperLiveKit better for everyone. Thank you for your time and dedication!
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && \
10
+ apt-get install -y --no-install-recommends \
11
+ python3 \
12
+ python3-pip \
13
+ ffmpeg \
14
+ git \
15
+ build-essential \
16
+ python3-dev && \
17
+ rm -rf /var/lib/apt/lists/*
18
+
19
+ RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
20
+
21
+ COPY . .
22
+
23
+ # Install WhisperLiveKit with extras for CZ
24
+ RUN pip install --no-cache-dir .[whisper-timestamped,diart]
25
+
26
+ # Expose port for Hugging Face Spaces
27
+ EXPOSE 7860
28
+
29
+ ENTRYPOINT ["whisperlivekit-server", "--host", "0.0.0.0", "--port", "7860"]
30
+
31
+ # Default args for CZ transcription
32
+ CMD ["--model", "tiny", "--language", "cs"]
LICENSE ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # License
2
+
3
+ ## Main Software License
4
+
5
+ MIT License
6
+
7
+ Copyright (c) 2025 Quentin Fuxa.
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+
27
+ ## SimulStreaming Backend License
28
+
29
+ **When using the SimulStreaming backend (SimulWhisper), additional licensing terms apply:**
30
+
31
+ SimulStreaming (https://github.com/ufal/SimulStreaming) is dual-licensed:
32
+
33
+ ### 🔹 Non-Commercial Use
34
+ You may use SimulStreaming under the **PolyForm Noncommercial License 1.0.0** if you obtain the code through the GitHub repository. This license is **free of charge** and comes with **no obligations** for non-commercial users.
35
+
36
+ ### 🔸 Commercial Use
37
+ Understanding who uses SimulStreaming commercially helps improve and prioritize development. Therefore, **registration is required** for those who acquire a commercial license.
38
+
39
+ Commercial licenses are planned to be **affordable** to SMEs and individuals. They are considering providing commercial licenses either for free or for a symbolic one-time fee, and may also provide additional support. You can share your preference via the [questionnaire](https://forms.cloud.microsoft.com/e/7tCxb4gJfB).
40
+
41
+ You can also leave your contact [there](https://forms.cloud.microsoft.com/e/7tCxb4gJfB) to be notified when commercial licenses become available.
42
+
43
+ **Contact for SimulStreaming licensing:**
44
+ [Dominik Macháček](https://ufal.mff.cuni.cz/dominik-machacek/), [email protected]
45
+
46
+ ---
47
+
48
+ ## Based on:
49
+ - **whisper_streaming** by ÚFAL – MIT License – https://github.com/ufal/whisper_streaming. The original work by ÚFAL. License: https://github.com/ufal/whisper_streaming/blob/main/LICENSE
50
+ - **silero-vad** by Snakers4 – MIT License – https://github.com/snakers4/silero-vad. The work by Snakers4 (silero-vad). License: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
51
+ - **Diart** by juanmc2005 – MIT License – https://github.com/juanmc2005/diart. The work in Diart by juanmc2005. License: https://github.com/juanmc2005/diart/blob/main/LICENSE
52
+ - **SimulStreaming** by ÚFAL – Dual License (PolyForm Noncommercial License 1.0.0 / Commercial License) – https://github.com/ufal/SimulStreaming
README.md ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Whisper Live Kit
3
+ emoji: 🐳
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: docker
7
+ app_port: 7860
8
+ ---
9
+
10
+ <h1 align="center">WhisperLiveKit</h1>
11
+
12
+ <p align="center">
13
+ <img src="https://raw.githubusercontent.com/QuentinFuxa/WhisperLiveKit/refs/heads/main/demo.png" alt="WhisperLiveKit Demo" width="730">
14
+ </p>
15
+
16
+ <p align="center"><b>Real-time, Fully Local Speech-to-Text with Speaker Diarization</b></p>
17
+
18
+ <p align="center">
19
+ <a href="https://pypi.org/project/whisperlivekit/"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/whisperlivekit?color=g"></a>
20
+ <a href="https://pepy.tech/project/whisperlivekit"><img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/whisperlivekit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads"></a>
21
+ <a href="https://pypi.org/project/whisperlivekit/"><img alt="Python Versions" src="https://img.shields.io/badge/python-3.9--3.13-dark_green"></a>
22
+ <a href="https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-MIT/Dual Licensed-dark_green"></a>
23
+ </p>
24
+
25
+
26
+ WhisperLiveKit brings real-time speech transcription directly to your browser, with a ready-to-use backend+server and a simple frontend. ✨
27
+
28
+ Built on [SimulStreaming](https://github.com/ufal/SimulStreaming) (SOTA 2025) and [WhisperStreaming](https://github.com/ufal/whisper_streaming) (SOTA 2023) for transcription, plus [Streaming Sortformer](https://arxiv.org/abs/2507.18446) (SOTA 2025) and [Diart](https://github.com/juanmc2005/diart) (SOTA 2021) for diarization.
29
+
30
+
31
+ ### Key Features
32
+
33
+ - **Real-time Transcription** - Locally (or on-prem) convert speech to text instantly as you speak
34
+ - **Speaker Diarization** - Identify different speakers in real-time. (⚠️ backend Streaming Sortformer in developement)
35
+ - **Multi-User Support** - Handle multiple users simultaneously with a single backend/server
36
+ - **Automatic Silence Chunking** – Automatically chunks when no audio is detected to limit buffer size
37
+ - **Confidence Validation** – Immediately validate high-confidence tokens for faster inference (WhisperStreaming only)
38
+ - **Buffering Preview** – Displays unvalidated transcription segments (not compatible with SimulStreaming yet)
39
+ - **Punctuation-Based Speaker Splitting [BETA]** - Align speaker changes with natural sentence boundaries for more readable transcripts
40
+ - **SimulStreaming Backend** - [Dual-licensed](https://github.com/ufal/SimulStreaming#-licence-and-contributions) - Ultra-low latency transcription using SOTA AlignAtt policy.
41
+
42
+ ### Architecture
43
+
44
+ <img alt="Architecture" src="architecture.png" />
45
+
46
+
47
+ ## Quick Start
48
+
49
+ ```bash
50
+ # Install the package
51
+ pip install whisperlivekit
52
+
53
+ # Start the transcription server
54
+ whisperlivekit-server --model tiny.en
55
+
56
+ # Open your browser at http://localhost:8000 to see the interface.
57
+ # Use -ssl-certfile public.crt --ssl-keyfile private.key parameters to use SSL
58
+ ```
59
+
60
+ That's it! Start speaking and watch your words appear on screen.
61
+
62
+ ## Installation
63
+
64
+ ```bash
65
+ #Install from PyPI (Recommended)
66
+ pip install whisperlivekit
67
+
68
+ #Install from Source
69
+ git clone https://github.com/QuentinFuxa/WhisperLiveKit
70
+ cd WhisperLiveKit
71
+ pip install -e .
72
+ ```
73
+
74
+ ### FFmpeg Dependency
75
+
76
+ ```bash
77
+ # Ubuntu/Debian
78
+ sudo apt install ffmpeg
79
+
80
+ # macOS
81
+ brew install ffmpeg
82
+
83
+ # Windows
84
+ # Download from https://ffmpeg.org/download.html and add to PATH
85
+ ```
86
+
87
+ ### Optional Dependencies
88
+
89
+ ```bash
90
+ # Voice Activity Controller (prevents hallucinations)
91
+ pip install torch
92
+
93
+ # Sentence-based buffer trimming
94
+ pip install mosestokenizer wtpsplit
95
+ pip install tokenize_uk # If you work with Ukrainian text
96
+
97
+ # Speaker diarization
98
+ pip install diart
99
+
100
+ # Alternative Whisper backends (default is faster-whisper)
101
+ pip install whisperlivekit[whisper] # Original Whisper
102
+ pip install whisperlivekit[whisper-timestamped] # Improved timestamps
103
+ pip install whisperlivekit[mlx-whisper] # Apple Silicon optimization
104
+ pip install whisperlivekit[openai] # OpenAI API
105
+ pip install whisperlivekit[simulstreaming]
106
+ ```
107
+
108
+ ### 🎹 Pyannote Models Setup
109
+
110
+ For diarization, you need access to pyannote.audio models:
111
+
112
+ 1. [Accept user conditions](https://huggingface.co/pyannote/segmentation) for the `pyannote/segmentation` model
113
+ 2. [Accept user conditions](https://huggingface.co/pyannote/segmentation-3.0) for the `pyannote/segmentation-3.0` model
114
+ 3. [Accept user conditions](https://huggingface.co/pyannote/embedding) for the `pyannote/embedding` model
115
+ 4. Login with HuggingFace:
116
+ ```bash
117
+ pip install huggingface_hub
118
+ huggingface-cli login
119
+ ```
120
+
121
+ ## 💻 Usage Examples
122
+
123
+ ### Command-line Interface
124
+
125
+ Start the transcription server with various options:
126
+
127
+ ```bash
128
+ # Basic server with English model
129
+ whisperlivekit-server --model tiny.en
130
+
131
+ # Advanced configuration with diarization
132
+ whisperlivekit-server --host 0.0.0.0 --port 8000 --model medium --diarization --language auto
133
+
134
+ # SimulStreaming backend for ultra-low latency
135
+ whisperlivekit-server --backend simulstreaming --model large-v3 --frame-threshold 20
136
+ ```
137
+
138
+
139
+ ### Python API Integration (Backend)
140
+ Check [basic_server.py](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/basic_server.py) for a complete example.
141
+
142
+ ```python
143
+ from whisperlivekit import TranscriptionEngine, AudioProcessor, parse_args
144
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
145
+ from fastapi.responses import HTMLResponse
146
+ from contextlib import asynccontextmanager
147
+ import asyncio
148
+
149
+ transcription_engine = None
150
+
151
+ @asynccontextmanager
152
+ async def lifespan(app: FastAPI):
153
+ global transcription_engine
154
+ transcription_engine = TranscriptionEngine(model="medium", diarization=True, lan="en")
155
+ # You can also load from command-line arguments using parse_args()
156
+ # args = parse_args()
157
+ # transcription_engine = TranscriptionEngine(**vars(args))
158
+ yield
159
+
160
+ app = FastAPI(lifespan=lifespan)
161
+
162
+ # Process WebSocket connections
163
+ async def handle_websocket_results(websocket: WebSocket, results_generator):
164
+ async for response in results_generator:
165
+ await websocket.send_json(response)
166
+ await websocket.send_json({"type": "ready_to_stop"})
167
+
168
+ @app.websocket("/asr")
169
+ async def websocket_endpoint(websocket: WebSocket):
170
+ global transcription_engine
171
+
172
+ # Create a new AudioProcessor for each connection, passing the shared engine
173
+ audio_processor = AudioProcessor(transcription_engine=transcription_engine)
174
+ results_generator = await audio_processor.create_tasks()
175
+ results_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
176
+ await websocket.accept()
177
+ while True:
178
+ message = await websocket.receive_bytes()
179
+ await audio_processor.process_audio(message)
180
+ ```
181
+
182
+ ### Frontend Implementation
183
+
184
+ The package includes a simple HTML/JavaScript implementation that you can adapt for your project. You can find it [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/web/live_transcription.html), or load its content using `get_web_interface_html()` :
185
+
186
+ ```python
187
+ from whisperlivekit import get_web_interface_html
188
+ html_content = get_web_interface_html()
189
+ ```
190
+
191
+ ## ⚙️ Configuration Reference
192
+
193
+ WhisperLiveKit offers extensive configuration options:
194
+
195
+ | Parameter | Description | Default |
196
+ |-----------|-------------|---------|
197
+ | `--host` | Server host address | `localhost` |
198
+ | `--port` | Server port | `8000` |
199
+ | `--model` | Whisper model size. Caution : '.en' models do not work with Simulstreaming | `tiny` |
200
+ | `--language` | Source language code or `auto` | `en` |
201
+ | `--task` | `transcribe` or `translate` | `transcribe` |
202
+ | `--backend` | Processing backend | `faster-whisper` |
203
+ | `--diarization` | Enable speaker identification | `False` |
204
+ | `--punctuation-split` | Use punctuation to improve speaker boundaries | `True` |
205
+ | `--confidence-validation` | Use confidence scores for faster validation | `False` |
206
+ | `--min-chunk-size` | Minimum audio chunk size (seconds) | `1.0` |
207
+ | `--vac` | Use Voice Activity Controller | `False` |
208
+ | `--no-vad` | Disable Voice Activity Detection | `False` |
209
+ | `--buffer_trimming` | Buffer trimming strategy (`sentence` or `segment`) | `segment` |
210
+ | `--warmup-file` | Audio file path for model warmup | `jfk.wav` |
211
+ | `--ssl-certfile` | Path to the SSL certificate file (for HTTPS support) | `None` |
212
+ | `--ssl-keyfile` | Path to the SSL private key file (for HTTPS support) | `None` |
213
+ | `--segmentation-model` | Hugging Face model ID for pyannote.audio segmentation model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `pyannote/segmentation-3.0` |
214
+ | `--embedding-model` | Hugging Face model ID for pyannote.audio embedding model. [Available models](https://github.com/juanmc2005/diart/tree/main?tab=readme-ov-file#pre-trained-models) | `speechbrain/spkrec-ecapa-voxceleb` |
215
+
216
+ **SimulStreaming-specific Options:**
217
+
218
+ | Parameter | Description | Default |
219
+ |-----------|-------------|---------|
220
+ | `--frame-threshold` | AlignAtt frame threshold (lower = faster, higher = more accurate) | `25` |
221
+ | `--beams` | Number of beams for beam search (1 = greedy decoding) | `1` |
222
+ | `--decoder` | Force decoder type (`beam` or `greedy`) | `auto` |
223
+ | `--audio-max-len` | Maximum audio buffer length (seconds) | `30.0` |
224
+ | `--audio-min-len` | Minimum audio length to process (seconds) | `0.0` |
225
+ | `--cif-ckpt-path` | Path to CIF model for word boundary detection | `None` |
226
+ | `--never-fire` | Never truncate incomplete words | `False` |
227
+ | `--init-prompt` | Initial prompt for the model | `None` |
228
+ | `--static-init-prompt` | Static prompt that doesn't scroll | `None` |
229
+ | `--max-context-tokens` | Maximum context tokens | `None` |
230
+ | `--model-path` | Direct path to .pt model file. Download it if not found | `./base.pt` |
231
+
232
+ ## 🔧 How It Works
233
+
234
+ 1. **Audio Capture**: Browser's MediaRecorder API captures audio in webm/opus format
235
+ 2. **Streaming**: Audio chunks are sent to the server via WebSocket
236
+ 3. **Processing**: Server decodes audio with FFmpeg and streams into the model for transcription
237
+ 4. **Real-time Output**: Partial transcriptions appear immediately in light gray (the 'aperçu') and finalized text appears in normal color
238
+
239
+ ## 🚀 Deployment Guide
240
+
241
+ To deploy WhisperLiveKit in production:
242
+
243
+ 1. **Server Setup** (Backend):
244
+ ```bash
245
+ # Install production ASGI server
246
+ pip install uvicorn gunicorn
247
+
248
+ # Launch with multiple workers
249
+ gunicorn -k uvicorn.workers.UvicornWorker -w 4 your_app:app
250
+ ```
251
+
252
+ 2. **Frontend Integration**:
253
+ - Host your customized version of the example HTML/JS in your web application
254
+ - Ensure WebSocket connection points to your server's address
255
+
256
+ 3. **Nginx Configuration** (recommended for production):
257
+ ```nginx
258
+ server {
259
+ listen 80;
260
+ server_name your-domain.com;
261
+
262
+ location / {
263
+ proxy_pass http://localhost:8000;
264
+ proxy_set_header Upgrade $http_upgrade;
265
+ proxy_set_header Connection "upgrade";
266
+ proxy_set_header Host $host;
267
+ }}
268
+ ```
269
+
270
+ 4. **HTTPS Support**: For secure deployments, use "wss://" instead of "ws://" in WebSocket URL
271
+
272
+ ### 🐋 Docker
273
+
274
+ A basic Dockerfile is provided which allows re-use of Python package installation options. ⚠️ For **large** models, ensure that your **docker runtime** has enough **memory** available. See below usage examples:
275
+
276
+
277
+ #### All defaults
278
+ - Create a reusable image with only the basics and then run as a named container:
279
+ ```bash
280
+ docker build -t whisperlivekit-defaults .
281
+ docker create --gpus all --name whisperlivekit -p 8000:8000 whisperlivekit-defaults
282
+ docker start -i whisperlivekit
283
+ ```
284
+
285
+ > **Note**: If you're running on a system without NVIDIA GPU support (such as Mac with Apple Silicon or any system without CUDA capabilities), you need to **remove the `--gpus all` flag** from the `docker create` command. Without GPU acceleration, transcription will use CPU only, which may be significantly slower. Consider using small models for better performance on CPU-only systems.
286
+
287
+ #### Customization
288
+ - Customize the container options:
289
+ ```bash
290
+ docker build -t whisperlivekit-defaults .
291
+ docker create --gpus all --name whisperlivekit-base -p 8000:8000 whisperlivekit-defaults --model base
292
+ docker start -i whisperlivekit-base
293
+ ```
294
+
295
+ - `--build-arg` Options:
296
+ - `EXTRAS="whisper-timestamped"` - Add extras to the image's installation (no spaces). Remember to set necessary container options!
297
+ - `HF_PRECACHE_DIR="./.cache/"` - Pre-load a model cache for faster first-time start
298
+ - `HF_TKN_FILE="./token"` - Add your Hugging Face Hub access token to download gated models
299
+
300
+ ## 🔮 Use Cases
301
+ Capture discussions in real-time for meeting transcription, help hearing-impaired users follow conversations through accessibility tools, transcribe podcasts or videos automatically for content creation, transcribe support calls with speaker identification for customer service...
302
+
303
+ ## 🙏 Acknowledgments
304
+
305
+ We extend our gratitude to the original authors of:
306
+
307
+ | [Whisper Streaming](https://github.com/ufal/whisper_streaming) | [SimulStreaming](https://github.com/ufal/SimulStreaming) | [Diart](https://github.com/juanmc2005/diart) | [OpenAI Whisper](https://github.com/openai/whisper) |
308
+ | -------- | ------- | -------- | ------- |
architecture.png ADDED

Git LFS Details

  • SHA256: a773a4bd8844ae7dde1fd5989c309fd70f7e280e77964508199d6653fe484430
  • Pointer size: 131 Bytes
  • Size of remote file: 391 kB
demo.png ADDED

Git LFS Details

  • SHA256: b251ccb7f207f5f83e2fb031dd299e83a741f5904960c69a830c8942a26808c3
  • Pointer size: 131 Bytes
  • Size of remote file: 449 kB
pyproject.toml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "whisperlivekit"
7
+ version = "0.2.5"
8
+ description = "Real-time, Fully Local Whisper's Speech-to-Text and Speaker Diarization"
9
+ readme = "README.md"
10
+ authors = [
11
+ { name = "Quentin Fuxa" }
12
+ ]
13
+ license = { file = "LICENSE" }
14
+ requires-python = ">=3.9"
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ "Topic :: Multimedia :: Sound/Audio :: Speech"
23
+ ]
24
+ dependencies = [
25
+ "fastapi",
26
+ "librosa",
27
+ "soundfile",
28
+ "faster-whisper",
29
+ "uvicorn",
30
+ "websockets"
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ diarization = ["diart"]
35
+ vac = ["torch"]
36
+ sentence = ["mosestokenizer", "wtpsplit"]
37
+ whisper = ["whisper"]
38
+ whisper-timestamped = ["whisper-timestamped"]
39
+ mlx-whisper = ["mlx-whisper"]
40
+ openai = ["openai"]
41
+ simulstreaming = [
42
+ "torch",
43
+ "tqdm",
44
+ "tiktoken",
45
+ 'triton>=2.0.0,<3; platform_machine == "x86_64" and (sys_platform == "linux" or sys_platform == "linux2")'
46
+ ]
47
+
48
+ [project.urls]
49
+ Homepage = "https://github.com/QuentinFuxa/WhisperLiveKit"
50
+
51
+ [project.scripts]
52
+ whisperlivekit-server = "whisperlivekit.basic_server:main"
53
+
54
+ [tool.setuptools]
55
+ packages = ["whisperlivekit", "whisperlivekit.diarization", "whisperlivekit.simul_whisper", "whisperlivekit.simul_whisper.whisper", "whisperlivekit.simul_whisper.whisper.assets", "whisperlivekit.simul_whisper.whisper.normalizers", "whisperlivekit.web", "whisperlivekit.whisper_streaming_custom"]
56
+
57
+ [tool.setuptools.package-data]
58
+ whisperlivekit = ["web/*.html"]
59
+ "whisperlivekit.simul_whisper.whisper.assets" = ["*.tiktoken", "*.npz"]
whisperlivekit/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .audio_processor import AudioProcessor
2
+ from .core import TranscriptionEngine
3
+ from .parse_args import parse_args
4
+ from .web.web_interface import get_web_interface_html
5
+
6
+ __all__ = [
7
+ "TranscriptionEngine",
8
+ "AudioProcessor",
9
+ "parse_args",
10
+ "get_web_interface_html",
11
+ "download_simulstreaming_backend",
12
+ ]
whisperlivekit/audio_processor.py ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import numpy as np
3
+ from time import time, sleep
4
+ import math
5
+ import logging
6
+ import traceback
7
+ from datetime import timedelta
8
+ from whisperlivekit.timed_objects import ASRToken
9
+ from whisperlivekit.core import TranscriptionEngine, online_factory
10
+ from whisperlivekit.ffmpeg_manager import FFmpegManager, FFmpegState
11
+ from .remove_silences import handle_silences
12
+ # Set up logging once
13
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
+ logger = logging.getLogger(__name__)
15
+ logger.setLevel(logging.DEBUG)
16
+
17
+ SENTINEL = object() # unique sentinel object for end of stream marker
18
+
19
+ def format_time(seconds: float) -> str:
20
+ """Format seconds as HH:MM:SS."""
21
+ return str(timedelta(seconds=int(seconds)))
22
+
23
+ class AudioProcessor:
24
+ """
25
+ Processes audio streams for transcription and diarization.
26
+ Handles audio processing, state management, and result formatting.
27
+ """
28
+
29
+ def __init__(self, **kwargs):
30
+ """Initialize the audio processor with configuration, models, and state."""
31
+
32
+ if 'transcription_engine' in kwargs and isinstance(kwargs['transcription_engine'], TranscriptionEngine):
33
+ models = kwargs['transcription_engine']
34
+ else:
35
+ models = TranscriptionEngine(**kwargs)
36
+
37
+ # Audio processing settings
38
+ self.args = models.args
39
+ self.sample_rate = 16000
40
+ self.channels = 1
41
+ self.samples_per_sec = int(self.sample_rate * self.args.min_chunk_size)
42
+ self.bytes_per_sample = 2
43
+ self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
44
+ self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
45
+ self.last_ffmpeg_activity = time()
46
+ self.ffmpeg_health_check_interval = 5
47
+ self.ffmpeg_max_idle_time = 10
48
+
49
+ # State management
50
+ self.is_stopping = False
51
+ self.tokens = []
52
+ self.buffer_transcription = ""
53
+ self.buffer_diarization = ""
54
+ self.end_buffer = 0
55
+ self.end_attributed_speaker = 0
56
+ self.lock = asyncio.Lock()
57
+ self.beg_loop = time()
58
+ self.sep = " " # Default separator
59
+ self.last_response_content = ""
60
+
61
+ # Models and processing
62
+ self.asr = models.asr
63
+ self.tokenizer = models.tokenizer
64
+ self.diarization = models.diarization
65
+
66
+ self.ffmpeg_manager = FFmpegManager(
67
+ sample_rate=self.sample_rate,
68
+ channels=self.channels
69
+ )
70
+
71
+ async def handle_ffmpeg_error(error_type: str):
72
+ logger.error(f"FFmpeg error: {error_type}")
73
+ self._ffmpeg_error = error_type
74
+
75
+ self.ffmpeg_manager.on_error_callback = handle_ffmpeg_error
76
+ self._ffmpeg_error = None
77
+
78
+ self.transcription_queue = asyncio.Queue() if self.args.transcription else None
79
+ self.diarization_queue = asyncio.Queue() if self.args.diarization else None
80
+ self.pcm_buffer = bytearray()
81
+
82
+ # Task references
83
+ self.transcription_task = None
84
+ self.diarization_task = None
85
+ self.ffmpeg_reader_task = None
86
+ self.watchdog_task = None
87
+ self.all_tasks_for_cleanup = []
88
+
89
+ # Initialize transcription engine if enabled
90
+ if self.args.transcription:
91
+ self.online = online_factory(self.args, models.asr, models.tokenizer)
92
+
93
+ def convert_pcm_to_float(self, pcm_buffer):
94
+ """Convert PCM buffer in s16le format to normalized NumPy array."""
95
+ return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0
96
+
97
+ async def update_transcription(self, new_tokens, buffer, end_buffer, sep):
98
+ """Thread-safe update of transcription with new data."""
99
+ async with self.lock:
100
+ self.tokens.extend(new_tokens)
101
+ self.buffer_transcription = buffer
102
+ self.end_buffer = end_buffer
103
+ self.sep = sep
104
+
105
+ async def update_diarization(self, end_attributed_speaker, buffer_diarization=""):
106
+ """Thread-safe update of diarization with new data."""
107
+ async with self.lock:
108
+ self.end_attributed_speaker = end_attributed_speaker
109
+ if buffer_diarization:
110
+ self.buffer_diarization = buffer_diarization
111
+
112
+ async def add_dummy_token(self):
113
+ """Placeholder token when no transcription is available."""
114
+ async with self.lock:
115
+ current_time = time() - self.beg_loop
116
+ self.tokens.append(ASRToken(
117
+ start=current_time, end=current_time + 1,
118
+ text=".", speaker=-1, is_dummy=True
119
+ ))
120
+
121
+ async def get_current_state(self):
122
+ """Get current state."""
123
+ async with self.lock:
124
+ current_time = time()
125
+
126
+ # Calculate remaining times
127
+ remaining_transcription = 0
128
+ if self.end_buffer > 0:
129
+ remaining_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 1))
130
+
131
+ remaining_diarization = 0
132
+ if self.tokens:
133
+ latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0)
134
+ remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 1))
135
+
136
+ return {
137
+ "tokens": self.tokens.copy(),
138
+ "buffer_transcription": self.buffer_transcription,
139
+ "buffer_diarization": self.buffer_diarization,
140
+ "end_buffer": self.end_buffer,
141
+ "end_attributed_speaker": self.end_attributed_speaker,
142
+ "sep": self.sep,
143
+ "remaining_time_transcription": remaining_transcription,
144
+ "remaining_time_diarization": remaining_diarization
145
+ }
146
+
147
+ async def reset(self):
148
+ """Reset all state variables to initial values."""
149
+ async with self.lock:
150
+ self.tokens = []
151
+ self.buffer_transcription = self.buffer_diarization = ""
152
+ self.end_buffer = self.end_attributed_speaker = 0
153
+ self.beg_loop = time()
154
+
155
+ async def ffmpeg_stdout_reader(self):
156
+ """Read audio data from FFmpeg stdout and process it."""
157
+ beg = time()
158
+
159
+ while True:
160
+ try:
161
+ # Check if FFmpeg is running
162
+ state = await self.ffmpeg_manager.get_state()
163
+ if state == FFmpegState.FAILED:
164
+ logger.error("FFmpeg is in FAILED state, cannot read data")
165
+ break
166
+ elif state == FFmpegState.STOPPED:
167
+ logger.info("FFmpeg is stopped")
168
+ break
169
+ elif state != FFmpegState.RUNNING:
170
+ logger.warning(f"FFmpeg is in {state} state, waiting...")
171
+ await asyncio.sleep(0.5)
172
+ continue
173
+
174
+ current_time = time()
175
+ elapsed_time = math.floor((current_time - beg) * 10) / 10
176
+ buffer_size = max(int(32000 * elapsed_time), 4096)
177
+ beg = current_time
178
+
179
+ chunk = await self.ffmpeg_manager.read_data(buffer_size)
180
+
181
+ if not chunk:
182
+ if self.is_stopping:
183
+ logger.info("FFmpeg stdout closed, stopping.")
184
+ break
185
+ else:
186
+ # No data available, but not stopping - FFmpeg might be restarting
187
+ await asyncio.sleep(0.1)
188
+ continue
189
+
190
+ self.pcm_buffer.extend(chunk)
191
+
192
+ # Process when enough data
193
+ if len(self.pcm_buffer) >= self.bytes_per_sec:
194
+ if len(self.pcm_buffer) > self.max_bytes_per_sec:
195
+ logger.warning(
196
+ f"Audio buffer too large: {len(self.pcm_buffer) / self.bytes_per_sec:.2f}s. "
197
+ f"Consider using a smaller model."
198
+ )
199
+
200
+ # Process audio chunk
201
+ pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:self.max_bytes_per_sec])
202
+ self.pcm_buffer = self.pcm_buffer[self.max_bytes_per_sec:]
203
+
204
+ # Send to transcription if enabled
205
+ if self.args.transcription and self.transcription_queue:
206
+ await self.transcription_queue.put(pcm_array.copy())
207
+
208
+ # Send to diarization if enabled
209
+ if self.args.diarization and self.diarization_queue:
210
+ await self.diarization_queue.put(pcm_array.copy())
211
+
212
+ # Sleep if no processing is happening
213
+ if not self.args.transcription and not self.args.diarization:
214
+ await asyncio.sleep(0.1)
215
+
216
+ except Exception as e:
217
+ logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
218
+ logger.warning(f"Traceback: {traceback.format_exc()}")
219
+ # Try to recover by waiting a bit
220
+ await asyncio.sleep(1)
221
+
222
+ # Check if we should exit
223
+ if self.is_stopping:
224
+ break
225
+
226
+ logger.info("FFmpeg stdout processing finished. Signaling downstream processors.")
227
+ if self.args.transcription and self.transcription_queue:
228
+ await self.transcription_queue.put(SENTINEL)
229
+ logger.debug("Sentinel put into transcription_queue.")
230
+ if self.args.diarization and self.diarization_queue:
231
+ await self.diarization_queue.put(SENTINEL)
232
+ logger.debug("Sentinel put into diarization_queue.")
233
+
234
+
235
+ async def transcription_processor(self):
236
+ """Process audio chunks for transcription."""
237
+ self.sep = self.online.asr.sep
238
+ cumulative_pcm_duration_stream_time = 0.0
239
+
240
+ while True:
241
+ try:
242
+ pcm_array = await self.transcription_queue.get()
243
+ if pcm_array is SENTINEL:
244
+ logger.debug("Transcription processor received sentinel. Finishing.")
245
+ self.transcription_queue.task_done()
246
+ break
247
+
248
+ if not self.online:
249
+ logger.warning("Transcription processor: self.online not initialized.")
250
+ self.transcription_queue.task_done()
251
+ continue
252
+
253
+ asr_internal_buffer_duration_s = len(getattr(self.online, 'audio_buffer', [])) / self.online.SAMPLING_RATE
254
+ transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer)
255
+
256
+ logger.info(
257
+ f"ASR processing: internal_buffer={asr_internal_buffer_duration_s:.2f}s, "
258
+ f"lag={transcription_lag_s:.2f}s."
259
+ )
260
+
261
+ # Process transcription
262
+ duration_this_chunk = len(pcm_array) / self.sample_rate if isinstance(pcm_array, np.ndarray) else 0
263
+ cumulative_pcm_duration_stream_time += duration_this_chunk
264
+ stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
265
+
266
+ self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
267
+ new_tokens, current_audio_processed_upto = self.online.process_iter()
268
+
269
+ # Get buffer information
270
+ _buffer_transcript_obj = self.online.get_buffer()
271
+ buffer_text = _buffer_transcript_obj.text
272
+
273
+ if new_tokens:
274
+ validated_text = self.sep.join([t.text for t in new_tokens])
275
+ if buffer_text.startswith(validated_text):
276
+ buffer_text = buffer_text[len(validated_text):].lstrip()
277
+
278
+ candidate_end_times = [self.end_buffer]
279
+
280
+ if new_tokens:
281
+ candidate_end_times.append(new_tokens[-1].end)
282
+
283
+ if _buffer_transcript_obj.end is not None:
284
+ candidate_end_times.append(_buffer_transcript_obj.end)
285
+
286
+ candidate_end_times.append(current_audio_processed_upto)
287
+
288
+ new_end_buffer = max(candidate_end_times)
289
+
290
+ await self.update_transcription(
291
+ new_tokens, buffer_text, new_end_buffer, self.sep
292
+ )
293
+ self.transcription_queue.task_done()
294
+
295
+ except Exception as e:
296
+ logger.warning(f"Exception in transcription_processor: {e}")
297
+ logger.warning(f"Traceback: {traceback.format_exc()}")
298
+ if 'pcm_array' in locals() and pcm_array is not SENTINEL : # Check if pcm_array was assigned from queue
299
+ self.transcription_queue.task_done()
300
+ logger.info("Transcription processor task finished.")
301
+
302
+
303
+ async def diarization_processor(self, diarization_obj):
304
+ """Process audio chunks for speaker diarization."""
305
+ buffer_diarization = ""
306
+
307
+ while True:
308
+ try:
309
+ pcm_array = await self.diarization_queue.get()
310
+ if pcm_array is SENTINEL:
311
+ logger.debug("Diarization processor received sentinel. Finishing.")
312
+ self.diarization_queue.task_done()
313
+ break
314
+
315
+ # Process diarization
316
+ await diarization_obj.diarize(pcm_array)
317
+
318
+ async with self.lock:
319
+ self.tokens = diarization_obj.assign_speakers_to_tokens(
320
+ self.tokens,
321
+ use_punctuation_split=self.args.punctuation_split
322
+ )
323
+ if len(self.tokens) > 0:
324
+ self.end_attributed_speaker = max(self.tokens[-1].end, self.end_attributed_speaker)
325
+ if buffer_diarization:
326
+ self.buffer_diarization = buffer_diarization
327
+
328
+ self.diarization_queue.task_done()
329
+
330
+ except Exception as e:
331
+ logger.warning(f"Exception in diarization_processor: {e}")
332
+ logger.warning(f"Traceback: {traceback.format_exc()}")
333
+ if 'pcm_array' in locals() and pcm_array is not SENTINEL:
334
+ self.diarization_queue.task_done()
335
+ logger.info("Diarization processor task finished.")
336
+
337
+
338
+ async def results_formatter(self):
339
+ """Format processing results for output."""
340
+ last_sent_trans = None
341
+ last_sent_diar = None
342
+ while True:
343
+ try:
344
+ ffmpeg_state = await self.ffmpeg_manager.get_state()
345
+ if ffmpeg_state == FFmpegState.FAILED and self._ffmpeg_error:
346
+ yield {
347
+ "status": "error",
348
+ "error": f"FFmpeg error: {self._ffmpeg_error}",
349
+ "lines": [],
350
+ "buffer_transcription": "",
351
+ "buffer_diarization": "",
352
+ "remaining_time_transcription": 0,
353
+ "remaining_time_diarization": 0
354
+ }
355
+ self._ffmpeg_error = None
356
+ await asyncio.sleep(1)
357
+ continue
358
+
359
+ # Get current state
360
+ state = await self.get_current_state()
361
+ tokens = state["tokens"]
362
+ buffer_transcription = state["buffer_transcription"]
363
+ buffer_diarization = state["buffer_diarization"]
364
+ end_attributed_speaker = state["end_attributed_speaker"]
365
+ sep = state["sep"]
366
+
367
+ # Add dummy tokens if needed
368
+ if (not tokens or tokens[-1].is_dummy) and not self.args.transcription and self.args.diarization:
369
+ await self.add_dummy_token()
370
+ sleep(0.5)
371
+ state = await self.get_current_state()
372
+ tokens = state["tokens"]
373
+
374
+ # Format output
375
+ previous_speaker = -1
376
+ lines = []
377
+ last_end_diarized = 0
378
+ undiarized_text = []
379
+ current_time = time() - self.beg_loop
380
+ tokens = handle_silences(tokens, current_time)
381
+ for token in tokens:
382
+ speaker = token.speaker
383
+
384
+ # Handle diarization
385
+ if self.args.diarization:
386
+ if (speaker in [-1, 0]) and token.end >= end_attributed_speaker:
387
+ undiarized_text.append(token.text)
388
+ continue
389
+ elif (speaker in [-1, 0]) and token.end < end_attributed_speaker:
390
+ speaker = previous_speaker
391
+ if speaker not in [-1, 0]:
392
+ last_end_diarized = max(token.end, last_end_diarized)
393
+
394
+ # Group by speaker
395
+ if speaker != previous_speaker or not lines:
396
+ lines.append({
397
+ "speaker": speaker,
398
+ "text": token.text,
399
+ "beg": format_time(token.start),
400
+ "end": format_time(token.end),
401
+ "diff": round(token.end - last_end_diarized, 2)
402
+ })
403
+ previous_speaker = speaker
404
+ elif token.text: # Only append if text isn't empty
405
+ lines[-1]["text"] += sep + token.text
406
+ lines[-1]["end"] = format_time(token.end)
407
+ lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
408
+
409
+ # Handle undiarized text
410
+ if undiarized_text:
411
+ combined = sep.join(undiarized_text)
412
+ if buffer_transcription:
413
+ combined += sep
414
+ await self.update_diarization(end_attributed_speaker, combined)
415
+ buffer_diarization = combined
416
+
417
+ response_status = "active_transcription"
418
+ final_lines_for_response = lines.copy()
419
+
420
+ if not tokens and not buffer_transcription and not buffer_diarization:
421
+ response_status = "no_audio_detected"
422
+ final_lines_for_response = []
423
+ elif response_status == "active_transcription" and not final_lines_for_response:
424
+ final_lines_for_response = [{
425
+ "speaker": 1,
426
+ "text": "",
427
+ "beg": format_time(state.get("end_buffer", 0)),
428
+ "end": format_time(state.get("end_buffer", 0)),
429
+ "diff": 0
430
+ }]
431
+
432
+ response = {
433
+ "status": response_status,
434
+ "lines": final_lines_for_response,
435
+ "buffer_transcription": buffer_transcription,
436
+ "buffer_diarization": buffer_diarization,
437
+ "remaining_time_transcription": state["remaining_time_transcription"],
438
+ "remaining_time_diarization": state["remaining_time_diarization"]
439
+ }
440
+
441
+ current_response_signature = f"{response_status} | " + \
442
+ ' '.join([f"{line['speaker']} {line['text']}" for line in final_lines_for_response]) + \
443
+ f" | {buffer_transcription} | {buffer_diarization}"
444
+
445
+ trans = state["remaining_time_transcription"]
446
+ diar = state["remaining_time_diarization"]
447
+ should_push = (
448
+ current_response_signature != self.last_response_content
449
+ or last_sent_trans is None
450
+ or round(trans, 1) != round(last_sent_trans, 1)
451
+ or round(diar, 1) != round(last_sent_diar, 1)
452
+ )
453
+ if should_push and (final_lines_for_response or buffer_transcription or buffer_diarization or response_status == "no_audio_detected" or trans > 0 or diar > 0):
454
+ yield response
455
+ self.last_response_content = current_response_signature
456
+ last_sent_trans = trans
457
+ last_sent_diar = diar
458
+
459
+ # Check for termination condition
460
+ if self.is_stopping:
461
+ all_processors_done = True
462
+ if self.args.transcription and self.transcription_task and not self.transcription_task.done():
463
+ all_processors_done = False
464
+ if self.args.diarization and self.diarization_task and not self.diarization_task.done():
465
+ all_processors_done = False
466
+
467
+ if all_processors_done:
468
+ logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
469
+ final_state = await self.get_current_state()
470
+ return
471
+
472
+ await asyncio.sleep(0.1) # Avoid overwhelming the client
473
+
474
+ except Exception as e:
475
+ logger.warning(f"Exception in results_formatter: {e}")
476
+ logger.warning(f"Traceback: {traceback.format_exc()}")
477
+ await asyncio.sleep(0.5) # Back off on error
478
+
479
+ async def create_tasks(self):
480
+ """Create and start processing tasks."""
481
+ self.all_tasks_for_cleanup = []
482
+ processing_tasks_for_watchdog = []
483
+
484
+ success = await self.ffmpeg_manager.start()
485
+ if not success:
486
+ logger.error("Failed to start FFmpeg manager")
487
+ async def error_generator():
488
+ yield {
489
+ "status": "error",
490
+ "error": "FFmpeg failed to start. Please check that FFmpeg is installed.",
491
+ "lines": [],
492
+ "buffer_transcription": "",
493
+ "buffer_diarization": "",
494
+ "remaining_time_transcription": 0,
495
+ "remaining_time_diarization": 0
496
+ }
497
+ return error_generator()
498
+
499
+ if self.args.transcription and self.online:
500
+ self.transcription_task = asyncio.create_task(self.transcription_processor())
501
+ self.all_tasks_for_cleanup.append(self.transcription_task)
502
+ processing_tasks_for_watchdog.append(self.transcription_task)
503
+
504
+ if self.args.diarization and self.diarization:
505
+ self.diarization_task = asyncio.create_task(self.diarization_processor(self.diarization))
506
+ self.all_tasks_for_cleanup.append(self.diarization_task)
507
+ processing_tasks_for_watchdog.append(self.diarization_task)
508
+
509
+ self.ffmpeg_reader_task = asyncio.create_task(self.ffmpeg_stdout_reader())
510
+ self.all_tasks_for_cleanup.append(self.ffmpeg_reader_task)
511
+ processing_tasks_for_watchdog.append(self.ffmpeg_reader_task)
512
+
513
+ # Monitor overall system health
514
+ self.watchdog_task = asyncio.create_task(self.watchdog(processing_tasks_for_watchdog))
515
+ self.all_tasks_for_cleanup.append(self.watchdog_task)
516
+
517
+ return self.results_formatter()
518
+
519
+ async def watchdog(self, tasks_to_monitor):
520
+ """Monitors the health of critical processing tasks."""
521
+ while True:
522
+ try:
523
+ await asyncio.sleep(10)
524
+
525
+ for i, task in enumerate(tasks_to_monitor):
526
+ if task.done():
527
+ exc = task.exception()
528
+ task_name = task.get_name() if hasattr(task, 'get_name') else f"Monitored Task {i}"
529
+ if exc:
530
+ logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
531
+ else:
532
+ logger.info(f"{task_name} completed normally.")
533
+
534
+ # Check FFmpeg status through the manager
535
+ ffmpeg_state = await self.ffmpeg_manager.get_state()
536
+ if ffmpeg_state == FFmpegState.FAILED:
537
+ logger.error("FFmpeg is in FAILED state, notifying results formatter")
538
+ # FFmpeg manager will handle its own recovery
539
+ elif ffmpeg_state == FFmpegState.STOPPED and not self.is_stopping:
540
+ logger.warning("FFmpeg unexpectedly stopped, attempting restart")
541
+ await self.ffmpeg_manager.restart()
542
+
543
+ except asyncio.CancelledError:
544
+ logger.info("Watchdog task cancelled.")
545
+ break
546
+ except Exception as e:
547
+ logger.error(f"Error in watchdog task: {e}", exc_info=True)
548
+
549
+ async def cleanup(self):
550
+ """Clean up resources when processing is complete."""
551
+ logger.info("Starting cleanup of AudioProcessor resources.")
552
+ for task in self.all_tasks_for_cleanup:
553
+ if task and not task.done():
554
+ task.cancel()
555
+
556
+ created_tasks = [t for t in self.all_tasks_for_cleanup if t]
557
+ if created_tasks:
558
+ await asyncio.gather(*created_tasks, return_exceptions=True)
559
+ logger.info("All processing tasks cancelled or finished.")
560
+ await self.ffmpeg_manager.stop()
561
+ logger.info("FFmpeg manager stopped.")
562
+ if self.args.diarization and hasattr(self, 'diarization') and hasattr(self.diarization, 'close'):
563
+ self.diarization.close()
564
+ logger.info("AudioProcessor cleanup complete.")
565
+
566
+
567
+ async def process_audio(self, message):
568
+ """Process incoming audio data."""
569
+ if not message:
570
+ logger.info("Empty audio message received, initiating stop sequence.")
571
+ self.is_stopping = True
572
+ # Signal FFmpeg manager to stop accepting data
573
+ await self.ffmpeg_manager.stop()
574
+ return
575
+
576
+ if self.is_stopping:
577
+ logger.warning("AudioProcessor is stopping. Ignoring incoming audio.")
578
+ return
579
+
580
+ success = await self.ffmpeg_manager.write_data(message)
581
+ if not success:
582
+ ffmpeg_state = await self.ffmpeg_manager.get_state()
583
+ if ffmpeg_state == FFmpegState.FAILED:
584
+ logger.error("FFmpeg is in FAILED state, cannot process audio")
585
+ else:
586
+ logger.warning("Failed to write audio data to FFmpeg")
whisperlivekit/basic_server.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
3
+ from fastapi.responses import HTMLResponse
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from whisperlivekit import TranscriptionEngine, AudioProcessor, get_web_interface_html, parse_args
6
+ import asyncio
7
+ import logging
8
+
9
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
10
+ logging.getLogger().setLevel(logging.WARNING)
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.DEBUG)
13
+
14
+ args = parse_args()
15
+ transcription_engine = None
16
+
17
+ @asynccontextmanager
18
+ async def lifespan(app: FastAPI):
19
+ global transcription_engine
20
+ transcription_engine = TranscriptionEngine(
21
+ **vars(args),
22
+ )
23
+ yield
24
+
25
+ app = FastAPI(lifespan=lifespan)
26
+ app.add_middleware(
27
+ CORSMiddleware,
28
+ allow_origins=["*"],
29
+ allow_credentials=True,
30
+ allow_methods=["*"],
31
+ allow_headers=["*"],
32
+ )
33
+
34
+ @app.get("/")
35
+ async def get():
36
+ return HTMLResponse(get_web_interface_html())
37
+
38
+
39
+ async def handle_websocket_results(websocket, results_generator):
40
+ """Consumes results from the audio processor and sends them via WebSocket."""
41
+ try:
42
+ async for response in results_generator:
43
+ await websocket.send_json(response)
44
+ # when the results_generator finishes it means all audio has been processed
45
+ logger.info("Results generator finished. Sending 'ready_to_stop' to client.")
46
+ await websocket.send_json({"type": "ready_to_stop"})
47
+ except WebSocketDisconnect:
48
+ logger.info("WebSocket disconnected while handling results (client likely closed connection).")
49
+ except Exception as e:
50
+ logger.warning(f"Error in WebSocket results handler: {e}")
51
+
52
+
53
+ @app.websocket("/asr")
54
+ async def websocket_endpoint(websocket: WebSocket):
55
+ global transcription_engine
56
+ audio_processor = AudioProcessor(
57
+ transcription_engine=transcription_engine,
58
+ )
59
+ await websocket.accept()
60
+ logger.info("WebSocket connection opened.")
61
+
62
+ results_generator = await audio_processor.create_tasks()
63
+ websocket_task = asyncio.create_task(handle_websocket_results(websocket, results_generator))
64
+
65
+ try:
66
+ while True:
67
+ message = await websocket.receive_bytes()
68
+ await audio_processor.process_audio(message)
69
+ except KeyError as e:
70
+ if 'bytes' in str(e):
71
+ logger.warning(f"Client has closed the connection.")
72
+ else:
73
+ logger.error(f"Unexpected KeyError in websocket_endpoint: {e}", exc_info=True)
74
+ except WebSocketDisconnect:
75
+ logger.info("WebSocket disconnected by client during message receiving loop.")
76
+ except Exception as e:
77
+ logger.error(f"Unexpected error in websocket_endpoint main loop: {e}", exc_info=True)
78
+ finally:
79
+ logger.info("Cleaning up WebSocket endpoint...")
80
+ if not websocket_task.done():
81
+ websocket_task.cancel()
82
+ try:
83
+ await websocket_task
84
+ except asyncio.CancelledError:
85
+ logger.info("WebSocket results handler task was cancelled.")
86
+ except Exception as e:
87
+ logger.warning(f"Exception while awaiting websocket_task completion: {e}")
88
+
89
+ await audio_processor.cleanup()
90
+ logger.info("WebSocket endpoint cleaned up successfully.")
91
+
92
+ def main():
93
+ """Entry point for the CLI command."""
94
+ import uvicorn
95
+
96
+ uvicorn_kwargs = {
97
+ "app": "whisperlivekit.basic_server:app",
98
+ "host":args.host,
99
+ "port":args.port,
100
+ "reload": False,
101
+ "log_level": "info",
102
+ "lifespan": "on",
103
+ }
104
+
105
+ ssl_kwargs = {}
106
+ if args.ssl_certfile or args.ssl_keyfile:
107
+ if not (args.ssl_certfile and args.ssl_keyfile):
108
+ raise ValueError("Both --ssl-certfile and --ssl-keyfile must be specified together.")
109
+ ssl_kwargs = {
110
+ "ssl_certfile": args.ssl_certfile,
111
+ "ssl_keyfile": args.ssl_keyfile
112
+ }
113
+
114
+ if ssl_kwargs:
115
+ uvicorn_kwargs = {**uvicorn_kwargs, **ssl_kwargs}
116
+
117
+ uvicorn.run(**uvicorn_kwargs)
118
+
119
+ if __name__ == "__main__":
120
+ main()
whisperlivekit/core.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ try:
2
+ from whisperlivekit.whisper_streaming_custom.whisper_online import backend_factory
3
+ from whisperlivekit.whisper_streaming_custom.online_asr import VACOnlineASRProcessor, OnlineASRProcessor
4
+ except ImportError:
5
+ from .whisper_streaming_custom.whisper_online import backend_factory
6
+ from .whisper_streaming_custom.online_asr import VACOnlineASRProcessor, OnlineASRProcessor
7
+ from whisperlivekit.warmup import warmup_asr, warmup_online
8
+ from argparse import Namespace
9
+ import sys
10
+
11
+ class TranscriptionEngine:
12
+ _instance = None
13
+ _initialized = False
14
+
15
+ def __new__(cls, *args, **kwargs):
16
+ if cls._instance is None:
17
+ cls._instance = super().__new__(cls)
18
+ return cls._instance
19
+
20
+ def __init__(self, **kwargs):
21
+ if TranscriptionEngine._initialized:
22
+ return
23
+
24
+ defaults = {
25
+ "host": "localhost",
26
+ "port": 8000,
27
+ "warmup_file": None,
28
+ "diarization": False,
29
+ "punctuation_split": False,
30
+ "min_chunk_size": 0.5,
31
+ "model": "tiny",
32
+ "model_cache_dir": None,
33
+ "model_dir": None,
34
+ "lan": "auto",
35
+ "task": "transcribe",
36
+ "backend": "faster-whisper",
37
+ "vac": False,
38
+ "vac_chunk_size": 0.04,
39
+ "log_level": "DEBUG",
40
+ "ssl_certfile": None,
41
+ "ssl_keyfile": None,
42
+ "transcription": True,
43
+ "vad": True,
44
+ # whisperstreaming params:
45
+ "buffer_trimming": "segment",
46
+ "confidence_validation": False,
47
+ "buffer_trimming_sec": 15,
48
+ # simulstreaming params:
49
+ "frame_threshold": 25,
50
+ "beams": 1,
51
+ "decoder_type": None,
52
+ "audio_max_len": 30.0,
53
+ "audio_min_len": 0.0,
54
+ "cif_ckpt_path": None,
55
+ "never_fire": False,
56
+ "init_prompt": None,
57
+ "static_init_prompt": None,
58
+ "max_context_tokens": None,
59
+ "model_path": './base.pt',
60
+ # diart params:
61
+ "segmentation_model": "pyannote/segmentation-3.0",
62
+ "embedding_model": "pyannote/embedding",
63
+
64
+ }
65
+
66
+ config_dict = {**defaults, **kwargs}
67
+
68
+ if 'no_transcription' in kwargs:
69
+ config_dict['transcription'] = not kwargs['no_transcription']
70
+ if 'no_vad' in kwargs:
71
+ config_dict['vad'] = not kwargs['no_vad']
72
+
73
+ config_dict.pop('no_transcription', None)
74
+ config_dict.pop('no_vad', None)
75
+
76
+ if 'language' in kwargs:
77
+ config_dict['lan'] = kwargs['language']
78
+ config_dict.pop('language', None)
79
+
80
+ self.args = Namespace(**config_dict)
81
+
82
+ self.asr = None
83
+ self.tokenizer = None
84
+ self.diarization = None
85
+
86
+ if self.args.transcription:
87
+ if self.args.backend == "simulstreaming":
88
+ from simul_whisper import SimulStreamingASR
89
+ self.tokenizer = None
90
+ simulstreaming_kwargs = {}
91
+ for attr in ['frame_threshold', 'beams', 'decoder_type', 'audio_max_len', 'audio_min_len',
92
+ 'cif_ckpt_path', 'never_fire', 'init_prompt', 'static_init_prompt',
93
+ 'max_context_tokens', 'model_path']:
94
+ if hasattr(self.args, attr):
95
+ simulstreaming_kwargs[attr] = getattr(self.args, attr)
96
+
97
+ # Add segment_length from min_chunk_size
98
+ simulstreaming_kwargs['segment_length'] = getattr(self.args, 'min_chunk_size', 0.5)
99
+ simulstreaming_kwargs['task'] = self.args.task
100
+
101
+ size = self.args.model
102
+ self.asr = SimulStreamingASR(
103
+ modelsize=size,
104
+ lan=self.args.lan,
105
+ cache_dir=getattr(self.args, 'model_cache_dir', None),
106
+ model_dir=getattr(self.args, 'model_dir', None),
107
+ **simulstreaming_kwargs
108
+ )
109
+
110
+ else:
111
+ self.asr, self.tokenizer = backend_factory(self.args)
112
+ warmup_asr(self.asr, self.args.warmup_file) #for simulstreaming, warmup should be done in the online class not here
113
+
114
+ if self.args.diarization:
115
+ from whisperlivekit.diarization.diarization_online import DiartDiarization
116
+ self.diarization = DiartDiarization(
117
+ block_duration=self.args.min_chunk_size,
118
+ segmentation_model_name=self.args.segmentation_model,
119
+ embedding_model_name=self.args.embedding_model
120
+ )
121
+
122
+ TranscriptionEngine._initialized = True
123
+
124
+
125
+
126
+ def online_factory(args, asr, tokenizer, logfile=sys.stderr):
127
+ if args.backend == "simulstreaming":
128
+ from simul_whisper import SimulStreamingOnlineProcessor
129
+ online = SimulStreamingOnlineProcessor(
130
+ asr,
131
+ logfile=logfile,
132
+ )
133
+ # warmup_online(online, args.warmup_file)
134
+ elif args.vac:
135
+ online = VACOnlineASRProcessor(
136
+ args.min_chunk_size,
137
+ asr,
138
+ tokenizer,
139
+ logfile=logfile,
140
+ buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
141
+ confidence_validation = args.confidence_validation
142
+ )
143
+ else:
144
+ online = OnlineASRProcessor(
145
+ asr,
146
+ tokenizer,
147
+ logfile=logfile,
148
+ buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
149
+ confidence_validation = args.confidence_validation
150
+ )
151
+ return online
152
+
whisperlivekit/diarization/__init__.py ADDED
File without changes
whisperlivekit/diarization/diarization_online.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import re
3
+ import threading
4
+ import numpy as np
5
+ import logging
6
+ import time
7
+ from queue import SimpleQueue, Empty
8
+
9
+ from diart import SpeakerDiarization, SpeakerDiarizationConfig
10
+ from diart.inference import StreamingInference
11
+ from diart.sources import AudioSource
12
+ from whisperlivekit.timed_objects import SpeakerSegment
13
+ from diart.sources import MicrophoneAudioSource
14
+ from rx.core import Observer
15
+ from typing import Tuple, Any, List
16
+ from pyannote.core import Annotation
17
+ import diart.models as m
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ def extract_number(s: str) -> int:
22
+ m = re.search(r'\d+', s)
23
+ return int(m.group()) if m else None
24
+
25
+ class DiarizationObserver(Observer):
26
+ """Observer that logs all data emitted by the diarization pipeline and stores speaker segments."""
27
+
28
+ def __init__(self):
29
+ self.speaker_segments = []
30
+ self.processed_time = 0
31
+ self.segment_lock = threading.Lock()
32
+
33
+ def on_next(self, value: Tuple[Annotation, Any]):
34
+ annotation, audio = value
35
+
36
+ logger.debug("\n--- New Diarization Result ---")
37
+
38
+ duration = audio.extent.end - audio.extent.start
39
+ logger.debug(f"Audio segment: {audio.extent.start:.2f}s - {audio.extent.end:.2f}s (duration: {duration:.2f}s)")
40
+ logger.debug(f"Audio shape: {audio.data.shape}")
41
+
42
+ with self.segment_lock:
43
+ if audio.extent.end > self.processed_time:
44
+ self.processed_time = audio.extent.end
45
+ if annotation and len(annotation._labels) > 0:
46
+ logger.debug("\nSpeaker segments:")
47
+ for speaker, label in annotation._labels.items():
48
+ for start, end in zip(label.segments_boundaries_[:-1], label.segments_boundaries_[1:]):
49
+ print(f" {speaker}: {start:.2f}s-{end:.2f}s")
50
+ self.speaker_segments.append(SpeakerSegment(
51
+ speaker=speaker,
52
+ start=start,
53
+ end=end
54
+ ))
55
+ else:
56
+ logger.debug("\nNo speakers detected in this segment")
57
+
58
+ def get_segments(self) -> List[SpeakerSegment]:
59
+ """Get a copy of the current speaker segments."""
60
+ with self.segment_lock:
61
+ return self.speaker_segments.copy()
62
+
63
+ def clear_old_segments(self, older_than: float = 30.0):
64
+ """Clear segments older than the specified time."""
65
+ with self.segment_lock:
66
+ current_time = self.processed_time
67
+ self.speaker_segments = [
68
+ segment for segment in self.speaker_segments
69
+ if current_time - segment.end < older_than
70
+ ]
71
+
72
+ def on_error(self, error):
73
+ """Handle an error in the stream."""
74
+ logger.debug(f"Error in diarization stream: {error}")
75
+
76
+ def on_completed(self):
77
+ """Handle the completion of the stream."""
78
+ logger.debug("Diarization stream completed")
79
+
80
+
81
+ class WebSocketAudioSource(AudioSource):
82
+ """
83
+ Buffers incoming audio and releases it in fixed-size chunks at regular intervals.
84
+ """
85
+ def __init__(self, uri: str = "websocket", sample_rate: int = 16000, block_duration: float = 0.5):
86
+ super().__init__(uri, sample_rate)
87
+ self.block_duration = block_duration
88
+ self.block_size = int(np.rint(block_duration * sample_rate))
89
+ self._queue = SimpleQueue()
90
+ self._buffer = np.array([], dtype=np.float32)
91
+ self._buffer_lock = threading.Lock()
92
+ self._closed = False
93
+ self._close_event = threading.Event()
94
+ self._processing_thread = None
95
+ self._last_chunk_time = time.time()
96
+
97
+ def read(self):
98
+ """Start processing buffered audio and emit fixed-size chunks."""
99
+ self._processing_thread = threading.Thread(target=self._process_chunks)
100
+ self._processing_thread.daemon = True
101
+ self._processing_thread.start()
102
+
103
+ self._close_event.wait()
104
+ if self._processing_thread:
105
+ self._processing_thread.join(timeout=2.0)
106
+
107
+ def _process_chunks(self):
108
+ """Process audio from queue and emit fixed-size chunks at regular intervals."""
109
+ while not self._closed:
110
+ try:
111
+ audio_chunk = self._queue.get(timeout=0.1)
112
+
113
+ with self._buffer_lock:
114
+ self._buffer = np.concatenate([self._buffer, audio_chunk])
115
+
116
+ while len(self._buffer) >= self.block_size:
117
+ chunk = self._buffer[:self.block_size]
118
+ self._buffer = self._buffer[self.block_size:]
119
+
120
+ current_time = time.time()
121
+ time_since_last = current_time - self._last_chunk_time
122
+ if time_since_last < self.block_duration:
123
+ time.sleep(self.block_duration - time_since_last)
124
+
125
+ chunk_reshaped = chunk.reshape(1, -1)
126
+ self.stream.on_next(chunk_reshaped)
127
+ self._last_chunk_time = time.time()
128
+
129
+ except Empty:
130
+ with self._buffer_lock:
131
+ if len(self._buffer) > 0 and time.time() - self._last_chunk_time > self.block_duration:
132
+ padded_chunk = np.zeros(self.block_size, dtype=np.float32)
133
+ padded_chunk[:len(self._buffer)] = self._buffer
134
+ self._buffer = np.array([], dtype=np.float32)
135
+
136
+ chunk_reshaped = padded_chunk.reshape(1, -1)
137
+ self.stream.on_next(chunk_reshaped)
138
+ self._last_chunk_time = time.time()
139
+ except Exception as e:
140
+ logger.error(f"Error in audio processing thread: {e}")
141
+ self.stream.on_error(e)
142
+ break
143
+
144
+ with self._buffer_lock:
145
+ if len(self._buffer) > 0:
146
+ padded_chunk = np.zeros(self.block_size, dtype=np.float32)
147
+ padded_chunk[:len(self._buffer)] = self._buffer
148
+ chunk_reshaped = padded_chunk.reshape(1, -1)
149
+ self.stream.on_next(chunk_reshaped)
150
+
151
+ self.stream.on_completed()
152
+
153
+ def close(self):
154
+ if not self._closed:
155
+ self._closed = True
156
+ self._close_event.set()
157
+
158
+ def push_audio(self, chunk: np.ndarray):
159
+ """Add audio chunk to the processing queue."""
160
+ if not self._closed:
161
+ if chunk.ndim > 1:
162
+ chunk = chunk.flatten()
163
+ self._queue.put(chunk)
164
+ logger.debug(f'Added chunk to queue with {len(chunk)} samples')
165
+
166
+
167
+ class DiartDiarization:
168
+ def __init__(self, sample_rate: int = 16000, config : SpeakerDiarizationConfig = None, use_microphone: bool = False, block_duration: float = 1.5, segmentation_model_name: str = "pyannote/segmentation-3.0", embedding_model_name: str = "pyannote/embedding"):
169
+ segmentation_model = m.SegmentationModel.from_pretrained(segmentation_model_name)
170
+ embedding_model = m.EmbeddingModel.from_pretrained(embedding_model_name)
171
+
172
+ if config is None:
173
+ config = SpeakerDiarizationConfig(
174
+ segmentation=segmentation_model,
175
+ embedding=embedding_model,
176
+ )
177
+
178
+ self.pipeline = SpeakerDiarization(config=config)
179
+ self.observer = DiarizationObserver()
180
+ self.lag_diart = None
181
+
182
+ if use_microphone:
183
+ self.source = MicrophoneAudioSource(block_duration=block_duration)
184
+ self.custom_source = None
185
+ else:
186
+ self.custom_source = WebSocketAudioSource(
187
+ uri="websocket_source",
188
+ sample_rate=sample_rate,
189
+ block_duration=block_duration
190
+ )
191
+ self.source = self.custom_source
192
+
193
+ self.inference = StreamingInference(
194
+ pipeline=self.pipeline,
195
+ source=self.source,
196
+ do_plot=False,
197
+ show_progress=False,
198
+ )
199
+ self.inference.attach_observers(self.observer)
200
+ asyncio.get_event_loop().run_in_executor(None, self.inference)
201
+
202
+ async def diarize(self, pcm_array: np.ndarray):
203
+ """
204
+ Process audio data for diarization.
205
+ Only used when working with WebSocketAudioSource.
206
+ """
207
+ if self.custom_source:
208
+ self.custom_source.push_audio(pcm_array)
209
+ # self.observer.clear_old_segments()
210
+
211
+ def close(self):
212
+ """Close the audio source."""
213
+ if self.custom_source:
214
+ self.custom_source.close()
215
+
216
+ def assign_speakers_to_tokens(self, tokens: list, use_punctuation_split: bool = False) -> float:
217
+ """
218
+ Assign speakers to tokens based on timing overlap with speaker segments.
219
+ Uses the segments collected by the observer.
220
+
221
+ If use_punctuation_split is True, uses punctuation marks to refine speaker boundaries.
222
+ """
223
+ segments = self.observer.get_segments()
224
+
225
+ # Debug logging
226
+ logger.debug(f"assign_speakers_to_tokens called with {len(tokens)} tokens")
227
+ logger.debug(f"Available segments: {len(segments)}")
228
+ for i, seg in enumerate(segments[:5]): # Show first 5 segments
229
+ logger.debug(f" Segment {i}: {seg.speaker} [{seg.start:.2f}-{seg.end:.2f}]")
230
+
231
+ if not self.lag_diart and segments and tokens:
232
+ self.lag_diart = segments[0].start - tokens[0].start
233
+
234
+ if not use_punctuation_split:
235
+ for token in tokens:
236
+ for segment in segments:
237
+ if not (segment.end <= token.start + self.lag_diart or segment.start >= token.end + self.lag_diart):
238
+ token.speaker = extract_number(segment.speaker) + 1
239
+ else:
240
+ tokens = add_speaker_to_tokens(segments, tokens)
241
+ return tokens
242
+
243
+ def concatenate_speakers(segments):
244
+ segments_concatenated = [{"speaker": 1, "begin": 0.0, "end": 0.0}]
245
+ for segment in segments:
246
+ speaker = extract_number(segment.speaker) + 1
247
+ if segments_concatenated[-1]['speaker'] != speaker:
248
+ segments_concatenated.append({"speaker": speaker, "begin": segment.start, "end": segment.end})
249
+ else:
250
+ segments_concatenated[-1]['end'] = segment.end
251
+ # print("Segments concatenated:")
252
+ # for entry in segments_concatenated:
253
+ # print(f"Speaker {entry['speaker']}: {entry['begin']:.2f}s - {entry['end']:.2f}s")
254
+ return segments_concatenated
255
+
256
+
257
+ def add_speaker_to_tokens(segments, tokens):
258
+ """
259
+ Assign speakers to tokens based on diarization segments, with punctuation-aware boundary adjustment.
260
+ """
261
+ punctuation_marks = {'.', '!', '?'}
262
+ punctuation_tokens = [token for token in tokens if token.text.strip() in punctuation_marks]
263
+ segments_concatenated = concatenate_speakers(segments)
264
+ for ind, segment in enumerate(segments_concatenated):
265
+ for i, punctuation_token in enumerate(punctuation_tokens):
266
+ if punctuation_token.start > segment['end']:
267
+ after_length = punctuation_token.start - segment['end']
268
+ before_length = segment['end'] - punctuation_tokens[i - 1].end
269
+ if before_length > after_length:
270
+ segment['end'] = punctuation_token.start
271
+ if i < len(punctuation_tokens) - 1 and ind + 1 < len(segments_concatenated):
272
+ segments_concatenated[ind + 1]['begin'] = punctuation_token.start
273
+ else:
274
+ segment['end'] = punctuation_tokens[i - 1].end
275
+ if i < len(punctuation_tokens) - 1 and ind - 1 >= 0:
276
+ segments_concatenated[ind - 1]['begin'] = punctuation_tokens[i - 1].end
277
+ break
278
+
279
+ last_end = 0.0
280
+ for token in tokens:
281
+ start = max(last_end + 0.01, token.start)
282
+ token.start = start
283
+ token.end = max(start, token.end)
284
+ last_end = token.end
285
+
286
+ ind_last_speaker = 0
287
+ for segment in segments_concatenated:
288
+ for i, token in enumerate(tokens[ind_last_speaker:]):
289
+ if token.end <= segment['end']:
290
+ token.speaker = segment['speaker']
291
+ ind_last_speaker = i + 1
292
+ # print(
293
+ # f"Token '{token.text}' ('begin': {token.start:.2f}, 'end': {token.end:.2f}) "
294
+ # f"assigned to Speaker {segment['speaker']} ('segment': {segment['begin']:.2f}-{segment['end']:.2f})"
295
+ # )
296
+ elif token.start > segment['end']:
297
+ break
298
+ return tokens
299
+
300
+
301
+ def visualize_tokens(tokens):
302
+ conversation = [{"speaker": -1, "text": ""}]
303
+ for token in tokens:
304
+ speaker = conversation[-1]['speaker']
305
+ if token.speaker != speaker:
306
+ conversation.append({"speaker": token.speaker, "text": token.text})
307
+ else:
308
+ conversation[-1]['text'] += token.text
309
+ print("Conversation:")
310
+ for entry in conversation:
311
+ print(f"Speaker {entry['speaker']}: {entry['text']}")
whisperlivekit/ffmpeg_manager.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from enum import Enum
4
+ from typing import Optional, Callable
5
+ import contextlib
6
+
7
+ logger = logging.getLogger(__name__)
8
+ logging.basicConfig(level=logging.INFO)
9
+
10
+ ERROR_INSTALL_INSTRUCTIONS = """
11
+ FFmpeg is not installed or not found in your system's PATH.
12
+ Please install FFmpeg to enable audio processing.
13
+
14
+ Installation instructions:
15
+
16
+ # Ubuntu/Debian:
17
+ sudo apt update && sudo apt install ffmpeg
18
+
19
+ # macOS (using Homebrew):
20
+ brew install ffmpeg
21
+
22
+ # Windows:
23
+ # 1. Download the latest static build from https://ffmpeg.org/download.html
24
+ # 2. Extract the archive (e.g., to C:\\FFmpeg).
25
+ # 3. Add the 'bin' directory (e.g., C:\\FFmpeg\\bin) to your system's PATH environment variable.
26
+
27
+ After installation, please restart the application.
28
+ """
29
+
30
+ class FFmpegState(Enum):
31
+ STOPPED = "stopped"
32
+ STARTING = "starting"
33
+ RUNNING = "running"
34
+ RESTARTING = "restarting"
35
+ FAILED = "failed"
36
+
37
+ class FFmpegManager:
38
+ def __init__(self, sample_rate: int = 16000, channels: int = 1):
39
+ self.sample_rate = sample_rate
40
+ self.channels = channels
41
+
42
+ self.process: Optional[asyncio.subprocess.Process] = None
43
+ self._stderr_task: Optional[asyncio.Task] = None
44
+
45
+ self.on_error_callback: Optional[Callable[[str], None]] = None
46
+
47
+ self.state = FFmpegState.STOPPED
48
+ self._state_lock = asyncio.Lock()
49
+
50
+ async def start(self) -> bool:
51
+ async with self._state_lock:
52
+ if self.state != FFmpegState.STOPPED:
53
+ logger.warning(f"FFmpeg already running in state: {self.state}")
54
+ return False
55
+ self.state = FFmpegState.STARTING
56
+
57
+ try:
58
+ cmd = [
59
+ "ffmpeg",
60
+ "-hide_banner",
61
+ "-loglevel", "error",
62
+ "-i", "pipe:0",
63
+ "-f", "s16le",
64
+ "-acodec", "pcm_s16le",
65
+ "-ac", str(self.channels),
66
+ "-ar", str(self.sample_rate),
67
+ "pipe:1"
68
+ ]
69
+
70
+ self.process = await asyncio.create_subprocess_exec(
71
+ *cmd,
72
+ stdin=asyncio.subprocess.PIPE,
73
+ stdout=asyncio.subprocess.PIPE,
74
+ stderr=asyncio.subprocess.PIPE
75
+ )
76
+
77
+ self._stderr_task = asyncio.create_task(self._drain_stderr())
78
+
79
+ async with self._state_lock:
80
+ self.state = FFmpegState.RUNNING
81
+
82
+ logger.info("FFmpeg started.")
83
+ return True
84
+
85
+ except FileNotFoundError:
86
+ logger.error(ERROR_INSTALL_INSTRUCTIONS)
87
+ async with self._state_lock:
88
+ self.state = FFmpegState.FAILED
89
+ if self.on_error_callback:
90
+ await self.on_error_callback("ffmpeg_not_found")
91
+ return False
92
+
93
+ except Exception as e:
94
+ logger.error(f"Error starting FFmpeg: {e}")
95
+ async with self._state_lock:
96
+ self.state = FFmpegState.FAILED
97
+ if self.on_error_callback:
98
+ await self.on_error_callback("start_failed")
99
+ return False
100
+
101
+ async def stop(self):
102
+ async with self._state_lock:
103
+ if self.state == FFmpegState.STOPPED:
104
+ return
105
+ self.state = FFmpegState.STOPPED
106
+
107
+ if self.process:
108
+ if self.process.stdin and not self.process.stdin.is_closing():
109
+ self.process.stdin.close()
110
+ await self.process.stdin.wait_closed()
111
+ await self.process.wait()
112
+ self.process = None
113
+
114
+ if self._stderr_task:
115
+ self._stderr_task.cancel()
116
+ with contextlib.suppress(asyncio.CancelledError):
117
+ await self._stderr_task
118
+
119
+ logger.info("FFmpeg stopped.")
120
+
121
+ async def write_data(self, data: bytes) -> bool:
122
+ async with self._state_lock:
123
+ if self.state != FFmpegState.RUNNING:
124
+ logger.warning(f"Cannot write, FFmpeg state: {self.state}")
125
+ return False
126
+
127
+ try:
128
+ self.process.stdin.write(data)
129
+ await self.process.stdin.drain()
130
+ return True
131
+ except Exception as e:
132
+ logger.error(f"Error writing to FFmpeg: {e}")
133
+ if self.on_error_callback:
134
+ await self.on_error_callback("write_error")
135
+ return False
136
+
137
+ async def read_data(self, size: int) -> Optional[bytes]:
138
+ async with self._state_lock:
139
+ if self.state != FFmpegState.RUNNING:
140
+ logger.warning(f"Cannot read, FFmpeg state: {self.state}")
141
+ return None
142
+
143
+ try:
144
+ data = await asyncio.wait_for(
145
+ self.process.stdout.read(size),
146
+ timeout=5.0
147
+ )
148
+ return data
149
+ except asyncio.TimeoutError:
150
+ logger.warning("FFmpeg read timeout.")
151
+ return None
152
+ except Exception as e:
153
+ logger.error(f"Error reading from FFmpeg: {e}")
154
+ if self.on_error_callback:
155
+ await self.on_error_callback("read_error")
156
+ return None
157
+
158
+ async def get_state(self) -> FFmpegState:
159
+ async with self._state_lock:
160
+ return self.state
161
+
162
+ async def restart(self) -> bool:
163
+ async with self._state_lock:
164
+ if self.state == FFmpegState.RESTARTING:
165
+ logger.warning("Restart already in progress.")
166
+ return False
167
+ self.state = FFmpegState.RESTARTING
168
+
169
+ logger.info("Restarting FFmpeg...")
170
+
171
+ try:
172
+ await self.stop()
173
+ await asyncio.sleep(1) # short delay before restarting
174
+ return await self.start()
175
+ except Exception as e:
176
+ logger.error(f"Error during FFmpeg restart: {e}")
177
+ async with self._state_lock:
178
+ self.state = FFmpegState.FAILED
179
+ if self.on_error_callback:
180
+ await self.on_error_callback("restart_failed")
181
+ return False
182
+
183
+ async def _drain_stderr(self):
184
+ try:
185
+ while True:
186
+ line = await self.process.stderr.readline()
187
+ if not line:
188
+ break
189
+ logger.debug(f"FFmpeg stderr: {line.decode(errors='ignore').strip()}")
190
+ except asyncio.CancelledError:
191
+ logger.info("FFmpeg stderr drain task cancelled.")
192
+ except Exception as e:
193
+ logger.error(f"Error draining FFmpeg stderr: {e}")
whisperlivekit/parse_args.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from argparse import ArgumentParser
3
+
4
+ def parse_args():
5
+ parser = ArgumentParser(description="Whisper FastAPI Online Server")
6
+ parser.add_argument(
7
+ "--host",
8
+ type=str,
9
+ default="localhost",
10
+ help="The host address to bind the server to.",
11
+ )
12
+ parser.add_argument(
13
+ "--port", type=int, default=8000, help="The port number to bind the server to."
14
+ )
15
+ parser.add_argument(
16
+ "--warmup-file",
17
+ type=str,
18
+ default=None,
19
+ dest="warmup_file",
20
+ help="""
21
+ The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
22
+ If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
23
+ If False, no warmup is performed.
24
+ """,
25
+ )
26
+
27
+ parser.add_argument(
28
+ "--confidence-validation",
29
+ action="store_true",
30
+ help="Accelerates validation of tokens using confidence scores. Transcription will be faster but punctuation might be less accurate.",
31
+ )
32
+
33
+ parser.add_argument(
34
+ "--diarization",
35
+ action="store_true",
36
+ default=False,
37
+ help="Enable speaker diarization.",
38
+ )
39
+
40
+ parser.add_argument(
41
+ "--punctuation-split",
42
+ action="store_true",
43
+ default=False,
44
+ help="Use punctuation marks from transcription to improve speaker boundary detection. Requires both transcription and diarization to be enabled.",
45
+ )
46
+
47
+ parser.add_argument(
48
+ "--segmentation-model",
49
+ type=str,
50
+ default="pyannote/segmentation-3.0",
51
+ help="Hugging Face model ID for pyannote.audio segmentation model.",
52
+ )
53
+
54
+ parser.add_argument(
55
+ "--embedding-model",
56
+ type=str,
57
+ default="pyannote/embedding",
58
+ help="Hugging Face model ID for pyannote.audio embedding model.",
59
+ )
60
+
61
+ parser.add_argument(
62
+ "--no-transcription",
63
+ action="store_true",
64
+ help="Disable transcription to only see live diarization results.",
65
+ )
66
+
67
+ parser.add_argument(
68
+ "--min-chunk-size",
69
+ type=float,
70
+ default=0.5,
71
+ help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
72
+ )
73
+
74
+ parser.add_argument(
75
+ "--model",
76
+ type=str,
77
+ default="tiny",
78
+ help="Name size of the Whisper model to use (default: tiny). Suggested values: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo. The model is automatically downloaded from the model hub if not present in model cache dir.",
79
+ )
80
+
81
+ parser.add_argument(
82
+ "--model_cache_dir",
83
+ type=str,
84
+ default=None,
85
+ help="Overriding the default model cache dir where models downloaded from the hub are saved",
86
+ )
87
+ parser.add_argument(
88
+ "--model_dir",
89
+ type=str,
90
+ default=None,
91
+ help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.",
92
+ )
93
+ parser.add_argument(
94
+ "--lan",
95
+ "--language",
96
+ type=str,
97
+ default="auto",
98
+ help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
99
+ )
100
+ parser.add_argument(
101
+ "--task",
102
+ type=str,
103
+ default="transcribe",
104
+ choices=["transcribe", "translate"],
105
+ help="Transcribe or translate.",
106
+ )
107
+ parser.add_argument(
108
+ "--backend",
109
+ type=str,
110
+ default="faster-whisper",
111
+ choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api", "simulstreaming"],
112
+ help="Load only this backend for Whisper processing.",
113
+ )
114
+ parser.add_argument(
115
+ "--vac",
116
+ action="store_true",
117
+ default=False,
118
+ help="Use VAC = voice activity controller. Recommended. Requires torch.",
119
+ )
120
+ parser.add_argument(
121
+ "--vac-chunk-size", type=float, default=0.04, help="VAC sample size in seconds."
122
+ )
123
+
124
+ parser.add_argument(
125
+ "--no-vad",
126
+ action="store_true",
127
+ help="Disable VAD (voice activity detection).",
128
+ )
129
+
130
+ parser.add_argument(
131
+ "--buffer_trimming",
132
+ type=str,
133
+ default="segment",
134
+ choices=["sentence", "segment"],
135
+ help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.',
136
+ )
137
+ parser.add_argument(
138
+ "--buffer_trimming_sec",
139
+ type=float,
140
+ default=15,
141
+ help="Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.",
142
+ )
143
+ parser.add_argument(
144
+ "-l",
145
+ "--log-level",
146
+ dest="log_level",
147
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
148
+ help="Set the log level",
149
+ default="DEBUG",
150
+ )
151
+ parser.add_argument("--ssl-certfile", type=str, help="Path to the SSL certificate file.", default=None)
152
+ parser.add_argument("--ssl-keyfile", type=str, help="Path to the SSL private key file.", default=None)
153
+
154
+ # SimulStreaming-specific arguments
155
+ simulstreaming_group = parser.add_argument_group('SimulStreaming arguments (only used with --backend simulstreaming)')
156
+
157
+ simulstreaming_group.add_argument(
158
+ "--frame-threshold",
159
+ type=int,
160
+ default=25,
161
+ dest="frame_threshold",
162
+ help="Threshold for the attention-guided decoding. The AlignAtt policy will decode only until this number of frames from the end of audio. In frames: one frame is 0.02 seconds for large-v3 model.",
163
+ )
164
+
165
+ simulstreaming_group.add_argument(
166
+ "--beams",
167
+ "-b",
168
+ type=int,
169
+ default=1,
170
+ help="Number of beams for beam search decoding. If 1, GreedyDecoder is used.",
171
+ )
172
+
173
+ simulstreaming_group.add_argument(
174
+ "--decoder",
175
+ type=str,
176
+ default=None,
177
+ dest="decoder_type",
178
+ choices=["beam", "greedy"],
179
+ help="Override automatic selection of beam or greedy decoder. If beams > 1 and greedy: invalid.",
180
+ )
181
+
182
+ simulstreaming_group.add_argument(
183
+ "--audio-max-len",
184
+ type=float,
185
+ default=30.0,
186
+ dest="audio_max_len",
187
+ help="Max length of the audio buffer, in seconds.",
188
+ )
189
+
190
+ simulstreaming_group.add_argument(
191
+ "--audio-min-len",
192
+ type=float,
193
+ default=0.0,
194
+ dest="audio_min_len",
195
+ help="Skip processing if the audio buffer is shorter than this length, in seconds. Useful when the --min-chunk-size is small.",
196
+ )
197
+
198
+ simulstreaming_group.add_argument(
199
+ "--cif-ckpt-path",
200
+ type=str,
201
+ default=None,
202
+ dest="cif_ckpt_path",
203
+ help="The file path to the Simul-Whisper's CIF model checkpoint that detects whether there is end of word at the end of the chunk. If not, the last decoded space-separated word is truncated because it is often wrong -- transcribing a word in the middle. The CIF model adapted for the Whisper model version should be used. Find the models in https://github.com/backspacetg/simul_whisper/tree/main/cif_models . Note that there is no model for large-v3.",
204
+ )
205
+
206
+ simulstreaming_group.add_argument(
207
+ "--never-fire",
208
+ action="store_true",
209
+ default=False,
210
+ dest="never_fire",
211
+ help="Override the CIF model. If True, the last word is NEVER truncated, no matter what the CIF model detects. If False: if CIF model path is set, the last word is SOMETIMES truncated, depending on the CIF detection. Otherwise, if the CIF model path is not set, the last word is ALWAYS trimmed.",
212
+ )
213
+
214
+ simulstreaming_group.add_argument(
215
+ "--init-prompt",
216
+ type=str,
217
+ default=None,
218
+ dest="init_prompt",
219
+ help="Init prompt for the model. It should be in the target language.",
220
+ )
221
+
222
+ simulstreaming_group.add_argument(
223
+ "--static-init-prompt",
224
+ type=str,
225
+ default=None,
226
+ dest="static_init_prompt",
227
+ help="Do not scroll over this text. It can contain terminology that should be relevant over all document.",
228
+ )
229
+
230
+ simulstreaming_group.add_argument(
231
+ "--max-context-tokens",
232
+ type=int,
233
+ default=None,
234
+ dest="max_context_tokens",
235
+ help="Max context tokens for the model. Default is 0.",
236
+ )
237
+
238
+ simulstreaming_group.add_argument(
239
+ "--model-path",
240
+ type=str,
241
+ default=None,
242
+ dest="model_path",
243
+ help="Direct path to the SimulStreaming Whisper .pt model file. Overrides --model for SimulStreaming backend.",
244
+ )
245
+
246
+ args = parser.parse_args()
247
+
248
+ args.transcription = not args.no_transcription
249
+ args.vad = not args.no_vad
250
+ delattr(args, 'no_transcription')
251
+ delattr(args, 'no_vad')
252
+
253
+ return args
whisperlivekit/remove_silences.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from whisperlivekit.timed_objects import ASRToken
2
+ import re
3
+
4
+ MIN_SILENCE_DURATION = 4 #in seconds
5
+ END_SILENCE_DURATION = 8 #in seconds. you should keep it important to not have false positive when the model lag is important
6
+
7
+ def blank_to_silence(tokens):
8
+ full_string = ''.join([t.text for t in tokens])
9
+ patterns = [re.compile(r'(?:\s*\[BLANK_AUDIO\]\s*)+'), re.compile(r'(?:\s*\[typing\]\s*)+')]
10
+ matches = []
11
+ for pattern in patterns:
12
+ for m in pattern.finditer(full_string):
13
+ matches.append({
14
+ 'start': m.start(),
15
+ 'end': m.end()
16
+ })
17
+ if matches:
18
+ # cleaned = pattern.sub(' ', full_string).strip()
19
+ # print("Cleaned:", cleaned)
20
+ cumulated_len = 0
21
+ silence_token = None
22
+ cleaned_tokens = []
23
+ for token in tokens:
24
+ if matches:
25
+ start = cumulated_len
26
+ end = cumulated_len + len(token.text)
27
+ cumulated_len = end
28
+ if start >= matches[0]['start'] and end <= matches[0]['end']:
29
+ if silence_token: #previous token was already silence
30
+ silence_token.start = min(silence_token.start, token.start)
31
+ silence_token.end = max(silence_token.end, token.end)
32
+ else: #new silence
33
+ silence_token = ASRToken(
34
+ start=token.start,
35
+ end=token.end,
36
+ speaker=-2,
37
+ probability=0.95
38
+ )
39
+ else:
40
+ if silence_token: #there was silence but no more
41
+ if silence_token.end - silence_token.start >= MIN_SILENCE_DURATION:
42
+ cleaned_tokens.append(
43
+ silence_token
44
+ )
45
+ silence_token = None
46
+ matches.pop(0)
47
+ cleaned_tokens.append(token)
48
+ # print(cleaned_tokens)
49
+ return cleaned_tokens
50
+ return tokens
51
+
52
+ def no_token_to_silence(tokens):
53
+ new_tokens = []
54
+ silence_token = None
55
+ for token in tokens:
56
+ if token.speaker == -2:
57
+ if new_tokens and new_tokens[-1].speaker == -2: #if token is silence and previous one too
58
+ new_tokens[-1].end = token.end
59
+ else:
60
+ new_tokens.append(token)
61
+
62
+ last_end = new_tokens[-1].end if new_tokens else 0.0
63
+ if token.start - last_end >= MIN_SILENCE_DURATION: #if token is not silence but important gap
64
+ if new_tokens and new_tokens[-1].speaker == -2:
65
+ new_tokens[-1].end = token.start
66
+ else:
67
+ silence_token = ASRToken(
68
+ start=last_end,
69
+ end=token.start,
70
+ speaker=-2,
71
+ probability=0.95
72
+ )
73
+ new_tokens.append(silence_token)
74
+
75
+ if token.speaker != -2:
76
+ new_tokens.append(token)
77
+ return new_tokens
78
+
79
+ def ends_with_silence(tokens, current_time):
80
+ if not tokens:
81
+ return []
82
+ last_token = tokens[-1]
83
+ if tokens and current_time - last_token.end >= END_SILENCE_DURATION:
84
+ if last_token.speaker == -2:
85
+ last_token.end = current_time
86
+ else:
87
+ tokens.append(
88
+ ASRToken(
89
+ start=tokens[-1].end,
90
+ end=current_time,
91
+ speaker=-2,
92
+ probability=0.95
93
+ )
94
+ )
95
+ return tokens
96
+
97
+
98
+ def handle_silences(tokens, current_time):
99
+ tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
100
+ tokens = no_token_to_silence(tokens)
101
+ tokens = ends_with_silence(tokens, current_time)
102
+ return tokens
103
+
whisperlivekit/simul_whisper/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .backend import SimulStreamingASR, SimulStreamingOnlineProcessor
2
+
3
+ __all__ = [
4
+ "SimulStreamingASR",
5
+ "SimulStreamingOnlineProcessor",
6
+ ]
whisperlivekit/simul_whisper/backend.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import numpy as np
3
+ import logging
4
+ from typing import List, Tuple, Optional
5
+ import logging
6
+ from whisperlivekit.timed_objects import ASRToken, Transcript
7
+ from whisperlivekit.simul_whisper.license_simulstreaming import SIMULSTREAMING_LICENSE
8
+ from .whisper import load_model, tokenizer
9
+ import os
10
+ logger = logging.getLogger(__name__)
11
+
12
+ try:
13
+ import torch
14
+ from whisperlivekit.simul_whisper.config import AlignAttConfig
15
+ from whisperlivekit.simul_whisper.simul_whisper import PaddedAlignAttWhisper
16
+ from whisperlivekit.simul_whisper.whisper import tokenizer
17
+ except ImportError as e:
18
+ raise ImportError(
19
+ """SimulStreaming dependencies are not available.
20
+ Please install WhisperLiveKit using pip install "whisperlivekit[simulstreaming]".""")
21
+
22
+ class SimulStreamingOnlineProcessor:
23
+ SAMPLING_RATE = 16000
24
+
25
+ def __init__(
26
+ self,
27
+ asr,
28
+ logfile=sys.stderr,
29
+ warmup_file=None
30
+ ):
31
+ self.asr = asr
32
+ self.logfile = logfile
33
+ self.is_last = False
34
+ self.beg = 0.0
35
+ self.end = 0.0
36
+ self.cumulative_audio_duration = 0.0
37
+
38
+ self.committed: List[ASRToken] = []
39
+ self.last_result_tokens: List[ASRToken] = []
40
+ self.model = PaddedAlignAttWhisper(
41
+ cfg=asr.cfg,
42
+ loaded_model=asr.whisper_model)
43
+ if asr.tokenizer:
44
+ self.model.tokenizer = asr.tokenizer
45
+
46
+ def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: Optional[float] = None):
47
+ """Append an audio chunk to be processed by SimulStreaming."""
48
+
49
+ # Convert numpy array to torch tensor
50
+ audio_tensor = torch.from_numpy(audio).float()
51
+
52
+ # Update timing
53
+ chunk_duration = len(audio) / self.SAMPLING_RATE
54
+ self.cumulative_audio_duration += chunk_duration
55
+
56
+ if audio_stream_end_time is not None:
57
+ self.end = audio_stream_end_time
58
+ else:
59
+ self.end = self.cumulative_audio_duration
60
+ self.model.insert_audio(audio_tensor)
61
+
62
+ def get_buffer(self):
63
+ return Transcript(
64
+ start=None,
65
+ end=None,
66
+ text='',
67
+ probability=None
68
+ )
69
+
70
+ def timestamped_text(self, tokens, generation):
71
+ # From the simulstreaming repo. self.model to self.asr.model
72
+ pr = generation["progress"]
73
+ if "result" not in generation:
74
+ split_words, split_tokens = self.model.tokenizer.split_to_word_tokens(tokens)
75
+ else:
76
+ split_words, split_tokens = generation["result"]["split_words"], generation["result"]["split_tokens"]
77
+
78
+ frames = [p["most_attended_frames"][0] for p in pr]
79
+ tokens = tokens.copy()
80
+ ret = []
81
+ for sw,st in zip(split_words,split_tokens):
82
+ b = None
83
+ for stt in st:
84
+ t,f = tokens.pop(0), frames.pop(0)
85
+ if t != stt:
86
+ raise ValueError(f"Token mismatch: {t} != {stt} at frame {f}.")
87
+ if b is None:
88
+ b = f
89
+ e = f
90
+ out = (b*0.02, e*0.02, sw)
91
+ ret.append(out)
92
+ logger.debug(f"TS-WORD:\t{' '.join(map(str, out))}")
93
+ return ret
94
+
95
+ def process_iter(self) -> Tuple[List[ASRToken], float]:
96
+ """
97
+ Process accumulated audio chunks using SimulStreaming.
98
+
99
+ Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
100
+ """
101
+ try:
102
+ tokens, generation_progress = self.model.infer(is_last=self.is_last)
103
+ ts_words = self.timestamped_text(tokens, generation_progress)
104
+
105
+ new_tokens = []
106
+ for ts_word in ts_words:
107
+
108
+ start, end, word = ts_word
109
+ token = ASRToken(
110
+ start=start,
111
+ end=end,
112
+ text=word,
113
+ probability=0.95 # fake prob. Maybe we can extract it from the model?
114
+ )
115
+ new_tokens.append(token)
116
+ self.committed.extend(new_tokens)
117
+
118
+ return new_tokens, self.end
119
+
120
+
121
+ except Exception as e:
122
+ logger.exception(f"SimulStreaming processing error: {e}")
123
+ return [], self.end
124
+
125
+ def warmup(self, audio, init_prompt=""):
126
+ """Warmup the SimulStreaming model."""
127
+ try:
128
+ self.model.insert_audio(audio)
129
+ self.model.infer(True)
130
+ self.model.refresh_segment(complete=True)
131
+ logger.info("SimulStreaming model warmed up successfully")
132
+ except Exception as e:
133
+ logger.exception(f"SimulStreaming warmup failed: {e}")
134
+
135
+
136
+ class SimulStreamingASR():
137
+ """SimulStreaming backend with AlignAtt policy."""
138
+ sep = ""
139
+
140
+ def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr, **kwargs):
141
+ logger.warning(SIMULSTREAMING_LICENSE)
142
+ self.logfile = logfile
143
+ self.transcribe_kargs = {}
144
+ self.original_language = None if lan == "auto" else lan
145
+
146
+ self.model_path = kwargs.get('model_path', './large-v3.pt')
147
+ self.frame_threshold = kwargs.get('frame_threshold', 25)
148
+ self.audio_max_len = kwargs.get('audio_max_len', 30.0)
149
+ self.audio_min_len = kwargs.get('audio_min_len', 0.0)
150
+ self.segment_length = kwargs.get('segment_length', 0.5)
151
+ self.beams = kwargs.get('beams', 1)
152
+ self.decoder_type = kwargs.get('decoder_type', 'greedy' if self.beams == 1 else 'beam')
153
+ self.task = kwargs.get('task', 'transcribe')
154
+ self.cif_ckpt_path = kwargs.get('cif_ckpt_path', None)
155
+ self.never_fire = kwargs.get('never_fire', False)
156
+ self.init_prompt = kwargs.get('init_prompt', None)
157
+ self.static_init_prompt = kwargs.get('static_init_prompt', None)
158
+ self.max_context_tokens = kwargs.get('max_context_tokens', None)
159
+
160
+ if model_dir is not None:
161
+ self.model_path = model_dir
162
+ elif modelsize is not None:
163
+ model_mapping = {
164
+ 'tiny': './tiny.pt',
165
+ 'base': './base.pt',
166
+ 'small': './small.pt',
167
+ 'medium': './medium.pt',
168
+ 'medium.en': './medium.en.pt',
169
+ 'large-v1': './large-v1.pt',
170
+ 'base.en': './base.en.pt',
171
+ 'small.en': './small.en.pt',
172
+ 'tiny.en': './tiny.en.pt',
173
+ 'large-v2': './large-v2.pt',
174
+ 'large-v3': './large-v3.pt',
175
+ 'large': './large-v3.pt'
176
+ }
177
+ self.model_path = model_mapping.get(modelsize, f'./{modelsize}.pt')
178
+
179
+ self.model = self.load_model(modelsize)
180
+
181
+ # Set up tokenizer for translation if needed
182
+ if self.task == "translate":
183
+ self.tokenizer = self.set_translate_task()
184
+ else:
185
+ self.tokenizer = None
186
+
187
+
188
+ def load_model(self, modelsize):
189
+ self.cfg = AlignAttConfig(
190
+ model_path=self.model_path,
191
+ segment_length=self.segment_length,
192
+ frame_threshold=self.frame_threshold,
193
+ language=self.original_language,
194
+ audio_max_len=self.audio_max_len,
195
+ audio_min_len=self.audio_min_len,
196
+ cif_ckpt_path=self.cif_ckpt_path,
197
+ decoder_type="beam",
198
+ beam_size=self.beams,
199
+ task=self.task,
200
+ never_fire=self.never_fire,
201
+ init_prompt=self.init_prompt,
202
+ max_context_tokens=self.max_context_tokens,
203
+ static_init_prompt=self.static_init_prompt,
204
+ )
205
+ model_name = os.path.basename(self.cfg.model_path).replace(".pt", "")
206
+ model_path = os.path.dirname(os.path.abspath(self.cfg.model_path))
207
+ self.whisper_model = load_model(name=model_name, download_root=model_path)
208
+
209
+
210
+ def set_translate_task(self):
211
+ """Set up translation task."""
212
+ return tokenizer.get_tokenizer(
213
+ multilingual=True,
214
+ language=self.model.cfg.language,
215
+ num_languages=self.model.model.num_languages,
216
+ task="translate"
217
+ )
218
+
219
+ def transcribe(self, audio):
220
+ """
221
+ Only used for warmup. It's a direct whisper call, not a simulstreaming call
222
+ """
223
+ self.whisper_model.transcribe(audio, language=self.original_language)
whisperlivekit/simul_whisper/beam.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .whisper.decoding import PyTorchInference
2
+
3
+ # extention of PyTorchInference for beam search
4
+ class BeamPyTorchInference(PyTorchInference):
5
+
6
+ def _kv_modules(self):
7
+ key_modules = [block.attn.key.cache_id for block in self.model.decoder.blocks]
8
+ value_modules = [block.attn.value.cache_id for block in self.model.decoder.blocks]
9
+ return key_modules + value_modules
10
+
11
+ def rearrange_kv_cache(self, source_indices):
12
+ if source_indices != list(range(len(source_indices))):
13
+ for module_cache_id in self._kv_modules():
14
+ self.kv_cache[module_cache_id] = self.kv_cache[module_cache_id][source_indices].detach()
15
+ from torch import Tensor
16
+ def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
17
+ return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
whisperlivekit/simul_whisper/config.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This code was originally in simul_whisper/transcriber/simul_whisper.py . It is adapted a lot for SimulStreaming.
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Literal
5
+
6
+ @dataclass
7
+ class SimulWhisperConfig:
8
+ '''Options that are common for all simul policies that could be implemented in SimulWhisper.'''
9
+ model_path: str
10
+ language: str = field(default="zh")
11
+ nonspeech_prob: float = 0.5
12
+ audio_min_len: float = 1.0
13
+ decoder_type: Literal["greedy","beam"] = "greedy"
14
+ beam_size: int = 5
15
+ task: Literal["transcribe","translate"] = "transcribe"
16
+ init_prompt: str = field(default=None)
17
+ static_init_prompt: str = field(default=None)
18
+ max_context_tokens: int = field(default=None)
19
+
20
+ @dataclass
21
+ class AlignAttConfig(SimulWhisperConfig):
22
+ '''Options specific to the AlignAtt policy.'''
23
+ eval_data_path: str = "tmp"
24
+ segment_length: float = field(default=1.0, metadata = {"help": "in second"})
25
+ frame_threshold: int = 4
26
+ rewind_threshold: int = 200
27
+ audio_max_len: float = 30.0
28
+ cif_ckpt_path: str = ""
29
+ never_fire: bool = False
whisperlivekit/simul_whisper/eow_detection.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ # code for the end-of-word detection based on the CIF model proposed in Simul-Whisper
4
+
5
+ def load_cif(cfg, n_audio_state, device):
6
+ """cfg: AlignAttConfig, n_audio_state: int, device: torch.device"""
7
+ cif_linear = torch.nn.Linear(n_audio_state, 1)
8
+ if cfg.cif_ckpt_path is None or not cfg.cif_ckpt_path:
9
+ if cfg.never_fire:
10
+ never_fire = True
11
+ always_fire = False
12
+ else:
13
+ always_fire = True
14
+ never_fire = False
15
+ else:
16
+ always_fire = False
17
+ never_fire = cfg.never_fire
18
+ checkpoint = torch.load(cfg.cif_ckpt_path)
19
+ cif_linear.load_state_dict(checkpoint)
20
+ cif_linear.to(device)
21
+ return cif_linear, always_fire, never_fire
22
+
23
+
24
+ # from https://github.com/dqqcasia/mosst/blob/master/fairseq/models/speech_to_text/convtransformer_wav2vec_cif.py
25
+ def resize(alphas, target_lengths, threshold=0.999):
26
+ """
27
+ alpha in thresh=1.0 | (0.0, +0.21)
28
+ target_lengths: if None, apply round and resize, else apply scaling
29
+ """
30
+ # sum
31
+ _num = alphas.sum(-1)
32
+ num = target_lengths.float()
33
+ # scaling
34
+ _alphas = alphas * (num / _num)[:, None].repeat(1, alphas.size(1))
35
+ # rm attention value that exceeds threashold
36
+ count = 0
37
+ while len(torch.where(_alphas > threshold)[0]):
38
+ count += 1
39
+ if count > 10:
40
+ break
41
+ xs, ys = torch.where(_alphas > threshold)
42
+ for x, y in zip(xs, ys):
43
+ if _alphas[x][y] >= threshold:
44
+ mask = _alphas[x].ne(0).float()
45
+ mean = 0.5 * _alphas[x].sum() / mask.sum()
46
+ _alphas[x] = _alphas[x] * 0.5 + mean * mask
47
+
48
+ return _alphas, _num
49
+
50
+ def fire_at_boundary(chunked_encoder_feature: torch.Tensor, cif_linear):
51
+ content_mel_len = chunked_encoder_feature.shape[1] # B, T, D
52
+ alphas = cif_linear(chunked_encoder_feature).squeeze(dim=2) # B, T
53
+ alphas = torch.sigmoid(alphas)
54
+ decode_length = torch.round(alphas.sum(-1)).int()
55
+ alphas, _ = resize(alphas, decode_length)
56
+ alphas = alphas.squeeze(0) # (T, )
57
+ threshold = 0.999
58
+ integrate = torch.cumsum(alphas[:-1], dim=0) # ignore the peak value at the end of the content chunk
59
+ exceed_count = integrate[-1] // threshold
60
+ integrate = integrate - exceed_count*1.0 # minus 1 every time intergrate exceed the threshold
61
+ important_positions = (integrate >= 0).nonzero(as_tuple=True)[0]
62
+ if important_positions.numel() == 0:
63
+ return False
64
+ else:
65
+ return important_positions[0] >= content_mel_len-2
whisperlivekit/simul_whisper/generation_progress.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Tokens:
2
+ def __init__(self, tokens):
3
+ self.tokens = tokens
4
+
5
+ # def clone(self):
6
+ # return Tokens(self.tokens.clone())
7
+
8
+ def __str__(self):
9
+ return str(self.tokens.tolist())
10
+
11
+ def __repr__(self):
12
+ return self.__str__()
13
+
14
+ class BeamTokens(Tokens):
15
+ def __init__(self, tokens, beam_size):
16
+ self.tokens = tokens
17
+ self.beam_size = beam_size
18
+
19
+ def clone(self):
20
+ return BeamTokens(self.tokens.clone())
21
+
22
+ def __str__(self):
23
+ return f"BeamTokens({self.tokens.tolist()}, beam_size={self.beam_size})"
24
+
25
+ def __repr__(self):
26
+ return self.__str__()
27
+
28
+ def as_text(self, tokenizer):
29
+ return tokenizer.decode(self.tokens)
30
+
31
+ class Logits(Tokens):
32
+ def __init__(self, logits):
33
+ super().__init__(logits)
34
+
35
+ # def clone(self):
36
+ # return Logits(self.tokens.clone(), self.beam_size)
37
+
38
+ def __str__(self):
39
+ # return "abc"
40
+ return f"Logits({self.tokens.shape})"
41
+
42
+ def __repr__(self):
43
+ return self.__str__()
whisperlivekit/simul_whisper/license_simulstreaming.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ SIMULSTREAMING_LICENSE = f"""
2
+ SimulStreaming backend is dual-licensed:
3
+ • Non-Commercial Use: PolyForm Noncommercial License 1.0.0.
4
+ • Commercial Use: Check SimulStreaming README (github.com/ufal/SimulStreaming) for more details.
5
+ """
whisperlivekit/simul_whisper/simul_whisper.py ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This code was originally in simul_whisper/transcriber/simul_whisper.py . It is adapted a lot for SimulStreaming.
2
+
3
+ import os
4
+ import logging
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+
9
+ from .whisper import load_model, DecodingOptions, tokenizer
10
+ from .config import AlignAttConfig
11
+ from .whisper.audio import log_mel_spectrogram, TOKENS_PER_SECOND, pad_or_trim, N_SAMPLES, N_FRAMES
12
+ from .whisper.timing import median_filter
13
+ from .whisper.decoding import GreedyDecoder, BeamSearchDecoder, SuppressTokens, detect_language
14
+ from .beam import BeamPyTorchInference
15
+ from .eow_detection import fire_at_boundary, load_cif
16
+ import os
17
+
18
+ from .token_buffer import TokenBuffer
19
+
20
+ import numpy as np
21
+ from .generation_progress import *
22
+
23
+ DEC_PAD = 50257
24
+ logger = logging.getLogger(__name__)
25
+
26
+ import sys
27
+ import wave
28
+
29
+ # New features added to the original version of Simul-Whisper:
30
+ # - large-v3 model support
31
+ # - translation support
32
+ # - beam search
33
+ # - prompt -- static vs. non-static
34
+ # - context
35
+ class PaddedAlignAttWhisper:
36
+ def __init__(self, cfg: AlignAttConfig, loaded_model=None) -> None:
37
+ self.log_segments = 0
38
+ model_name = os.path.basename(cfg.model_path).replace(".pt", "")
39
+ model_path = os.path.dirname(os.path.abspath(cfg.model_path))
40
+ if loaded_model:
41
+ self.model = loaded_model
42
+ else:
43
+ self.model = load_model(name=model_name, download_root=model_path)
44
+
45
+ logger.info(f"Model dimensions: {self.model.dims}")
46
+
47
+ self.decode_options = DecodingOptions(
48
+ language = cfg.language,
49
+ without_timestamps = True,
50
+ task=cfg.task
51
+ )
52
+ self.tokenizer_is_multilingual = not model_name.endswith(".en")
53
+ self.create_tokenizer(cfg.language if cfg.language != "auto" else None)
54
+ self.detected_language = cfg.language if cfg.language != "auto" else None
55
+
56
+ self.max_text_len = self.model.dims.n_text_ctx
57
+ self.num_decoder_layers = len(self.model.decoder.blocks)
58
+ self.cfg = cfg
59
+
60
+ # model to detect end-of-word boundary at the end of the segment
61
+ self.CIFLinear, self.always_fire, self.never_fire = load_cif(cfg,
62
+ n_audio_state=self.model.dims.n_audio_state,
63
+ device=self.model.device)
64
+
65
+ # install hooks to access encoder-decoder attention
66
+ self.dec_attns = []
67
+ def layer_hook(module, net_input, net_output):
68
+ # net_output[1]: B*num_head*token_len*audio_len
69
+ t = F.softmax(net_output[1], dim=-1)
70
+ self.dec_attns.append(t.squeeze(0))
71
+ for b in self.model.decoder.blocks:
72
+ b.cross_attn.register_forward_hook(layer_hook)
73
+
74
+ self.kv_cache = {}
75
+ def kv_hook(module: torch.nn.Linear, _, net_output: torch.Tensor):
76
+ if module.cache_id not in self.kv_cache or net_output.shape[1] > self.max_text_len:
77
+ # save as-is, for the first token or cross attention
78
+ self.kv_cache[module.cache_id] = net_output
79
+ else:
80
+ x = self.kv_cache[module.cache_id]
81
+ self.kv_cache[module.cache_id] = torch.cat([x, net_output], dim=1).detach()
82
+ return self.kv_cache[module.cache_id]
83
+
84
+ for i,b in enumerate(self.model.decoder.blocks):
85
+ b.attn.key.register_forward_hook(kv_hook)
86
+ b.attn.value.register_forward_hook(kv_hook)
87
+ b.cross_attn.key.register_forward_hook(kv_hook)
88
+ b.cross_attn.value.register_forward_hook(kv_hook)
89
+
90
+ self.align_source = {}
91
+ self.num_align_heads = 0
92
+ for layer_rank, head_id in self.model.alignment_heads.indices().T:
93
+ layer_rank = layer_rank.item()
94
+ heads = self.align_source.get(layer_rank, [])
95
+ heads.append((self.num_align_heads, head_id.item()))
96
+ self.align_source[layer_rank] = heads
97
+ self.num_align_heads += 1
98
+
99
+
100
+ # tokens to be suppressed from decoding, to prevent hallucinations
101
+ suppress_tokens = [
102
+ self.tokenizer.transcribe,
103
+ self.tokenizer.translate,
104
+ self.tokenizer.sot,
105
+ self.tokenizer.sot_prev,
106
+ self.tokenizer.sot_lm,
107
+ # self.tokenizer.eot
108
+ self.tokenizer.no_timestamps, # added by DM
109
+ ] + list(self.tokenizer.all_language_tokens) # added by DM
110
+ if self.tokenizer.no_speech is not None:
111
+ suppress_tokens.append(self.tokenizer.no_speech)
112
+ suppress_tokens = tuple(sorted(set(suppress_tokens)))
113
+ logger.debug(f"Suppress tokens: {suppress_tokens}")
114
+ sup_tokens = SuppressTokens(suppress_tokens)
115
+ self.suppress_tokens = lambda logits: sup_tokens.apply(logits, None)
116
+ # blank tokens are suppresed for new segments near the line 334
117
+
118
+ # it's going to be regenerated after lang id
119
+ self.segments = []
120
+ self.init_tokens()
121
+
122
+ self.last_attend_frame = -self.cfg.rewind_threshold
123
+
124
+ if self.cfg.max_context_tokens is None:
125
+ self.max_context_tokens = self.max_text_len
126
+ else:
127
+ self.max_context_tokens = self.cfg.max_context_tokens
128
+ self.init_context()
129
+
130
+ # decoder type: greedy or beam
131
+ if cfg.decoder_type == "greedy":
132
+ logger.info("Using greedy decoder")
133
+ self.token_decoder = GreedyDecoder(0.0, self.tokenizer.eot)
134
+ self.decoder_type = "greedy"
135
+
136
+ elif cfg.decoder_type == "beam":
137
+ self.decoder_type = "beam"
138
+ self.inference = BeamPyTorchInference(self.model, self.initial_token_length)
139
+ self.inference.kv_cache = self.kv_cache
140
+
141
+ self.token_decoder = BeamSearchDecoder(inference=self.inference, eot=self.tokenizer.eot, beam_size=cfg.beam_size)
142
+
143
+ def create_tokenizer(self, language=None):
144
+ self.tokenizer = tokenizer.get_tokenizer(
145
+ multilingual=self.tokenizer_is_multilingual,
146
+ language=language,
147
+ num_languages=self.model.num_languages,
148
+ task=self.decode_options.task
149
+ )
150
+
151
+ def init_context(self):
152
+ kw = {'tokenizer': self.tokenizer,
153
+ 'device': self.model.device,
154
+ 'prefix_token_ids': [self.tokenizer.sot_prev]}
155
+ self.context = TokenBuffer.empty(**kw)
156
+ if self.cfg.static_init_prompt is not None:
157
+ self.context = TokenBuffer.from_text(self.cfg.static_init_prompt, **kw)
158
+ if self.cfg.init_prompt is not None:
159
+ self.context.text += self.cfg.init_prompt
160
+
161
+ def init_tokens(self):
162
+ logger.debug(f"init tokens, {len(self.segments)}")
163
+ # init tokens (mandatory prompt)
164
+ self.initial_tokens = torch.tensor(
165
+ self.tokenizer.sot_sequence_including_notimestamps,
166
+ dtype=torch.long,
167
+ device=self.model.device).unsqueeze(0)
168
+ self.initial_token_length = self.initial_tokens.shape[1]
169
+ self.sot_index = self.tokenizer.sot_sequence.index(self.tokenizer.sot)
170
+ # self.segments = []
171
+ logger.debug(f"init tokens after, {len(self.segments)}")
172
+ self.tokens = [self.initial_tokens]
173
+
174
+ def trim_context(self):
175
+ logger.info("Trimming context")
176
+ c = len(self.context.as_token_ids()) - len(self.context.prefix_token_ids)
177
+ # logger.debug(f"c= {len(self.context.as_token_ids())}, {len(self.context.prefix_token_ids)}")
178
+ logger.info(f"Context text: {self.context.as_text()}")
179
+ # logger.debug(f"Context tensor: {self.context.as_tensor()}")
180
+ l = sum(t.shape[1] for t in self.tokens) + c
181
+ # logger.debug(f"len {l}, c {c}, max_context_tokens {self.max_context_tokens}")
182
+ if self.cfg.static_init_prompt is None:
183
+ after = 0
184
+ else:
185
+ after = len(self.cfg.static_init_prompt)
186
+ # logger.debug(f"len {l}, c {c}, max_context_tokens {self.max_context_tokens}")
187
+ while c > self.max_context_tokens or l > self.max_text_len - 20:
188
+ t = self.context.trim_words(after=after)
189
+ l -= t
190
+ c -= t
191
+ logger.debug(f"len {l}, c {c}, max_context_tokens {self.max_context_tokens}")
192
+ if t == 0:
193
+ break
194
+ # logger.debug(f"len {l}, c {c}, max_context_tokens {self.max_context_tokens}")
195
+ logger.info(f"Context after trim: {self.context.text} (len: {l})")
196
+
197
+
198
+ def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor) -> torch.Tensor:
199
+ if self.cfg.decoder_type == "greedy":
200
+ logit = self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
201
+ else:
202
+ logger.debug(f"Logits shape: {tokens.shape}")
203
+ logit = self.inference.logits(tokens, audio_features)
204
+ return logit
205
+
206
+
207
+ def refresh_segment(self, complete=False):
208
+
209
+ logger.debug("Refreshing segment:")
210
+ self.init_tokens()
211
+ self.last_attend_frame = -self.cfg.rewind_threshold
212
+ self.detected_language = None
213
+ self.init_context()
214
+ logger.debug(f"Context: {self.context}")
215
+ if not complete and len(self.segments) > 2:
216
+ logger.debug("keeping last two segments because they are and it is not complete.")
217
+ self.segments = self.segments[-2:]
218
+ else:
219
+ logger.debug("removing all segments.")
220
+ self.segments = []
221
+ self.log_segments += 1
222
+
223
+
224
+ def fire_at_boundary(self, chunked_encoder_feature: torch.Tensor):
225
+ if self.always_fire: return True
226
+ if self.never_fire: return False
227
+ return fire_at_boundary(chunked_encoder_feature, self.CIFLinear)
228
+
229
+
230
+ def _current_tokens(self):
231
+
232
+ toks = self.tokens
233
+ # very first infer: duplicate start of seq to beam_size
234
+ if toks[0].shape[0] == 1:
235
+ toks[0] = toks[0].repeat_interleave(self.cfg.beam_size,dim=0)
236
+
237
+ if not self.context.is_empty():
238
+ context_toks = self.context.as_tensor_beam(self.cfg.beam_size, device=self.model.device)
239
+ toks = [context_toks] + toks
240
+
241
+ # make it one tensor
242
+ if len(toks) > 1:
243
+ current_tokens = torch.cat(toks, dim=1)
244
+ else:
245
+ current_tokens = toks[0]
246
+ logger.debug("debug print current_tokens:")
247
+ self.debug_print_tokens(current_tokens)
248
+ return current_tokens
249
+
250
+
251
+ def debug_print_tokens(self, tokens):
252
+ for i in range(self.cfg.beam_size):
253
+ logger.debug(self.tokenizer.decode_with_timestamps(tokens[i].tolist()))
254
+
255
+ ### audio buffer
256
+
257
+ def segments_len(self):
258
+ segments_len = sum(s.shape[0] for s in self.segments) / 16000
259
+ return segments_len
260
+
261
+ def _apply_minseglen(self):
262
+ segments_len = self.segments_len()
263
+ # wait for long enough audio to start
264
+ if segments_len < self.cfg.audio_min_len:
265
+ logger.debug("waiting for next segment")
266
+ return False
267
+ return True
268
+
269
+ def insert_audio(self, segment=None):
270
+ if segment is not None:
271
+ self.segments.append(segment)
272
+
273
+ removed_len = 0
274
+ # len of audio is bigger than buffer_len. Going to remove the first segment
275
+ segments_len = self.segments_len()
276
+ while len(self.segments) > 1 and segments_len > self.cfg.audio_max_len:
277
+ removed_len = self.segments[0].shape[0] / 16000
278
+ segments_len -= removed_len
279
+ self.last_attend_frame -= int(TOKENS_PER_SECOND*removed_len)
280
+ self.segments = self.segments[1:]
281
+ logger.debug(f"remove segments: {len(self.segments)} {len(self.tokens)}")
282
+ if len(self.tokens) > 1:
283
+ self.context.append_token_ids(self.tokens[1][0,:])
284
+ self.tokens = [self.initial_tokens] + self.tokens[2:]
285
+ return removed_len
286
+
287
+ def _clean_cache(self):
288
+ '''clean the cache that stores the attention matrices and kv_cache.
289
+ It must be called every time after generation with the model.'''
290
+ # cleaning cache
291
+ self.dec_attns = []
292
+ self.kv_cache = {}
293
+ if self.decoder_type == "beam":
294
+ self.inference.kv_cache = self.kv_cache
295
+ self.token_decoder.reset()
296
+
297
+ @torch.no_grad()
298
+ def lang_id(self, encoder_features):
299
+ """Language detection from encoder features.
300
+ This code is trimmed and copy-pasted from whisper.decoding.detect_language .
301
+ """
302
+
303
+ # forward pass using a single token, startoftranscript
304
+ n_audio = encoder_features.shape[0]
305
+ x = torch.tensor([[self.tokenizer.sot]] * n_audio).to(self.model.device) # [n_audio, 1]
306
+ logits = self.model.logits(x, encoder_features)[:, 0]
307
+
308
+ # collect detected languages; suppress all non-language tokens
309
+ mask = torch.ones(logits.shape[-1], dtype=torch.bool)
310
+ mask[list(self.tokenizer.all_language_tokens)] = False
311
+ logits[:, mask] = -np.inf
312
+ language_tokens = logits.argmax(dim=-1)
313
+ language_token_probs = logits.softmax(dim=-1).cpu()
314
+ language_probs = [
315
+ {
316
+ c: language_token_probs[i, j].item()
317
+ for j, c in zip(self.tokenizer.all_language_tokens, self.tokenizer.all_language_codes)
318
+ }
319
+ for i in range(n_audio)
320
+ ]
321
+
322
+ single = encoder_features.ndim == 2
323
+ if single:
324
+ language_tokens = language_tokens[0]
325
+ language_probs = language_probs[0]
326
+
327
+ self._clean_cache()
328
+ return language_tokens, language_probs
329
+
330
+ ### transcription / translation
331
+
332
+ @torch.no_grad()
333
+ def infer(self, is_last=False):
334
+ new_segment = True
335
+ if len(self.segments) == 0:
336
+ logger.debug("No segments, nothing to do")
337
+ return [], {}
338
+ if not self._apply_minseglen():
339
+ logger.debug(f"applied minseglen {self.cfg.audio_min_len} > {self.segments_len()}.")
340
+ input_segments = torch.cat(self.segments, dim=0)
341
+ return [], {}
342
+
343
+ # input_segments is concatenation of audio, it's one array
344
+ if len(self.segments) > 1:
345
+ input_segments = torch.cat(self.segments, dim=0)
346
+ else:
347
+ input_segments = self.segments[0]
348
+
349
+
350
+
351
+ # mel + padding to 30s
352
+ mel_padded = log_mel_spectrogram(input_segments, n_mels=self.model.dims.n_mels, padding=N_SAMPLES,
353
+ device=self.model.device).unsqueeze(0)
354
+ # trim to 3000
355
+ mel = pad_or_trim(mel_padded, N_FRAMES)
356
+
357
+ # the len of actual audio
358
+ content_mel_len = int((mel_padded.shape[2] - mel.shape[2])/2)
359
+
360
+ # encode
361
+ encoder_feature = self.model.encoder(mel)
362
+
363
+ # logger.debug(f"Encoder feature shape: {encoder_feature.shape}")
364
+ # if mel.shape[-2:] != (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state):
365
+ # logger.debug("mel ")
366
+ if self.cfg.language == "auto" and self.detected_language is None:
367
+ language_tokens, language_probs = self.lang_id(encoder_feature)
368
+ logger.debug(f"Language tokens: {language_tokens}, probs: {language_probs}")
369
+ top_lan, p = max(language_probs[0].items(), key=lambda x: x[1])
370
+ logger.info(f"Detected language: {top_lan} with p={p:.4f}")
371
+ #self.tokenizer.language = top_lan
372
+ #self.tokenizer.__post_init__()
373
+ self.create_tokenizer(top_lan)
374
+ self.detected_language = top_lan
375
+ self.init_tokens()
376
+ logger.info(f"Tokenizer language: {self.tokenizer.language}, {self.tokenizer.sot_sequence_including_notimestamps}")
377
+
378
+ self.trim_context()
379
+ current_tokens = self._current_tokens()
380
+ #
381
+ fire_detected = self.fire_at_boundary(encoder_feature[:, :content_mel_len, :])
382
+
383
+
384
+ ####################### Decoding loop
385
+ logger.info("Decoding loop starts\n")
386
+
387
+ sum_logprobs = torch.zeros(self.cfg.beam_size, device=mel.device)
388
+ completed = False
389
+
390
+ attn_of_alignment_heads = None
391
+ most_attended_frame = None
392
+
393
+ token_len_before_decoding = current_tokens.shape[1]
394
+
395
+ generation_progress = []
396
+ generation = {
397
+ "starting_tokens": BeamTokens(current_tokens[0,:].clone(), self.cfg.beam_size),
398
+ "token_len_before_decoding": token_len_before_decoding,
399
+ #"fire_detected": fire_detected,
400
+ "frames_len": content_mel_len,
401
+ "frames_threshold": 4 if is_last else self.cfg.frame_threshold,
402
+
403
+ # to be filled later
404
+ "logits_starting": None,
405
+
406
+ # to be filled later
407
+ "no_speech_prob": None,
408
+ "no_speech": False,
409
+
410
+ # to be filled in the loop
411
+ "progress": generation_progress,
412
+ }
413
+ while not completed and current_tokens.shape[1] < self.max_text_len: # bos is 3 tokens
414
+ generation_progress_loop = []
415
+
416
+ if new_segment:
417
+ tokens_for_logits = current_tokens
418
+ else:
419
+ # only need to use the last token except in the first forward pass
420
+ tokens_for_logits = current_tokens[:,-1:]
421
+
422
+ logits = self.logits(tokens_for_logits, encoder_feature) # B, len(tokens), token dict size
423
+ if new_segment:
424
+ generation["logits_starting"] = Logits(logits[:,:,:])
425
+
426
+ if new_segment and self.tokenizer.no_speech is not None:
427
+ probs_at_sot = logits[:, self.sot_index, :].float().softmax(dim=-1)
428
+ no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
429
+ generation["no_speech_prob"] = no_speech_probs[0]
430
+ if no_speech_probs[0] > self.cfg.nonspeech_prob:
431
+ generation["no_speech"] = True
432
+ logger.info("no speech, stop")
433
+ break
434
+
435
+ logits = logits[:, -1, :] # logits for the last token
436
+ generation_progress_loop.append(("logits_before_suppress",Logits(logits)))
437
+
438
+ # supress blank tokens only at the beginning of the segment
439
+ if new_segment:
440
+ logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
441
+ new_segment = False
442
+ self.suppress_tokens(logits)
443
+ #generation_progress_loop.append(("logits_after_suppres",BeamLogits(logits[0,:].clone(), self.cfg.beam_size)))
444
+ generation_progress_loop.append(("logits_after_suppress",Logits(logits)))
445
+
446
+ current_tokens, completed = self.token_decoder.update(current_tokens, logits, sum_logprobs)
447
+ generation_progress_loop.append(("beam_tokens",Tokens(current_tokens[:,-1].clone())))
448
+ generation_progress_loop.append(("sum_logprobs",sum_logprobs.tolist()))
449
+ generation_progress_loop.append(("completed",completed))
450
+
451
+ logger.debug(f"Decoding completed: {completed}, sum_logprobs: {sum_logprobs.tolist()}, tokens: ")
452
+ self.debug_print_tokens(current_tokens)
453
+
454
+
455
+ # if self.decoder_type == "beam":
456
+ # logger.debug(f"Finished sequences: {self.token_decoder.finished_sequences}")
457
+
458
+ # logprobs = F.log_softmax(logits.float(), dim=-1)
459
+ # idx = 0
460
+ # logger.debug(f"Beam search topk: {logprobs[idx].topk(self.cfg.beam_size + 1)}")
461
+ # logger.debug(f"Greedy search argmax: {logits.argmax(dim=-1)}")
462
+ # if completed:
463
+ # self.debug_print_tokens(current_tokens)
464
+
465
+ # logger.debug("decode stopped because decoder completed")
466
+
467
+ attn_of_alignment_heads = [[] for _ in range(self.num_align_heads)]
468
+ for i, attn_mat in enumerate(self.dec_attns):
469
+ layer_rank = int(i % len(self.model.decoder.blocks))
470
+ align_heads_in_layer = self.align_source.get(layer_rank, [])
471
+ if len(align_heads_in_layer) == 0:
472
+ continue
473
+ for align_head_rank, head_id in align_heads_in_layer:
474
+ if self.cfg.beam_size == 1:
475
+ a = attn_mat[head_id, :, :]
476
+ a = a.unsqueeze(0)
477
+ else:
478
+ a = attn_mat[:, head_id, :, :]
479
+ attn_of_alignment_heads[align_head_rank].append(a)
480
+ tmp = []
481
+ for mat in attn_of_alignment_heads:
482
+ t = torch.cat(mat, dim=1)
483
+ tmp.append(t)
484
+ attn_of_alignment_heads = torch.stack(tmp, dim=1)
485
+ # logger.debug(str(attn_of_alignment_heads.shape) + " tttady")
486
+ std, mean = torch.std_mean(attn_of_alignment_heads, dim=-2, keepdim=True, unbiased=False)
487
+ attn_of_alignment_heads = (attn_of_alignment_heads - mean) / std
488
+ attn_of_alignment_heads = median_filter(attn_of_alignment_heads, 7) # from whisper.timing
489
+ attn_of_alignment_heads = attn_of_alignment_heads.mean(dim=1)
490
+ # logger.debug(str(attn_of_alignment_heads.shape) + " po mean")
491
+ attn_of_alignment_heads = attn_of_alignment_heads[:,:, :content_mel_len]
492
+ # logger.debug(str(attn_of_alignment_heads.shape) + " pak ")
493
+
494
+ # for each beam, the most attended frame is:
495
+ most_attended_frames = torch.argmax(attn_of_alignment_heads[:,-1,:], dim=-1)
496
+ generation_progress_loop.append(("most_attended_frames",most_attended_frames.clone().tolist()))
497
+ logger.debug(str(most_attended_frames.tolist()) + " most att frames")
498
+
499
+ most_attended_frame = most_attended_frames[0].item()
500
+
501
+
502
+ generation_progress.append(dict(generation_progress_loop))
503
+ logger.debug("current tokens" + str(current_tokens.shape))
504
+ if completed:
505
+ # # stripping the last token, the eot
506
+ current_tokens = current_tokens[:, :-1]
507
+ break
508
+
509
+ # for some rare cases where the attention fails
510
+ if not is_last and self.last_attend_frame - most_attended_frame > self.cfg.rewind_threshold:
511
+ # TODO: check this
512
+ if current_tokens.shape[1] > 1 and current_tokens[0, -2] >= DEC_PAD:
513
+ logger.debug("ommit rewinding from special tokens")
514
+ self.last_attend_frame = most_attended_frame
515
+ else:
516
+ logger.debug(
517
+ f"[rewind detected] current attention pos: {most_attended_frame}, "
518
+ f"last attention pos: {self.last_attend_frame}; omit this segment")
519
+ self.last_attend_frame = -self.cfg.rewind_threshold
520
+ current_tokens = torch.cat(self.tokens, dim=1) if len(self.tokens) > 0 else self.tokens[0]
521
+ break
522
+ else:
523
+ self.last_attend_frame = most_attended_frame
524
+
525
+ if content_mel_len - most_attended_frame <= (4 if is_last else self.cfg.frame_threshold):
526
+ logger.debug(f"attention reaches the end: {most_attended_frame}/{content_mel_len}")
527
+ # stripping the last token, the one that is attended too close to the end
528
+ current_tokens = current_tokens[:, :-1]
529
+ break
530
+
531
+ # debug print
532
+ for i in range(self.cfg.beam_size):
533
+ logger.debug("attn: {}, current pos: {}, current token: {}({})".format(
534
+ attn_of_alignment_heads.shape if attn_of_alignment_heads is not None else None,
535
+ most_attended_frames[i],
536
+ current_tokens[i, -1].item(),
537
+ self.tokenizer.decode([current_tokens[i, -1].item()])
538
+ ))
539
+
540
+ # for k,v in generation.items():
541
+ # print(k,v,file=sys.stderr)
542
+ # for x in generation_progress:
543
+ # for y in x.items():
544
+ # print("\t\t",*y,file=sys.stderr)
545
+ # print("\t","----", file=sys.stderr)
546
+ # print("\t", "end of generation_progress_loop", file=sys.stderr)
547
+ # sys.exit(1)
548
+ ####################### End of decoding loop
549
+
550
+ logger.info("End of decoding loop")
551
+
552
+ # if attn_of_alignment_heads is not None:
553
+ # seg_len = int(segment.shape[0] / 16000 * TOKENS_PER_SECOND)
554
+
555
+ # # Lets' now consider only the top hypothesis in the beam search
556
+ # top_beam_attn_of_alignment_heads = attn_of_alignment_heads[0]
557
+
558
+ # # debug print: how is the new token attended?
559
+ # new_token_attn = top_beam_attn_of_alignment_heads[token_len_before_decoding:, -seg_len:]
560
+ # logger.debug(f"New token attention shape: {new_token_attn.shape}")
561
+ # if new_token_attn.shape[0] == 0: # it's not attended in the current audio segment
562
+ # logger.debug("no token generated")
563
+ # else: # it is, and the max attention is:
564
+ # new_token_max_attn, _ = new_token_attn.max(dim=-1)
565
+ # logger.debug(f"segment max attention: {new_token_max_attn.mean().item()/len(self.segments)}")
566
+
567
+
568
+ # let's now operate only with the top beam hypothesis
569
+ tokens_to_split = current_tokens[0, token_len_before_decoding:]
570
+ if fire_detected or is_last:
571
+ new_hypothesis = tokens_to_split.flatten().tolist()
572
+ else:
573
+ # going to truncate the tokens after the last space
574
+ split_words, split_tokens = self.tokenizer.split_to_word_tokens(tokens_to_split.tolist())
575
+ generation["result"] = {"split_words": split_words[:-1], "split_tokens": split_tokens[:-1]}
576
+ generation["result_truncated"] = {"split_words": split_words[-1:], "split_tokens": split_tokens[-1:]}
577
+
578
+ # text_to_split = self.tokenizer.decode(tokens_to_split)
579
+ # logger.debug(f"text_to_split: {text_to_split}")
580
+ # logger.debug("text at current step: {}".format(text_to_split.replace(" ", "<space>")))
581
+ # text_before_space = " ".join(text_to_split.split(" ")[:-1])
582
+ # logger.debug("before the last space: {}".format(text_before_space.replace(" ", "<space>")))
583
+ if len(split_words) > 1:
584
+ new_hypothesis = [i for sublist in split_tokens[:-1] for i in sublist]
585
+ else:
586
+ new_hypothesis = []
587
+
588
+
589
+ ### new hypothesis
590
+ logger.debug(f"new_hypothesis: {new_hypothesis}")
591
+ new_tokens = torch.tensor([new_hypothesis], dtype=torch.long).repeat_interleave(self.cfg.beam_size, dim=0).to(
592
+ device=self.model.device,
593
+ )
594
+ self.tokens.append(new_tokens)
595
+ # TODO: test if this is redundant or not
596
+ # ret = ret[ret<DEC_PAD]
597
+
598
+ logger.info(f"Output: {self.tokenizer.decode(new_hypothesis)}")
599
+
600
+ self._clean_cache()
601
+
602
+ return new_hypothesis, generation
whisperlivekit/simul_whisper/token_buffer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import sys
3
+ class TokenBuffer:
4
+
5
+ def __init__(self, text="", tokenizer=None, device=None, prefix_token_ids=[]):
6
+ self.text = text
7
+ self.prefix_token_ids = prefix_token_ids
8
+ self.tokenizer = tokenizer
9
+ self.device = device
10
+
11
+ def as_token_ids(self, tokenizer=None):
12
+
13
+ if tokenizer is None:
14
+ tokenizer = self.tokenizer
15
+ if tokenizer is None:
16
+ raise ValueError("Tokenizer is not set.")
17
+ return self.prefix_token_ids + tokenizer.encode(self.text)
18
+
19
+ def as_tensor(self, device=None):
20
+ if device is None:
21
+ device = self.device
22
+ if device is None:
23
+ raise ValueError("Device is not set.")
24
+ tok_ids = self.as_token_ids()
25
+ return torch.tensor(tok_ids,
26
+ dtype=torch.long, device=device).unsqueeze(0)
27
+
28
+ def as_tensor_beam(self, beam, device=None):
29
+ t = self.as_tensor(device=device)
30
+ return t.repeat_interleave(beam, dim=0)
31
+
32
+
33
+ def as_text(self):
34
+ return self.text
35
+
36
+ @staticmethod
37
+ def empty(*a, **kw):
38
+ return TokenBuffer(*a,**kw)
39
+
40
+ @staticmethod
41
+ def from_text(text, *a, **kw):
42
+ return TokenBuffer(*a, text=text, **kw)
43
+
44
+ def is_empty(self):
45
+ return self.text is None or self.text == ""
46
+
47
+ def trim_words(self, num=1, after=0):
48
+ '''
49
+ num: how many words to trim from the beginning
50
+ after: how many characters to skip (length of the static prompt)
51
+ '''
52
+ tokenizer = self.tokenizer
53
+ assert tokenizer is not None, "Tokenizer is not set."
54
+
55
+ ids = tokenizer.encode(self.text[after:])
56
+ words, wids = self.tokenizer.split_to_word_tokens(ids)
57
+ # print(words, file=sys.stderr)
58
+ # print(wids, file=sys.stderr)
59
+ if not words:
60
+ return 0
61
+ self.text = self.text[:after] + "".join(words[num:])
62
+ return sum(len(wi) for wi in wids[:num])
63
+
64
+ def append_token_ids(self, token_ids):
65
+ tokenizer = self.tokenizer
66
+ assert tokenizer is not None, "Tokenizer is not set."
67
+ self.text += self.tokenizer.decode(token_ids)
68
+
69
+ def as_split_word_tokens(self):
70
+ tokenizer = self.tokenizer
71
+ assert tokenizer is not None, "Tokenizer is not set."
72
+ ids = tokenizer.encode(self.text)
73
+ return tokenizer.split_to_word_tokens(ids)
whisperlivekit/simul_whisper/whisper/__init__.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import io
3
+ import os
4
+ import urllib
5
+ import warnings
6
+ from typing import List, Optional, Union
7
+
8
+ import torch
9
+ from tqdm import tqdm
10
+
11
+ from .audio import load_audio, log_mel_spectrogram, pad_or_trim
12
+ from .decoding import DecodingOptions, DecodingResult, decode, detect_language
13
+ from .model import ModelDimensions, Whisper
14
+ from .transcribe import transcribe
15
+ from .version import __version__
16
+
17
+ _MODELS = {
18
+ "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
19
+ "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
20
+ "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
21
+ "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
22
+ "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
23
+ "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
24
+ "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
25
+ "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
26
+ "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
27
+ "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
28
+ "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
29
+ "large": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
30
+ "large-v3-turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt",
31
+ "turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt",
32
+ }
33
+
34
+ # base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are
35
+ # highly correlated to the word-level timing, i.e. the alignment between audio and text tokens.
36
+ _ALIGNMENT_HEADS = {
37
+ "tiny.en": b"ABzY8J1N>@0{>%R00Bk>$p{7v037`oCl~+#00",
38
+ "tiny": b"ABzY8bu8Lr0{>%RKn9Fp%m@SkK7Kt=7ytkO",
39
+ "base.en": b"ABzY8;40c<0{>%RzzG;p*o+Vo09|#PsxSZm00",
40
+ "base": b"ABzY8KQ!870{>%RzyTQH3`Q^yNP!>##QT-<FaQ7m",
41
+ "small.en": b"ABzY8>?_)10{>%RpeA61k&I|OI3I$65C{;;pbCHh0B{qLQ;+}v00",
42
+ "small": b"ABzY8DmU6=0{>%Rpa?J`kvJ6qF(V^F86#Xh7JUGMK}P<N0000",
43
+ "medium.en": b"ABzY8usPae0{>%R7<zz_OvQ{)4kMa0BMw6u5rT}kRKX;$NfYBv00*Hl@qhsU00",
44
+ "medium": b"ABzY8B0Jh+0{>%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9",
45
+ "large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR<kSfC2yj",
46
+ "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
47
+ "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
48
+ "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
49
+ "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
50
+ "turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
51
+ }
52
+
53
+
54
+ def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
55
+ os.makedirs(root, exist_ok=True)
56
+
57
+ expected_sha256 = url.split("/")[-2]
58
+ download_target = os.path.join(root, os.path.basename(url))
59
+
60
+ if os.path.exists(download_target) and not os.path.isfile(download_target):
61
+ raise RuntimeError(f"{download_target} exists and is not a regular file")
62
+
63
+ if os.path.isfile(download_target):
64
+ with open(download_target, "rb") as f:
65
+ model_bytes = f.read()
66
+ if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
67
+ return model_bytes if in_memory else download_target
68
+ else:
69
+ warnings.warn(
70
+ f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file"
71
+ )
72
+
73
+ with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
74
+ with tqdm(
75
+ total=int(source.info().get("Content-Length")),
76
+ ncols=80,
77
+ unit="iB",
78
+ unit_scale=True,
79
+ unit_divisor=1024,
80
+ ) as loop:
81
+ while True:
82
+ buffer = source.read(8192)
83
+ if not buffer:
84
+ break
85
+
86
+ output.write(buffer)
87
+ loop.update(len(buffer))
88
+
89
+ model_bytes = open(download_target, "rb").read()
90
+ if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
91
+ raise RuntimeError(
92
+ "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
93
+ )
94
+
95
+ return model_bytes if in_memory else download_target
96
+
97
+
98
+ def available_models() -> List[str]:
99
+ """Returns the names of available models"""
100
+ return list(_MODELS.keys())
101
+
102
+
103
+ def load_model(
104
+ name: str,
105
+ device: Optional[Union[str, torch.device]] = None,
106
+ download_root: str = None,
107
+ in_memory: bool = False,
108
+ ) -> Whisper:
109
+ """
110
+ Load a Whisper ASR model
111
+
112
+ Parameters
113
+ ----------
114
+ name : str
115
+ one of the official model names listed by `whisper.available_models()`, or
116
+ path to a model checkpoint containing the model dimensions and the model state_dict.
117
+ device : Union[str, torch.device]
118
+ the PyTorch device to put the model into
119
+ download_root: str
120
+ path to download the model files; by default, it uses "~/.cache/whisper"
121
+ in_memory: bool
122
+ whether to preload the model weights into host memory
123
+
124
+ Returns
125
+ -------
126
+ model : Whisper
127
+ The Whisper ASR model instance
128
+ """
129
+
130
+ if device is None:
131
+ device = "cuda" if torch.cuda.is_available() else "cpu"
132
+ if download_root is None:
133
+ default = os.path.join(os.path.expanduser("~"), ".cache")
134
+ download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default), "whisper")
135
+
136
+ if name in _MODELS:
137
+ checkpoint_file = _download(_MODELS[name], download_root, in_memory)
138
+ alignment_heads = _ALIGNMENT_HEADS[name]
139
+ elif os.path.isfile(name):
140
+ checkpoint_file = open(name, "rb").read() if in_memory else name
141
+ alignment_heads = None
142
+ else:
143
+ raise RuntimeError(
144
+ f"Model {name} not found; available models = {available_models()}"
145
+ )
146
+
147
+ with (
148
+ io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")
149
+ ) as fp:
150
+ checkpoint = torch.load(fp, map_location=device)
151
+ del checkpoint_file
152
+
153
+ dims = ModelDimensions(**checkpoint["dims"])
154
+ model = Whisper(dims)
155
+ model.load_state_dict(checkpoint["model_state_dict"])
156
+
157
+ if alignment_heads is not None:
158
+ model.set_alignment_heads(alignment_heads)
159
+
160
+ return model.to(device)
whisperlivekit/simul_whisper/whisper/__main__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .transcribe import cli
2
+
3
+ cli()
whisperlivekit/simul_whisper/whisper/assets/__init__.py ADDED
File without changes
whisperlivekit/simul_whisper/whisper/assets/gpt2.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
whisperlivekit/simul_whisper/whisper/assets/mel_filters.npz ADDED
Binary file (4.27 kB). View file
 
whisperlivekit/simul_whisper/whisper/assets/multilingual.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
whisperlivekit/simul_whisper/whisper/audio.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from functools import lru_cache
3
+ from subprocess import CalledProcessError, run
4
+ from typing import Optional, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+ from .utils import exact_div
11
+
12
+ # hard-coded audio hyperparameters
13
+ SAMPLE_RATE = 16000
14
+ N_FFT = 400
15
+ HOP_LENGTH = 160
16
+ CHUNK_LENGTH = 30
17
+ N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
18
+ N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input
19
+
20
+ N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
21
+ FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame
22
+ TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token
23
+
24
+
25
+ def load_audio(file: str, sr: int = SAMPLE_RATE):
26
+ """
27
+ Open an audio file and read as mono waveform, resampling as necessary
28
+
29
+ Parameters
30
+ ----------
31
+ file: str
32
+ The audio file to open
33
+
34
+ sr: int
35
+ The sample rate to resample the audio if necessary
36
+
37
+ Returns
38
+ -------
39
+ A NumPy array containing the audio waveform, in float32 dtype.
40
+ """
41
+
42
+ # This launches a subprocess to decode audio while down-mixing
43
+ # and resampling as necessary. Requires the ffmpeg CLI in PATH.
44
+ # fmt: off
45
+ cmd = [
46
+ "ffmpeg",
47
+ "-nostdin",
48
+ "-threads", "0",
49
+ "-i", file,
50
+ "-f", "s16le",
51
+ "-ac", "1",
52
+ "-acodec", "pcm_s16le",
53
+ "-ar", str(sr),
54
+ "-"
55
+ ]
56
+ # fmt: on
57
+ try:
58
+ out = run(cmd, capture_output=True, check=True).stdout
59
+ except CalledProcessError as e:
60
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
61
+
62
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
63
+
64
+
65
+ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
66
+ """
67
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
68
+ """
69
+ if torch.is_tensor(array):
70
+ if array.shape[axis] > length:
71
+ array = array.index_select(
72
+ dim=axis, index=torch.arange(length, device=array.device)
73
+ )
74
+
75
+ if array.shape[axis] < length:
76
+ pad_widths = [(0, 0)] * array.ndim
77
+ pad_widths[axis] = (0, length - array.shape[axis])
78
+ array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
79
+ else:
80
+ if array.shape[axis] > length:
81
+ array = array.take(indices=range(length), axis=axis)
82
+
83
+ if array.shape[axis] < length:
84
+ pad_widths = [(0, 0)] * array.ndim
85
+ pad_widths[axis] = (0, length - array.shape[axis])
86
+ array = np.pad(array, pad_widths)
87
+
88
+ return array
89
+
90
+
91
+ @lru_cache(maxsize=None)
92
+ def mel_filters(device, n_mels: int) -> torch.Tensor:
93
+ """
94
+ load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
95
+ Allows decoupling librosa dependency; saved using:
96
+
97
+ np.savez_compressed(
98
+ "mel_filters.npz",
99
+ mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
100
+ mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
101
+ )
102
+ """
103
+ assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"
104
+
105
+ filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
106
+ with np.load(filters_path, allow_pickle=False) as f:
107
+ return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
108
+
109
+
110
+ def log_mel_spectrogram(
111
+ audio: Union[str, np.ndarray, torch.Tensor],
112
+ n_mels: int = 80,
113
+ padding: int = 0,
114
+ device: Optional[Union[str, torch.device]] = None,
115
+ ):
116
+ """
117
+ Compute the log-Mel spectrogram of
118
+
119
+ Parameters
120
+ ----------
121
+ audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
122
+ The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
123
+
124
+ n_mels: int
125
+ The number of Mel-frequency filters, only 80 and 128 are supported
126
+
127
+ padding: int
128
+ Number of zero samples to pad to the right
129
+
130
+ device: Optional[Union[str, torch.device]]
131
+ If given, the audio tensor is moved to this device before STFT
132
+
133
+ Returns
134
+ -------
135
+ torch.Tensor, shape = (n_mels, n_frames)
136
+ A Tensor that contains the Mel spectrogram
137
+ """
138
+ if not torch.is_tensor(audio):
139
+ if isinstance(audio, str):
140
+ audio = load_audio(audio)
141
+ audio = torch.from_numpy(audio)
142
+
143
+ if device is not None:
144
+ audio = audio.to(device)
145
+ if padding > 0:
146
+ audio = F.pad(audio, (0, padding))
147
+ window = torch.hann_window(N_FFT).to(audio.device)
148
+ stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
149
+ magnitudes = stft[..., :-1].abs() ** 2
150
+
151
+ filters = mel_filters(audio.device, n_mels)
152
+ mel_spec = filters @ magnitudes
153
+
154
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
155
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
156
+ log_spec = (log_spec + 4.0) / 4.0
157
+ return log_spec
whisperlivekit/simul_whisper/whisper/decoding.py ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field, replace
2
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch import Tensor
8
+ from torch.distributions import Categorical
9
+
10
+ from .audio import CHUNK_LENGTH
11
+ from .tokenizer import Tokenizer, get_tokenizer
12
+ from .utils import compression_ratio
13
+
14
+ if TYPE_CHECKING:
15
+ from .model import Whisper
16
+
17
+
18
+ @torch.no_grad()
19
+ def detect_language(
20
+ model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None
21
+ ) -> Tuple[Tensor, List[dict]]:
22
+ """
23
+ Detect the spoken language in the audio, and return them as list of strings, along with the ids
24
+ of the most probable language tokens and the probability distribution over all language tokens.
25
+ This is performed outside the main decode loop in order to not interfere with kv-caching.
26
+
27
+ Returns
28
+ -------
29
+ language_tokens : Tensor, shape = (n_audio,)
30
+ ids of the most probable language tokens, which appears after the startoftranscript token.
31
+ language_probs : List[Dict[str, float]], length = n_audio
32
+ list of dictionaries containing the probability distribution over all languages.
33
+ """
34
+ if tokenizer is None:
35
+ tokenizer = get_tokenizer(
36
+ model.is_multilingual, num_languages=model.num_languages
37
+ )
38
+ if (
39
+ tokenizer.language is None
40
+ or tokenizer.language_token not in tokenizer.sot_sequence
41
+ ):
42
+ raise ValueError(
43
+ "This model doesn't have language tokens so it can't perform lang id"
44
+ )
45
+
46
+ single = mel.ndim == 2
47
+ if single:
48
+ mel = mel.unsqueeze(0)
49
+
50
+ # skip encoder forward pass if already-encoded audio features were given
51
+ if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
52
+ mel = model.encoder(mel)
53
+
54
+ # forward pass using a single token, startoftranscript
55
+ n_audio = mel.shape[0]
56
+ x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device) # [n_audio, 1]
57
+ logits = model.logits(x, mel)[:, 0]
58
+
59
+ # collect detected languages; suppress all non-language tokens
60
+ mask = torch.ones(logits.shape[-1], dtype=torch.bool)
61
+ mask[list(tokenizer.all_language_tokens)] = False
62
+ logits[:, mask] = -np.inf
63
+ language_tokens = logits.argmax(dim=-1)
64
+ language_token_probs = logits.softmax(dim=-1).cpu()
65
+ language_probs = [
66
+ {
67
+ c: language_token_probs[i, j].item()
68
+ for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
69
+ }
70
+ for i in range(n_audio)
71
+ ]
72
+
73
+ if single:
74
+ language_tokens = language_tokens[0]
75
+ language_probs = language_probs[0]
76
+
77
+ return language_tokens, language_probs
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class DecodingOptions:
82
+ # whether to perform X->X "transcribe" or X->English "translate"
83
+ task: str = "transcribe"
84
+
85
+ # language that the audio is in; uses detected language if None
86
+ language: Optional[str] = None
87
+
88
+ # sampling-related options
89
+ temperature: float = 0.0
90
+ sample_len: Optional[int] = None # maximum number of tokens to sample
91
+ best_of: Optional[int] = None # number of independent sample trajectories, if t > 0
92
+ beam_size: Optional[int] = None # number of beams in beam search, if t == 0
93
+ patience: Optional[float] = None # patience in beam search (arxiv:2204.05424)
94
+
95
+ # "alpha" in Google NMT, or None for length norm, when ranking generations
96
+ # to select which to return among the beams or best-of-N samples
97
+ length_penalty: Optional[float] = None
98
+
99
+ # text or tokens to feed as the prompt or the prefix; for more info:
100
+ # https://github.com/openai/whisper/discussions/117#discussioncomment-3727051
101
+ prompt: Optional[Union[str, List[int]]] = None # for the previous context
102
+ prefix: Optional[Union[str, List[int]]] = None # to prefix the current context
103
+
104
+ # list of tokens ids (or comma-separated token ids) to suppress
105
+ # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
106
+ suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
107
+ suppress_blank: bool = True # this will suppress blank outputs
108
+
109
+ # timestamp sampling options
110
+ without_timestamps: bool = False # use <|notimestamps|> to sample text tokens only
111
+ max_initial_timestamp: Optional[float] = 1.0
112
+
113
+ # implementation details
114
+ fp16: bool = True # use fp16 for most of the calculation
115
+
116
+
117
+ @dataclass(frozen=True)
118
+ class DecodingResult:
119
+ audio_features: Tensor
120
+ language: str
121
+ language_probs: Optional[Dict[str, float]] = None
122
+ tokens: List[int] = field(default_factory=list)
123
+ text: str = ""
124
+ avg_logprob: float = np.nan
125
+ no_speech_prob: float = np.nan
126
+ temperature: float = np.nan
127
+ compression_ratio: float = np.nan
128
+
129
+
130
+ class Inference:
131
+ def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
132
+ """Perform a forward pass on the decoder and return per-token logits"""
133
+ raise NotImplementedError
134
+
135
+ def rearrange_kv_cache(self, source_indices) -> None:
136
+ """Update the key-value cache according to the updated beams"""
137
+ raise NotImplementedError
138
+
139
+ def cleanup_caching(self) -> None:
140
+ """Clean up any resources or hooks after decoding is finished"""
141
+ pass
142
+
143
+
144
+ class PyTorchInference(Inference):
145
+ def __init__(self, model: "Whisper", initial_token_length: int):
146
+ self.model: "Whisper" = model
147
+ self.initial_token_length = initial_token_length
148
+ self.kv_cache = {}
149
+ self.hooks = []
150
+
151
+ key_modules = [block.attn.key for block in self.model.decoder.blocks]
152
+ value_modules = [block.attn.value for block in self.model.decoder.blocks]
153
+ self.kv_modules = key_modules + value_modules
154
+
155
+ def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
156
+ if not self.kv_cache:
157
+ self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
158
+
159
+ if tokens.shape[-1] > self.initial_token_length:
160
+ # only need to use the last token except in the first forward pass
161
+ tokens = tokens[:, -1:]
162
+
163
+ return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
164
+
165
+ def cleanup_caching(self):
166
+ for hook in self.hooks:
167
+ hook.remove()
168
+
169
+ self.kv_cache = {}
170
+ self.hooks = []
171
+
172
+ def rearrange_kv_cache(self, source_indices):
173
+ if source_indices != list(range(len(source_indices))):
174
+ for module in self.kv_modules:
175
+ # update the key/value cache to contain the selected sequences
176
+ self.kv_cache[module] = self.kv_cache[module][source_indices].detach()
177
+
178
+
179
+ class SequenceRanker:
180
+ def rank(
181
+ self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]
182
+ ) -> List[int]:
183
+ """
184
+ Given a list of groups of samples and their cumulative log probabilities,
185
+ return the indices of the samples in each group to select as the final result
186
+ """
187
+ raise NotImplementedError
188
+
189
+
190
+ class MaximumLikelihoodRanker(SequenceRanker):
191
+ """
192
+ Select the sample with the highest log probabilities, penalized using either
193
+ a simple length normalization or Google NMT paper's length penalty
194
+ """
195
+
196
+ def __init__(self, length_penalty: Optional[float]):
197
+ self.length_penalty = length_penalty
198
+
199
+ def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
200
+ def scores(logprobs, lengths):
201
+ result = []
202
+ for logprob, length in zip(logprobs, lengths):
203
+ if self.length_penalty is None:
204
+ penalty = length
205
+ else:
206
+ # from the Google NMT paper
207
+ penalty = ((5 + length) / 6) ** self.length_penalty
208
+ result.append(logprob / penalty)
209
+ return result
210
+
211
+ # get the sequence with the highest score
212
+ lengths = [[len(t) for t in s] for s in tokens]
213
+ return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
214
+
215
+
216
+ class TokenDecoder:
217
+ def reset(self):
218
+ """Initialize any stateful variables for decoding a new sequence"""
219
+
220
+ def update(
221
+ self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
222
+ ) -> Tuple[Tensor, bool]:
223
+ """Specify how to select the next token, based on the current trace and logits
224
+
225
+ Parameters
226
+ ----------
227
+ tokens : Tensor, shape = (n_batch, current_sequence_length)
228
+ all tokens in the context so far, including the prefix and sot_sequence tokens
229
+
230
+ logits : Tensor, shape = (n_batch, vocab_size)
231
+ per-token logits of the probability distribution at the current step
232
+
233
+ sum_logprobs : Tensor, shape = (n_batch)
234
+ cumulative log probabilities for each sequence
235
+
236
+ Returns
237
+ -------
238
+ tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
239
+ the tokens, appended with the selected next token
240
+
241
+ completed : bool
242
+ True if all sequences has reached the end of text
243
+
244
+ """
245
+ raise NotImplementedError
246
+
247
+ def finalize(
248
+ self, tokens: Tensor, sum_logprobs: Tensor
249
+ ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
250
+ """Finalize search and return the final candidate sequences
251
+
252
+ Parameters
253
+ ----------
254
+ tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
255
+ all tokens in the context so far, including the prefix and sot_sequence
256
+
257
+ sum_logprobs : Tensor, shape = (n_audio, n_group)
258
+ cumulative log probabilities for each sequence
259
+
260
+ Returns
261
+ -------
262
+ tokens : Sequence[Sequence[Tensor]], length = n_audio
263
+ sequence of Tensors containing candidate token sequences, for each audio input
264
+
265
+ sum_logprobs : List[List[float]], length = n_audio
266
+ sequence of cumulative log probabilities corresponding to the above
267
+
268
+ """
269
+ raise NotImplementedError
270
+
271
+
272
+ class GreedyDecoder(TokenDecoder):
273
+ def __init__(self, temperature: float, eot: int):
274
+ self.temperature = temperature
275
+ self.eot = eot
276
+
277
+ def update(
278
+ self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
279
+ ) -> Tuple[Tensor, bool]:
280
+ if self.temperature == 0:
281
+ next_tokens = logits.argmax(dim=-1)
282
+ else:
283
+ next_tokens = Categorical(logits=logits / self.temperature).sample()
284
+
285
+ logprobs = F.log_softmax(logits.float(), dim=-1)
286
+ current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
287
+ sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
288
+
289
+ next_tokens[tokens[:, -1] == self.eot] = self.eot
290
+ tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
291
+
292
+ completed = (tokens[:, -1] == self.eot).all()
293
+ return tokens, completed
294
+
295
+ def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
296
+ # make sure each sequence has at least one EOT token at the end
297
+ tokens = F.pad(tokens, (0, 1), value=self.eot)
298
+ return tokens, sum_logprobs.tolist()
299
+
300
+
301
+ class BeamSearchDecoder(TokenDecoder):
302
+ def __init__(
303
+ self,
304
+ beam_size: int,
305
+ eot: int,
306
+ inference: Inference,
307
+ patience: Optional[float] = None,
308
+ ):
309
+ self.beam_size = beam_size
310
+ self.eot = eot
311
+ self.inference = inference
312
+ self.patience = patience or 1.0
313
+ self.max_candidates: int = round(beam_size * self.patience)
314
+ self.finished_sequences = None
315
+
316
+ assert (
317
+ self.max_candidates > 0
318
+ ), f"Invalid beam size ({beam_size}) or patience ({patience})"
319
+
320
+ def reset(self):
321
+ self.finished_sequences = None
322
+
323
+ def update(
324
+ self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
325
+ ) -> Tuple[Tensor, bool]:
326
+ if tokens.shape[0] % self.beam_size != 0:
327
+ raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
328
+
329
+ n_audio = tokens.shape[0] // self.beam_size
330
+ if self.finished_sequences is None: # for the first update
331
+ self.finished_sequences = [{} for _ in range(n_audio)]
332
+
333
+ logprobs = F.log_softmax(logits.float(), dim=-1)
334
+ next_tokens, source_indices, finished_sequences = [], [], []
335
+ for i in range(n_audio):
336
+ scores, sources, finished = {}, {}, {}
337
+
338
+ # STEP 1: calculate the cumulative log probabilities for possible candidates
339
+ for j in range(self.beam_size):
340
+ idx = i * self.beam_size + j
341
+ prefix = tokens[idx].tolist()
342
+ for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
343
+ new_logprob = (sum_logprobs[idx] + logprob).item()
344
+ sequence = tuple(prefix + [token.item()])
345
+ scores[sequence] = new_logprob
346
+ sources[sequence] = idx
347
+
348
+ # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
349
+ saved = 0
350
+ for sequence in sorted(scores, key=scores.get, reverse=True):
351
+ if sequence[-1] == self.eot:
352
+ finished[sequence] = scores[sequence]
353
+ else:
354
+ sum_logprobs[len(next_tokens)] = scores[sequence]
355
+ next_tokens.append(sequence)
356
+ source_indices.append(sources[sequence])
357
+
358
+ saved += 1
359
+ if saved == self.beam_size:
360
+ break
361
+
362
+ finished_sequences.append(finished)
363
+
364
+ tokens = torch.tensor(next_tokens, device=tokens.device)
365
+ self.inference.rearrange_kv_cache(source_indices)
366
+
367
+ # add newly finished sequences to self.finished_sequences
368
+ assert len(self.finished_sequences) == len(finished_sequences)
369
+ for previously_finished, newly_finished in zip(
370
+ self.finished_sequences, finished_sequences
371
+ ):
372
+ for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
373
+ if len(previously_finished) >= self.max_candidates:
374
+ break # the candidate list is full
375
+ previously_finished[seq] = newly_finished[seq]
376
+
377
+ # mark as completed if all audio has enough number of samples
378
+ completed = all(
379
+ len(sequences) >= self.max_candidates
380
+ for sequences in self.finished_sequences
381
+ )
382
+ return tokens, completed
383
+
384
+ def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
385
+ # collect all finished sequences, including patience, and add unfinished ones if not enough
386
+ sum_logprobs = sum_logprobs.cpu()
387
+ for i, sequences in enumerate(self.finished_sequences):
388
+ if (
389
+ len(sequences) < self.beam_size
390
+ ): # when not enough sequences are finished
391
+ for j in list(np.argsort(sum_logprobs[i]))[::-1]:
392
+ sequence = preceding_tokens[i, j].tolist() + [self.eot]
393
+ sequences[tuple(sequence)] = sum_logprobs[i][j].item()
394
+ if len(sequences) >= self.beam_size:
395
+ break
396
+
397
+ tokens: List[List[Tensor]] = [
398
+ [torch.tensor(seq) for seq in sequences.keys()]
399
+ for sequences in self.finished_sequences
400
+ ]
401
+ sum_logprobs: List[List[float]] = [
402
+ list(sequences.values()) for sequences in self.finished_sequences
403
+ ]
404
+ return tokens, sum_logprobs
405
+
406
+
407
+ class LogitFilter:
408
+ def apply(self, logits: Tensor, tokens: Tensor) -> None:
409
+ """Apply any filtering or masking to logits in-place
410
+
411
+ Parameters
412
+ ----------
413
+ logits : Tensor, shape = (n_batch, vocab_size)
414
+ per-token logits of the probability distribution at the current step
415
+
416
+ tokens : Tensor, shape = (n_batch, current_sequence_length)
417
+ all tokens in the context so far, including the prefix and sot_sequence tokens
418
+
419
+ """
420
+ raise NotImplementedError
421
+
422
+
423
+ class SuppressBlank(LogitFilter):
424
+ def __init__(self, tokenizer: Tokenizer, sample_begin: int):
425
+ self.tokenizer = tokenizer
426
+ self.sample_begin = sample_begin
427
+
428
+ def apply(self, logits: Tensor, tokens: Tensor):
429
+ if tokens.shape[1] == self.sample_begin:
430
+ logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
431
+
432
+
433
+ class SuppressTokens(LogitFilter):
434
+ def __init__(self, suppress_tokens: Sequence[int]):
435
+ self.suppress_tokens = list(suppress_tokens)
436
+
437
+ def apply(self, logits: Tensor, tokens: Tensor):
438
+ logits[:, self.suppress_tokens] = -np.inf
439
+
440
+
441
+ class ApplyTimestampRules(LogitFilter):
442
+ def __init__(
443
+ self,
444
+ tokenizer: Tokenizer,
445
+ sample_begin: int,
446
+ max_initial_timestamp_index: Optional[int],
447
+ ):
448
+ self.tokenizer = tokenizer
449
+ self.sample_begin = sample_begin
450
+ self.max_initial_timestamp_index = max_initial_timestamp_index
451
+
452
+ def apply(self, logits: Tensor, tokens: Tensor):
453
+ # suppress <|notimestamps|> which is handled by without_timestamps
454
+ if self.tokenizer.no_timestamps is not None:
455
+ logits[:, self.tokenizer.no_timestamps] = -np.inf
456
+
457
+ # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
458
+ for k in range(tokens.shape[0]):
459
+ sampled_tokens = tokens[k, self.sample_begin :]
460
+ seq = [t for t in sampled_tokens.tolist()]
461
+ last_was_timestamp = (
462
+ len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
463
+ )
464
+ penultimate_was_timestamp = (
465
+ len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
466
+ )
467
+
468
+ if last_was_timestamp:
469
+ if penultimate_was_timestamp: # has to be non-timestamp
470
+ logits[k, self.tokenizer.timestamp_begin :] = -np.inf
471
+ else: # cannot be normal text tokens
472
+ logits[k, : self.tokenizer.eot] = -np.inf
473
+
474
+ timestamps = sampled_tokens[
475
+ sampled_tokens.ge(self.tokenizer.timestamp_begin)
476
+ ]
477
+ if timestamps.numel() > 0:
478
+ # timestamps shouldn't decrease; forbid timestamp tokens smaller than the last
479
+ # also force each segment to have a nonzero length, to prevent infinite looping
480
+ if last_was_timestamp and not penultimate_was_timestamp:
481
+ timestamp_last = timestamps[-1]
482
+ else:
483
+ timestamp_last = timestamps[-1] + 1
484
+ logits[k, self.tokenizer.timestamp_begin : timestamp_last] = -np.inf
485
+
486
+ if tokens.shape[1] == self.sample_begin:
487
+ # suppress generating non-timestamp tokens at the beginning
488
+ logits[:, : self.tokenizer.timestamp_begin] = -np.inf
489
+
490
+ # apply the `max_initial_timestamp` option
491
+ if self.max_initial_timestamp_index is not None:
492
+ last_allowed = (
493
+ self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
494
+ )
495
+ logits[:, last_allowed + 1 :] = -np.inf
496
+
497
+ # if sum of probability over timestamps is above any other token, sample timestamp
498
+ logprobs = F.log_softmax(logits.float(), dim=-1)
499
+ for k in range(tokens.shape[0]):
500
+ timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(
501
+ dim=-1
502
+ )
503
+ max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
504
+ if timestamp_logprob > max_text_token_logprob:
505
+ logits[k, : self.tokenizer.timestamp_begin] = -np.inf
506
+
507
+
508
+ class DecodingTask:
509
+ inference: Inference
510
+ sequence_ranker: SequenceRanker
511
+ decoder: TokenDecoder
512
+ logit_filters: List[LogitFilter]
513
+
514
+ def __init__(self, model: "Whisper", options: DecodingOptions):
515
+ self.model = model
516
+
517
+ language = options.language or "en"
518
+ tokenizer = get_tokenizer(
519
+ model.is_multilingual,
520
+ num_languages=model.num_languages,
521
+ language=language,
522
+ task=options.task,
523
+ )
524
+ self.tokenizer: Tokenizer = tokenizer
525
+ self.options: DecodingOptions = self._verify_options(options)
526
+
527
+ self.n_group: int = options.beam_size or options.best_of or 1
528
+ self.n_ctx: int = model.dims.n_text_ctx
529
+ self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
530
+
531
+ self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
532
+ if self.options.without_timestamps:
533
+ self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
534
+
535
+ self.initial_tokens: Tuple[int] = self._get_initial_tokens()
536
+ self.sample_begin: int = len(self.initial_tokens)
537
+ self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
538
+
539
+ # inference: implements the forward pass through the decoder, including kv caching
540
+ self.inference = PyTorchInference(model, len(self.initial_tokens))
541
+
542
+ # sequence ranker: implements how to rank a group of sampled sequences
543
+ self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
544
+
545
+ # decoder: implements how to select the next tokens, given the autoregressive distribution
546
+ if options.beam_size is not None:
547
+ self.decoder = BeamSearchDecoder(
548
+ options.beam_size, tokenizer.eot, self.inference, options.patience
549
+ )
550
+ else:
551
+ self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
552
+
553
+ # logit filters: applies various rules to suppress or penalize certain tokens
554
+ self.logit_filters = []
555
+ if self.options.suppress_blank:
556
+ self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
557
+ if self.options.suppress_tokens:
558
+ self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
559
+ if not options.without_timestamps:
560
+ precision = CHUNK_LENGTH / model.dims.n_audio_ctx # usually 0.02 seconds
561
+ max_initial_timestamp_index = None
562
+ if options.max_initial_timestamp:
563
+ max_initial_timestamp_index = round(
564
+ self.options.max_initial_timestamp / precision
565
+ )
566
+ self.logit_filters.append(
567
+ ApplyTimestampRules(
568
+ tokenizer, self.sample_begin, max_initial_timestamp_index
569
+ )
570
+ )
571
+
572
+ def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
573
+ if options.beam_size is not None and options.best_of is not None:
574
+ raise ValueError("beam_size and best_of can't be given together")
575
+ if options.temperature == 0:
576
+ if options.best_of is not None:
577
+ raise ValueError("best_of with greedy sampling (T=0) is not compatible")
578
+ if options.patience is not None and options.beam_size is None:
579
+ raise ValueError("patience requires beam_size to be given")
580
+ if options.length_penalty is not None and not (
581
+ 0 <= options.length_penalty <= 1
582
+ ):
583
+ raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
584
+
585
+ return options
586
+
587
+ def _get_initial_tokens(self) -> Tuple[int]:
588
+ tokens = list(self.sot_sequence)
589
+
590
+ if prefix := self.options.prefix:
591
+ prefix_tokens = (
592
+ self.tokenizer.encode(" " + prefix.strip())
593
+ if isinstance(prefix, str)
594
+ else prefix
595
+ )
596
+ if self.sample_len is not None:
597
+ max_prefix_len = self.n_ctx // 2 - self.sample_len
598
+ prefix_tokens = prefix_tokens[-max_prefix_len:]
599
+ tokens = tokens + prefix_tokens
600
+
601
+ if prompt := self.options.prompt:
602
+ prompt_tokens = (
603
+ self.tokenizer.encode(" " + prompt.strip())
604
+ if isinstance(prompt, str)
605
+ else prompt
606
+ )
607
+ tokens = (
608
+ [self.tokenizer.sot_prev]
609
+ + prompt_tokens[-(self.n_ctx // 2 - 1) :]
610
+ + tokens
611
+ )
612
+
613
+ return tuple(tokens)
614
+
615
+ def _get_suppress_tokens(self) -> Tuple[int]:
616
+ suppress_tokens = self.options.suppress_tokens
617
+
618
+ if isinstance(suppress_tokens, str):
619
+ suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
620
+
621
+ if -1 in suppress_tokens:
622
+ suppress_tokens = [t for t in suppress_tokens if t >= 0]
623
+ suppress_tokens.extend(self.tokenizer.non_speech_tokens)
624
+ elif suppress_tokens is None or len(suppress_tokens) == 0:
625
+ suppress_tokens = [] # interpret empty string as an empty list
626
+ else:
627
+ assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
628
+
629
+ suppress_tokens.extend(
630
+ [
631
+ self.tokenizer.transcribe,
632
+ self.tokenizer.translate,
633
+ self.tokenizer.sot,
634
+ self.tokenizer.sot_prev,
635
+ self.tokenizer.sot_lm,
636
+ ]
637
+ )
638
+ if self.tokenizer.no_speech is not None:
639
+ # no-speech probability is collected separately
640
+ suppress_tokens.append(self.tokenizer.no_speech)
641
+
642
+ return tuple(sorted(set(suppress_tokens)))
643
+
644
+ def _get_audio_features(self, mel: Tensor):
645
+ if self.options.fp16:
646
+ mel = mel.half()
647
+
648
+ if mel.shape[-2:] == (
649
+ self.model.dims.n_audio_ctx,
650
+ self.model.dims.n_audio_state,
651
+ ):
652
+ # encoded audio features are given; skip audio encoding
653
+ audio_features = mel
654
+ else:
655
+ audio_features = self.model.encoder(mel)
656
+
657
+ if audio_features.dtype != (
658
+ torch.float16 if self.options.fp16 else torch.float32
659
+ ):
660
+ return TypeError(
661
+ f"audio_features has an incorrect dtype: {audio_features.dtype}"
662
+ )
663
+
664
+ return audio_features
665
+
666
+ def _detect_language(self, audio_features: Tensor, tokens: Tensor):
667
+ languages = [self.options.language] * audio_features.shape[0]
668
+ lang_probs = None
669
+
670
+ if self.options.language is None or self.options.task == "lang_id":
671
+ lang_tokens, lang_probs = self.model.detect_language(
672
+ audio_features, self.tokenizer
673
+ )
674
+ languages = [max(probs, key=probs.get) for probs in lang_probs]
675
+ if self.options.language is None:
676
+ tokens[:, self.sot_index + 1] = lang_tokens # write language tokens
677
+
678
+ return languages, lang_probs
679
+
680
+ def _main_loop(self, audio_features: Tensor, tokens: Tensor):
681
+ n_batch = tokens.shape[0]
682
+ sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
683
+ no_speech_probs = [np.nan] * n_batch
684
+
685
+ try:
686
+ for i in range(self.sample_len):
687
+ logits = self.inference.logits(tokens, audio_features)
688
+
689
+ if (
690
+ i == 0 and self.tokenizer.no_speech is not None
691
+ ): # save no_speech_probs
692
+ probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
693
+ no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
694
+
695
+ # now we need to consider the logits at the last token only
696
+ logits = logits[:, -1]
697
+
698
+ # apply the logit filters, e.g. for suppressing or applying penalty to
699
+ for logit_filter in self.logit_filters:
700
+ logit_filter.apply(logits, tokens)
701
+
702
+ # expand the tokens tensor with the selected next tokens
703
+ tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
704
+
705
+ if completed or tokens.shape[-1] > self.n_ctx:
706
+ break
707
+ finally:
708
+ self.inference.cleanup_caching()
709
+
710
+ return tokens, sum_logprobs, no_speech_probs
711
+
712
+ @torch.no_grad()
713
+ def run(self, mel: Tensor) -> List[DecodingResult]:
714
+ self.decoder.reset()
715
+ tokenizer: Tokenizer = self.tokenizer
716
+ n_audio: int = mel.shape[0]
717
+
718
+ audio_features: Tensor = self._get_audio_features(mel) # encoder forward pass
719
+ tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
720
+
721
+ # detect language if requested, overwriting the language token
722
+ languages, language_probs = self._detect_language(audio_features, tokens)
723
+ if self.options.task == "lang_id":
724
+ return [
725
+ DecodingResult(
726
+ audio_features=features, language=language, language_probs=probs
727
+ )
728
+ for features, language, probs in zip(
729
+ audio_features, languages, language_probs
730
+ )
731
+ ]
732
+
733
+ # repeat text tensors by the group size, for beam search or best-of-n sampling
734
+ tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
735
+
736
+ # call the main sampling loop
737
+ tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
738
+
739
+ # reshape the tensors to have (n_audio, n_group) as the first two dimensions
740
+ audio_features = audio_features[:: self.n_group]
741
+ no_speech_probs = no_speech_probs[:: self.n_group]
742
+ assert audio_features.shape[0] == len(no_speech_probs) == n_audio
743
+
744
+ tokens = tokens.reshape(n_audio, self.n_group, -1)
745
+ sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
746
+
747
+ # get the final candidates for each group, and slice between the first sampled token and EOT
748
+ tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
749
+ tokens: List[List[Tensor]] = [
750
+ [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s]
751
+ for s in tokens
752
+ ]
753
+
754
+ # select the top-ranked sample in each group
755
+ selected = self.sequence_ranker.rank(tokens, sum_logprobs)
756
+ tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
757
+ texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
758
+
759
+ sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
760
+ avg_logprobs: List[float] = [
761
+ lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)
762
+ ]
763
+
764
+ fields = (
765
+ texts,
766
+ languages,
767
+ tokens,
768
+ audio_features,
769
+ avg_logprobs,
770
+ no_speech_probs,
771
+ )
772
+ if len(set(map(len, fields))) != 1:
773
+ raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
774
+
775
+ return [
776
+ DecodingResult(
777
+ audio_features=features,
778
+ language=language,
779
+ tokens=tokens,
780
+ text=text,
781
+ avg_logprob=avg_logprob,
782
+ no_speech_prob=no_speech_prob,
783
+ temperature=self.options.temperature,
784
+ compression_ratio=compression_ratio(text),
785
+ )
786
+ for text, language, tokens, features, avg_logprob, no_speech_prob in zip(
787
+ *fields
788
+ )
789
+ ]
790
+
791
+
792
+ @torch.no_grad()
793
+ def decode(
794
+ model: "Whisper",
795
+ mel: Tensor,
796
+ options: DecodingOptions = DecodingOptions(),
797
+ **kwargs,
798
+ ) -> Union[DecodingResult, List[DecodingResult]]:
799
+ """
800
+ Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
801
+
802
+ Parameters
803
+ ----------
804
+ model: Whisper
805
+ the Whisper model instance
806
+
807
+ mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
808
+ A tensor containing the Mel spectrogram(s)
809
+
810
+ options: DecodingOptions
811
+ A dataclass that contains all necessary options for decoding 30-second segments
812
+
813
+ Returns
814
+ -------
815
+ result: Union[DecodingResult, List[DecodingResult]]
816
+ The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
817
+ """
818
+ if single := mel.ndim == 2:
819
+ mel = mel.unsqueeze(0)
820
+
821
+ if kwargs:
822
+ options = replace(options, **kwargs)
823
+
824
+ result = DecodingTask(model, options).run(mel)
825
+
826
+ return result[0] if single else result
whisperlivekit/simul_whisper/whisper/model.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import gzip
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+ from typing import Dict, Iterable, Optional, Tuple
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from torch import Tensor, nn
11
+
12
+ from .decoding import decode as decode_function
13
+ from .decoding import detect_language as detect_language_function
14
+ from .transcribe import transcribe as transcribe_function
15
+
16
+ try:
17
+ from torch.nn.functional import scaled_dot_product_attention
18
+
19
+ SDPA_AVAILABLE = True
20
+ except (ImportError, RuntimeError, OSError):
21
+ scaled_dot_product_attention = None
22
+ SDPA_AVAILABLE = False
23
+
24
+
25
+ @dataclass
26
+ class ModelDimensions:
27
+ n_mels: int
28
+ n_audio_ctx: int
29
+ n_audio_state: int
30
+ n_audio_head: int
31
+ n_audio_layer: int
32
+ n_vocab: int
33
+ n_text_ctx: int
34
+ n_text_state: int
35
+ n_text_head: int
36
+ n_text_layer: int
37
+
38
+
39
+ class LayerNorm(nn.LayerNorm):
40
+ def forward(self, x: Tensor) -> Tensor:
41
+ return super().forward(x.float()).type(x.dtype)
42
+
43
+
44
+ class Linear(nn.Linear):
45
+ def forward(self, x: Tensor) -> Tensor:
46
+ return F.linear(
47
+ x,
48
+ self.weight.to(x.dtype),
49
+ None if self.bias is None else self.bias.to(x.dtype),
50
+ )
51
+
52
+
53
+ class Conv1d(nn.Conv1d):
54
+ def _conv_forward(
55
+ self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
56
+ ) -> Tensor:
57
+ return super()._conv_forward(
58
+ x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
59
+ )
60
+
61
+
62
+ def sinusoids(length, channels, max_timescale=10000):
63
+ """Returns sinusoids for positional embedding"""
64
+ assert channels % 2 == 0
65
+ log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
66
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
67
+ scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
68
+ return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
69
+
70
+
71
+ @contextmanager
72
+ def disable_sdpa():
73
+ prev_state = MultiHeadAttention.use_sdpa
74
+ try:
75
+ MultiHeadAttention.use_sdpa = False
76
+ yield
77
+ finally:
78
+ MultiHeadAttention.use_sdpa = prev_state
79
+
80
+
81
+ class MultiHeadAttention(nn.Module):
82
+ use_sdpa = False # Disable SDPA to ensure qk is always computed for hooks
83
+
84
+ def __init__(self, n_state: int, n_head: int, cache_id: str = ""):
85
+ super().__init__()
86
+ self.n_head = n_head
87
+ self.query = Linear(n_state, n_state)
88
+ self.key = Linear(n_state, n_state, bias=False)
89
+ self.value = Linear(n_state, n_state)
90
+ self.out = Linear(n_state, n_state)
91
+ self.cache_id = cache_id
92
+ self.key.cache_id = f"{cache_id}_key"
93
+ self.value.cache_id = f"{cache_id}_value"
94
+
95
+ def forward(
96
+ self,
97
+ x: Tensor,
98
+ xa: Optional[Tensor] = None,
99
+ mask: Optional[Tensor] = None,
100
+ kv_cache: Optional[dict] = None,
101
+ ):
102
+ q = self.query(x)
103
+
104
+ if kv_cache is None or xa is None or self.key not in kv_cache:
105
+ # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
106
+ # otherwise, perform key/value projections for self- or cross-attention as usual.
107
+ k = self.key(x if xa is None else xa)
108
+ v = self.value(x if xa is None else xa)
109
+ else:
110
+ # for cross-attention, calculate keys and values once and reuse in subsequent calls.
111
+ k = kv_cache[self.key]
112
+ v = kv_cache[self.value]
113
+
114
+ wv, qk = self.qkv_attention(q, k, v, mask)
115
+ return self.out(wv), qk
116
+
117
+ def qkv_attention(
118
+ self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
119
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
120
+ n_batch, n_ctx, n_state = q.shape
121
+ scale = (n_state // self.n_head) ** -0.25
122
+ q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
123
+ k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
124
+ v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
125
+
126
+ if SDPA_AVAILABLE and MultiHeadAttention.use_sdpa:
127
+ a = scaled_dot_product_attention(
128
+ q, k, v, is_causal=mask is not None and n_ctx > 1
129
+ )
130
+ out = a.permute(0, 2, 1, 3).flatten(start_dim=2)
131
+ qk = None
132
+ else:
133
+ qk = (q * scale) @ (k * scale).transpose(-1, -2)
134
+ if mask is not None:
135
+ qk = qk + mask[:n_ctx, :n_ctx]
136
+ qk = qk.float()
137
+
138
+ w = F.softmax(qk, dim=-1).to(q.dtype)
139
+ out = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
140
+ qk = qk.detach()
141
+
142
+ return out, qk
143
+
144
+
145
+ class ResidualAttentionBlock(nn.Module):
146
+ def __init__(self, n_state: int, n_head: int, cross_attention: bool = False, cache_id: str = ""):
147
+ super().__init__()
148
+
149
+ self.attn = MultiHeadAttention(n_state, n_head, cache_id=f"{cache_id}_self_attn")
150
+ self.attn_ln = LayerNorm(n_state)
151
+
152
+ self.cross_attn = (
153
+ MultiHeadAttention(n_state, n_head, cache_id=f"{cache_id}_cross_attn") if cross_attention else None
154
+ )
155
+ self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
156
+
157
+ n_mlp = n_state * 4
158
+ self.mlp = nn.Sequential(
159
+ Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
160
+ )
161
+ self.mlp_ln = LayerNorm(n_state)
162
+
163
+ def forward(
164
+ self,
165
+ x: Tensor,
166
+ xa: Optional[Tensor] = None,
167
+ mask: Optional[Tensor] = None,
168
+ kv_cache: Optional[dict] = None,
169
+ ):
170
+ x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
171
+ if self.cross_attn:
172
+ x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
173
+ x = x + self.mlp(self.mlp_ln(x))
174
+ return x
175
+
176
+
177
+ class AudioEncoder(nn.Module):
178
+ def __init__(
179
+ self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
180
+ ):
181
+ super().__init__()
182
+ self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
183
+ self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
184
+ self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
185
+
186
+ self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
187
+ [ResidualAttentionBlock(n_state, n_head, cache_id=f"enc_layer{i}") for i in range(n_layer)]
188
+ )
189
+ self.ln_post = LayerNorm(n_state)
190
+
191
+ def forward(self, x: Tensor):
192
+ """
193
+ x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
194
+ the mel spectrogram of the audio
195
+ """
196
+ x = F.gelu(self.conv1(x))
197
+ x = F.gelu(self.conv2(x))
198
+ x = x.permute(0, 2, 1)
199
+
200
+ assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
201
+ x = (x + self.positional_embedding).to(x.dtype)
202
+
203
+ for block in self.blocks:
204
+ x = block(x)
205
+
206
+ x = self.ln_post(x)
207
+ return x
208
+
209
+
210
+ class TextDecoder(nn.Module):
211
+ def __init__(
212
+ self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
213
+ ):
214
+ super().__init__()
215
+
216
+ self.token_embedding = nn.Embedding(n_vocab, n_state)
217
+ self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
218
+
219
+ self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
220
+ [
221
+ ResidualAttentionBlock(n_state, n_head, cross_attention=True, cache_id=f"dec_layer{i}")
222
+ for i in range(n_layer)
223
+ ]
224
+ )
225
+ self.ln = LayerNorm(n_state)
226
+
227
+ mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
228
+ self.register_buffer("mask", mask, persistent=False)
229
+
230
+ def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
231
+ """
232
+ x : torch.LongTensor, shape = (batch_size, <= n_ctx)
233
+ the text tokens
234
+ xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
235
+ the encoded audio features to be attended on
236
+ """
237
+ offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
238
+ x = (
239
+ self.token_embedding(x)
240
+ + self.positional_embedding[offset : offset + x.shape[-1]]
241
+ )
242
+ x = x.to(xa.dtype)
243
+
244
+ for block in self.blocks:
245
+ x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
246
+
247
+ x = self.ln(x)
248
+ logits = (
249
+ x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
250
+ ).float()
251
+
252
+ return logits
253
+
254
+
255
+ class Whisper(nn.Module):
256
+ def __init__(self, dims: ModelDimensions):
257
+ super().__init__()
258
+ self.dims = dims
259
+ self.encoder = AudioEncoder(
260
+ self.dims.n_mels,
261
+ self.dims.n_audio_ctx,
262
+ self.dims.n_audio_state,
263
+ self.dims.n_audio_head,
264
+ self.dims.n_audio_layer,
265
+ )
266
+ self.decoder = TextDecoder(
267
+ self.dims.n_vocab,
268
+ self.dims.n_text_ctx,
269
+ self.dims.n_text_state,
270
+ self.dims.n_text_head,
271
+ self.dims.n_text_layer,
272
+ )
273
+ # use the last half among the decoder layers for time alignment by default;
274
+ # to use a specific set of heads, see `set_alignment_heads()` below.
275
+ all_heads = torch.zeros(
276
+ self.dims.n_text_layer, self.dims.n_text_head, dtype=torch.bool
277
+ )
278
+ all_heads[self.dims.n_text_layer // 2 :] = True
279
+ self.register_buffer("alignment_heads", all_heads.to_sparse(), persistent=False)
280
+
281
+ def set_alignment_heads(self, dump: bytes):
282
+ array = np.frombuffer(
283
+ gzip.decompress(base64.b85decode(dump)), dtype=bool
284
+ ).copy()
285
+ mask = torch.from_numpy(array).reshape(
286
+ self.dims.n_text_layer, self.dims.n_text_head
287
+ )
288
+ self.register_buffer("alignment_heads", mask.to_sparse(), persistent=False)
289
+
290
+ def embed_audio(self, mel: torch.Tensor):
291
+ return self.encoder(mel)
292
+
293
+ def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
294
+ return self.decoder(tokens, audio_features)
295
+
296
+ def forward(
297
+ self, mel: torch.Tensor, tokens: torch.Tensor
298
+ ) -> Dict[str, torch.Tensor]:
299
+ return self.decoder(tokens, self.encoder(mel))
300
+
301
+ @property
302
+ def device(self):
303
+ return next(self.parameters()).device
304
+
305
+ @property
306
+ def is_multilingual(self):
307
+ return self.dims.n_vocab >= 51865
308
+
309
+ @property
310
+ def num_languages(self):
311
+ return self.dims.n_vocab - 51765 - int(self.is_multilingual)
312
+
313
+ def install_kv_cache_hooks(self, cache: Optional[dict] = None):
314
+ """
315
+ The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
316
+ tensors calculated for the previous positions. This method returns a dictionary that stores
317
+ all caches, and the necessary hooks for the key and value projection modules that save the
318
+ intermediate tensors to be reused during later calculations.
319
+
320
+ Returns
321
+ -------
322
+ cache : Dict[nn.Module, torch.Tensor]
323
+ A dictionary object mapping the key/value projection modules to its cache
324
+ hooks : List[RemovableHandle]
325
+ List of PyTorch RemovableHandle objects to stop the hooks to be called
326
+ """
327
+ cache = {**cache} if cache is not None else {}
328
+ hooks = []
329
+
330
+ def save_to_cache(module, _, output):
331
+ if module not in cache or output.shape[1] > self.dims.n_text_ctx:
332
+ # save as-is, for the first token or cross attention
333
+ cache[module] = output
334
+ else:
335
+ cache[module] = torch.cat([cache[module], output], dim=1).detach()
336
+ return cache[module]
337
+
338
+ def install_hooks(layer: nn.Module):
339
+ if isinstance(layer, MultiHeadAttention):
340
+ hooks.append(layer.key.register_forward_hook(save_to_cache))
341
+ hooks.append(layer.value.register_forward_hook(save_to_cache))
342
+
343
+ self.decoder.apply(install_hooks)
344
+ return cache, hooks
345
+
346
+ detect_language = detect_language_function
347
+ transcribe = transcribe_function
348
+ decode = decode_function
whisperlivekit/simul_whisper/whisper/normalizers/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .basic import BasicTextNormalizer as BasicTextNormalizer
2
+ from .english import EnglishTextNormalizer as EnglishTextNormalizer
whisperlivekit/simul_whisper/whisper/normalizers/basic.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ import regex
5
+
6
+ # non-ASCII letters that are not separated by "NFKD" normalization
7
+ ADDITIONAL_DIACRITICS = {
8
+ "œ": "oe",
9
+ "Œ": "OE",
10
+ "ø": "o",
11
+ "Ø": "O",
12
+ "æ": "ae",
13
+ "Æ": "AE",
14
+ "ß": "ss",
15
+ "ẞ": "SS",
16
+ "đ": "d",
17
+ "Đ": "D",
18
+ "ð": "d",
19
+ "Ð": "D",
20
+ "þ": "th",
21
+ "Þ": "th",
22
+ "ł": "l",
23
+ "Ł": "L",
24
+ }
25
+
26
+
27
+ def remove_symbols_and_diacritics(s: str, keep=""):
28
+ """
29
+ Replace any other markers, symbols, and punctuations with a space,
30
+ and drop any diacritics (category 'Mn' and some manual mappings)
31
+ """
32
+ return "".join(
33
+ (
34
+ c
35
+ if c in keep
36
+ else (
37
+ ADDITIONAL_DIACRITICS[c]
38
+ if c in ADDITIONAL_DIACRITICS
39
+ else (
40
+ ""
41
+ if unicodedata.category(c) == "Mn"
42
+ else " " if unicodedata.category(c)[0] in "MSP" else c
43
+ )
44
+ )
45
+ )
46
+ for c in unicodedata.normalize("NFKD", s)
47
+ )
48
+
49
+
50
+ def remove_symbols(s: str):
51
+ """
52
+ Replace any other markers, symbols, punctuations with a space, keeping diacritics
53
+ """
54
+ return "".join(
55
+ " " if unicodedata.category(c)[0] in "MSP" else c
56
+ for c in unicodedata.normalize("NFKC", s)
57
+ )
58
+
59
+
60
+ class BasicTextNormalizer:
61
+ def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
62
+ self.clean = (
63
+ remove_symbols_and_diacritics if remove_diacritics else remove_symbols
64
+ )
65
+ self.split_letters = split_letters
66
+
67
+ def __call__(self, s: str):
68
+ s = s.lower()
69
+ s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
70
+ s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
71
+ s = self.clean(s).lower()
72
+
73
+ if self.split_letters:
74
+ s = " ".join(regex.findall(r"\X", s, regex.U))
75
+
76
+ s = re.sub(
77
+ r"\s+", " ", s
78
+ ) # replace any successive whitespace characters with a space
79
+
80
+ return s
whisperlivekit/simul_whisper/whisper/normalizers/english.json ADDED
@@ -0,0 +1,1741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accessorise": "accessorize",
3
+ "accessorised": "accessorized",
4
+ "accessorises": "accessorizes",
5
+ "accessorising": "accessorizing",
6
+ "acclimatisation": "acclimatization",
7
+ "acclimatise": "acclimatize",
8
+ "acclimatised": "acclimatized",
9
+ "acclimatises": "acclimatizes",
10
+ "acclimatising": "acclimatizing",
11
+ "accoutrements": "accouterments",
12
+ "aeon": "eon",
13
+ "aeons": "eons",
14
+ "aerogramme": "aerogram",
15
+ "aerogrammes": "aerograms",
16
+ "aeroplane": "airplane",
17
+ "aeroplanes": "airplanes",
18
+ "aesthete": "esthete",
19
+ "aesthetes": "esthetes",
20
+ "aesthetic": "esthetic",
21
+ "aesthetically": "esthetically",
22
+ "aesthetics": "esthetics",
23
+ "aetiology": "etiology",
24
+ "ageing": "aging",
25
+ "aggrandisement": "aggrandizement",
26
+ "agonise": "agonize",
27
+ "agonised": "agonized",
28
+ "agonises": "agonizes",
29
+ "agonising": "agonizing",
30
+ "agonisingly": "agonizingly",
31
+ "almanack": "almanac",
32
+ "almanacks": "almanacs",
33
+ "aluminium": "aluminum",
34
+ "amortisable": "amortizable",
35
+ "amortisation": "amortization",
36
+ "amortisations": "amortizations",
37
+ "amortise": "amortize",
38
+ "amortised": "amortized",
39
+ "amortises": "amortizes",
40
+ "amortising": "amortizing",
41
+ "amphitheatre": "amphitheater",
42
+ "amphitheatres": "amphitheaters",
43
+ "anaemia": "anemia",
44
+ "anaemic": "anemic",
45
+ "anaesthesia": "anesthesia",
46
+ "anaesthetic": "anesthetic",
47
+ "anaesthetics": "anesthetics",
48
+ "anaesthetise": "anesthetize",
49
+ "anaesthetised": "anesthetized",
50
+ "anaesthetises": "anesthetizes",
51
+ "anaesthetising": "anesthetizing",
52
+ "anaesthetist": "anesthetist",
53
+ "anaesthetists": "anesthetists",
54
+ "anaesthetize": "anesthetize",
55
+ "anaesthetized": "anesthetized",
56
+ "anaesthetizes": "anesthetizes",
57
+ "anaesthetizing": "anesthetizing",
58
+ "analogue": "analog",
59
+ "analogues": "analogs",
60
+ "analyse": "analyze",
61
+ "analysed": "analyzed",
62
+ "analyses": "analyzes",
63
+ "analysing": "analyzing",
64
+ "anglicise": "anglicize",
65
+ "anglicised": "anglicized",
66
+ "anglicises": "anglicizes",
67
+ "anglicising": "anglicizing",
68
+ "annualised": "annualized",
69
+ "antagonise": "antagonize",
70
+ "antagonised": "antagonized",
71
+ "antagonises": "antagonizes",
72
+ "antagonising": "antagonizing",
73
+ "apologise": "apologize",
74
+ "apologised": "apologized",
75
+ "apologises": "apologizes",
76
+ "apologising": "apologizing",
77
+ "appal": "appall",
78
+ "appals": "appalls",
79
+ "appetiser": "appetizer",
80
+ "appetisers": "appetizers",
81
+ "appetising": "appetizing",
82
+ "appetisingly": "appetizingly",
83
+ "arbour": "arbor",
84
+ "arbours": "arbors",
85
+ "archeological": "archaeological",
86
+ "archaeologically": "archeologically",
87
+ "archaeologist": "archeologist",
88
+ "archaeologists": "archeologists",
89
+ "archaeology": "archeology</span>",
90
+ "ardour": "ardor",
91
+ "armour": "armor",
92
+ "armoured": "armored",
93
+ "armourer": "armorer",
94
+ "armourers": "armorers",
95
+ "armouries": "armories",
96
+ "armoury": "armory",
97
+ "artefact": "artifact",
98
+ "artefacts": "artifacts",
99
+ "authorise": "authorize",
100
+ "authorised": "authorized",
101
+ "authorises": "authorizes",
102
+ "authorising": "authorizing",
103
+ "axe": "ax",
104
+ "backpedalled": "backpedaled",
105
+ "backpedalling": "backpedaling",
106
+ "bannister": "banister",
107
+ "bannisters": "banisters",
108
+ "baptise": "baptize",
109
+ "baptised": "baptized",
110
+ "baptises": "baptizes",
111
+ "baptising": "baptizing",
112
+ "bastardise": "bastardize",
113
+ "bastardised": "bastardized",
114
+ "bastardises": "bastardizes",
115
+ "bastardising": "bastardizing",
116
+ "battleax": "battleaxe",
117
+ "baulk": "balk",
118
+ "baulked": "balked",
119
+ "baulking": "balking",
120
+ "baulks": "balks",
121
+ "bedevilled": "bedeviled",
122
+ "bedevilling": "bedeviling",
123
+ "behaviour": "behavior",
124
+ "behavioural": "behavioral",
125
+ "behaviourism": "behaviorism",
126
+ "behaviourist": "behaviorist",
127
+ "behaviourists": "behaviorists",
128
+ "behaviours": "behaviors",
129
+ "behove": "behoove",
130
+ "behoved": "behooved",
131
+ "behoves": "behooves",
132
+ "bejewelled": "bejeweled",
133
+ "belabour": "belabor",
134
+ "belaboured": "belabored",
135
+ "belabouring": "belaboring",
136
+ "belabours": "belabors",
137
+ "bevelled": "beveled",
138
+ "bevvies": "bevies",
139
+ "bevvy": "bevy",
140
+ "biassed": "biased",
141
+ "biassing": "biasing",
142
+ "bingeing": "binging",
143
+ "bougainvillaea": "bougainvillea",
144
+ "bougainvillaeas": "bougainvilleas",
145
+ "bowdlerise": "bowdlerize",
146
+ "bowdlerised": "bowdlerized",
147
+ "bowdlerises": "bowdlerizes",
148
+ "bowdlerising": "bowdlerizing",
149
+ "breathalyse": "breathalyze",
150
+ "breathalysed": "breathalyzed",
151
+ "breathalyser": "breathalyzer",
152
+ "breathalysers": "breathalyzers",
153
+ "breathalyses": "breathalyzes",
154
+ "breathalysing": "breathalyzing",
155
+ "brutalise": "brutalize",
156
+ "brutalised": "brutalized",
157
+ "brutalises": "brutalizes",
158
+ "brutalising": "brutalizing",
159
+ "busses": "buses",
160
+ "bussing": "busing",
161
+ "caesarean": "cesarean",
162
+ "caesareans": "cesareans",
163
+ "calibre": "caliber",
164
+ "calibres": "calibers",
165
+ "calliper": "caliper",
166
+ "callipers": "calipers",
167
+ "callisthenics": "calisthenics",
168
+ "canalise": "canalize",
169
+ "canalised": "canalized",
170
+ "canalises": "canalizes",
171
+ "canalising": "canalizing",
172
+ "cancelation": "cancellation",
173
+ "cancelations": "cancellations",
174
+ "cancelled": "canceled",
175
+ "cancelling": "canceling",
176
+ "candour": "candor",
177
+ "cannibalise": "cannibalize",
178
+ "cannibalised": "cannibalized",
179
+ "cannibalises": "cannibalizes",
180
+ "cannibalising": "cannibalizing",
181
+ "canonise": "canonize",
182
+ "canonised": "canonized",
183
+ "canonises": "canonizes",
184
+ "canonising": "canonizing",
185
+ "capitalise": "capitalize",
186
+ "capitalised": "capitalized",
187
+ "capitalises": "capitalizes",
188
+ "capitalising": "capitalizing",
189
+ "caramelise": "caramelize",
190
+ "caramelised": "caramelized",
191
+ "caramelises": "caramelizes",
192
+ "caramelising": "caramelizing",
193
+ "carbonise": "carbonize",
194
+ "carbonised": "carbonized",
195
+ "carbonises": "carbonizes",
196
+ "carbonising": "carbonizing",
197
+ "carolled": "caroled",
198
+ "carolling": "caroling",
199
+ "catalogue": "catalog",
200
+ "catalogued": "cataloged",
201
+ "catalogues": "catalogs",
202
+ "cataloguing": "cataloging",
203
+ "catalyse": "catalyze",
204
+ "catalysed": "catalyzed",
205
+ "catalyses": "catalyzes",
206
+ "catalysing": "catalyzing",
207
+ "categorise": "categorize",
208
+ "categorised": "categorized",
209
+ "categorises": "categorizes",
210
+ "categorising": "categorizing",
211
+ "cauterise": "cauterize",
212
+ "cauterised": "cauterized",
213
+ "cauterises": "cauterizes",
214
+ "cauterising": "cauterizing",
215
+ "cavilled": "caviled",
216
+ "cavilling": "caviling",
217
+ "centigramme": "centigram",
218
+ "centigrammes": "centigrams",
219
+ "centilitre": "centiliter",
220
+ "centilitres": "centiliters",
221
+ "centimetre": "centimeter",
222
+ "centimetres": "centimeters",
223
+ "centralise": "centralize",
224
+ "centralised": "centralized",
225
+ "centralises": "centralizes",
226
+ "centralising": "centralizing",
227
+ "centre": "center",
228
+ "centred": "centered",
229
+ "centrefold": "centerfold",
230
+ "centrefolds": "centerfolds",
231
+ "centrepiece": "centerpiece",
232
+ "centrepieces": "centerpieces",
233
+ "centres": "centers",
234
+ "channelled": "channeled",
235
+ "channelling": "channeling",
236
+ "characterise": "characterize",
237
+ "characterised": "characterized",
238
+ "characterises": "characterizes",
239
+ "characterising": "characterizing",
240
+ "cheque": "check",
241
+ "chequebook": "checkbook",
242
+ "chequebooks": "checkbooks",
243
+ "chequered": "checkered",
244
+ "cheques": "checks",
245
+ "chilli": "chili",
246
+ "chimaera": "chimera",
247
+ "chimaeras": "chimeras",
248
+ "chiselled": "chiseled",
249
+ "chiselling": "chiseling",
250
+ "circularise": "circularize",
251
+ "circularised": "circularized",
252
+ "circularises": "circularizes",
253
+ "circularising": "circularizing",
254
+ "civilise": "civilize",
255
+ "civilised": "civilized",
256
+ "civilises": "civilizes",
257
+ "civilising": "civilizing",
258
+ "clamour": "clamor",
259
+ "clamoured": "clamored",
260
+ "clamouring": "clamoring",
261
+ "clamours": "clamors",
262
+ "clangour": "clangor",
263
+ "clarinettist": "clarinetist",
264
+ "clarinettists": "clarinetists",
265
+ "collectivise": "collectivize",
266
+ "collectivised": "collectivized",
267
+ "collectivises": "collectivizes",
268
+ "collectivising": "collectivizing",
269
+ "colonisation": "colonization",
270
+ "colonise": "colonize",
271
+ "colonised": "colonized",
272
+ "coloniser": "colonizer",
273
+ "colonisers": "colonizers",
274
+ "colonises": "colonizes",
275
+ "colonising": "colonizing",
276
+ "colour": "color",
277
+ "colourant": "colorant",
278
+ "colourants": "colorants",
279
+ "coloured": "colored",
280
+ "coloureds": "coloreds",
281
+ "colourful": "colorful",
282
+ "colourfully": "colorfully",
283
+ "colouring": "coloring",
284
+ "colourize": "colorize",
285
+ "colourized": "colorized",
286
+ "colourizes": "colorizes",
287
+ "colourizing": "colorizing",
288
+ "colourless": "colorless",
289
+ "colours": "colors",
290
+ "commercialise": "commercialize",
291
+ "commercialised": "commercialized",
292
+ "commercialises": "commercializes",
293
+ "commercialising": "commercializing",
294
+ "compartmentalise": "compartmentalize",
295
+ "compartmentalised": "compartmentalized",
296
+ "compartmentalises": "compartmentalizes",
297
+ "compartmentalising": "compartmentalizing",
298
+ "computerise": "computerize",
299
+ "computerised": "computerized",
300
+ "computerises": "computerizes",
301
+ "computerising": "computerizing",
302
+ "conceptualise": "conceptualize",
303
+ "conceptualised": "conceptualized",
304
+ "conceptualises": "conceptualizes",
305
+ "conceptualising": "conceptualizing",
306
+ "connexion": "connection",
307
+ "connexions": "connections",
308
+ "contextualise": "contextualize",
309
+ "contextualised": "contextualized",
310
+ "contextualises": "contextualizes",
311
+ "contextualising": "contextualizing",
312
+ "cosier": "cozier",
313
+ "cosies": "cozies",
314
+ "cosiest": "coziest",
315
+ "cosily": "cozily",
316
+ "cosiness": "coziness",
317
+ "cosy": "cozy",
318
+ "councillor": "councilor",
319
+ "councillors": "councilors",
320
+ "counselled": "counseled",
321
+ "counselling": "counseling",
322
+ "counsellor": "counselor",
323
+ "counsellors": "counselors",
324
+ "crenelated": "crenellated",
325
+ "criminalise": "criminalize",
326
+ "criminalised": "criminalized",
327
+ "criminalises": "criminalizes",
328
+ "criminalising": "criminalizing",
329
+ "criticise": "criticize",
330
+ "criticised": "criticized",
331
+ "criticises": "criticizes",
332
+ "criticising": "criticizing",
333
+ "crueller": "crueler",
334
+ "cruellest": "cruelest",
335
+ "crystallisation": "crystallization",
336
+ "crystallise": "crystallize",
337
+ "crystallised": "crystallized",
338
+ "crystallises": "crystallizes",
339
+ "crystallising": "crystallizing",
340
+ "cudgelled": "cudgeled",
341
+ "cudgelling": "cudgeling",
342
+ "customise": "customize",
343
+ "customised": "customized",
344
+ "customises": "customizes",
345
+ "customising": "customizing",
346
+ "cypher": "cipher",
347
+ "cyphers": "ciphers",
348
+ "decentralisation": "decentralization",
349
+ "decentralise": "decentralize",
350
+ "decentralised": "decentralized",
351
+ "decentralises": "decentralizes",
352
+ "decentralising": "decentralizing",
353
+ "decriminalisation": "decriminalization",
354
+ "decriminalise": "decriminalize",
355
+ "decriminalised": "decriminalized",
356
+ "decriminalises": "decriminalizes",
357
+ "decriminalising": "decriminalizing",
358
+ "defence": "defense",
359
+ "defenceless": "defenseless",
360
+ "defences": "defenses",
361
+ "dehumanisation": "dehumanization",
362
+ "dehumanise": "dehumanize",
363
+ "dehumanised": "dehumanized",
364
+ "dehumanises": "dehumanizes",
365
+ "dehumanising": "dehumanizing",
366
+ "demeanour": "demeanor",
367
+ "demilitarisation": "demilitarization",
368
+ "demilitarise": "demilitarize",
369
+ "demilitarised": "demilitarized",
370
+ "demilitarises": "demilitarizes",
371
+ "demilitarising": "demilitarizing",
372
+ "demobilisation": "demobilization",
373
+ "demobilise": "demobilize",
374
+ "demobilised": "demobilized",
375
+ "demobilises": "demobilizes",
376
+ "demobilising": "demobilizing",
377
+ "democratisation": "democratization",
378
+ "democratise": "democratize",
379
+ "democratised": "democratized",
380
+ "democratises": "democratizes",
381
+ "democratising": "democratizing",
382
+ "demonise": "demonize",
383
+ "demonised": "demonized",
384
+ "demonises": "demonizes",
385
+ "demonising": "demonizing",
386
+ "demoralisation": "demoralization",
387
+ "demoralise": "demoralize",
388
+ "demoralised": "demoralized",
389
+ "demoralises": "demoralizes",
390
+ "demoralising": "demoralizing",
391
+ "denationalisation": "denationalization",
392
+ "denationalise": "denationalize",
393
+ "denationalised": "denationalized",
394
+ "denationalises": "denationalizes",
395
+ "denationalising": "denationalizing",
396
+ "deodorise": "deodorize",
397
+ "deodorised": "deodorized",
398
+ "deodorises": "deodorizes",
399
+ "deodorising": "deodorizing",
400
+ "depersonalise": "depersonalize",
401
+ "depersonalised": "depersonalized",
402
+ "depersonalises": "depersonalizes",
403
+ "depersonalising": "depersonalizing",
404
+ "deputise": "deputize",
405
+ "deputised": "deputized",
406
+ "deputises": "deputizes",
407
+ "deputising": "deputizing",
408
+ "desensitisation": "desensitization",
409
+ "desensitise": "desensitize",
410
+ "desensitised": "desensitized",
411
+ "desensitises": "desensitizes",
412
+ "desensitising": "desensitizing",
413
+ "destabilisation": "destabilization",
414
+ "destabilise": "destabilize",
415
+ "destabilised": "destabilized",
416
+ "destabilises": "destabilizes",
417
+ "destabilising": "destabilizing",
418
+ "dialled": "dialed",
419
+ "dialling": "dialing",
420
+ "dialogue": "dialog",
421
+ "dialogues": "dialogs",
422
+ "diarrhoea": "diarrhea",
423
+ "digitise": "digitize",
424
+ "digitised": "digitized",
425
+ "digitises": "digitizes",
426
+ "digitising": "digitizing",
427
+ "disc": "disk",
428
+ "discolour": "discolor",
429
+ "discoloured": "discolored",
430
+ "discolouring": "discoloring",
431
+ "discolours": "discolors",
432
+ "discs": "disks",
433
+ "disembowelled": "disemboweled",
434
+ "disembowelling": "disemboweling",
435
+ "disfavour": "disfavor",
436
+ "dishevelled": "disheveled",
437
+ "dishonour": "dishonor",
438
+ "dishonourable": "dishonorable",
439
+ "dishonourably": "dishonorably",
440
+ "dishonoured": "dishonored",
441
+ "dishonouring": "dishonoring",
442
+ "dishonours": "dishonors",
443
+ "disorganisation": "disorganization",
444
+ "disorganised": "disorganized",
445
+ "distil": "distill",
446
+ "distils": "distills",
447
+ "dramatisation": "dramatization",
448
+ "dramatisations": "dramatizations",
449
+ "dramatise": "dramatize",
450
+ "dramatised": "dramatized",
451
+ "dramatises": "dramatizes",
452
+ "dramatising": "dramatizing",
453
+ "draught": "draft",
454
+ "draughtboard": "draftboard",
455
+ "draughtboards": "draftboards",
456
+ "draughtier": "draftier",
457
+ "draughtiest": "draftiest",
458
+ "draughts": "drafts",
459
+ "draughtsman": "draftsman",
460
+ "draughtsmanship": "draftsmanship",
461
+ "draughtsmen": "draftsmen",
462
+ "draughtswoman": "draftswoman",
463
+ "draughtswomen": "draftswomen",
464
+ "draughty": "drafty",
465
+ "drivelled": "driveled",
466
+ "drivelling": "driveling",
467
+ "duelled": "dueled",
468
+ "duelling": "dueling",
469
+ "economise": "economize",
470
+ "economised": "economized",
471
+ "economises": "economizes",
472
+ "economising": "economizing",
473
+ "edoema": "edema",
474
+ "editorialise": "editorialize",
475
+ "editorialised": "editorialized",
476
+ "editorialises": "editorializes",
477
+ "editorialising": "editorializing",
478
+ "empathise": "empathize",
479
+ "empathised": "empathized",
480
+ "empathises": "empathizes",
481
+ "empathising": "empathizing",
482
+ "emphasise": "emphasize",
483
+ "emphasised": "emphasized",
484
+ "emphasises": "emphasizes",
485
+ "emphasising": "emphasizing",
486
+ "enamelled": "enameled",
487
+ "enamelling": "enameling",
488
+ "enamoured": "enamored",
489
+ "encyclopaedia": "encyclopedia",
490
+ "encyclopaedias": "encyclopedias",
491
+ "encyclopaedic": "encyclopedic",
492
+ "endeavour": "endeavor",
493
+ "endeavoured": "endeavored",
494
+ "endeavouring": "endeavoring",
495
+ "endeavours": "endeavors",
496
+ "energise": "energize",
497
+ "energised": "energized",
498
+ "energises": "energizes",
499
+ "energising": "energizing",
500
+ "enrol": "enroll",
501
+ "enrols": "enrolls",
502
+ "enthral": "enthrall",
503
+ "enthrals": "enthralls",
504
+ "epaulette": "epaulet",
505
+ "epaulettes": "epaulets",
506
+ "epicentre": "epicenter",
507
+ "epicentres": "epicenters",
508
+ "epilogue": "epilog",
509
+ "epilogues": "epilogs",
510
+ "epitomise": "epitomize",
511
+ "epitomised": "epitomized",
512
+ "epitomises": "epitomizes",
513
+ "epitomising": "epitomizing",
514
+ "equalisation": "equalization",
515
+ "equalise": "equalize",
516
+ "equalised": "equalized",
517
+ "equaliser": "equalizer",
518
+ "equalisers": "equalizers",
519
+ "equalises": "equalizes",
520
+ "equalising": "equalizing",
521
+ "eulogise": "eulogize",
522
+ "eulogised": "eulogized",
523
+ "eulogises": "eulogizes",
524
+ "eulogising": "eulogizing",
525
+ "evangelise": "evangelize",
526
+ "evangelised": "evangelized",
527
+ "evangelises": "evangelizes",
528
+ "evangelising": "evangelizing",
529
+ "exorcise": "exorcize",
530
+ "exorcised": "exorcized",
531
+ "exorcises": "exorcizes",
532
+ "exorcising": "exorcizing",
533
+ "extemporisation": "extemporization",
534
+ "extemporise": "extemporize",
535
+ "extemporised": "extemporized",
536
+ "extemporises": "extemporizes",
537
+ "extemporising": "extemporizing",
538
+ "externalisation": "externalization",
539
+ "externalisations": "externalizations",
540
+ "externalise": "externalize",
541
+ "externalised": "externalized",
542
+ "externalises": "externalizes",
543
+ "externalising": "externalizing",
544
+ "factorise": "factorize",
545
+ "factorised": "factorized",
546
+ "factorises": "factorizes",
547
+ "factorising": "factorizing",
548
+ "faecal": "fecal",
549
+ "faeces": "feces",
550
+ "familiarisation": "familiarization",
551
+ "familiarise": "familiarize",
552
+ "familiarised": "familiarized",
553
+ "familiarises": "familiarizes",
554
+ "familiarising": "familiarizing",
555
+ "fantasise": "fantasize",
556
+ "fantasised": "fantasized",
557
+ "fantasises": "fantasizes",
558
+ "fantasising": "fantasizing",
559
+ "favour": "favor",
560
+ "favourable": "favorable",
561
+ "favourably": "favorably",
562
+ "favoured": "favored",
563
+ "favouring": "favoring",
564
+ "favourite": "favorite",
565
+ "favourites": "favorites",
566
+ "favouritism": "favoritism",
567
+ "favours": "favors",
568
+ "feminise": "feminize",
569
+ "feminised": "feminized",
570
+ "feminises": "feminizes",
571
+ "feminising": "feminizing",
572
+ "fertilisation": "fertilization",
573
+ "fertilise": "fertilize",
574
+ "fertilised": "fertilized",
575
+ "fertiliser": "fertilizer",
576
+ "fertilisers": "fertilizers",
577
+ "fertilises": "fertilizes",
578
+ "fertilising": "fertilizing",
579
+ "fervour": "fervor",
580
+ "fibre": "fiber",
581
+ "fibreglass": "fiberglass",
582
+ "fibres": "fibers",
583
+ "fictionalisation": "fictionalization",
584
+ "fictionalisations": "fictionalizations",
585
+ "fictionalise": "fictionalize",
586
+ "fictionalised": "fictionalized",
587
+ "fictionalises": "fictionalizes",
588
+ "fictionalising": "fictionalizing",
589
+ "fillet": "filet",
590
+ "filleted": "fileted",
591
+ "filleting": "fileting",
592
+ "fillets": "filets",
593
+ "finalisation": "finalization",
594
+ "finalise": "finalize",
595
+ "finalised": "finalized",
596
+ "finalises": "finalizes",
597
+ "finalising": "finalizing",
598
+ "flautist": "flutist",
599
+ "flautists": "flutists",
600
+ "flavour": "flavor",
601
+ "flavoured": "flavored",
602
+ "flavouring": "flavoring",
603
+ "flavourings": "flavorings",
604
+ "flavourless": "flavorless",
605
+ "flavours": "flavors",
606
+ "flavoursome": "flavorsome",
607
+ "flyer / flier": "flier / flyer",
608
+ "foetal": "fetal",
609
+ "foetid": "fetid",
610
+ "foetus": "fetus",
611
+ "foetuses": "fetuses",
612
+ "formalisation": "formalization",
613
+ "formalise": "formalize",
614
+ "formalised": "formalized",
615
+ "formalises": "formalizes",
616
+ "formalising": "formalizing",
617
+ "fossilisation": "fossilization",
618
+ "fossilise": "fossilize",
619
+ "fossilised": "fossilized",
620
+ "fossilises": "fossilizes",
621
+ "fossilising": "fossilizing",
622
+ "fraternisation": "fraternization",
623
+ "fraternise": "fraternize",
624
+ "fraternised": "fraternized",
625
+ "fraternises": "fraternizes",
626
+ "fraternising": "fraternizing",
627
+ "fulfil": "fulfill",
628
+ "fulfilment": "fulfillment",
629
+ "fulfils": "fulfills",
630
+ "funnelled": "funneled",
631
+ "funnelling": "funneling",
632
+ "galvanise": "galvanize",
633
+ "galvanised": "galvanized",
634
+ "galvanises": "galvanizes",
635
+ "galvanising": "galvanizing",
636
+ "gambolled": "gamboled",
637
+ "gambolling": "gamboling",
638
+ "gaol": "jail",
639
+ "gaolbird": "jailbird",
640
+ "gaolbirds": "jailbirds",
641
+ "gaolbreak": "jailbreak",
642
+ "gaolbreaks": "jailbreaks",
643
+ "gaoled": "jailed",
644
+ "gaoler": "jailer",
645
+ "gaolers": "jailers",
646
+ "gaoling": "jailing",
647
+ "gaols": "jails",
648
+ "gasses": "gases",
649
+ "gage": "gauge",
650
+ "gaged": "gauged",
651
+ "gages": "gauges",
652
+ "gaging": "gauging",
653
+ "generalisation": "generalization",
654
+ "generalisations": "generalizations",
655
+ "generalise": "generalize",
656
+ "generalised": "generalized",
657
+ "generalises": "generalizes",
658
+ "generalising": "generalizing",
659
+ "ghettoise": "ghettoize",
660
+ "ghettoised": "ghettoized",
661
+ "ghettoises": "ghettoizes",
662
+ "ghettoising": "ghettoizing",
663
+ "gipsies": "gypsies",
664
+ "glamorise": "glamorize",
665
+ "glamorised": "glamorized",
666
+ "glamorises": "glamorizes",
667
+ "glamorising": "glamorizing",
668
+ "glamor": "glamour",
669
+ "globalisation": "globalization",
670
+ "globalise": "globalize",
671
+ "globalised": "globalized",
672
+ "globalises": "globalizes",
673
+ "globalising": "globalizing",
674
+ "glueing": "gluing",
675
+ "goitre": "goiter",
676
+ "goitres": "goiters",
677
+ "gonorrhoea": "gonorrhea",
678
+ "gramme": "gram",
679
+ "grammes": "grams",
680
+ "gravelled": "graveled",
681
+ "grey": "gray",
682
+ "greyed": "grayed",
683
+ "greying": "graying",
684
+ "greyish": "grayish",
685
+ "greyness": "grayness",
686
+ "greys": "grays",
687
+ "grovelled": "groveled",
688
+ "grovelling": "groveling",
689
+ "groyne": "groin",
690
+ "groynes": "groins",
691
+ "gruelling": "grueling",
692
+ "gruellingly": "gruelingly",
693
+ "gryphon": "griffin",
694
+ "gryphons": "griffins",
695
+ "gynaecological": "gynecological",
696
+ "gynaecologist": "gynecologist",
697
+ "gynaecologists": "gynecologists",
698
+ "gynaecology": "gynecology",
699
+ "haematological": "hematological",
700
+ "haematologist": "hematologist",
701
+ "haematologists": "hematologists",
702
+ "haematology": "hematology",
703
+ "haemoglobin": "hemoglobin",
704
+ "haemophilia": "hemophilia",
705
+ "haemophiliac": "hemophiliac",
706
+ "haemophiliacs": "hemophiliacs",
707
+ "haemorrhage": "hemorrhage",
708
+ "haemorrhaged": "hemorrhaged",
709
+ "haemorrhages": "hemorrhages",
710
+ "haemorrhaging": "hemorrhaging",
711
+ "haemorrhoids": "hemorrhoids",
712
+ "harbour": "harbor",
713
+ "harboured": "harbored",
714
+ "harbouring": "harboring",
715
+ "harbours": "harbors",
716
+ "harmonisation": "harmonization",
717
+ "harmonise": "harmonize",
718
+ "harmonised": "harmonized",
719
+ "harmonises": "harmonizes",
720
+ "harmonising": "harmonizing",
721
+ "homoeopath": "homeopath",
722
+ "homoeopathic": "homeopathic",
723
+ "homoeopaths": "homeopaths",
724
+ "homoeopathy": "homeopathy",
725
+ "homogenise": "homogenize",
726
+ "homogenised": "homogenized",
727
+ "homogenises": "homogenizes",
728
+ "homogenising": "homogenizing",
729
+ "honour": "honor",
730
+ "honourable": "honorable",
731
+ "honourably": "honorably",
732
+ "honoured": "honored",
733
+ "honouring": "honoring",
734
+ "honours": "honors",
735
+ "hospitalisation": "hospitalization",
736
+ "hospitalise": "hospitalize",
737
+ "hospitalised": "hospitalized",
738
+ "hospitalises": "hospitalizes",
739
+ "hospitalising": "hospitalizing",
740
+ "humanise": "humanize",
741
+ "humanised": "humanized",
742
+ "humanises": "humanizes",
743
+ "humanising": "humanizing",
744
+ "humour": "humor",
745
+ "humoured": "humored",
746
+ "humouring": "humoring",
747
+ "humourless": "humorless",
748
+ "humours": "humors",
749
+ "hybridise": "hybridize",
750
+ "hybridised": "hybridized",
751
+ "hybridises": "hybridizes",
752
+ "hybridising": "hybridizing",
753
+ "hypnotise": "hypnotize",
754
+ "hypnotised": "hypnotized",
755
+ "hypnotises": "hypnotizes",
756
+ "hypnotising": "hypnotizing",
757
+ "hypothesise": "hypothesize",
758
+ "hypothesised": "hypothesized",
759
+ "hypothesises": "hypothesizes",
760
+ "hypothesising": "hypothesizing",
761
+ "idealisation": "idealization",
762
+ "idealise": "idealize",
763
+ "idealised": "idealized",
764
+ "idealises": "idealizes",
765
+ "idealising": "idealizing",
766
+ "idolise": "idolize",
767
+ "idolised": "idolized",
768
+ "idolises": "idolizes",
769
+ "idolising": "idolizing",
770
+ "immobilisation": "immobilization",
771
+ "immobilise": "immobilize",
772
+ "immobilised": "immobilized",
773
+ "immobiliser": "immobilizer",
774
+ "immobilisers": "immobilizers",
775
+ "immobilises": "immobilizes",
776
+ "immobilising": "immobilizing",
777
+ "immortalise": "immortalize",
778
+ "immortalised": "immortalized",
779
+ "immortalises": "immortalizes",
780
+ "immortalising": "immortalizing",
781
+ "immunisation": "immunization",
782
+ "immunise": "immunize",
783
+ "immunised": "immunized",
784
+ "immunises": "immunizes",
785
+ "immunising": "immunizing",
786
+ "impanelled": "impaneled",
787
+ "impanelling": "impaneling",
788
+ "imperilled": "imperiled",
789
+ "imperilling": "imperiling",
790
+ "individualise": "individualize",
791
+ "individualised": "individualized",
792
+ "individualises": "individualizes",
793
+ "individualising": "individualizing",
794
+ "industrialise": "industrialize",
795
+ "industrialised": "industrialized",
796
+ "industrialises": "industrializes",
797
+ "industrialising": "industrializing",
798
+ "inflexion": "inflection",
799
+ "inflexions": "inflections",
800
+ "initialise": "initialize",
801
+ "initialised": "initialized",
802
+ "initialises": "initializes",
803
+ "initialising": "initializing",
804
+ "initialled": "initialed",
805
+ "initialling": "initialing",
806
+ "instal": "install",
807
+ "instalment": "installment",
808
+ "instalments": "installments",
809
+ "instals": "installs",
810
+ "instil": "instill",
811
+ "instils": "instills",
812
+ "institutionalisation": "institutionalization",
813
+ "institutionalise": "institutionalize",
814
+ "institutionalised": "institutionalized",
815
+ "institutionalises": "institutionalizes",
816
+ "institutionalising": "institutionalizing",
817
+ "intellectualise": "intellectualize",
818
+ "intellectualised": "intellectualized",
819
+ "intellectualises": "intellectualizes",
820
+ "intellectualising": "intellectualizing",
821
+ "internalisation": "internalization",
822
+ "internalise": "internalize",
823
+ "internalised": "internalized",
824
+ "internalises": "internalizes",
825
+ "internalising": "internalizing",
826
+ "internationalisation": "internationalization",
827
+ "internationalise": "internationalize",
828
+ "internationalised": "internationalized",
829
+ "internationalises": "internationalizes",
830
+ "internationalising": "internationalizing",
831
+ "ionisation": "ionization",
832
+ "ionise": "ionize",
833
+ "ionised": "ionized",
834
+ "ioniser": "ionizer",
835
+ "ionisers": "ionizers",
836
+ "ionises": "ionizes",
837
+ "ionising": "ionizing",
838
+ "italicise": "italicize",
839
+ "italicised": "italicized",
840
+ "italicises": "italicizes",
841
+ "italicising": "italicizing",
842
+ "itemise": "itemize",
843
+ "itemised": "itemized",
844
+ "itemises": "itemizes",
845
+ "itemising": "itemizing",
846
+ "jeopardise": "jeopardize",
847
+ "jeopardised": "jeopardized",
848
+ "jeopardises": "jeopardizes",
849
+ "jeopardising": "jeopardizing",
850
+ "jewelled": "jeweled",
851
+ "jeweller": "jeweler",
852
+ "jewellers": "jewelers",
853
+ "jewellery": "jewelry",
854
+ "judgement": "judgment",
855
+ "kilogramme": "kilogram",
856
+ "kilogrammes": "kilograms",
857
+ "kilometre": "kilometer",
858
+ "kilometres": "kilometers",
859
+ "labelled": "labeled",
860
+ "labelling": "labeling",
861
+ "labour": "labor",
862
+ "laboured": "labored",
863
+ "labourer": "laborer",
864
+ "labourers": "laborers",
865
+ "labouring": "laboring",
866
+ "labours": "labors",
867
+ "lacklustre": "lackluster",
868
+ "legalisation": "legalization",
869
+ "legalise": "legalize",
870
+ "legalised": "legalized",
871
+ "legalises": "legalizes",
872
+ "legalising": "legalizing",
873
+ "legitimise": "legitimize",
874
+ "legitimised": "legitimized",
875
+ "legitimises": "legitimizes",
876
+ "legitimising": "legitimizing",
877
+ "leukaemia": "leukemia",
878
+ "levelled": "leveled",
879
+ "leveller": "leveler",
880
+ "levellers": "levelers",
881
+ "levelling": "leveling",
882
+ "libelled": "libeled",
883
+ "libelling": "libeling",
884
+ "libellous": "libelous",
885
+ "liberalisation": "liberalization",
886
+ "liberalise": "liberalize",
887
+ "liberalised": "liberalized",
888
+ "liberalises": "liberalizes",
889
+ "liberalising": "liberalizing",
890
+ "licence": "license",
891
+ "licenced": "licensed",
892
+ "licences": "licenses",
893
+ "licencing": "licensing",
894
+ "likeable": "likable",
895
+ "lionisation": "lionization",
896
+ "lionise": "lionize",
897
+ "lionised": "lionized",
898
+ "lionises": "lionizes",
899
+ "lionising": "lionizing",
900
+ "liquidise": "liquidize",
901
+ "liquidised": "liquidized",
902
+ "liquidiser": "liquidizer",
903
+ "liquidisers": "liquidizers",
904
+ "liquidises": "liquidizes",
905
+ "liquidising": "liquidizing",
906
+ "litre": "liter",
907
+ "litres": "liters",
908
+ "localise": "localize",
909
+ "localised": "localized",
910
+ "localises": "localizes",
911
+ "localising": "localizing",
912
+ "louvre": "louver",
913
+ "louvred": "louvered",
914
+ "louvres": "louvers",
915
+ "lustre": "luster",
916
+ "magnetise": "magnetize",
917
+ "magnetised": "magnetized",
918
+ "magnetises": "magnetizes",
919
+ "magnetising": "magnetizing",
920
+ "manoeuvrability": "maneuverability",
921
+ "manoeuvrable": "maneuverable",
922
+ "manoeuvre": "maneuver",
923
+ "manoeuvred": "maneuvered",
924
+ "manoeuvres": "maneuvers",
925
+ "manoeuvring": "maneuvering",
926
+ "manoeuvrings": "maneuverings",
927
+ "marginalisation": "marginalization",
928
+ "marginalise": "marginalize",
929
+ "marginalised": "marginalized",
930
+ "marginalises": "marginalizes",
931
+ "marginalising": "marginalizing",
932
+ "marshalled": "marshaled",
933
+ "marshalling": "marshaling",
934
+ "marvelled": "marveled",
935
+ "marvelling": "marveling",
936
+ "marvellous": "marvelous",
937
+ "marvellously": "marvelously",
938
+ "materialisation": "materialization",
939
+ "materialise": "materialize",
940
+ "materialised": "materialized",
941
+ "materialises": "materializes",
942
+ "materialising": "materializing",
943
+ "maximisation": "maximization",
944
+ "maximise": "maximize",
945
+ "maximised": "maximized",
946
+ "maximises": "maximizes",
947
+ "maximising": "maximizing",
948
+ "meagre": "meager",
949
+ "mechanisation": "mechanization",
950
+ "mechanise": "mechanize",
951
+ "mechanised": "mechanized",
952
+ "mechanises": "mechanizes",
953
+ "mechanising": "mechanizing",
954
+ "mediaeval": "medieval",
955
+ "memorialise": "memorialize",
956
+ "memorialised": "memorialized",
957
+ "memorialises": "memorializes",
958
+ "memorialising": "memorializing",
959
+ "memorise": "memorize",
960
+ "memorised": "memorized",
961
+ "memorises": "memorizes",
962
+ "memorising": "memorizing",
963
+ "mesmerise": "mesmerize",
964
+ "mesmerised": "mesmerized",
965
+ "mesmerises": "mesmerizes",
966
+ "mesmerising": "mesmerizing",
967
+ "metabolise": "metabolize",
968
+ "metabolised": "metabolized",
969
+ "metabolises": "metabolizes",
970
+ "metabolising": "metabolizing",
971
+ "metre": "meter",
972
+ "metres": "meters",
973
+ "micrometre": "micrometer",
974
+ "micrometres": "micrometers",
975
+ "militarise": "militarize",
976
+ "militarised": "militarized",
977
+ "militarises": "militarizes",
978
+ "militarising": "militarizing",
979
+ "milligramme": "milligram",
980
+ "milligrammes": "milligrams",
981
+ "millilitre": "milliliter",
982
+ "millilitres": "milliliters",
983
+ "millimetre": "millimeter",
984
+ "millimetres": "millimeters",
985
+ "miniaturisation": "miniaturization",
986
+ "miniaturise": "miniaturize",
987
+ "miniaturised": "miniaturized",
988
+ "miniaturises": "miniaturizes",
989
+ "miniaturising": "miniaturizing",
990
+ "minibusses": "minibuses",
991
+ "minimise": "minimize",
992
+ "minimised": "minimized",
993
+ "minimises": "minimizes",
994
+ "minimising": "minimizing",
995
+ "misbehaviour": "misbehavior",
996
+ "misdemeanour": "misdemeanor",
997
+ "misdemeanours": "misdemeanors",
998
+ "misspelt": "misspelled",
999
+ "mitre": "miter",
1000
+ "mitres": "miters",
1001
+ "mobilisation": "mobilization",
1002
+ "mobilise": "mobilize",
1003
+ "mobilised": "mobilized",
1004
+ "mobilises": "mobilizes",
1005
+ "mobilising": "mobilizing",
1006
+ "modelled": "modeled",
1007
+ "modeller": "modeler",
1008
+ "modellers": "modelers",
1009
+ "modelling": "modeling",
1010
+ "modernise": "modernize",
1011
+ "modernised": "modernized",
1012
+ "modernises": "modernizes",
1013
+ "modernising": "modernizing",
1014
+ "moisturise": "moisturize",
1015
+ "moisturised": "moisturized",
1016
+ "moisturiser": "moisturizer",
1017
+ "moisturisers": "moisturizers",
1018
+ "moisturises": "moisturizes",
1019
+ "moisturising": "moisturizing",
1020
+ "monologue": "monolog",
1021
+ "monologues": "monologs",
1022
+ "monopolisation": "monopolization",
1023
+ "monopolise": "monopolize",
1024
+ "monopolised": "monopolized",
1025
+ "monopolises": "monopolizes",
1026
+ "monopolising": "monopolizing",
1027
+ "moralise": "moralize",
1028
+ "moralised": "moralized",
1029
+ "moralises": "moralizes",
1030
+ "moralising": "moralizing",
1031
+ "motorised": "motorized",
1032
+ "mould": "mold",
1033
+ "moulded": "molded",
1034
+ "moulder": "molder",
1035
+ "mouldered": "moldered",
1036
+ "mouldering": "moldering",
1037
+ "moulders": "molders",
1038
+ "mouldier": "moldier",
1039
+ "mouldiest": "moldiest",
1040
+ "moulding": "molding",
1041
+ "mouldings": "moldings",
1042
+ "moulds": "molds",
1043
+ "mouldy": "moldy",
1044
+ "moult": "molt",
1045
+ "moulted": "molted",
1046
+ "moulting": "molting",
1047
+ "moults": "molts",
1048
+ "moustache": "mustache",
1049
+ "moustached": "mustached",
1050
+ "moustaches": "mustaches",
1051
+ "moustachioed": "mustachioed",
1052
+ "multicoloured": "multicolored",
1053
+ "nationalisation": "nationalization",
1054
+ "nationalisations": "nationalizations",
1055
+ "nationalise": "nationalize",
1056
+ "nationalised": "nationalized",
1057
+ "nationalises": "nationalizes",
1058
+ "nationalising": "nationalizing",
1059
+ "naturalisation": "naturalization",
1060
+ "naturalise": "naturalize",
1061
+ "naturalised": "naturalized",
1062
+ "naturalises": "naturalizes",
1063
+ "naturalising": "naturalizing",
1064
+ "neighbour": "neighbor",
1065
+ "neighbourhood": "neighborhood",
1066
+ "neighbourhoods": "neighborhoods",
1067
+ "neighbouring": "neighboring",
1068
+ "neighbourliness": "neighborliness",
1069
+ "neighbourly": "neighborly",
1070
+ "neighbours": "neighbors",
1071
+ "neutralisation": "neutralization",
1072
+ "neutralise": "neutralize",
1073
+ "neutralised": "neutralized",
1074
+ "neutralises": "neutralizes",
1075
+ "neutralising": "neutralizing",
1076
+ "normalisation": "normalization",
1077
+ "normalise": "normalize",
1078
+ "normalised": "normalized",
1079
+ "normalises": "normalizes",
1080
+ "normalising": "normalizing",
1081
+ "odour": "odor",
1082
+ "odourless": "odorless",
1083
+ "odours": "odors",
1084
+ "oesophagus": "esophagus",
1085
+ "oesophaguses": "esophaguses",
1086
+ "oestrogen": "estrogen",
1087
+ "offence": "offense",
1088
+ "offences": "offenses",
1089
+ "omelette": "omelet",
1090
+ "omelettes": "omelets",
1091
+ "optimise": "optimize",
1092
+ "optimised": "optimized",
1093
+ "optimises": "optimizes",
1094
+ "optimising": "optimizing",
1095
+ "organisation": "organization",
1096
+ "organisational": "organizational",
1097
+ "organisations": "organizations",
1098
+ "organise": "organize",
1099
+ "organised": "organized",
1100
+ "organiser": "organizer",
1101
+ "organisers": "organizers",
1102
+ "organises": "organizes",
1103
+ "organising": "organizing",
1104
+ "orthopaedic": "orthopedic",
1105
+ "orthopaedics": "orthopedics",
1106
+ "ostracise": "ostracize",
1107
+ "ostracised": "ostracized",
1108
+ "ostracises": "ostracizes",
1109
+ "ostracising": "ostracizing",
1110
+ "outmanoeuvre": "outmaneuver",
1111
+ "outmanoeuvred": "outmaneuvered",
1112
+ "outmanoeuvres": "outmaneuvers",
1113
+ "outmanoeuvring": "outmaneuvering",
1114
+ "overemphasise": "overemphasize",
1115
+ "overemphasised": "overemphasized",
1116
+ "overemphasises": "overemphasizes",
1117
+ "overemphasising": "overemphasizing",
1118
+ "oxidisation": "oxidization",
1119
+ "oxidise": "oxidize",
1120
+ "oxidised": "oxidized",
1121
+ "oxidises": "oxidizes",
1122
+ "oxidising": "oxidizing",
1123
+ "paederast": "pederast",
1124
+ "paederasts": "pederasts",
1125
+ "paediatric": "pediatric",
1126
+ "paediatrician": "pediatrician",
1127
+ "paediatricians": "pediatricians",
1128
+ "paediatrics": "pediatrics",
1129
+ "paedophile": "pedophile",
1130
+ "paedophiles": "pedophiles",
1131
+ "paedophilia": "pedophilia",
1132
+ "palaeolithic": "paleolithic",
1133
+ "palaeontologist": "paleontologist",
1134
+ "palaeontologists": "paleontologists",
1135
+ "palaeontology": "paleontology",
1136
+ "panelled": "paneled",
1137
+ "panelling": "paneling",
1138
+ "panellist": "panelist",
1139
+ "panellists": "panelists",
1140
+ "paralyse": "paralyze",
1141
+ "paralysed": "paralyzed",
1142
+ "paralyses": "paralyzes",
1143
+ "paralysing": "paralyzing",
1144
+ "parcelled": "parceled",
1145
+ "parcelling": "parceling",
1146
+ "parlour": "parlor",
1147
+ "parlours": "parlors",
1148
+ "particularise": "particularize",
1149
+ "particularised": "particularized",
1150
+ "particularises": "particularizes",
1151
+ "particularising": "particularizing",
1152
+ "passivisation": "passivization",
1153
+ "passivise": "passivize",
1154
+ "passivised": "passivized",
1155
+ "passivises": "passivizes",
1156
+ "passivising": "passivizing",
1157
+ "pasteurisation": "pasteurization",
1158
+ "pasteurise": "pasteurize",
1159
+ "pasteurised": "pasteurized",
1160
+ "pasteurises": "pasteurizes",
1161
+ "pasteurising": "pasteurizing",
1162
+ "patronise": "patronize",
1163
+ "patronised": "patronized",
1164
+ "patronises": "patronizes",
1165
+ "patronising": "patronizing",
1166
+ "patronisingly": "patronizingly",
1167
+ "pedalled": "pedaled",
1168
+ "pedalling": "pedaling",
1169
+ "pedestrianisation": "pedestrianization",
1170
+ "pedestrianise": "pedestrianize",
1171
+ "pedestrianised": "pedestrianized",
1172
+ "pedestrianises": "pedestrianizes",
1173
+ "pedestrianising": "pedestrianizing",
1174
+ "penalise": "penalize",
1175
+ "penalised": "penalized",
1176
+ "penalises": "penalizes",
1177
+ "penalising": "penalizing",
1178
+ "pencilled": "penciled",
1179
+ "pencilling": "penciling",
1180
+ "personalise": "personalize",
1181
+ "personalised": "personalized",
1182
+ "personalises": "personalizes",
1183
+ "personalising": "personalizing",
1184
+ "pharmacopoeia": "pharmacopeia",
1185
+ "pharmacopoeias": "pharmacopeias",
1186
+ "philosophise": "philosophize",
1187
+ "philosophised": "philosophized",
1188
+ "philosophises": "philosophizes",
1189
+ "philosophising": "philosophizing",
1190
+ "philtre": "filter",
1191
+ "philtres": "filters",
1192
+ "phoney": "phony",
1193
+ "plagiarise": "plagiarize",
1194
+ "plagiarised": "plagiarized",
1195
+ "plagiarises": "plagiarizes",
1196
+ "plagiarising": "plagiarizing",
1197
+ "plough": "plow",
1198
+ "ploughed": "plowed",
1199
+ "ploughing": "plowing",
1200
+ "ploughman": "plowman",
1201
+ "ploughmen": "plowmen",
1202
+ "ploughs": "plows",
1203
+ "ploughshare": "plowshare",
1204
+ "ploughshares": "plowshares",
1205
+ "polarisation": "polarization",
1206
+ "polarise": "polarize",
1207
+ "polarised": "polarized",
1208
+ "polarises": "polarizes",
1209
+ "polarising": "polarizing",
1210
+ "politicisation": "politicization",
1211
+ "politicise": "politicize",
1212
+ "politicised": "politicized",
1213
+ "politicises": "politicizes",
1214
+ "politicising": "politicizing",
1215
+ "popularisation": "popularization",
1216
+ "popularise": "popularize",
1217
+ "popularised": "popularized",
1218
+ "popularises": "popularizes",
1219
+ "popularising": "popularizing",
1220
+ "pouffe": "pouf",
1221
+ "pouffes": "poufs",
1222
+ "practise": "practice",
1223
+ "practised": "practiced",
1224
+ "practises": "practices",
1225
+ "practising": "practicing",
1226
+ "praesidium": "presidium",
1227
+ "praesidiums": "presidiums",
1228
+ "pressurisation": "pressurization",
1229
+ "pressurise": "pressurize",
1230
+ "pressurised": "pressurized",
1231
+ "pressurises": "pressurizes",
1232
+ "pressurising": "pressurizing",
1233
+ "pretence": "pretense",
1234
+ "pretences": "pretenses",
1235
+ "primaeval": "primeval",
1236
+ "prioritisation": "prioritization",
1237
+ "prioritise": "prioritize",
1238
+ "prioritised": "prioritized",
1239
+ "prioritises": "prioritizes",
1240
+ "prioritising": "prioritizing",
1241
+ "privatisation": "privatization",
1242
+ "privatisations": "privatizations",
1243
+ "privatise": "privatize",
1244
+ "privatised": "privatized",
1245
+ "privatises": "privatizes",
1246
+ "privatising": "privatizing",
1247
+ "professionalisation": "professionalization",
1248
+ "professionalise": "professionalize",
1249
+ "professionalised": "professionalized",
1250
+ "professionalises": "professionalizes",
1251
+ "professionalising": "professionalizing",
1252
+ "programme": "program",
1253
+ "programmes": "programs",
1254
+ "prologue": "prolog",
1255
+ "prologues": "prologs",
1256
+ "propagandise": "propagandize",
1257
+ "propagandised": "propagandized",
1258
+ "propagandises": "propagandizes",
1259
+ "propagandising": "propagandizing",
1260
+ "proselytise": "proselytize",
1261
+ "proselytised": "proselytized",
1262
+ "proselytiser": "proselytizer",
1263
+ "proselytisers": "proselytizers",
1264
+ "proselytises": "proselytizes",
1265
+ "proselytising": "proselytizing",
1266
+ "psychoanalyse": "psychoanalyze",
1267
+ "psychoanalysed": "psychoanalyzed",
1268
+ "psychoanalyses": "psychoanalyzes",
1269
+ "psychoanalysing": "psychoanalyzing",
1270
+ "publicise": "publicize",
1271
+ "publicised": "publicized",
1272
+ "publicises": "publicizes",
1273
+ "publicising": "publicizing",
1274
+ "pulverisation": "pulverization",
1275
+ "pulverise": "pulverize",
1276
+ "pulverised": "pulverized",
1277
+ "pulverises": "pulverizes",
1278
+ "pulverising": "pulverizing",
1279
+ "pummelled": "pummel",
1280
+ "pummelling": "pummeled",
1281
+ "pyjama": "pajama",
1282
+ "pyjamas": "pajamas",
1283
+ "pzazz": "pizzazz",
1284
+ "quarrelled": "quarreled",
1285
+ "quarrelling": "quarreling",
1286
+ "radicalise": "radicalize",
1287
+ "radicalised": "radicalized",
1288
+ "radicalises": "radicalizes",
1289
+ "radicalising": "radicalizing",
1290
+ "rancour": "rancor",
1291
+ "randomise": "randomize",
1292
+ "randomised": "randomized",
1293
+ "randomises": "randomizes",
1294
+ "randomising": "randomizing",
1295
+ "rationalisation": "rationalization",
1296
+ "rationalisations": "rationalizations",
1297
+ "rationalise": "rationalize",
1298
+ "rationalised": "rationalized",
1299
+ "rationalises": "rationalizes",
1300
+ "rationalising": "rationalizing",
1301
+ "ravelled": "raveled",
1302
+ "ravelling": "raveling",
1303
+ "realisable": "realizable",
1304
+ "realisation": "realization",
1305
+ "realisations": "realizations",
1306
+ "realise": "realize",
1307
+ "realised": "realized",
1308
+ "realises": "realizes",
1309
+ "realising": "realizing",
1310
+ "recognisable": "recognizable",
1311
+ "recognisably": "recognizably",
1312
+ "recognisance": "recognizance",
1313
+ "recognise": "recognize",
1314
+ "recognised": "recognized",
1315
+ "recognises": "recognizes",
1316
+ "recognising": "recognizing",
1317
+ "reconnoitre": "reconnoiter",
1318
+ "reconnoitred": "reconnoitered",
1319
+ "reconnoitres": "reconnoiters",
1320
+ "reconnoitring": "reconnoitering",
1321
+ "refuelled": "refueled",
1322
+ "refuelling": "refueling",
1323
+ "regularisation": "regularization",
1324
+ "regularise": "regularize",
1325
+ "regularised": "regularized",
1326
+ "regularises": "regularizes",
1327
+ "regularising": "regularizing",
1328
+ "remodelled": "remodeled",
1329
+ "remodelling": "remodeling",
1330
+ "remould": "remold",
1331
+ "remoulded": "remolded",
1332
+ "remoulding": "remolding",
1333
+ "remoulds": "remolds",
1334
+ "reorganisation": "reorganization",
1335
+ "reorganisations": "reorganizations",
1336
+ "reorganise": "reorganize",
1337
+ "reorganised": "reorganized",
1338
+ "reorganises": "reorganizes",
1339
+ "reorganising": "reorganizing",
1340
+ "revelled": "reveled",
1341
+ "reveller": "reveler",
1342
+ "revellers": "revelers",
1343
+ "revelling": "reveling",
1344
+ "revitalise": "revitalize",
1345
+ "revitalised": "revitalized",
1346
+ "revitalises": "revitalizes",
1347
+ "revitalising": "revitalizing",
1348
+ "revolutionise": "revolutionize",
1349
+ "revolutionised": "revolutionized",
1350
+ "revolutionises": "revolutionizes",
1351
+ "revolutionising": "revolutionizing",
1352
+ "rhapsodise": "rhapsodize",
1353
+ "rhapsodised": "rhapsodized",
1354
+ "rhapsodises": "rhapsodizes",
1355
+ "rhapsodising": "rhapsodizing",
1356
+ "rigour": "rigor",
1357
+ "rigours": "rigors",
1358
+ "ritualised": "ritualized",
1359
+ "rivalled": "rivaled",
1360
+ "rivalling": "rivaling",
1361
+ "romanticise": "romanticize",
1362
+ "romanticised": "romanticized",
1363
+ "romanticises": "romanticizes",
1364
+ "romanticising": "romanticizing",
1365
+ "rumour": "rumor",
1366
+ "rumoured": "rumored",
1367
+ "rumours": "rumors",
1368
+ "sabre": "saber",
1369
+ "sabres": "sabers",
1370
+ "saltpetre": "saltpeter",
1371
+ "sanitise": "sanitize",
1372
+ "sanitised": "sanitized",
1373
+ "sanitises": "sanitizes",
1374
+ "sanitising": "sanitizing",
1375
+ "satirise": "satirize",
1376
+ "satirised": "satirized",
1377
+ "satirises": "satirizes",
1378
+ "satirising": "satirizing",
1379
+ "saviour": "savior",
1380
+ "saviours": "saviors",
1381
+ "savour": "savor",
1382
+ "savoured": "savored",
1383
+ "savouries": "savories",
1384
+ "savouring": "savoring",
1385
+ "savours": "savors",
1386
+ "savoury": "savory",
1387
+ "scandalise": "scandalize",
1388
+ "scandalised": "scandalized",
1389
+ "scandalises": "scandalizes",
1390
+ "scandalising": "scandalizing",
1391
+ "sceptic": "skeptic",
1392
+ "sceptical": "skeptical",
1393
+ "sceptically": "skeptically",
1394
+ "scepticism": "skepticism",
1395
+ "sceptics": "skeptics",
1396
+ "sceptre": "scepter",
1397
+ "sceptres": "scepters",
1398
+ "scrutinise": "scrutinize",
1399
+ "scrutinised": "scrutinized",
1400
+ "scrutinises": "scrutinizes",
1401
+ "scrutinising": "scrutinizing",
1402
+ "secularisation": "secularization",
1403
+ "secularise": "secularize",
1404
+ "secularised": "secularized",
1405
+ "secularises": "secularizes",
1406
+ "secularising": "secularizing",
1407
+ "sensationalise": "sensationalize",
1408
+ "sensationalised": "sensationalized",
1409
+ "sensationalises": "sensationalizes",
1410
+ "sensationalising": "sensationalizing",
1411
+ "sensitise": "sensitize",
1412
+ "sensitised": "sensitized",
1413
+ "sensitises": "sensitizes",
1414
+ "sensitising": "sensitizing",
1415
+ "sentimentalise": "sentimentalize",
1416
+ "sentimentalised": "sentimentalized",
1417
+ "sentimentalises": "sentimentalizes",
1418
+ "sentimentalising": "sentimentalizing",
1419
+ "sepulchre": "sepulcher",
1420
+ "sepulchres": "sepulchers",
1421
+ "serialisation": "serialization",
1422
+ "serialisations": "serializations",
1423
+ "serialise": "serialize",
1424
+ "serialised": "serialized",
1425
+ "serialises": "serializes",
1426
+ "serialising": "serializing",
1427
+ "sermonise": "sermonize",
1428
+ "sermonised": "sermonized",
1429
+ "sermonises": "sermonizes",
1430
+ "sermonising": "sermonizing",
1431
+ "sheikh": "sheik",
1432
+ "shovelled": "shoveled",
1433
+ "shovelling": "shoveling",
1434
+ "shrivelled": "shriveled",
1435
+ "shrivelling": "shriveling",
1436
+ "signalise": "signalize",
1437
+ "signalised": "signalized",
1438
+ "signalises": "signalizes",
1439
+ "signalising": "signalizing",
1440
+ "signalled": "signaled",
1441
+ "signalling": "signaling",
1442
+ "smoulder": "smolder",
1443
+ "smouldered": "smoldered",
1444
+ "smouldering": "smoldering",
1445
+ "smoulders": "smolders",
1446
+ "snivelled": "sniveled",
1447
+ "snivelling": "sniveling",
1448
+ "snorkelled": "snorkeled",
1449
+ "snorkelling": "snorkeling",
1450
+ "snowplough": "snowplow",
1451
+ "snowploughs": "snowplow",
1452
+ "socialisation": "socialization",
1453
+ "socialise": "socialize",
1454
+ "socialised": "socialized",
1455
+ "socialises": "socializes",
1456
+ "socialising": "socializing",
1457
+ "sodomise": "sodomize",
1458
+ "sodomised": "sodomized",
1459
+ "sodomises": "sodomizes",
1460
+ "sodomising": "sodomizing",
1461
+ "solemnise": "solemnize",
1462
+ "solemnised": "solemnized",
1463
+ "solemnises": "solemnizes",
1464
+ "solemnising": "solemnizing",
1465
+ "sombre": "somber",
1466
+ "specialisation": "specialization",
1467
+ "specialisations": "specializations",
1468
+ "specialise": "specialize",
1469
+ "specialised": "specialized",
1470
+ "specialises": "specializes",
1471
+ "specialising": "specializing",
1472
+ "spectre": "specter",
1473
+ "spectres": "specters",
1474
+ "spiralled": "spiraled",
1475
+ "spiralling": "spiraling",
1476
+ "splendour": "splendor",
1477
+ "splendours": "splendors",
1478
+ "squirrelled": "squirreled",
1479
+ "squirrelling": "squirreling",
1480
+ "stabilisation": "stabilization",
1481
+ "stabilise": "stabilize",
1482
+ "stabilised": "stabilized",
1483
+ "stabiliser": "stabilizer",
1484
+ "stabilisers": "stabilizers",
1485
+ "stabilises": "stabilizes",
1486
+ "stabilising": "stabilizing",
1487
+ "standardisation": "standardization",
1488
+ "standardise": "standardize",
1489
+ "standardised": "standardized",
1490
+ "standardises": "standardizes",
1491
+ "standardising": "standardizing",
1492
+ "stencilled": "stenciled",
1493
+ "stencilling": "stenciling",
1494
+ "sterilisation": "sterilization",
1495
+ "sterilisations": "sterilizations",
1496
+ "sterilise": "sterilize",
1497
+ "sterilised": "sterilized",
1498
+ "steriliser": "sterilizer",
1499
+ "sterilisers": "sterilizers",
1500
+ "sterilises": "sterilizes",
1501
+ "sterilising": "sterilizing",
1502
+ "stigmatisation": "stigmatization",
1503
+ "stigmatise": "stigmatize",
1504
+ "stigmatised": "stigmatized",
1505
+ "stigmatises": "stigmatizes",
1506
+ "stigmatising": "stigmatizing",
1507
+ "storey": "story",
1508
+ "storeys": "stories",
1509
+ "subsidisation": "subsidization",
1510
+ "subsidise": "subsidize",
1511
+ "subsidised": "subsidized",
1512
+ "subsidiser": "subsidizer",
1513
+ "subsidisers": "subsidizers",
1514
+ "subsidises": "subsidizes",
1515
+ "subsidising": "subsidizing",
1516
+ "succour": "succor",
1517
+ "succoured": "succored",
1518
+ "succouring": "succoring",
1519
+ "succours": "succors",
1520
+ "sulphate": "sulfate",
1521
+ "sulphates": "sulfates",
1522
+ "sulphide": "sulfide",
1523
+ "sulphides": "sulfides",
1524
+ "sulphur": "sulfur",
1525
+ "sulphurous": "sulfurous",
1526
+ "summarise": "summarize",
1527
+ "summarised": "summarized",
1528
+ "summarises": "summarizes",
1529
+ "summarising": "summarizing",
1530
+ "swivelled": "swiveled",
1531
+ "swivelling": "swiveling",
1532
+ "symbolise": "symbolize",
1533
+ "symbolised": "symbolized",
1534
+ "symbolises": "symbolizes",
1535
+ "symbolising": "symbolizing",
1536
+ "sympathise": "sympathize",
1537
+ "sympathised": "sympathized",
1538
+ "sympathiser": "sympathizer",
1539
+ "sympathisers": "sympathizers",
1540
+ "sympathises": "sympathizes",
1541
+ "sympathising": "sympathizing",
1542
+ "synchronisation": "synchronization",
1543
+ "synchronise": "synchronize",
1544
+ "synchronised": "synchronized",
1545
+ "synchronises": "synchronizes",
1546
+ "synchronising": "synchronizing",
1547
+ "synthesise": "synthesize",
1548
+ "synthesised": "synthesized",
1549
+ "synthesiser": "synthesizer",
1550
+ "synthesisers": "synthesizers",
1551
+ "synthesises": "synthesizes",
1552
+ "synthesising": "synthesizing",
1553
+ "syphon": "siphon",
1554
+ "syphoned": "siphoned",
1555
+ "syphoning": "siphoning",
1556
+ "syphons": "siphons",
1557
+ "systematisation": "systematization",
1558
+ "systematise": "systematize",
1559
+ "systematised": "systematized",
1560
+ "systematises": "systematizes",
1561
+ "systematising": "systematizing",
1562
+ "tantalise": "tantalize",
1563
+ "tantalised": "tantalized",
1564
+ "tantalises": "tantalizes",
1565
+ "tantalising": "tantalizing",
1566
+ "tantalisingly": "tantalizingly",
1567
+ "tasselled": "tasseled",
1568
+ "technicolour": "technicolor",
1569
+ "temporise": "temporize",
1570
+ "temporised": "temporized",
1571
+ "temporises": "temporizes",
1572
+ "temporising": "temporizing",
1573
+ "tenderise": "tenderize",
1574
+ "tenderised": "tenderized",
1575
+ "tenderises": "tenderizes",
1576
+ "tenderising": "tenderizing",
1577
+ "terrorise": "terrorize",
1578
+ "terrorised": "terrorized",
1579
+ "terrorises": "terrorizes",
1580
+ "terrorising": "terrorizing",
1581
+ "theatre": "theater",
1582
+ "theatregoer": "theatergoer",
1583
+ "theatregoers": "theatergoers",
1584
+ "theatres": "theaters",
1585
+ "theorise": "theorize",
1586
+ "theorised": "theorized",
1587
+ "theorises": "theorizes",
1588
+ "theorising": "theorizing",
1589
+ "tonne": "ton",
1590
+ "tonnes": "tons",
1591
+ "towelled": "toweled",
1592
+ "towelling": "toweling",
1593
+ "toxaemia": "toxemia",
1594
+ "tranquillise": "tranquilize",
1595
+ "tranquillised": "tranquilized",
1596
+ "tranquilliser": "tranquilizer",
1597
+ "tranquillisers": "tranquilizers",
1598
+ "tranquillises": "tranquilizes",
1599
+ "tranquillising": "tranquilizing",
1600
+ "tranquillity": "tranquility",
1601
+ "tranquillize": "tranquilize",
1602
+ "tranquillized": "tranquilized",
1603
+ "tranquillizer": "tranquilizer",
1604
+ "tranquillizers": "tranquilizers",
1605
+ "tranquillizes": "tranquilizes",
1606
+ "tranquillizing": "tranquilizing",
1607
+ "tranquilly": "tranquility",
1608
+ "transistorised": "transistorized",
1609
+ "traumatise": "traumatize",
1610
+ "traumatised": "traumatized",
1611
+ "traumatises": "traumatizes",
1612
+ "traumatising": "traumatizing",
1613
+ "travelled": "traveled",
1614
+ "traveller": "traveler",
1615
+ "travellers": "travelers",
1616
+ "travelling": "traveling",
1617
+ "travelog": "travelogue",
1618
+ "travelogs": "travelogues",
1619
+ "trialled": "trialed",
1620
+ "trialling": "trialing",
1621
+ "tricolour": "tricolor",
1622
+ "tricolours": "tricolors",
1623
+ "trivialise": "trivialize",
1624
+ "trivialised": "trivialized",
1625
+ "trivialises": "trivializes",
1626
+ "trivialising": "trivializing",
1627
+ "tumour": "tumor",
1628
+ "tumours": "tumors",
1629
+ "tunnelled": "tunneled",
1630
+ "tunnelling": "tunneling",
1631
+ "tyrannise": "tyrannize",
1632
+ "tyrannised": "tyrannized",
1633
+ "tyrannises": "tyrannizes",
1634
+ "tyrannising": "tyrannizing",
1635
+ "tyre": "tire",
1636
+ "tyres": "tires",
1637
+ "unauthorised": "unauthorized",
1638
+ "uncivilised": "uncivilized",
1639
+ "underutilised": "underutilized",
1640
+ "unequalled": "unequaled",
1641
+ "unfavourable": "unfavorable",
1642
+ "unfavourably": "unfavorably",
1643
+ "unionisation": "unionization",
1644
+ "unionise": "unionize",
1645
+ "unionised": "unionized",
1646
+ "unionises": "unionizes",
1647
+ "unionising": "unionizing",
1648
+ "unorganised": "unorganized",
1649
+ "unravelled": "unraveled",
1650
+ "unravelling": "unraveling",
1651
+ "unrecognisable": "unrecognizable",
1652
+ "unrecognised": "unrecognized",
1653
+ "unrivalled": "unrivaled",
1654
+ "unsavoury": "unsavory",
1655
+ "untrammelled": "untrammeled",
1656
+ "urbanisation": "urbanization",
1657
+ "urbanise": "urbanize",
1658
+ "urbanised": "urbanized",
1659
+ "urbanises": "urbanizes",
1660
+ "urbanising": "urbanizing",
1661
+ "utilisable": "utilizable",
1662
+ "utilisation": "utilization",
1663
+ "utilise": "utilize",
1664
+ "utilised": "utilized",
1665
+ "utilises": "utilizes",
1666
+ "utilising": "utilizing",
1667
+ "valour": "valor",
1668
+ "vandalise": "vandalize",
1669
+ "vandalised": "vandalized",
1670
+ "vandalises": "vandalizes",
1671
+ "vandalising": "vandalizing",
1672
+ "vaporisation": "vaporization",
1673
+ "vaporise": "vaporize",
1674
+ "vaporised": "vaporized",
1675
+ "vaporises": "vaporizes",
1676
+ "vaporising": "vaporizing",
1677
+ "vapour": "vapor",
1678
+ "vapours": "vapors",
1679
+ "verbalise": "verbalize",
1680
+ "verbalised": "verbalized",
1681
+ "verbalises": "verbalizes",
1682
+ "verbalising": "verbalizing",
1683
+ "victimisation": "victimization",
1684
+ "victimise": "victimize",
1685
+ "victimised": "victimized",
1686
+ "victimises": "victimizes",
1687
+ "victimising": "victimizing",
1688
+ "videodisc": "videodisk",
1689
+ "videodiscs": "videodisks",
1690
+ "vigour": "vigor",
1691
+ "visualisation": "visualization",
1692
+ "visualisations": "visualizations",
1693
+ "visualise": "visualize",
1694
+ "visualised": "visualized",
1695
+ "visualises": "visualizes",
1696
+ "visualising": "visualizing",
1697
+ "vocalisation": "vocalization",
1698
+ "vocalisations": "vocalizations",
1699
+ "vocalise": "vocalize",
1700
+ "vocalised": "vocalized",
1701
+ "vocalises": "vocalizes",
1702
+ "vocalising": "vocalizing",
1703
+ "vulcanised": "vulcanized",
1704
+ "vulgarisation": "vulgarization",
1705
+ "vulgarise": "vulgarize",
1706
+ "vulgarised": "vulgarized",
1707
+ "vulgarises": "vulgarizes",
1708
+ "vulgarising": "vulgarizing",
1709
+ "waggon": "wagon",
1710
+ "waggons": "wagons",
1711
+ "watercolour": "watercolor",
1712
+ "watercolours": "watercolors",
1713
+ "weaselled": "weaseled",
1714
+ "weaselling": "weaseling",
1715
+ "westernisation": "westernization",
1716
+ "westernise": "westernize",
1717
+ "westernised": "westernized",
1718
+ "westernises": "westernizes",
1719
+ "westernising": "westernizing",
1720
+ "womanise": "womanize",
1721
+ "womanised": "womanized",
1722
+ "womaniser": "womanizer",
1723
+ "womanisers": "womanizers",
1724
+ "womanises": "womanizes",
1725
+ "womanising": "womanizing",
1726
+ "woollen": "woolen",
1727
+ "woollens": "woolens",
1728
+ "woollies": "woolies",
1729
+ "woolly": "wooly",
1730
+ "worshipped": "worshiped",
1731
+ "worshipping": "worshiping",
1732
+ "worshipper": "worshiper",
1733
+ "yodelled": "yodeled",
1734
+ "yodelling": "yodeling",
1735
+ "yoghourt": "yogurt",
1736
+ "yoghourts": "yogurts",
1737
+ "yoghurt": "yogurt",
1738
+ "yoghurts": "yogurts",
1739
+ "mhm": "hmm",
1740
+ "mmm": "hmm"
1741
+ }
whisperlivekit/simul_whisper/whisper/normalizers/english.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from fractions import Fraction
5
+ from typing import Iterator, List, Match, Optional, Union
6
+
7
+ from more_itertools import windowed
8
+
9
+ from .basic import remove_symbols_and_diacritics
10
+
11
+
12
+ class EnglishNumberNormalizer:
13
+ """
14
+ Convert any spelled-out numbers into arabic numbers, while handling:
15
+
16
+ - remove any commas
17
+ - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
18
+ - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
19
+ - spell out `one` and `ones`
20
+ - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
21
+ """
22
+
23
+ def __init__(self):
24
+ super().__init__()
25
+
26
+ self.zeros = {"o", "oh", "zero"}
27
+ self.ones = {
28
+ name: i
29
+ for i, name in enumerate(
30
+ [
31
+ "one",
32
+ "two",
33
+ "three",
34
+ "four",
35
+ "five",
36
+ "six",
37
+ "seven",
38
+ "eight",
39
+ "nine",
40
+ "ten",
41
+ "eleven",
42
+ "twelve",
43
+ "thirteen",
44
+ "fourteen",
45
+ "fifteen",
46
+ "sixteen",
47
+ "seventeen",
48
+ "eighteen",
49
+ "nineteen",
50
+ ],
51
+ start=1,
52
+ )
53
+ }
54
+ self.ones_plural = {
55
+ "sixes" if name == "six" else name + "s": (value, "s")
56
+ for name, value in self.ones.items()
57
+ }
58
+ self.ones_ordinal = {
59
+ "zeroth": (0, "th"),
60
+ "first": (1, "st"),
61
+ "second": (2, "nd"),
62
+ "third": (3, "rd"),
63
+ "fifth": (5, "th"),
64
+ "twelfth": (12, "th"),
65
+ **{
66
+ name + ("h" if name.endswith("t") else "th"): (value, "th")
67
+ for name, value in self.ones.items()
68
+ if value > 3 and value != 5 and value != 12
69
+ },
70
+ }
71
+ self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
72
+
73
+ self.tens = {
74
+ "twenty": 20,
75
+ "thirty": 30,
76
+ "forty": 40,
77
+ "fifty": 50,
78
+ "sixty": 60,
79
+ "seventy": 70,
80
+ "eighty": 80,
81
+ "ninety": 90,
82
+ }
83
+ self.tens_plural = {
84
+ name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
85
+ }
86
+ self.tens_ordinal = {
87
+ name.replace("y", "ieth"): (value, "th")
88
+ for name, value in self.tens.items()
89
+ }
90
+ self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
91
+
92
+ self.multipliers = {
93
+ "hundred": 100,
94
+ "thousand": 1_000,
95
+ "million": 1_000_000,
96
+ "billion": 1_000_000_000,
97
+ "trillion": 1_000_000_000_000,
98
+ "quadrillion": 1_000_000_000_000_000,
99
+ "quintillion": 1_000_000_000_000_000_000,
100
+ "sextillion": 1_000_000_000_000_000_000_000,
101
+ "septillion": 1_000_000_000_000_000_000_000_000,
102
+ "octillion": 1_000_000_000_000_000_000_000_000_000,
103
+ "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
104
+ "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
105
+ }
106
+ self.multipliers_plural = {
107
+ name + "s": (value, "s") for name, value in self.multipliers.items()
108
+ }
109
+ self.multipliers_ordinal = {
110
+ name + "th": (value, "th") for name, value in self.multipliers.items()
111
+ }
112
+ self.multipliers_suffixed = {
113
+ **self.multipliers_plural,
114
+ **self.multipliers_ordinal,
115
+ }
116
+ self.decimals = {*self.ones, *self.tens, *self.zeros}
117
+
118
+ self.preceding_prefixers = {
119
+ "minus": "-",
120
+ "negative": "-",
121
+ "plus": "+",
122
+ "positive": "+",
123
+ }
124
+ self.following_prefixers = {
125
+ "pound": "£",
126
+ "pounds": "£",
127
+ "euro": "€",
128
+ "euros": "€",
129
+ "dollar": "$",
130
+ "dollars": "$",
131
+ "cent": "¢",
132
+ "cents": "¢",
133
+ }
134
+ self.prefixes = set(
135
+ list(self.preceding_prefixers.values())
136
+ + list(self.following_prefixers.values())
137
+ )
138
+ self.suffixers = {
139
+ "per": {"cent": "%"},
140
+ "percent": "%",
141
+ }
142
+ self.specials = {"and", "double", "triple", "point"}
143
+
144
+ self.words = set(
145
+ [
146
+ key
147
+ for mapping in [
148
+ self.zeros,
149
+ self.ones,
150
+ self.ones_suffixed,
151
+ self.tens,
152
+ self.tens_suffixed,
153
+ self.multipliers,
154
+ self.multipliers_suffixed,
155
+ self.preceding_prefixers,
156
+ self.following_prefixers,
157
+ self.suffixers,
158
+ self.specials,
159
+ ]
160
+ for key in mapping
161
+ ]
162
+ )
163
+ self.literal_words = {"one", "ones"}
164
+
165
+ def process_words(self, words: List[str]) -> Iterator[str]:
166
+ prefix: Optional[str] = None
167
+ value: Optional[Union[str, int]] = None
168
+ skip = False
169
+
170
+ def to_fraction(s: str):
171
+ try:
172
+ return Fraction(s)
173
+ except ValueError:
174
+ return None
175
+
176
+ def output(result: Union[str, int]):
177
+ nonlocal prefix, value
178
+ result = str(result)
179
+ if prefix is not None:
180
+ result = prefix + result
181
+ value = None
182
+ prefix = None
183
+ return result
184
+
185
+ if len(words) == 0:
186
+ return
187
+
188
+ for prev, current, next in windowed([None] + words + [None], 3):
189
+ if skip:
190
+ skip = False
191
+ continue
192
+
193
+ next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
194
+ has_prefix = current[0] in self.prefixes
195
+ current_without_prefix = current[1:] if has_prefix else current
196
+ if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
197
+ # arabic numbers (potentially with signs and fractions)
198
+ f = to_fraction(current_without_prefix)
199
+ assert f is not None
200
+ if value is not None:
201
+ if isinstance(value, str) and value.endswith("."):
202
+ # concatenate decimals / ip address components
203
+ value = str(value) + str(current)
204
+ continue
205
+ else:
206
+ yield output(value)
207
+
208
+ prefix = current[0] if has_prefix else prefix
209
+ if f.denominator == 1:
210
+ value = f.numerator # store integers as int
211
+ else:
212
+ value = current_without_prefix
213
+ elif current not in self.words:
214
+ # non-numeric words
215
+ if value is not None:
216
+ yield output(value)
217
+ yield output(current)
218
+ elif current in self.zeros:
219
+ value = str(value or "") + "0"
220
+ elif current in self.ones:
221
+ ones = self.ones[current]
222
+
223
+ if value is None:
224
+ value = ones
225
+ elif isinstance(value, str) or prev in self.ones:
226
+ if (
227
+ prev in self.tens and ones < 10
228
+ ): # replace the last zero with the digit
229
+ assert value[-1] == "0"
230
+ value = value[:-1] + str(ones)
231
+ else:
232
+ value = str(value) + str(ones)
233
+ elif ones < 10:
234
+ if value % 10 == 0:
235
+ value += ones
236
+ else:
237
+ value = str(value) + str(ones)
238
+ else: # eleven to nineteen
239
+ if value % 100 == 0:
240
+ value += ones
241
+ else:
242
+ value = str(value) + str(ones)
243
+ elif current in self.ones_suffixed:
244
+ # ordinal or cardinal; yield the number right away
245
+ ones, suffix = self.ones_suffixed[current]
246
+ if value is None:
247
+ yield output(str(ones) + suffix)
248
+ elif isinstance(value, str) or prev in self.ones:
249
+ if prev in self.tens and ones < 10:
250
+ assert value[-1] == "0"
251
+ yield output(value[:-1] + str(ones) + suffix)
252
+ else:
253
+ yield output(str(value) + str(ones) + suffix)
254
+ elif ones < 10:
255
+ if value % 10 == 0:
256
+ yield output(str(value + ones) + suffix)
257
+ else:
258
+ yield output(str(value) + str(ones) + suffix)
259
+ else: # eleven to nineteen
260
+ if value % 100 == 0:
261
+ yield output(str(value + ones) + suffix)
262
+ else:
263
+ yield output(str(value) + str(ones) + suffix)
264
+ value = None
265
+ elif current in self.tens:
266
+ tens = self.tens[current]
267
+ if value is None:
268
+ value = tens
269
+ elif isinstance(value, str):
270
+ value = str(value) + str(tens)
271
+ else:
272
+ if value % 100 == 0:
273
+ value += tens
274
+ else:
275
+ value = str(value) + str(tens)
276
+ elif current in self.tens_suffixed:
277
+ # ordinal or cardinal; yield the number right away
278
+ tens, suffix = self.tens_suffixed[current]
279
+ if value is None:
280
+ yield output(str(tens) + suffix)
281
+ elif isinstance(value, str):
282
+ yield output(str(value) + str(tens) + suffix)
283
+ else:
284
+ if value % 100 == 0:
285
+ yield output(str(value + tens) + suffix)
286
+ else:
287
+ yield output(str(value) + str(tens) + suffix)
288
+ elif current in self.multipliers:
289
+ multiplier = self.multipliers[current]
290
+ if value is None:
291
+ value = multiplier
292
+ elif isinstance(value, str) or value == 0:
293
+ f = to_fraction(value)
294
+ p = f * multiplier if f is not None else None
295
+ if f is not None and p.denominator == 1:
296
+ value = p.numerator
297
+ else:
298
+ yield output(value)
299
+ value = multiplier
300
+ else:
301
+ before = value // 1000 * 1000
302
+ residual = value % 1000
303
+ value = before + residual * multiplier
304
+ elif current in self.multipliers_suffixed:
305
+ multiplier, suffix = self.multipliers_suffixed[current]
306
+ if value is None:
307
+ yield output(str(multiplier) + suffix)
308
+ elif isinstance(value, str):
309
+ f = to_fraction(value)
310
+ p = f * multiplier if f is not None else None
311
+ if f is not None and p.denominator == 1:
312
+ yield output(str(p.numerator) + suffix)
313
+ else:
314
+ yield output(value)
315
+ yield output(str(multiplier) + suffix)
316
+ else: # int
317
+ before = value // 1000 * 1000
318
+ residual = value % 1000
319
+ value = before + residual * multiplier
320
+ yield output(str(value) + suffix)
321
+ value = None
322
+ elif current in self.preceding_prefixers:
323
+ # apply prefix (positive, minus, etc.) if it precedes a number
324
+ if value is not None:
325
+ yield output(value)
326
+
327
+ if next in self.words or next_is_numeric:
328
+ prefix = self.preceding_prefixers[current]
329
+ else:
330
+ yield output(current)
331
+ elif current in self.following_prefixers:
332
+ # apply prefix (dollars, cents, etc.) only after a number
333
+ if value is not None:
334
+ prefix = self.following_prefixers[current]
335
+ yield output(value)
336
+ else:
337
+ yield output(current)
338
+ elif current in self.suffixers:
339
+ # apply suffix symbols (percent -> '%')
340
+ if value is not None:
341
+ suffix = self.suffixers[current]
342
+ if isinstance(suffix, dict):
343
+ if next in suffix:
344
+ yield output(str(value) + suffix[next])
345
+ skip = True
346
+ else:
347
+ yield output(value)
348
+ yield output(current)
349
+ else:
350
+ yield output(str(value) + suffix)
351
+ else:
352
+ yield output(current)
353
+ elif current in self.specials:
354
+ if next not in self.words and not next_is_numeric:
355
+ # apply special handling only if the next word can be numeric
356
+ if value is not None:
357
+ yield output(value)
358
+ yield output(current)
359
+ elif current == "and":
360
+ # ignore "and" after hundreds, thousands, etc.
361
+ if prev not in self.multipliers:
362
+ if value is not None:
363
+ yield output(value)
364
+ yield output(current)
365
+ elif current == "double" or current == "triple":
366
+ if next in self.ones or next in self.zeros:
367
+ repeats = 2 if current == "double" else 3
368
+ ones = self.ones.get(next, 0)
369
+ value = str(value or "") + str(ones) * repeats
370
+ skip = True
371
+ else:
372
+ if value is not None:
373
+ yield output(value)
374
+ yield output(current)
375
+ elif current == "point":
376
+ if next in self.decimals or next_is_numeric:
377
+ value = str(value or "") + "."
378
+ else:
379
+ # should all have been covered at this point
380
+ raise ValueError(f"Unexpected token: {current}")
381
+ else:
382
+ # all should have been covered at this point
383
+ raise ValueError(f"Unexpected token: {current}")
384
+
385
+ if value is not None:
386
+ yield output(value)
387
+
388
+ def preprocess(self, s: str):
389
+ # replace "<number> and a half" with "<number> point five"
390
+ results = []
391
+
392
+ segments = re.split(r"\band\s+a\s+half\b", s)
393
+ for i, segment in enumerate(segments):
394
+ if len(segment.strip()) == 0:
395
+ continue
396
+ if i == len(segments) - 1:
397
+ results.append(segment)
398
+ else:
399
+ results.append(segment)
400
+ last_word = segment.rsplit(maxsplit=2)[-1]
401
+ if last_word in self.decimals or last_word in self.multipliers:
402
+ results.append("point five")
403
+ else:
404
+ results.append("and a half")
405
+
406
+ s = " ".join(results)
407
+
408
+ # put a space at number/letter boundary
409
+ s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
410
+ s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
411
+
412
+ # but remove spaces which could be a suffix
413
+ s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
414
+
415
+ return s
416
+
417
+ def postprocess(self, s: str):
418
+ def combine_cents(m: Match):
419
+ try:
420
+ currency = m.group(1)
421
+ integer = m.group(2)
422
+ cents = int(m.group(3))
423
+ return f"{currency}{integer}.{cents:02d}"
424
+ except ValueError:
425
+ return m.string
426
+
427
+ def extract_cents(m: Match):
428
+ try:
429
+ return f"¢{int(m.group(1))}"
430
+ except ValueError:
431
+ return m.string
432
+
433
+ # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
434
+ s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
435
+ s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
436
+
437
+ # write "one(s)" instead of "1(s)", just for the readability
438
+ s = re.sub(r"\b1(s?)\b", r"one\1", s)
439
+
440
+ return s
441
+
442
+ def __call__(self, s: str):
443
+ s = self.preprocess(s)
444
+ s = " ".join(word for word in self.process_words(s.split()) if word is not None)
445
+ s = self.postprocess(s)
446
+
447
+ return s
448
+
449
+
450
+ class EnglishSpellingNormalizer:
451
+ """
452
+ Applies British-American spelling mappings as listed in [1].
453
+
454
+ [1] https://www.tysto.com/uk-us-spelling-list.html
455
+ """
456
+
457
+ def __init__(self):
458
+ mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
459
+ self.mapping = json.load(open(mapping_path))
460
+
461
+ def __call__(self, s: str):
462
+ return " ".join(self.mapping.get(word, word) for word in s.split())
463
+
464
+
465
+ class EnglishTextNormalizer:
466
+ def __init__(self):
467
+ self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
468
+ self.replacers = {
469
+ # common contractions
470
+ r"\bwon't\b": "will not",
471
+ r"\bcan't\b": "can not",
472
+ r"\blet's\b": "let us",
473
+ r"\bain't\b": "aint",
474
+ r"\by'all\b": "you all",
475
+ r"\bwanna\b": "want to",
476
+ r"\bgotta\b": "got to",
477
+ r"\bgonna\b": "going to",
478
+ r"\bi'ma\b": "i am going to",
479
+ r"\bimma\b": "i am going to",
480
+ r"\bwoulda\b": "would have",
481
+ r"\bcoulda\b": "could have",
482
+ r"\bshoulda\b": "should have",
483
+ r"\bma'am\b": "madam",
484
+ # contractions in titles/prefixes
485
+ r"\bmr\b": "mister ",
486
+ r"\bmrs\b": "missus ",
487
+ r"\bst\b": "saint ",
488
+ r"\bdr\b": "doctor ",
489
+ r"\bprof\b": "professor ",
490
+ r"\bcapt\b": "captain ",
491
+ r"\bgov\b": "governor ",
492
+ r"\bald\b": "alderman ",
493
+ r"\bgen\b": "general ",
494
+ r"\bsen\b": "senator ",
495
+ r"\brep\b": "representative ",
496
+ r"\bpres\b": "president ",
497
+ r"\brev\b": "reverend ",
498
+ r"\bhon\b": "honorable ",
499
+ r"\basst\b": "assistant ",
500
+ r"\bassoc\b": "associate ",
501
+ r"\blt\b": "lieutenant ",
502
+ r"\bcol\b": "colonel ",
503
+ r"\bjr\b": "junior ",
504
+ r"\bsr\b": "senior ",
505
+ r"\besq\b": "esquire ",
506
+ # prefect tenses, ideally it should be any past participles, but it's harder..
507
+ r"'d been\b": " had been",
508
+ r"'s been\b": " has been",
509
+ r"'d gone\b": " had gone",
510
+ r"'s gone\b": " has gone",
511
+ r"'d done\b": " had done", # "'s done" is ambiguous
512
+ r"'s got\b": " has got",
513
+ # general contractions
514
+ r"n't\b": " not",
515
+ r"'re\b": " are",
516
+ r"'s\b": " is",
517
+ r"'d\b": " would",
518
+ r"'ll\b": " will",
519
+ r"'t\b": " not",
520
+ r"'ve\b": " have",
521
+ r"'m\b": " am",
522
+ }
523
+ self.standardize_numbers = EnglishNumberNormalizer()
524
+ self.standardize_spellings = EnglishSpellingNormalizer()
525
+
526
+ def __call__(self, s: str):
527
+ s = s.lower()
528
+
529
+ s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
530
+ s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
531
+ s = re.sub(self.ignore_patterns, "", s)
532
+ s = re.sub(r"\s+'", "'", s) # when there's a space before an apostrophe
533
+
534
+ for pattern, replacement in self.replacers.items():
535
+ s = re.sub(pattern, replacement, s)
536
+
537
+ s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
538
+ s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers
539
+ s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep numeric symbols
540
+
541
+ s = self.standardize_numbers(s)
542
+ s = self.standardize_spellings(s)
543
+
544
+ # now remove prefix/suffix symbols that are not preceded/followed by numbers
545
+ s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
546
+ s = re.sub(r"([^0-9])%", r"\1 ", s)
547
+
548
+ s = re.sub(r"\s+", " ", s) # replace any successive whitespaces with a space
549
+
550
+ return s
whisperlivekit/simul_whisper/whisper/timing.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import subprocess
3
+ import warnings
4
+ from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING, List
6
+
7
+ import numba
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn.functional as F
11
+
12
+ from .audio import HOP_LENGTH, SAMPLE_RATE, TOKENS_PER_SECOND
13
+ from .tokenizer import Tokenizer
14
+
15
+ if TYPE_CHECKING:
16
+ from .model import Whisper
17
+
18
+
19
+ def median_filter(x: torch.Tensor, filter_width: int):
20
+ """Apply a median filter of width `filter_width` along the last dimension of `x`"""
21
+ pad_width = filter_width // 2
22
+ if x.shape[-1] <= pad_width:
23
+ # F.pad requires the padding width to be smaller than the input dimension
24
+ return x
25
+
26
+ if (ndim := x.ndim) <= 2:
27
+ # `F.pad` does not support 1D or 2D inputs for reflect padding but supports 3D and 4D
28
+ x = x[None, None, :]
29
+
30
+ assert (
31
+ filter_width > 0 and filter_width % 2 == 1
32
+ ), "`filter_width` should be an odd number"
33
+
34
+ result = None
35
+ x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect")
36
+ if x.is_cuda:
37
+ try:
38
+ from .triton_ops import median_filter_cuda
39
+
40
+ result = median_filter_cuda(x, filter_width)
41
+ except (RuntimeError, subprocess.CalledProcessError):
42
+ warnings.warn(
43
+ "Failed to launch Triton kernels, likely due to missing CUDA toolkit; "
44
+ "falling back to a slower median kernel implementation..."
45
+ )
46
+
47
+ if result is None:
48
+ # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
49
+ result = x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
50
+
51
+ if ndim <= 2:
52
+ result = result[0, 0]
53
+
54
+ return result
55
+
56
+
57
+ @numba.jit(nopython=True)
58
+ def backtrace(trace: np.ndarray):
59
+ i = trace.shape[0] - 1
60
+ j = trace.shape[1] - 1
61
+ trace[0, :] = 2
62
+ trace[:, 0] = 1
63
+
64
+ result = []
65
+ while i > 0 or j > 0:
66
+ result.append((i - 1, j - 1))
67
+
68
+ if trace[i, j] == 0:
69
+ i -= 1
70
+ j -= 1
71
+ elif trace[i, j] == 1:
72
+ i -= 1
73
+ elif trace[i, j] == 2:
74
+ j -= 1
75
+ else:
76
+ raise ValueError("Unexpected trace[i, j]")
77
+
78
+ result = np.array(result)
79
+ return result[::-1, :].T
80
+
81
+
82
+ @numba.jit(nopython=True, parallel=True)
83
+ def dtw_cpu(x: np.ndarray):
84
+ N, M = x.shape
85
+ cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
86
+ trace = -np.ones((N + 1, M + 1), dtype=np.float32)
87
+
88
+ cost[0, 0] = 0
89
+ for j in range(1, M + 1):
90
+ for i in range(1, N + 1):
91
+ c0 = cost[i - 1, j - 1]
92
+ c1 = cost[i - 1, j]
93
+ c2 = cost[i, j - 1]
94
+
95
+ if c0 < c1 and c0 < c2:
96
+ c, t = c0, 0
97
+ elif c1 < c0 and c1 < c2:
98
+ c, t = c1, 1
99
+ else:
100
+ c, t = c2, 2
101
+
102
+ cost[i, j] = x[i - 1, j - 1] + c
103
+ trace[i, j] = t
104
+
105
+ return backtrace(trace)
106
+
107
+
108
+ def dtw_cuda(x, BLOCK_SIZE=1024):
109
+ from .triton_ops import dtw_kernel
110
+
111
+ M, N = x.shape
112
+ assert M < BLOCK_SIZE, f"M should be smaller than {BLOCK_SIZE=}"
113
+
114
+ x_skew = (
115
+ F.pad(x, (0, M + 1), value=np.inf).flatten()[: M * (N + M)].reshape(M, N + M)
116
+ )
117
+ x_skew = x_skew.T.contiguous()
118
+ cost = torch.ones(N + M + 2, M + 2) * np.inf
119
+ cost[0, 0] = 0
120
+ cost = cost.to(x.device)
121
+ trace = torch.zeros_like(cost, dtype=torch.int32)
122
+
123
+ dtw_kernel[(1,)](
124
+ cost,
125
+ trace,
126
+ x_skew,
127
+ x_skew.stride(0),
128
+ cost.stride(0),
129
+ trace.stride(0),
130
+ N,
131
+ M,
132
+ BLOCK_SIZE=BLOCK_SIZE,
133
+ )
134
+
135
+ trace = trace.T.flatten()[: (M + 1) * (M + N + 3)].reshape(M + 1, M + N + 3)[
136
+ :, : N + 1
137
+ ]
138
+ return backtrace(trace.cpu().numpy())
139
+
140
+
141
+ def dtw(x: torch.Tensor) -> np.ndarray:
142
+ if x.is_cuda:
143
+ try:
144
+ return dtw_cuda(x)
145
+ except (RuntimeError, subprocess.CalledProcessError):
146
+ warnings.warn(
147
+ "Failed to launch Triton kernels, likely due to missing CUDA toolkit; "
148
+ "falling back to a slower DTW implementation..."
149
+ )
150
+
151
+ return dtw_cpu(x.double().cpu().numpy())
152
+
153
+
154
+ @dataclass
155
+ class WordTiming:
156
+ word: str
157
+ tokens: List[int]
158
+ start: float
159
+ end: float
160
+ probability: float
161
+
162
+
163
+ def find_alignment(
164
+ model: "Whisper",
165
+ tokenizer: Tokenizer,
166
+ text_tokens: List[int],
167
+ mel: torch.Tensor,
168
+ num_frames: int,
169
+ *,
170
+ medfilt_width: int = 7,
171
+ qk_scale: float = 1.0,
172
+ ) -> List[WordTiming]:
173
+ if len(text_tokens) == 0:
174
+ return []
175
+
176
+ tokens = torch.tensor(
177
+ [
178
+ *tokenizer.sot_sequence,
179
+ tokenizer.no_timestamps,
180
+ *text_tokens,
181
+ tokenizer.eot,
182
+ ]
183
+ ).to(model.device)
184
+
185
+ # install hooks on the cross attention layers to retrieve the attention weights
186
+ QKs = [None] * model.dims.n_text_layer
187
+ hooks = [
188
+ block.cross_attn.register_forward_hook(
189
+ lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1][0])
190
+ )
191
+ for i, block in enumerate(model.decoder.blocks)
192
+ ]
193
+
194
+ from .model import disable_sdpa
195
+
196
+ with torch.no_grad(), disable_sdpa():
197
+ logits = model(mel.unsqueeze(0), tokens.unsqueeze(0))[0]
198
+ sampled_logits = logits[len(tokenizer.sot_sequence) :, : tokenizer.eot]
199
+ token_probs = sampled_logits.softmax(dim=-1)
200
+ text_token_probs = token_probs[np.arange(len(text_tokens)), text_tokens]
201
+ text_token_probs = text_token_probs.tolist()
202
+
203
+ for hook in hooks:
204
+ hook.remove()
205
+
206
+ # heads * tokens * frames
207
+ weights = torch.stack([QKs[_l][_h] for _l, _h in model.alignment_heads.indices().T])
208
+ weights = weights[:, :, : num_frames // 2]
209
+ weights = (weights * qk_scale).softmax(dim=-1)
210
+ std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
211
+ weights = (weights - mean) / std
212
+ weights = median_filter(weights, medfilt_width)
213
+
214
+ matrix = weights.mean(axis=0)
215
+ matrix = matrix[len(tokenizer.sot_sequence) : -1]
216
+ text_indices, time_indices = dtw(-matrix)
217
+
218
+ words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
219
+ if len(word_tokens) <= 1:
220
+ # return on eot only
221
+ # >>> np.pad([], (1, 0))
222
+ # array([0.])
223
+ # This results in crashes when we lookup jump_times with float, like
224
+ # IndexError: arrays used as indices must be of integer (or boolean) type
225
+ return []
226
+ word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
227
+
228
+ jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
229
+ jump_times = time_indices[jumps] / TOKENS_PER_SECOND
230
+ start_times = jump_times[word_boundaries[:-1]]
231
+ end_times = jump_times[word_boundaries[1:]]
232
+ word_probabilities = [
233
+ np.mean(text_token_probs[i:j])
234
+ for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
235
+ ]
236
+
237
+ return [
238
+ WordTiming(word, tokens, start, end, probability)
239
+ for word, tokens, start, end, probability in zip(
240
+ words, word_tokens, start_times, end_times, word_probabilities
241
+ )
242
+ ]
243
+
244
+
245
+ def merge_punctuations(alignment: List[WordTiming], prepended: str, appended: str):
246
+ # merge prepended punctuations
247
+ i = len(alignment) - 2
248
+ j = len(alignment) - 1
249
+ while i >= 0:
250
+ previous = alignment[i]
251
+ following = alignment[j]
252
+ if previous.word.startswith(" ") and previous.word.strip() in prepended:
253
+ # prepend it to the following word
254
+ following.word = previous.word + following.word
255
+ following.tokens = previous.tokens + following.tokens
256
+ previous.word = ""
257
+ previous.tokens = []
258
+ else:
259
+ j = i
260
+ i -= 1
261
+
262
+ # merge appended punctuations
263
+ i = 0
264
+ j = 1
265
+ while j < len(alignment):
266
+ previous = alignment[i]
267
+ following = alignment[j]
268
+ if not previous.word.endswith(" ") and following.word in appended:
269
+ # append it to the previous word
270
+ previous.word = previous.word + following.word
271
+ previous.tokens = previous.tokens + following.tokens
272
+ following.word = ""
273
+ following.tokens = []
274
+ else:
275
+ i = j
276
+ j += 1
277
+
278
+
279
+ def add_word_timestamps(
280
+ *,
281
+ segments: List[dict],
282
+ model: "Whisper",
283
+ tokenizer: Tokenizer,
284
+ mel: torch.Tensor,
285
+ num_frames: int,
286
+ prepend_punctuations: str = "\"'“¿([{-",
287
+ append_punctuations: str = "\"'.。,,!!??::”)]}、",
288
+ last_speech_timestamp: float,
289
+ **kwargs,
290
+ ):
291
+ if len(segments) == 0:
292
+ return
293
+
294
+ text_tokens_per_segment = [
295
+ [token for token in segment["tokens"] if token < tokenizer.eot]
296
+ for segment in segments
297
+ ]
298
+
299
+ text_tokens = list(itertools.chain.from_iterable(text_tokens_per_segment))
300
+ alignment = find_alignment(model, tokenizer, text_tokens, mel, num_frames, **kwargs)
301
+ word_durations = np.array([t.end - t.start for t in alignment])
302
+ word_durations = word_durations[word_durations.nonzero()]
303
+ median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0
304
+ median_duration = min(0.7, float(median_duration))
305
+ max_duration = median_duration * 2
306
+
307
+ # hack: truncate long words at sentence boundaries.
308
+ # a better segmentation algorithm based on VAD should be able to replace this.
309
+ if len(word_durations) > 0:
310
+ sentence_end_marks = ".。!!??"
311
+ # ensure words at sentence boundaries are not longer than twice the median word duration.
312
+ for i in range(1, len(alignment)):
313
+ if alignment[i].end - alignment[i].start > max_duration:
314
+ if alignment[i].word in sentence_end_marks:
315
+ alignment[i].end = alignment[i].start + max_duration
316
+ elif alignment[i - 1].word in sentence_end_marks:
317
+ alignment[i].start = alignment[i].end - max_duration
318
+
319
+ merge_punctuations(alignment, prepend_punctuations, append_punctuations)
320
+
321
+ time_offset = segments[0]["seek"] * HOP_LENGTH / SAMPLE_RATE
322
+ word_index = 0
323
+
324
+ for segment, text_tokens in zip(segments, text_tokens_per_segment):
325
+ saved_tokens = 0
326
+ words = []
327
+
328
+ while word_index < len(alignment) and saved_tokens < len(text_tokens):
329
+ timing = alignment[word_index]
330
+
331
+ if timing.word:
332
+ words.append(
333
+ dict(
334
+ word=timing.word,
335
+ start=round(time_offset + timing.start, 2),
336
+ end=round(time_offset + timing.end, 2),
337
+ probability=timing.probability,
338
+ )
339
+ )
340
+
341
+ saved_tokens += len(timing.tokens)
342
+ word_index += 1
343
+
344
+ # hack: truncate long words at segment boundaries.
345
+ # a better segmentation algorithm based on VAD should be able to replace this.
346
+ if len(words) > 0:
347
+ # ensure the first and second word after a pause is not longer than
348
+ # twice the median word duration.
349
+ if words[0]["end"] - last_speech_timestamp > median_duration * 4 and (
350
+ words[0]["end"] - words[0]["start"] > max_duration
351
+ or (
352
+ len(words) > 1
353
+ and words[1]["end"] - words[0]["start"] > max_duration * 2
354
+ )
355
+ ):
356
+ if (
357
+ len(words) > 1
358
+ and words[1]["end"] - words[1]["start"] > max_duration
359
+ ):
360
+ boundary = max(words[1]["end"] / 2, words[1]["end"] - max_duration)
361
+ words[0]["end"] = words[1]["start"] = boundary
362
+ words[0]["start"] = max(0, words[0]["end"] - max_duration)
363
+
364
+ # prefer the segment-level start timestamp if the first word is too long.
365
+ if (
366
+ segment["start"] < words[0]["end"]
367
+ and segment["start"] - 0.5 > words[0]["start"]
368
+ ):
369
+ words[0]["start"] = max(
370
+ 0, min(words[0]["end"] - median_duration, segment["start"])
371
+ )
372
+ else:
373
+ segment["start"] = words[0]["start"]
374
+
375
+ # prefer the segment-level end timestamp if the last word is too long.
376
+ if (
377
+ segment["end"] > words[-1]["start"]
378
+ and segment["end"] + 0.5 < words[-1]["end"]
379
+ ):
380
+ words[-1]["end"] = max(
381
+ words[-1]["start"] + median_duration, segment["end"]
382
+ )
383
+ else:
384
+ segment["end"] = words[-1]["end"]
385
+
386
+ last_speech_timestamp = segment["end"]
387
+
388
+ segment["words"] = words
whisperlivekit/simul_whisper/whisper/tokenizer.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import string
4
+ from dataclasses import dataclass, field
5
+ from functools import cached_property, lru_cache
6
+ from typing import Dict, List, Optional, Tuple
7
+
8
+ import tiktoken
9
+
10
+ LANGUAGES = {
11
+ "en": "english",
12
+ "zh": "chinese",
13
+ "de": "german",
14
+ "es": "spanish",
15
+ "ru": "russian",
16
+ "ko": "korean",
17
+ "fr": "french",
18
+ "ja": "japanese",
19
+ "pt": "portuguese",
20
+ "tr": "turkish",
21
+ "pl": "polish",
22
+ "ca": "catalan",
23
+ "nl": "dutch",
24
+ "ar": "arabic",
25
+ "sv": "swedish",
26
+ "it": "italian",
27
+ "id": "indonesian",
28
+ "hi": "hindi",
29
+ "fi": "finnish",
30
+ "vi": "vietnamese",
31
+ "he": "hebrew",
32
+ "uk": "ukrainian",
33
+ "el": "greek",
34
+ "ms": "malay",
35
+ "cs": "czech",
36
+ "ro": "romanian",
37
+ "da": "danish",
38
+ "hu": "hungarian",
39
+ "ta": "tamil",
40
+ "no": "norwegian",
41
+ "th": "thai",
42
+ "ur": "urdu",
43
+ "hr": "croatian",
44
+ "bg": "bulgarian",
45
+ "lt": "lithuanian",
46
+ "la": "latin",
47
+ "mi": "maori",
48
+ "ml": "malayalam",
49
+ "cy": "welsh",
50
+ "sk": "slovak",
51
+ "te": "telugu",
52
+ "fa": "persian",
53
+ "lv": "latvian",
54
+ "bn": "bengali",
55
+ "sr": "serbian",
56
+ "az": "azerbaijani",
57
+ "sl": "slovenian",
58
+ "kn": "kannada",
59
+ "et": "estonian",
60
+ "mk": "macedonian",
61
+ "br": "breton",
62
+ "eu": "basque",
63
+ "is": "icelandic",
64
+ "hy": "armenian",
65
+ "ne": "nepali",
66
+ "mn": "mongolian",
67
+ "bs": "bosnian",
68
+ "kk": "kazakh",
69
+ "sq": "albanian",
70
+ "sw": "swahili",
71
+ "gl": "galician",
72
+ "mr": "marathi",
73
+ "pa": "punjabi",
74
+ "si": "sinhala",
75
+ "km": "khmer",
76
+ "sn": "shona",
77
+ "yo": "yoruba",
78
+ "so": "somali",
79
+ "af": "afrikaans",
80
+ "oc": "occitan",
81
+ "ka": "georgian",
82
+ "be": "belarusian",
83
+ "tg": "tajik",
84
+ "sd": "sindhi",
85
+ "gu": "gujarati",
86
+ "am": "amharic",
87
+ "yi": "yiddish",
88
+ "lo": "lao",
89
+ "uz": "uzbek",
90
+ "fo": "faroese",
91
+ "ht": "haitian creole",
92
+ "ps": "pashto",
93
+ "tk": "turkmen",
94
+ "nn": "nynorsk",
95
+ "mt": "maltese",
96
+ "sa": "sanskrit",
97
+ "lb": "luxembourgish",
98
+ "my": "myanmar",
99
+ "bo": "tibetan",
100
+ "tl": "tagalog",
101
+ "mg": "malagasy",
102
+ "as": "assamese",
103
+ "tt": "tatar",
104
+ "haw": "hawaiian",
105
+ "ln": "lingala",
106
+ "ha": "hausa",
107
+ "ba": "bashkir",
108
+ "jw": "javanese",
109
+ "su": "sundanese",
110
+ "yue": "cantonese",
111
+ }
112
+
113
+ # language code lookup by name, with a few language aliases
114
+ TO_LANGUAGE_CODE = {
115
+ **{language: code for code, language in LANGUAGES.items()},
116
+ "burmese": "my",
117
+ "valencian": "ca",
118
+ "flemish": "nl",
119
+ "haitian": "ht",
120
+ "letzeburgesch": "lb",
121
+ "pushto": "ps",
122
+ "panjabi": "pa",
123
+ "moldavian": "ro",
124
+ "moldovan": "ro",
125
+ "sinhalese": "si",
126
+ "castilian": "es",
127
+ "mandarin": "zh",
128
+ }
129
+
130
+
131
+ @dataclass
132
+ class Tokenizer:
133
+ """A thin wrapper around `tiktoken` providing quick access to special tokens"""
134
+
135
+ encoding: tiktoken.Encoding
136
+ num_languages: int
137
+ language: Optional[str] = None
138
+ task: Optional[str] = None
139
+ sot_sequence: Tuple[int] = ()
140
+ special_tokens: Dict[str, int] = field(default_factory=dict)
141
+
142
+ def __post_init__(self):
143
+ for special in self.encoding.special_tokens_set:
144
+ special_token = self.encoding.encode_single_token(special)
145
+ self.special_tokens[special] = special_token
146
+
147
+ sot: int = self.special_tokens["<|startoftranscript|>"]
148
+ translate: int = self.special_tokens["<|translate|>"]
149
+ transcribe: int = self.special_tokens["<|transcribe|>"]
150
+
151
+ langs = tuple(LANGUAGES.keys())[: self.num_languages]
152
+ sot_sequence = [sot]
153
+ if self.language is not None:
154
+ sot_sequence.append(sot + 1 + langs.index(self.language))
155
+ if self.task is not None:
156
+ task_token: int = transcribe if self.task == "transcribe" else translate
157
+ sot_sequence.append(task_token)
158
+
159
+ self.sot_sequence = tuple(sot_sequence)
160
+
161
+ def encode(self, text, **kwargs):
162
+ return self.encoding.encode(text, **kwargs)
163
+
164
+ def decode(self, token_ids: List[int], **kwargs) -> str:
165
+ token_ids = [t for t in token_ids if t < self.timestamp_begin]
166
+ return self.encoding.decode(token_ids, **kwargs)
167
+
168
+ def decode_with_timestamps(self, token_ids: List[int], **kwargs) -> str:
169
+ """
170
+ Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
171
+ This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
172
+ """
173
+ return self.encoding.decode(token_ids, **kwargs)
174
+
175
+ @cached_property
176
+ def eot(self) -> int:
177
+ return self.encoding.eot_token
178
+
179
+ @cached_property
180
+ def transcribe(self) -> int:
181
+ return self.special_tokens["<|transcribe|>"]
182
+
183
+ @cached_property
184
+ def translate(self) -> int:
185
+ return self.special_tokens["<|translate|>"]
186
+
187
+ @cached_property
188
+ def sot(self) -> int:
189
+ return self.special_tokens["<|startoftranscript|>"]
190
+
191
+ @cached_property
192
+ def sot_lm(self) -> int:
193
+ return self.special_tokens["<|startoflm|>"]
194
+
195
+ @cached_property
196
+ def sot_prev(self) -> int:
197
+ return self.special_tokens["<|startofprev|>"]
198
+
199
+ @cached_property
200
+ def no_speech(self) -> int:
201
+ return self.special_tokens["<|nospeech|>"]
202
+
203
+ @cached_property
204
+ def no_timestamps(self) -> int:
205
+ return self.special_tokens["<|notimestamps|>"]
206
+
207
+ @cached_property
208
+ def timestamp_begin(self) -> int:
209
+ return self.special_tokens["<|0.00|>"]
210
+
211
+ @cached_property
212
+ def language_token(self) -> int:
213
+ """Returns the token id corresponding to the value of the `language` field"""
214
+ if self.language is None:
215
+ raise ValueError("This tokenizer does not have language token configured")
216
+
217
+ return self.to_language_token(self.language)
218
+
219
+ def to_language_token(self, language):
220
+ if token := self.special_tokens.get(f"<|{language}|>", None):
221
+ return token
222
+
223
+ raise KeyError(f"Language {language} not found in tokenizer.")
224
+
225
+ @cached_property
226
+ def all_language_tokens(self) -> Tuple[int]:
227
+ result = []
228
+ for token, token_id in self.special_tokens.items():
229
+ if token.strip("<|>") in LANGUAGES:
230
+ result.append(token_id)
231
+ return tuple(result)[: self.num_languages]
232
+
233
+ @cached_property
234
+ def all_language_codes(self) -> Tuple[str]:
235
+ return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens)
236
+
237
+ @cached_property
238
+ def sot_sequence_including_notimestamps(self) -> Tuple[int]:
239
+ return tuple(list(self.sot_sequence) + [self.no_timestamps])
240
+
241
+ @cached_property
242
+ def non_speech_tokens(self) -> Tuple[int]:
243
+ """
244
+ Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
245
+ annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
246
+
247
+ - ♪♪♪
248
+ - ( SPEAKING FOREIGN LANGUAGE )
249
+ - [DAVID] Hey there,
250
+
251
+ keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
252
+ """
253
+ symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
254
+ symbols += (
255
+ "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
256
+ )
257
+
258
+ # symbols that may be a single token or multiple tokens depending on the tokenizer.
259
+ # In case they're multiple tokens, suppress the first token, which is safe because:
260
+ # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
261
+ # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
262
+ miscellaneous = set("♩♪♫♬♭♮♯")
263
+ assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
264
+
265
+ # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
266
+ result = {self.encoding.encode(" -")[0], self.encoding.encode(" '")[0]}
267
+ for symbol in symbols + list(miscellaneous):
268
+ for tokens in [
269
+ self.encoding.encode(symbol),
270
+ self.encoding.encode(" " + symbol),
271
+ ]:
272
+ if len(tokens) == 1 or symbol in miscellaneous:
273
+ result.add(tokens[0])
274
+
275
+ return tuple(sorted(result))
276
+
277
+ def split_to_word_tokens(self, tokens: List[int]):
278
+ if self.language in {"zh", "ja", "th", "lo", "my", "yue"}:
279
+ # These languages don't typically use spaces, so it is difficult to split words
280
+ # without morpheme analysis. Here, we instead split words at any
281
+ # position where the tokens are decoded as valid unicode points
282
+ return self.split_tokens_on_unicode(tokens)
283
+
284
+ return self.split_tokens_on_spaces(tokens)
285
+
286
+ def split_tokens_on_unicode(self, tokens: List[int]):
287
+ decoded_full = self.decode_with_timestamps(tokens)
288
+ replacement_char = "\ufffd"
289
+
290
+ words = []
291
+ word_tokens = []
292
+ current_tokens = []
293
+ unicode_offset = 0
294
+
295
+ for token in tokens:
296
+ current_tokens.append(token)
297
+ decoded = self.decode_with_timestamps(current_tokens)
298
+
299
+ if (
300
+ replacement_char not in decoded
301
+ or decoded_full[unicode_offset + decoded.index(replacement_char)]
302
+ == replacement_char
303
+ ):
304
+ words.append(decoded)
305
+ word_tokens.append(current_tokens)
306
+ current_tokens = []
307
+ unicode_offset += len(decoded)
308
+
309
+ return words, word_tokens
310
+
311
+ def split_tokens_on_spaces(self, tokens: List[int]):
312
+ subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
313
+ words = []
314
+ word_tokens = []
315
+
316
+ for subword, subword_tokens in zip(subwords, subword_tokens_list):
317
+ special = subword_tokens[0] >= self.eot
318
+ with_space = subword.startswith(" ")
319
+ punctuation = subword.strip() in string.punctuation
320
+ if special or with_space or punctuation or len(words) == 0:
321
+ words.append(subword)
322
+ word_tokens.append(subword_tokens)
323
+ else:
324
+ words[-1] = words[-1] + subword
325
+ word_tokens[-1].extend(subword_tokens)
326
+
327
+ return words, word_tokens
328
+
329
+
330
+ @lru_cache(maxsize=None)
331
+ def get_encoding(name: str = "gpt2", num_languages: int = 99):
332
+ vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
333
+ ranks = {
334
+ base64.b64decode(token): int(rank)
335
+ for token, rank in (line.split() for line in open(vocab_path) if line)
336
+ }
337
+ n_vocab = len(ranks)
338
+ special_tokens = {}
339
+
340
+ specials = [
341
+ "<|endoftext|>",
342
+ "<|startoftranscript|>",
343
+ *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
344
+ "<|translate|>",
345
+ "<|transcribe|>",
346
+ "<|startoflm|>",
347
+ "<|startofprev|>",
348
+ "<|nospeech|>",
349
+ "<|notimestamps|>",
350
+ *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
351
+ ]
352
+
353
+ for token in specials:
354
+ special_tokens[token] = n_vocab
355
+ n_vocab += 1
356
+
357
+ return tiktoken.Encoding(
358
+ name=os.path.basename(vocab_path),
359
+ explicit_n_vocab=n_vocab,
360
+ pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
361
+ mergeable_ranks=ranks,
362
+ special_tokens=special_tokens,
363
+ )
364
+
365
+
366
+ @lru_cache(maxsize=None)
367
+ def get_tokenizer(
368
+ multilingual: bool,
369
+ *,
370
+ num_languages: int = 99,
371
+ language: Optional[str] = None,
372
+ task: Optional[str] = None, # Literal["transcribe", "translate", None]
373
+ ) -> Tokenizer:
374
+ if language is not None:
375
+ language = language.lower()
376
+ if language not in LANGUAGES:
377
+ if language in TO_LANGUAGE_CODE:
378
+ language = TO_LANGUAGE_CODE[language]
379
+ else:
380
+ raise ValueError(f"Unsupported language: {language}")
381
+
382
+ if multilingual:
383
+ encoding_name = "multilingual"
384
+ language = language or "en"
385
+ task = task or "transcribe"
386
+ else:
387
+ encoding_name = "gpt2"
388
+ language = None
389
+ task = None
390
+
391
+ encoding = get_encoding(name=encoding_name, num_languages=num_languages)
392
+
393
+ return Tokenizer(
394
+ encoding=encoding, num_languages=num_languages, language=language, task=task
395
+ )
whisperlivekit/simul_whisper/whisper/transcribe.py ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import traceback
4
+ import warnings
5
+ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
6
+
7
+ import numpy as np
8
+ import torch
9
+ import tqdm
10
+
11
+ from .audio import (
12
+ FRAMES_PER_SECOND,
13
+ HOP_LENGTH,
14
+ N_FRAMES,
15
+ N_SAMPLES,
16
+ SAMPLE_RATE,
17
+ log_mel_spectrogram,
18
+ pad_or_trim,
19
+ )
20
+ from .decoding import DecodingOptions, DecodingResult
21
+ from .timing import add_word_timestamps
22
+ from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
23
+ from .utils import (
24
+ exact_div,
25
+ format_timestamp,
26
+ get_end,
27
+ get_writer,
28
+ make_safe,
29
+ optional_float,
30
+ optional_int,
31
+ str2bool,
32
+ )
33
+
34
+ if TYPE_CHECKING:
35
+ from .model import Whisper
36
+
37
+
38
+ def transcribe(
39
+ model: "Whisper",
40
+ audio: Union[str, np.ndarray, torch.Tensor],
41
+ *,
42
+ verbose: Optional[bool] = None,
43
+ temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
44
+ compression_ratio_threshold: Optional[float] = 2.4,
45
+ logprob_threshold: Optional[float] = -1.0,
46
+ no_speech_threshold: Optional[float] = 0.6,
47
+ condition_on_previous_text: bool = True,
48
+ initial_prompt: Optional[str] = None,
49
+ carry_initial_prompt: bool = False,
50
+ word_timestamps: bool = False,
51
+ prepend_punctuations: str = "\"'“¿([{-",
52
+ append_punctuations: str = "\"'.。,,!!??::”)]}、",
53
+ clip_timestamps: Union[str, List[float]] = "0",
54
+ hallucination_silence_threshold: Optional[float] = None,
55
+ **decode_options,
56
+ ):
57
+ """
58
+ Transcribe an audio file using Whisper
59
+
60
+ Parameters
61
+ ----------
62
+ model: Whisper
63
+ The Whisper model instance
64
+
65
+ audio: Union[str, np.ndarray, torch.Tensor]
66
+ The path to the audio file to open, or the audio waveform
67
+
68
+ verbose: bool
69
+ Whether to display the text being decoded to the console. If True, displays all the details,
70
+ If False, displays minimal details. If None, does not display anything
71
+
72
+ temperature: Union[float, Tuple[float, ...]]
73
+ Temperature for sampling. It can be a tuple of temperatures, which will be successively used
74
+ upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
75
+
76
+ compression_ratio_threshold: float
77
+ If the gzip compression ratio is above this value, treat as failed
78
+
79
+ logprob_threshold: float
80
+ If the average log probability over sampled tokens is below this value, treat as failed
81
+
82
+ no_speech_threshold: float
83
+ If the no_speech probability is higher than this value AND the average log probability
84
+ over sampled tokens is below `logprob_threshold`, consider the segment as silent
85
+
86
+ condition_on_previous_text: bool
87
+ if True, the previous output of the model is provided as a prompt for the next window;
88
+ disabling may make the text inconsistent across windows, but the model becomes less prone to
89
+ getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
90
+
91
+ word_timestamps: bool
92
+ Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
93
+ and include the timestamps for each word in each segment.
94
+
95
+ prepend_punctuations: str
96
+ If word_timestamps is True, merge these punctuation symbols with the next word
97
+
98
+ append_punctuations: str
99
+ If word_timestamps is True, merge these punctuation symbols with the previous word
100
+
101
+ initial_prompt: Optional[str]
102
+ Optional text to provide as a prompt for the first window. This can be used to provide, or
103
+ "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
104
+ to make it more likely to predict those word correctly.
105
+
106
+ carry_initial_prompt: bool
107
+ If carry_initial_prompt is True, `initial_prompt` is prepended to the prompt of each internal
108
+ `decode()` call. If there is not enough context space at the start of the prompt, it is
109
+ left-sliced to make space.
110
+
111
+ decode_options: dict
112
+ Keyword arguments to construct `DecodingOptions` instances
113
+
114
+ clip_timestamps: Union[str, List[float]]
115
+ Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
116
+ The last end timestamp defaults to the end of the file.
117
+
118
+ hallucination_silence_threshold: Optional[float]
119
+ When word_timestamps is True, skip silent periods longer than this threshold (in seconds)
120
+ when a possible hallucination is detected
121
+
122
+ Returns
123
+ -------
124
+ A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
125
+ the spoken language ("language"), which is detected when `decode_options["language"]` is None.
126
+ """
127
+ dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
128
+ if model.device == torch.device("cpu"):
129
+ if torch.cuda.is_available():
130
+ warnings.warn("Performing inference on CPU when CUDA is available")
131
+ if dtype == torch.float16:
132
+ warnings.warn("FP16 is not supported on CPU; using FP32 instead")
133
+ dtype = torch.float32
134
+
135
+ if dtype == torch.float32:
136
+ decode_options["fp16"] = False
137
+
138
+ # Pad 30-seconds of silence to the input audio, for slicing
139
+ mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES)
140
+ content_frames = mel.shape[-1] - N_FRAMES
141
+ content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
142
+
143
+ if decode_options.get("language", None) is None:
144
+ if not model.is_multilingual:
145
+ decode_options["language"] = "en"
146
+ else:
147
+ if verbose:
148
+ print(
149
+ "Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
150
+ )
151
+ mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
152
+ _, probs = model.detect_language(mel_segment)
153
+ decode_options["language"] = max(probs, key=probs.get)
154
+ if verbose is not None:
155
+ print(
156
+ f"Detected language: {LANGUAGES[decode_options['language']].title()}"
157
+ )
158
+
159
+ language: str = decode_options["language"]
160
+ task: str = decode_options.get("task", "transcribe")
161
+ tokenizer = get_tokenizer(
162
+ model.is_multilingual,
163
+ num_languages=model.num_languages,
164
+ language=language,
165
+ task=task,
166
+ )
167
+
168
+ if isinstance(clip_timestamps, str):
169
+ clip_timestamps = [
170
+ float(ts) for ts in (clip_timestamps.split(",") if clip_timestamps else [])
171
+ ]
172
+ seek_points: List[int] = [round(ts * FRAMES_PER_SECOND) for ts in clip_timestamps]
173
+ if len(seek_points) == 0:
174
+ seek_points.append(0)
175
+ if len(seek_points) % 2 == 1:
176
+ seek_points.append(content_frames)
177
+ seek_clips: List[Tuple[int, int]] = list(zip(seek_points[::2], seek_points[1::2]))
178
+
179
+ punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、"
180
+
181
+ if word_timestamps and task == "translate":
182
+ warnings.warn("Word-level timestamps on translations may not be reliable.")
183
+
184
+ def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
185
+ temperatures = (
186
+ [temperature] if isinstance(temperature, (int, float)) else temperature
187
+ )
188
+ decode_result = None
189
+
190
+ for t in temperatures:
191
+ kwargs = {**decode_options}
192
+ if t > 0:
193
+ # disable beam_size and patience when t > 0
194
+ kwargs.pop("beam_size", None)
195
+ kwargs.pop("patience", None)
196
+ else:
197
+ # disable best_of when t == 0
198
+ kwargs.pop("best_of", None)
199
+
200
+ options = DecodingOptions(**kwargs, temperature=t)
201
+ decode_result = model.decode(segment, options)
202
+
203
+ needs_fallback = False
204
+ if (
205
+ compression_ratio_threshold is not None
206
+ and decode_result.compression_ratio > compression_ratio_threshold
207
+ ):
208
+ needs_fallback = True # too repetitive
209
+ if (
210
+ logprob_threshold is not None
211
+ and decode_result.avg_logprob < logprob_threshold
212
+ ):
213
+ needs_fallback = True # average log probability is too low
214
+ if (
215
+ no_speech_threshold is not None
216
+ and decode_result.no_speech_prob > no_speech_threshold
217
+ and logprob_threshold is not None
218
+ and decode_result.avg_logprob < logprob_threshold
219
+ ):
220
+ needs_fallback = False # silence
221
+ if not needs_fallback:
222
+ break
223
+
224
+ return decode_result
225
+
226
+ clip_idx = 0
227
+ seek = seek_clips[clip_idx][0]
228
+ input_stride = exact_div(
229
+ N_FRAMES, model.dims.n_audio_ctx
230
+ ) # mel frames per output token: 2
231
+ time_precision = (
232
+ input_stride * HOP_LENGTH / SAMPLE_RATE
233
+ ) # time per output token: 0.02 (seconds)
234
+ all_tokens = []
235
+ all_segments = []
236
+ prompt_reset_since = 0
237
+
238
+ remaining_prompt_length = model.dims.n_text_ctx // 2 - 1
239
+ if initial_prompt is not None:
240
+ initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
241
+ all_tokens.extend(initial_prompt_tokens)
242
+ remaining_prompt_length -= len(initial_prompt_tokens)
243
+ else:
244
+ initial_prompt_tokens = []
245
+
246
+ def new_segment(
247
+ *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
248
+ ):
249
+ tokens = tokens.tolist()
250
+ text_tokens = [token for token in tokens if token < tokenizer.eot]
251
+ return {
252
+ "seek": seek,
253
+ "start": start,
254
+ "end": end,
255
+ "text": tokenizer.decode(text_tokens),
256
+ "tokens": tokens,
257
+ "temperature": result.temperature,
258
+ "avg_logprob": result.avg_logprob,
259
+ "compression_ratio": result.compression_ratio,
260
+ "no_speech_prob": result.no_speech_prob,
261
+ }
262
+
263
+ # show the progress bar when verbose is False (if True, transcribed text will be printed)
264
+ with tqdm.tqdm(
265
+ total=content_frames, unit="frames", disable=verbose is not False
266
+ ) as pbar:
267
+ last_speech_timestamp = 0.0
268
+ # NOTE: This loop is obscurely flattened to make the diff readable.
269
+ # A later commit should turn this into a simpler nested loop.
270
+ # for seek_clip_start, seek_clip_end in seek_clips:
271
+ # while seek < seek_clip_end
272
+ while clip_idx < len(seek_clips):
273
+ seek_clip_start, seek_clip_end = seek_clips[clip_idx]
274
+ if seek < seek_clip_start:
275
+ seek = seek_clip_start
276
+ if seek >= seek_clip_end:
277
+ clip_idx += 1
278
+ if clip_idx < len(seek_clips):
279
+ seek = seek_clips[clip_idx][0]
280
+ continue
281
+ time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
282
+ window_end_time = float((seek + N_FRAMES) * HOP_LENGTH / SAMPLE_RATE)
283
+ segment_size = min(N_FRAMES, content_frames - seek, seek_clip_end - seek)
284
+ mel_segment = mel[:, seek : seek + segment_size]
285
+ segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
286
+ mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
287
+
288
+ if carry_initial_prompt:
289
+ nignored = max(len(initial_prompt_tokens), prompt_reset_since)
290
+ remaining_prompt = all_tokens[nignored:][-remaining_prompt_length:]
291
+ decode_options["prompt"] = initial_prompt_tokens + remaining_prompt
292
+ else:
293
+ decode_options["prompt"] = all_tokens[prompt_reset_since:]
294
+
295
+ result: DecodingResult = decode_with_fallback(mel_segment)
296
+ tokens = torch.tensor(result.tokens)
297
+
298
+ if no_speech_threshold is not None:
299
+ # no voice activity check
300
+ should_skip = result.no_speech_prob > no_speech_threshold
301
+ if (
302
+ logprob_threshold is not None
303
+ and result.avg_logprob > logprob_threshold
304
+ ):
305
+ # don't skip if the logprob is high enough, despite the no_speech_prob
306
+ should_skip = False
307
+
308
+ if should_skip:
309
+ seek += segment_size # fast-forward to the next segment boundary
310
+ continue
311
+
312
+ previous_seek = seek
313
+ current_segments = []
314
+
315
+ # anomalous words are very long/short/improbable
316
+ def word_anomaly_score(word: dict) -> float:
317
+ probability = word.get("probability", 0.0)
318
+ duration = word["end"] - word["start"]
319
+ score = 0.0
320
+ if probability < 0.15:
321
+ score += 1.0
322
+ if duration < 0.133:
323
+ score += (0.133 - duration) * 15
324
+ if duration > 2.0:
325
+ score += duration - 2.0
326
+ return score
327
+
328
+ def is_segment_anomaly(segment: Optional[dict]) -> bool:
329
+ if segment is None or not segment["words"]:
330
+ return False
331
+ words = [w for w in segment["words"] if w["word"] not in punctuation]
332
+ words = words[:8]
333
+ score = sum(word_anomaly_score(w) for w in words)
334
+ return score >= 3 or score + 0.01 >= len(words)
335
+
336
+ def next_words_segment(segments: List[dict]) -> Optional[dict]:
337
+ return next((s for s in segments if s["words"]), None)
338
+
339
+ timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
340
+ single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
341
+
342
+ consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
343
+ consecutive.add_(1)
344
+ if len(consecutive) > 0:
345
+ # if the output contains two consecutive timestamp tokens
346
+ slices = consecutive.tolist()
347
+ if single_timestamp_ending:
348
+ slices.append(len(tokens))
349
+
350
+ last_slice = 0
351
+ for current_slice in slices:
352
+ sliced_tokens = tokens[last_slice:current_slice]
353
+ start_timestamp_pos = (
354
+ sliced_tokens[0].item() - tokenizer.timestamp_begin
355
+ )
356
+ end_timestamp_pos = (
357
+ sliced_tokens[-1].item() - tokenizer.timestamp_begin
358
+ )
359
+ current_segments.append(
360
+ new_segment(
361
+ start=time_offset + start_timestamp_pos * time_precision,
362
+ end=time_offset + end_timestamp_pos * time_precision,
363
+ tokens=sliced_tokens,
364
+ result=result,
365
+ )
366
+ )
367
+ last_slice = current_slice
368
+
369
+ if single_timestamp_ending:
370
+ # single timestamp at the end means no speech after the last timestamp.
371
+ seek += segment_size
372
+ else:
373
+ # otherwise, ignore the unfinished segment and seek to the last timestamp
374
+ last_timestamp_pos = (
375
+ tokens[last_slice - 1].item() - tokenizer.timestamp_begin
376
+ )
377
+ seek += last_timestamp_pos * input_stride
378
+ else:
379
+ duration = segment_duration
380
+ timestamps = tokens[timestamp_tokens.nonzero().flatten()]
381
+ if (
382
+ len(timestamps) > 0
383
+ and timestamps[-1].item() != tokenizer.timestamp_begin
384
+ ):
385
+ # no consecutive timestamps but it has a timestamp; use the last one.
386
+ last_timestamp_pos = (
387
+ timestamps[-1].item() - tokenizer.timestamp_begin
388
+ )
389
+ duration = last_timestamp_pos * time_precision
390
+
391
+ current_segments.append(
392
+ new_segment(
393
+ start=time_offset,
394
+ end=time_offset + duration,
395
+ tokens=tokens,
396
+ result=result,
397
+ )
398
+ )
399
+ seek += segment_size
400
+
401
+ if word_timestamps:
402
+ add_word_timestamps(
403
+ segments=current_segments,
404
+ model=model,
405
+ tokenizer=tokenizer,
406
+ mel=mel_segment,
407
+ num_frames=segment_size,
408
+ prepend_punctuations=prepend_punctuations,
409
+ append_punctuations=append_punctuations,
410
+ last_speech_timestamp=last_speech_timestamp,
411
+ )
412
+
413
+ if not single_timestamp_ending:
414
+ last_word_end = get_end(current_segments)
415
+ if last_word_end is not None and last_word_end > time_offset:
416
+ seek = round(last_word_end * FRAMES_PER_SECOND)
417
+
418
+ # skip silence before possible hallucinations
419
+ if hallucination_silence_threshold is not None:
420
+ threshold = hallucination_silence_threshold
421
+ if not single_timestamp_ending:
422
+ last_word_end = get_end(current_segments)
423
+ if last_word_end is not None and last_word_end > time_offset:
424
+ remaining_duration = window_end_time - last_word_end
425
+ if remaining_duration > threshold:
426
+ seek = round(last_word_end * FRAMES_PER_SECOND)
427
+ else:
428
+ seek = previous_seek + segment_size
429
+
430
+ # if first segment might be a hallucination, skip leading silence
431
+ first_segment = next_words_segment(current_segments)
432
+ if first_segment is not None and is_segment_anomaly(first_segment):
433
+ gap = first_segment["start"] - time_offset
434
+ if gap > threshold:
435
+ seek = previous_seek + round(gap * FRAMES_PER_SECOND)
436
+ continue
437
+
438
+ # skip silence before any possible hallucination that is surrounded
439
+ # by silence or more hallucinations
440
+ hal_last_end = last_speech_timestamp
441
+ for si in range(len(current_segments)):
442
+ segment = current_segments[si]
443
+ if not segment["words"]:
444
+ continue
445
+ if is_segment_anomaly(segment):
446
+ next_segment = next_words_segment(
447
+ current_segments[si + 1 :]
448
+ )
449
+ if next_segment is not None:
450
+ hal_next_start = next_segment["words"][0]["start"]
451
+ else:
452
+ hal_next_start = time_offset + segment_duration
453
+ silence_before = (
454
+ segment["start"] - hal_last_end > threshold
455
+ or segment["start"] < threshold
456
+ or segment["start"] - time_offset < 2.0
457
+ )
458
+ silence_after = (
459
+ hal_next_start - segment["end"] > threshold
460
+ or is_segment_anomaly(next_segment)
461
+ or window_end_time - segment["end"] < 2.0
462
+ )
463
+ if silence_before and silence_after:
464
+ seek = round(
465
+ max(time_offset + 1, segment["start"])
466
+ * FRAMES_PER_SECOND
467
+ )
468
+ if content_duration - segment["end"] < threshold:
469
+ seek = content_frames
470
+ current_segments[si:] = []
471
+ break
472
+ hal_last_end = segment["end"]
473
+
474
+ last_word_end = get_end(current_segments)
475
+ if last_word_end is not None:
476
+ last_speech_timestamp = last_word_end
477
+
478
+ if verbose:
479
+ for segment in current_segments:
480
+ start, end, text = segment["start"], segment["end"], segment["text"]
481
+ line = f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"
482
+ print(make_safe(line))
483
+
484
+ # if a segment is instantaneous or does not contain text, clear it
485
+ for i, segment in enumerate(current_segments):
486
+ if segment["start"] == segment["end"] or segment["text"].strip() == "":
487
+ segment["text"] = ""
488
+ segment["tokens"] = []
489
+ segment["words"] = []
490
+
491
+ all_segments.extend(
492
+ [
493
+ {"id": i, **segment}
494
+ for i, segment in enumerate(
495
+ current_segments, start=len(all_segments)
496
+ )
497
+ ]
498
+ )
499
+ all_tokens.extend(
500
+ [token for segment in current_segments for token in segment["tokens"]]
501
+ )
502
+
503
+ if not condition_on_previous_text or result.temperature > 0.5:
504
+ # do not feed the prompt tokens if a high temperature was used
505
+ prompt_reset_since = len(all_tokens)
506
+
507
+ # update progress bar
508
+ pbar.update(min(content_frames, seek) - previous_seek)
509
+
510
+ return dict(
511
+ text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),
512
+ segments=all_segments,
513
+ language=language,
514
+ )
515
+
516
+
517
+ def cli():
518
+ from . import available_models
519
+
520
+ def valid_model_name(name):
521
+ if name in available_models() or os.path.exists(name):
522
+ return name
523
+ raise ValueError(
524
+ f"model should be one of {available_models()} or path to a model checkpoint"
525
+ )
526
+
527
+ # fmt: off
528
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
529
+ parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
530
+ parser.add_argument("--model", default="turbo", type=valid_model_name, help="name of the Whisper model to use")
531
+ parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
532
+ parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
533
+ parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
534
+ parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
535
+ parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
536
+
537
+ parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
538
+ parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
539
+
540
+ parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
541
+ parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
542
+ parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
543
+ parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
544
+ parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
545
+
546
+ parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
547
+ parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
548
+ parser.add_argument("--carry_initial_prompt", type=str2bool, default=False, help="if True, prepend initial_prompt to every internal decode() call. May reduce the effectiveness of condition_on_previous_text")
549
+
550
+ parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
551
+ parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
552
+
553
+ parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
554
+ parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
555
+ parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
556
+ parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
557
+ parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
558
+ parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
559
+ parser.add_argument("--append_punctuations", type=str, default="\"\'.。,,!!??::”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
560
+ parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
561
+ parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line")
562
+ parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment")
563
+ parser.add_argument("--max_words_per_line", type=optional_int, default=None, help="(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segment")
564
+ parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
565
+ parser.add_argument("--clip_timestamps", type=str, default="0", help="comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process, where the last end timestamp defaults to the end of the file")
566
+ parser.add_argument("--hallucination_silence_threshold", type=optional_float, help="(requires --word_timestamps True) skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected")
567
+ # fmt: on
568
+
569
+ args = parser.parse_args().__dict__
570
+ model_name: str = args.pop("model")
571
+ model_dir: str = args.pop("model_dir")
572
+ output_dir: str = args.pop("output_dir")
573
+ output_format: str = args.pop("output_format")
574
+ device: str = args.pop("device")
575
+ os.makedirs(output_dir, exist_ok=True)
576
+
577
+ if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
578
+ if args["language"] is not None:
579
+ warnings.warn(
580
+ f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
581
+ )
582
+ args["language"] = "en"
583
+
584
+ temperature = args.pop("temperature")
585
+ if (increment := args.pop("temperature_increment_on_fallback")) is not None:
586
+ temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
587
+ else:
588
+ temperature = [temperature]
589
+
590
+ if (threads := args.pop("threads")) > 0:
591
+ torch.set_num_threads(threads)
592
+
593
+ from . import load_model
594
+
595
+ model = load_model(model_name, device=device, download_root=model_dir)
596
+
597
+ writer = get_writer(output_format, output_dir)
598
+ word_options = [
599
+ "highlight_words",
600
+ "max_line_count",
601
+ "max_line_width",
602
+ "max_words_per_line",
603
+ ]
604
+ if not args["word_timestamps"]:
605
+ for option in word_options:
606
+ if args[option]:
607
+ parser.error(f"--{option} requires --word_timestamps True")
608
+ if args["max_line_count"] and not args["max_line_width"]:
609
+ warnings.warn("--max_line_count has no effect without --max_line_width")
610
+ if args["max_words_per_line"] and args["max_line_width"]:
611
+ warnings.warn("--max_words_per_line has no effect with --max_line_width")
612
+ writer_args = {arg: args.pop(arg) for arg in word_options}
613
+ for audio_path in args.pop("audio"):
614
+ try:
615
+ result = transcribe(model, audio_path, temperature=temperature, **args)
616
+ writer(result, audio_path, **writer_args)
617
+ except Exception as e:
618
+ traceback.print_exc()
619
+ print(f"Skipping {audio_path} due to {type(e).__name__}: {str(e)}")
620
+
621
+
622
+ if __name__ == "__main__":
623
+ cli()
whisperlivekit/simul_whisper/whisper/triton_ops.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+
3
+ import numpy as np
4
+ import torch
5
+
6
+ try:
7
+ import triton
8
+ import triton.language as tl
9
+ except ImportError:
10
+ raise RuntimeError("triton import failed; try `pip install --pre triton`")
11
+
12
+
13
+ @triton.jit
14
+ def dtw_kernel(
15
+ cost, trace, x, x_stride, cost_stride, trace_stride, N, M, BLOCK_SIZE: tl.constexpr
16
+ ):
17
+ offsets = tl.arange(0, BLOCK_SIZE)
18
+ mask = offsets < M
19
+
20
+ for k in range(1, N + M + 1): # k = i + j
21
+ tl.debug_barrier()
22
+
23
+ p0 = cost + (k - 1) * cost_stride
24
+ p1 = cost + k * cost_stride
25
+ p2 = cost + k * cost_stride + 1
26
+
27
+ c0 = tl.load(p0 + offsets, mask=mask)
28
+ c1 = tl.load(p1 + offsets, mask=mask)
29
+ c2 = tl.load(p2 + offsets, mask=mask)
30
+
31
+ x_row = tl.load(x + (k - 1) * x_stride + offsets, mask=mask, other=0)
32
+ cost_row = x_row + tl.minimum(tl.minimum(c0, c1), c2)
33
+
34
+ cost_ptr = cost + (k + 1) * cost_stride + 1
35
+ tl.store(cost_ptr + offsets, cost_row, mask=mask)
36
+
37
+ trace_ptr = trace + (k + 1) * trace_stride + 1
38
+ tl.store(trace_ptr + offsets, 2, mask=mask & (c2 <= c0) & (c2 <= c1))
39
+ tl.store(trace_ptr + offsets, 1, mask=mask & (c1 <= c0) & (c1 <= c2))
40
+ tl.store(trace_ptr + offsets, 0, mask=mask & (c0 <= c1) & (c0 <= c2))
41
+
42
+
43
+ @lru_cache(maxsize=None)
44
+ def median_kernel(filter_width: int):
45
+ @triton.jit
46
+ def kernel(
47
+ y, x, x_stride, y_stride, BLOCK_SIZE: tl.constexpr
48
+ ): # x.shape[-1] == filter_width
49
+ row_idx = tl.program_id(0)
50
+ offsets = tl.arange(0, BLOCK_SIZE)
51
+ mask = offsets < y_stride
52
+
53
+ x_ptr = x + row_idx * x_stride # noqa: F841
54
+ y_ptr = y + row_idx * y_stride
55
+
56
+ LOAD_ALL_ROWS_HERE # noqa: F821
57
+
58
+ BUBBLESORT_HERE # noqa: F821
59
+
60
+ tl.store(y_ptr + offsets, MIDDLE_ROW_HERE, mask=mask) # noqa: F821
61
+
62
+ kernel = triton.JITFunction(kernel.fn)
63
+ new_kernel = kernel.src.replace(
64
+ " LOAD_ALL_ROWS_HERE",
65
+ "\n".join(
66
+ [
67
+ f" row{i} = tl.load(x_ptr + offsets + {i}, mask=mask)"
68
+ for i in range(filter_width)
69
+ ]
70
+ ),
71
+ )
72
+
73
+ new_kernel = new_kernel.replace(
74
+ " BUBBLESORT_HERE",
75
+ "\n\n".join(
76
+ [
77
+ "\n\n".join(
78
+ [
79
+ "\n".join(
80
+ [
81
+ f" smaller = tl.where(row{j} < row{j + 1}, row{j}, row{j + 1})",
82
+ f" larger = tl.where(row{j} > row{j + 1}, row{j}, row{j + 1})",
83
+ f" row{j} = smaller",
84
+ f" row{j + 1} = larger",
85
+ ]
86
+ )
87
+ for j in range(filter_width - i - 1)
88
+ ]
89
+ )
90
+ for i in range(filter_width // 2 + 1)
91
+ ]
92
+ ),
93
+ )
94
+
95
+ new_kernel = new_kernel.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}")
96
+
97
+ if hasattr(kernel, "_unsafe_update_src") is True:
98
+ kernel._unsafe_update_src(new_kernel)
99
+ kernel.hash = None
100
+ else:
101
+ kernel.src = new_kernel
102
+
103
+ return kernel
104
+
105
+
106
+ def median_filter_cuda(x: torch.Tensor, filter_width: int):
107
+ """Apply a median filter of given width along the last dimension of x"""
108
+ slices = x.contiguous().unfold(-1, filter_width, 1)
109
+ grid = np.prod(slices.shape[:-2])
110
+
111
+ kernel = median_kernel(filter_width)
112
+ y = torch.empty_like(slices[..., 0])
113
+
114
+ BLOCK_SIZE = 1 << (y.stride(-2) - 1).bit_length()
115
+ kernel[(grid,)](y, x, x.stride(-2), y.stride(-2), BLOCK_SIZE=BLOCK_SIZE)
116
+
117
+ return y
whisperlivekit/simul_whisper/whisper/utils.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ import sys
5
+ import zlib
6
+ from typing import Callable, List, Optional, TextIO
7
+
8
+ system_encoding = sys.getdefaultencoding()
9
+
10
+ if system_encoding != "utf-8":
11
+
12
+ def make_safe(string):
13
+ # replaces any character not representable using the system default encoding with an '?',
14
+ # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
15
+ return string.encode(system_encoding, errors="replace").decode(system_encoding)
16
+
17
+ else:
18
+
19
+ def make_safe(string):
20
+ # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
21
+ return string
22
+
23
+
24
+ def exact_div(x, y):
25
+ assert x % y == 0
26
+ return x // y
27
+
28
+
29
+ def str2bool(string):
30
+ str2val = {"True": True, "False": False}
31
+ if string in str2val:
32
+ return str2val[string]
33
+ else:
34
+ raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
35
+
36
+
37
+ def optional_int(string):
38
+ return None if string == "None" else int(string)
39
+
40
+
41
+ def optional_float(string):
42
+ return None if string == "None" else float(string)
43
+
44
+
45
+ def compression_ratio(text) -> float:
46
+ text_bytes = text.encode("utf-8")
47
+ return len(text_bytes) / len(zlib.compress(text_bytes))
48
+
49
+
50
+ def format_timestamp(
51
+ seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
52
+ ):
53
+ assert seconds >= 0, "non-negative timestamp expected"
54
+ milliseconds = round(seconds * 1000.0)
55
+
56
+ hours = milliseconds // 3_600_000
57
+ milliseconds -= hours * 3_600_000
58
+
59
+ minutes = milliseconds // 60_000
60
+ milliseconds -= minutes * 60_000
61
+
62
+ seconds = milliseconds // 1_000
63
+ milliseconds -= seconds * 1_000
64
+
65
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
66
+ return (
67
+ f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
68
+ )
69
+
70
+
71
+ def get_start(segments: List[dict]) -> Optional[float]:
72
+ return next(
73
+ (w["start"] for s in segments for w in s["words"]),
74
+ segments[0]["start"] if segments else None,
75
+ )
76
+
77
+
78
+ def get_end(segments: List[dict]) -> Optional[float]:
79
+ return next(
80
+ (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
81
+ segments[-1]["end"] if segments else None,
82
+ )
83
+
84
+
85
+ class ResultWriter:
86
+ extension: str
87
+
88
+ def __init__(self, output_dir: str):
89
+ self.output_dir = output_dir
90
+
91
+ def __call__(
92
+ self, result: dict, audio_path: str, options: Optional[dict] = None, **kwargs
93
+ ):
94
+ audio_basename = os.path.basename(audio_path)
95
+ audio_basename = os.path.splitext(audio_basename)[0]
96
+ output_path = os.path.join(
97
+ self.output_dir, audio_basename + "." + self.extension
98
+ )
99
+
100
+ with open(output_path, "w", encoding="utf-8") as f:
101
+ self.write_result(result, file=f, options=options, **kwargs)
102
+
103
+ def write_result(
104
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
105
+ ):
106
+ raise NotImplementedError
107
+
108
+
109
+ class WriteTXT(ResultWriter):
110
+ extension: str = "txt"
111
+
112
+ def write_result(
113
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
114
+ ):
115
+ for segment in result["segments"]:
116
+ print(segment["text"].strip(), file=file, flush=True)
117
+
118
+
119
+ class SubtitlesWriter(ResultWriter):
120
+ always_include_hours: bool
121
+ decimal_marker: str
122
+
123
+ def iterate_result(
124
+ self,
125
+ result: dict,
126
+ options: Optional[dict] = None,
127
+ *,
128
+ max_line_width: Optional[int] = None,
129
+ max_line_count: Optional[int] = None,
130
+ highlight_words: bool = False,
131
+ max_words_per_line: Optional[int] = None,
132
+ ):
133
+ options = options or {}
134
+ max_line_width = max_line_width or options.get("max_line_width")
135
+ max_line_count = max_line_count or options.get("max_line_count")
136
+ highlight_words = highlight_words or options.get("highlight_words", False)
137
+ max_words_per_line = max_words_per_line or options.get("max_words_per_line")
138
+ preserve_segments = max_line_count is None or max_line_width is None
139
+ max_line_width = max_line_width or 1000
140
+ max_words_per_line = max_words_per_line or 1000
141
+
142
+ def iterate_subtitles():
143
+ line_len = 0
144
+ line_count = 1
145
+ # the next subtitle to yield (a list of word timings with whitespace)
146
+ subtitle: List[dict] = []
147
+ last: float = get_start(result["segments"]) or 0.0
148
+ for segment in result["segments"]:
149
+ chunk_index = 0
150
+ words_count = max_words_per_line
151
+ while chunk_index < len(segment["words"]):
152
+ remaining_words = len(segment["words"]) - chunk_index
153
+ if max_words_per_line > len(segment["words"]) - chunk_index:
154
+ words_count = remaining_words
155
+ for i, original_timing in enumerate(
156
+ segment["words"][chunk_index : chunk_index + words_count]
157
+ ):
158
+ timing = original_timing.copy()
159
+ long_pause = (
160
+ not preserve_segments and timing["start"] - last > 3.0
161
+ )
162
+ has_room = line_len + len(timing["word"]) <= max_line_width
163
+ seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
164
+ if (
165
+ line_len > 0
166
+ and has_room
167
+ and not long_pause
168
+ and not seg_break
169
+ ):
170
+ # line continuation
171
+ line_len += len(timing["word"])
172
+ else:
173
+ # new line
174
+ timing["word"] = timing["word"].strip()
175
+ if (
176
+ len(subtitle) > 0
177
+ and max_line_count is not None
178
+ and (long_pause or line_count >= max_line_count)
179
+ or seg_break
180
+ ):
181
+ # subtitle break
182
+ yield subtitle
183
+ subtitle = []
184
+ line_count = 1
185
+ elif line_len > 0:
186
+ # line break
187
+ line_count += 1
188
+ timing["word"] = "\n" + timing["word"]
189
+ line_len = len(timing["word"].strip())
190
+ subtitle.append(timing)
191
+ last = timing["start"]
192
+ chunk_index += max_words_per_line
193
+ if len(subtitle) > 0:
194
+ yield subtitle
195
+
196
+ if len(result["segments"]) > 0 and "words" in result["segments"][0]:
197
+ for subtitle in iterate_subtitles():
198
+ subtitle_start = self.format_timestamp(subtitle[0]["start"])
199
+ subtitle_end = self.format_timestamp(subtitle[-1]["end"])
200
+ subtitle_text = "".join([word["word"] for word in subtitle])
201
+ if highlight_words:
202
+ last = subtitle_start
203
+ all_words = [timing["word"] for timing in subtitle]
204
+ for i, this_word in enumerate(subtitle):
205
+ start = self.format_timestamp(this_word["start"])
206
+ end = self.format_timestamp(this_word["end"])
207
+ if last != start:
208
+ yield last, start, subtitle_text
209
+
210
+ yield start, end, "".join(
211
+ [
212
+ (
213
+ re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
214
+ if j == i
215
+ else word
216
+ )
217
+ for j, word in enumerate(all_words)
218
+ ]
219
+ )
220
+ last = end
221
+ else:
222
+ yield subtitle_start, subtitle_end, subtitle_text
223
+ else:
224
+ for segment in result["segments"]:
225
+ segment_start = self.format_timestamp(segment["start"])
226
+ segment_end = self.format_timestamp(segment["end"])
227
+ segment_text = segment["text"].strip().replace("-->", "->")
228
+ yield segment_start, segment_end, segment_text
229
+
230
+ def format_timestamp(self, seconds: float):
231
+ return format_timestamp(
232
+ seconds=seconds,
233
+ always_include_hours=self.always_include_hours,
234
+ decimal_marker=self.decimal_marker,
235
+ )
236
+
237
+
238
+ class WriteVTT(SubtitlesWriter):
239
+ extension: str = "vtt"
240
+ always_include_hours: bool = False
241
+ decimal_marker: str = "."
242
+
243
+ def write_result(
244
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
245
+ ):
246
+ print("WEBVTT\n", file=file)
247
+ for start, end, text in self.iterate_result(result, options, **kwargs):
248
+ print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
249
+
250
+
251
+ class WriteSRT(SubtitlesWriter):
252
+ extension: str = "srt"
253
+ always_include_hours: bool = True
254
+ decimal_marker: str = ","
255
+
256
+ def write_result(
257
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
258
+ ):
259
+ for i, (start, end, text) in enumerate(
260
+ self.iterate_result(result, options, **kwargs), start=1
261
+ ):
262
+ print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
263
+
264
+
265
+ class WriteTSV(ResultWriter):
266
+ """
267
+ Write a transcript to a file in TSV (tab-separated values) format containing lines like:
268
+ <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
269
+
270
+ Using integer milliseconds as start and end times means there's no chance of interference from
271
+ an environment setting a language encoding that causes the decimal in a floating point number
272
+ to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
273
+ """
274
+
275
+ extension: str = "tsv"
276
+
277
+ def write_result(
278
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
279
+ ):
280
+ print("start", "end", "text", sep="\t", file=file)
281
+ for segment in result["segments"]:
282
+ print(round(1000 * segment["start"]), file=file, end="\t")
283
+ print(round(1000 * segment["end"]), file=file, end="\t")
284
+ print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
285
+
286
+
287
+ class WriteJSON(ResultWriter):
288
+ extension: str = "json"
289
+
290
+ def write_result(
291
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
292
+ ):
293
+ json.dump(result, file)
294
+
295
+
296
+ def get_writer(
297
+ output_format: str, output_dir: str
298
+ ) -> Callable[[dict, TextIO, dict], None]:
299
+ writers = {
300
+ "txt": WriteTXT,
301
+ "vtt": WriteVTT,
302
+ "srt": WriteSRT,
303
+ "tsv": WriteTSV,
304
+ "json": WriteJSON,
305
+ }
306
+
307
+ if output_format == "all":
308
+ all_writers = [writer(output_dir) for writer in writers.values()]
309
+
310
+ def write_all(
311
+ result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
312
+ ):
313
+ for writer in all_writers:
314
+ writer(result, file, options, **kwargs)
315
+
316
+ return write_all
317
+
318
+ return writers[output_format](output_dir)
whisperlivekit/simul_whisper/whisper/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "20250625"
whisperlivekit/timed_objects.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ @dataclass
5
+ class TimedText:
6
+ start: Optional[float]
7
+ end: Optional[float]
8
+ text: Optional[str] = ''
9
+ speaker: Optional[int] = -1
10
+ probability: Optional[float] = None
11
+ is_dummy: Optional[bool] = False
12
+
13
+ @dataclass
14
+ class ASRToken(TimedText):
15
+ def with_offset(self, offset: float) -> "ASRToken":
16
+ """Return a new token with the time offset added."""
17
+ return ASRToken(self.start + offset, self.end + offset, self.text, self.speaker, self.probability)
18
+
19
+ @dataclass
20
+ class Sentence(TimedText):
21
+ pass
22
+
23
+ @dataclass
24
+ class Transcript(TimedText):
25
+ pass
26
+
27
+ @dataclass
28
+ class SpeakerSegment(TimedText):
29
+ """Represents a segment of audio attributed to a specific speaker.
30
+ No text nor probability is associated with this segment.
31
+ """
32
+ pass
whisperlivekit/warmup.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ def load_file(warmup_file=None, timeout=5):
7
+ import os
8
+ import tempfile
9
+ import librosa
10
+
11
+ if warmup_file is None:
12
+ # Download JFK sample if not already present
13
+ jfk_url = "https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav"
14
+ temp_dir = tempfile.gettempdir()
15
+ warmup_file = os.path.join(temp_dir, "whisper_warmup_jfk.wav")
16
+
17
+ if not os.path.exists(warmup_file):
18
+ logger.debug(f"Downloading warmup file from {jfk_url}")
19
+ print(f"Downloading warmup file from {jfk_url}")
20
+ import time
21
+ import urllib.request
22
+ import urllib.error
23
+ import socket
24
+
25
+ original_timeout = socket.getdefaulttimeout()
26
+ socket.setdefaulttimeout(timeout)
27
+
28
+ start_time = time.time()
29
+ try:
30
+ urllib.request.urlretrieve(jfk_url, warmup_file)
31
+ logger.debug(f"Download successful in {time.time() - start_time:.2f}s")
32
+ except (urllib.error.URLError, socket.timeout) as e:
33
+ logger.warning(f"Download failed: {e}. Proceeding without warmup.")
34
+ return False
35
+ finally:
36
+ socket.setdefaulttimeout(original_timeout)
37
+ elif not warmup_file:
38
+ return False
39
+
40
+ if not warmup_file or not os.path.exists(warmup_file) or os.path.getsize(warmup_file) == 0:
41
+ logger.warning(f"Warmup file {warmup_file} invalid or missing.")
42
+ return False
43
+
44
+ try:
45
+ audio, sr = librosa.load(warmup_file, sr=16000)
46
+ except Exception as e:
47
+ logger.warning(f"Failed to load audio file: {e}")
48
+ return False
49
+ return audio
50
+
51
+ def warmup_asr(asr, warmup_file=None, timeout=5):
52
+ """
53
+ Warmup the ASR model by transcribing a short audio file.
54
+ """
55
+ audio = load_file(warmup_file=None, timeout=5)
56
+ asr.transcribe(audio)
57
+ logger.info("ASR model is warmed up")
58
+
59
+ def warmup_online(online, warmup_file=None, timeout=5):
60
+ audio = load_file(warmup_file=None, timeout=5)
61
+ online.warmup(audio)
62
+ logger.warning("ASR is warmed up")
whisperlivekit/web/__init__.py ADDED
File without changes
whisperlivekit/web/live_transcription.html ADDED
@@ -0,0 +1,861 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>WhisperLiveKit</title>
8
+ <style>
9
+ :root {
10
+ --bg: #ffffff;
11
+ --text: #111111;
12
+ --muted: #666666;
13
+ --border: #e5e5e5;
14
+ --chip-bg: rgba(0, 0, 0, 0.04);
15
+ --chip-text: #000000;
16
+ --spinner-border: #8d8d8d5c;
17
+ --spinner-top: #b0b0b0;
18
+ --silence-bg: #f3f3f3;
19
+ --loading-bg: rgba(255, 77, 77, 0.06);
20
+ --button-bg: #ffffff;
21
+ --button-border: #e9e9e9;
22
+ --wave-stroke: #000000;
23
+ --label-dia-text: #868686;
24
+ --label-trans-text: #111111;
25
+ }
26
+
27
+ @media (prefers-color-scheme: dark) {
28
+ :root:not([data-theme="light"]) {
29
+ --bg: #0b0b0b;
30
+ --text: #e6e6e6;
31
+ --muted: #9aa0a6;
32
+ --border: #333333;
33
+ --chip-bg: rgba(255, 255, 255, 0.08);
34
+ --chip-text: #e6e6e6;
35
+ --spinner-border: #555555;
36
+ --spinner-top: #dddddd;
37
+ --silence-bg: #1a1a1a;
38
+ --loading-bg: rgba(255, 77, 77, 0.12);
39
+ --button-bg: #111111;
40
+ --button-border: #333333;
41
+ --wave-stroke: #e6e6e6;
42
+ --label-dia-text: #b3b3b3;
43
+ --label-trans-text: #ffffff;
44
+ }
45
+ }
46
+
47
+ :root[data-theme="dark"] {
48
+ --bg: #0b0b0b;
49
+ --text: #e6e6e6;
50
+ --muted: #9aa0a6;
51
+ --border: #333333;
52
+ --chip-bg: rgba(255, 255, 255, 0.08);
53
+ --chip-text: #e6e6e6;
54
+ --spinner-border: #555555;
55
+ --spinner-top: #dddddd;
56
+ --silence-bg: #1a1a1a;
57
+ --loading-bg: rgba(255, 77, 77, 0.12);
58
+ --button-bg: #111111;
59
+ --button-border: #333333;
60
+ --wave-stroke: #e6e6e6;
61
+ --label-dia-text: #b3b3b3;
62
+ --label-trans-text: #ffffff;
63
+ }
64
+
65
+ :root[data-theme="light"] {
66
+ --bg: #ffffff;
67
+ --text: #111111;
68
+ --muted: #666666;
69
+ --border: #e5e5e5;
70
+ --chip-bg: rgba(0, 0, 0, 0.04);
71
+ --chip-text: #000000;
72
+ --spinner-border: #8d8d8d5c;
73
+ --spinner-top: #b0b0b0;
74
+ --silence-bg: #f3f3f3;
75
+ --loading-bg: rgba(255, 77, 77, 0.06);
76
+ --button-bg: #ffffff;
77
+ --button-border: #e9e9e9;
78
+ --wave-stroke: #000000;
79
+ --label-dia-text: #868686;
80
+ --label-trans-text: #111111;
81
+ }
82
+ body {
83
+ font-family: ui-sans-serif, system-ui, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
84
+ margin: 20px;
85
+ text-align: center;
86
+ background-color: var(--bg);
87
+ color: var(--text);
88
+ }
89
+
90
+ #recordButton {
91
+ width: 50px;
92
+ height: 50px;
93
+ border: none;
94
+ border-radius: 50%;
95
+ background-color: var(--button-bg);
96
+ cursor: pointer;
97
+ transition: all 0.3s ease;
98
+ border: 1px solid var(--button-border);
99
+ display: flex;
100
+ align-items: center;
101
+ justify-content: center;
102
+ position: relative;
103
+ }
104
+
105
+ #recordButton.recording {
106
+ width: 180px;
107
+ border-radius: 40px;
108
+ justify-content: flex-start;
109
+ padding-left: 20px;
110
+ }
111
+
112
+ #recordButton:active {
113
+ transform: scale(0.95);
114
+ }
115
+
116
+ .shape-container {
117
+ width: 25px;
118
+ height: 25px;
119
+ display: flex;
120
+ align-items: center;
121
+ justify-content: center;
122
+ flex-shrink: 0;
123
+ }
124
+
125
+ .shape {
126
+ width: 25px;
127
+ height: 25px;
128
+ background-color: rgb(209, 61, 53);
129
+ border-radius: 50%;
130
+ transition: all 0.3s ease;
131
+ }
132
+
133
+ #recordButton:disabled .shape {
134
+ background-color: #6e6d6d;
135
+ }
136
+
137
+ #recordButton.recording .shape {
138
+ border-radius: 5px;
139
+ width: 25px;
140
+ height: 25px;
141
+ }
142
+
143
+ /* Recording elements */
144
+ .recording-info {
145
+ display: none;
146
+ align-items: center;
147
+ margin-left: 15px;
148
+ flex-grow: 1;
149
+ }
150
+
151
+ #recordButton.recording .recording-info {
152
+ display: flex;
153
+ }
154
+
155
+ .wave-container {
156
+ width: 60px;
157
+ height: 30px;
158
+ position: relative;
159
+ display: flex;
160
+ align-items: center;
161
+ justify-content: center;
162
+ }
163
+
164
+ #waveCanvas {
165
+ width: 100%;
166
+ height: 100%;
167
+ }
168
+
169
+ .timer {
170
+ font-size: 14px;
171
+ font-weight: 500;
172
+ color: var(--text);
173
+ margin-left: 10px;
174
+ }
175
+
176
+ #status {
177
+ margin-top: 20px;
178
+ font-size: 16px;
179
+ color: var(--text);
180
+ }
181
+
182
+ .settings-container {
183
+ display: flex;
184
+ justify-content: center;
185
+ align-items: center;
186
+ gap: 15px;
187
+ margin-top: 20px;
188
+ }
189
+
190
+ .settings {
191
+ display: flex;
192
+ flex-direction: column;
193
+ align-items: flex-start;
194
+ gap: 5px;
195
+ }
196
+
197
+ #chunkSelector,
198
+ #websocketInput,
199
+ #themeSelector {
200
+ font-size: 16px;
201
+ padding: 5px;
202
+ border-radius: 5px;
203
+ border: 1px solid var(--border);
204
+ background-color: var(--button-bg);
205
+ color: var(--text);
206
+ max-height: 30px;
207
+ }
208
+
209
+ #websocketInput {
210
+ width: 200px;
211
+ }
212
+
213
+ #chunkSelector:focus,
214
+ #websocketInput:focus,
215
+ #themeSelector:focus {
216
+ outline: none;
217
+ border-color: #007bff;
218
+ }
219
+
220
+ label {
221
+ font-size: 14px;
222
+ }
223
+
224
+ /* Speaker-labeled transcript area */
225
+ #linesTranscript {
226
+ margin: 20px auto;
227
+ max-width: 700px;
228
+ text-align: left;
229
+ font-size: 16px;
230
+ }
231
+
232
+ #linesTranscript p {
233
+ margin: 0px 0;
234
+ }
235
+
236
+ #linesTranscript strong {
237
+ color: var(--text);
238
+ }
239
+
240
+ #speaker {
241
+ border: 1px solid var(--border);
242
+ border-radius: 100px;
243
+ padding: 2px 10px;
244
+ font-size: 14px;
245
+ margin-bottom: 0px;
246
+ }
247
+ .label_diarization {
248
+ background-color: var(--chip-bg);
249
+ border-radius: 8px 8px 8px 8px;
250
+ padding: 2px 10px;
251
+ margin-left: 10px;
252
+ display: inline-block;
253
+ white-space: nowrap;
254
+ font-size: 14px;
255
+ margin-bottom: 0px;
256
+ color: var(--label-dia-text)
257
+ }
258
+
259
+ .label_transcription {
260
+ background-color: var(--chip-bg);
261
+ border-radius: 8px 8px 8px 8px;
262
+ padding: 2px 10px;
263
+ display: inline-block;
264
+ white-space: nowrap;
265
+ margin-left: 10px;
266
+ font-size: 14px;
267
+ margin-bottom: 0px;
268
+ color: var(--label-trans-text)
269
+ }
270
+
271
+ #timeInfo {
272
+ color: var(--muted);
273
+ margin-left: 10px;
274
+ }
275
+
276
+ .textcontent {
277
+ font-size: 16px;
278
+ /* margin-left: 10px; */
279
+ padding-left: 10px;
280
+ margin-bottom: 10px;
281
+ margin-top: 1px;
282
+ padding-top: 5px;
283
+ border-radius: 0px 0px 0px 10px;
284
+ }
285
+
286
+ .buffer_diarization {
287
+ color: var(--label-dia-text);
288
+ margin-left: 4px;
289
+ }
290
+
291
+ .buffer_transcription {
292
+ color: #7474748c;
293
+ margin-left: 4px;
294
+ }
295
+
296
+
297
+ .spinner {
298
+ display: inline-block;
299
+ width: 8px;
300
+ height: 8px;
301
+ border: 2px solid var(--spinner-border);
302
+ border-top: 2px solid var(--spinner-top);
303
+ border-radius: 50%;
304
+ animation: spin 0.7s linear infinite;
305
+ vertical-align: middle;
306
+ margin-bottom: 2px;
307
+ margin-right: 5px;
308
+ }
309
+
310
+ @keyframes spin {
311
+ to {
312
+ transform: rotate(360deg);
313
+ }
314
+ }
315
+
316
+ .silence {
317
+ color: var(--muted);
318
+ background-color: var(--silence-bg);
319
+ font-size: 13px;
320
+ border-radius: 30px;
321
+ padding: 2px 10px;
322
+ }
323
+
324
+ .loading {
325
+ color: var(--muted);
326
+ background-color: var(--loading-bg);
327
+ border-radius: 8px 8px 8px 0px;
328
+ padding: 2px 10px;
329
+ font-size: 14px;
330
+ margin-bottom: 0px;
331
+ }
332
+ </style>
333
+ </head>
334
+
335
+ <body>
336
+
337
+ <div class="settings-container">
338
+ <button id="recordButton">
339
+ <div class="shape-container">
340
+ <div class="shape"></div>
341
+ </div>
342
+ <div class="recording-info">
343
+ <div class="wave-container">
344
+ <canvas id="waveCanvas"></canvas>
345
+ </div>
346
+ <div class="timer">00:00</div>
347
+ </div>
348
+ </button>
349
+ <div class="settings">
350
+ <div>
351
+ <label for="chunkSelector">Chunk size (ms):</label>
352
+ <select id="chunkSelector">
353
+ <option value="500">500 ms</option>
354
+ <option value="1000" selected>1000 ms</option>
355
+ <option value="2000">2000 ms</option>
356
+ <option value="3000">3000 ms</option>
357
+ <option value="4000">4000 ms</option>
358
+ <option value="5000">5000 ms</option>
359
+ </select>
360
+ </div>
361
+ <div>
362
+ <label for="websocketInput">WebSocket URL:</label>
363
+ <input id="websocketInput" type="text" />
364
+ </div>
365
+ <div>
366
+ <label for="themeSelector">Theme:</label>
367
+ <select id="themeSelector">
368
+ <option value="system" selected>System</option>
369
+ <option value="light">Light</option>
370
+ <option value="dark">Dark</option>
371
+ </select>
372
+ </div>
373
+ </div>
374
+ </div>
375
+
376
+ <p id="status"></p>
377
+
378
+ <!-- Speaker-labeled transcript -->
379
+ <div id="linesTranscript"></div>
380
+
381
+ <script>
382
+ let isRecording = false;
383
+ let websocket = null;
384
+ let recorder = null;
385
+ let chunkDuration = 1000;
386
+ let websocketUrl = "ws://localhost:8000/asr";
387
+ let userClosing = false;
388
+ let wakeLock = null;
389
+ let startTime = null;
390
+ let timerInterval = null;
391
+ let audioContext = null;
392
+ let analyser = null;
393
+ let microphone = null;
394
+ let waveCanvas = document.getElementById("waveCanvas");
395
+ let waveCtx = waveCanvas.getContext("2d");
396
+ let animationFrame = null;
397
+ let waitingForStop = false;
398
+ let lastReceivedData = null;
399
+ let lastSignature = null;
400
+ waveCanvas.width = 60 * (window.devicePixelRatio || 1);
401
+ waveCanvas.height = 30 * (window.devicePixelRatio || 1);
402
+ waveCtx.scale(window.devicePixelRatio || 1, window.devicePixelRatio || 1);
403
+
404
+ const statusText = document.getElementById("status");
405
+ const recordButton = document.getElementById("recordButton");
406
+ const chunkSelector = document.getElementById("chunkSelector");
407
+ const websocketInput = document.getElementById("websocketInput");
408
+ const linesTranscriptDiv = document.getElementById("linesTranscript");
409
+ const timerElement = document.querySelector(".timer");
410
+ const themeSelector = document.getElementById("themeSelector");
411
+
412
+ function getWaveStroke() {
413
+ const styles = getComputedStyle(document.documentElement);
414
+ const v = styles.getPropertyValue("--wave-stroke").trim();
415
+ return v || "#000";
416
+ }
417
+
418
+ let waveStroke = getWaveStroke();
419
+
420
+ function updateWaveStroke() {
421
+ waveStroke = getWaveStroke();
422
+ }
423
+
424
+ function applyTheme(pref) {
425
+ if (pref === "light") {
426
+ document.documentElement.setAttribute("data-theme", "light");
427
+ } else if (pref === "dark") {
428
+ document.documentElement.setAttribute("data-theme", "dark");
429
+ } else {
430
+ document.documentElement.removeAttribute("data-theme");
431
+ }
432
+ updateWaveStroke();
433
+ }
434
+
435
+ const savedThemePref = localStorage.getItem("themePreference") || "system";
436
+ applyTheme(savedThemePref);
437
+ if (themeSelector) {
438
+ themeSelector.value = savedThemePref;
439
+ themeSelector.addEventListener("change", () => {
440
+ const val = themeSelector.value;
441
+ localStorage.setItem("themePreference", val);
442
+ applyTheme(val);
443
+ });
444
+ }
445
+
446
+ const darkMq = window.matchMedia && window.matchMedia("(prefers-color-scheme: dark)");
447
+ const handleOsThemeChange = () => {
448
+ const pref = localStorage.getItem("themePreference") || "system";
449
+ if (pref === "system") updateWaveStroke();
450
+ };
451
+ if (darkMq && darkMq.addEventListener) {
452
+ darkMq.addEventListener("change", handleOsThemeChange);
453
+ } else if (darkMq && darkMq.addListener) {
454
+ darkMq.addListener(handleOsThemeChange);
455
+ }
456
+
457
+ function fmt1(x) {
458
+ const n = Number(x);
459
+ return Number.isFinite(n) ? n.toFixed(1) : x;
460
+ }
461
+
462
+ const host = window.location.hostname || "localhost";
463
+ const port = window.location.port;
464
+ const protocol = window.location.protocol === "https:" ? "wss" : "ws";
465
+ const defaultWebSocketUrl = `${protocol}://${host}:${port}/asr`;
466
+ websocketInput.value = defaultWebSocketUrl;
467
+ websocketUrl = defaultWebSocketUrl;
468
+
469
+ chunkSelector.addEventListener("change", () => {
470
+ chunkDuration = parseInt(chunkSelector.value);
471
+ });
472
+
473
+ websocketInput.addEventListener("change", () => {
474
+ const urlValue = websocketInput.value.trim();
475
+ if (!urlValue.startsWith("ws://") && !urlValue.startsWith("wss://")) {
476
+ statusText.textContent = "Invalid WebSocket URL (must start with ws:// or wss://)";
477
+ return;
478
+ }
479
+ websocketUrl = urlValue;
480
+ statusText.textContent = "WebSocket URL updated. Ready to connect.";
481
+ });
482
+
483
+ function setupWebSocket() {
484
+ return new Promise((resolve, reject) => {
485
+ try {
486
+ websocket = new WebSocket(websocketUrl);
487
+ } catch (error) {
488
+ statusText.textContent = "Invalid WebSocket URL. Please check and try again.";
489
+ reject(error);
490
+ return;
491
+ }
492
+
493
+ websocket.onopen = () => {
494
+ statusText.textContent = "Connected to server.";
495
+ resolve();
496
+ };
497
+
498
+ websocket.onclose = () => {
499
+ if (userClosing) {
500
+ if (waitingForStop) {
501
+ statusText.textContent = "Processing finalized or connection closed.";
502
+ if (lastReceivedData) {
503
+ renderLinesWithBuffer(
504
+ lastReceivedData.lines || [],
505
+ lastReceivedData.buffer_diarization || "",
506
+ lastReceivedData.buffer_transcription || "",
507
+ 0, 0, true // isFinalizing = true
508
+ );
509
+ }
510
+ }
511
+ // If ready_to_stop was received, statusText is already "Finished processing..."
512
+ // and waitingForStop is false.
513
+ } else {
514
+ statusText.textContent = "Disconnected from the WebSocket server. (Check logs if model is loading.)";
515
+ if (isRecording) {
516
+ stopRecording();
517
+ }
518
+ }
519
+ isRecording = false;
520
+ waitingForStop = false;
521
+ userClosing = false;
522
+ lastReceivedData = null;
523
+ websocket = null;
524
+ updateUI();
525
+ };
526
+
527
+ websocket.onerror = () => {
528
+ statusText.textContent = "Error connecting to WebSocket.";
529
+ reject(new Error("Error connecting to WebSocket"));
530
+ };
531
+
532
+ // Handle messages from server
533
+ websocket.onmessage = (event) => {
534
+ const data = JSON.parse(event.data);
535
+
536
+ // Check for status messages
537
+ if (data.type === "ready_to_stop") {
538
+ console.log("Ready to stop received, finalizing display and closing WebSocket.");
539
+ waitingForStop = false;
540
+
541
+ if (lastReceivedData) {
542
+ renderLinesWithBuffer(
543
+ lastReceivedData.lines || [],
544
+ lastReceivedData.buffer_diarization || "",
545
+ lastReceivedData.buffer_transcription || "",
546
+ 0, // No more lag
547
+ 0, // No more lag
548
+ true // isFinalizing = true
549
+ );
550
+ }
551
+ statusText.textContent = "Finished processing audio! Ready to record again.";
552
+ recordButton.disabled = false;
553
+
554
+ if (websocket) {
555
+ websocket.close(); // will trigger onclose
556
+ // websocket = null; // onclose handle setting websocket to null
557
+ }
558
+ return;
559
+ }
560
+
561
+ lastReceivedData = data;
562
+
563
+ // Handle normal transcription updates
564
+ const {
565
+ lines = [],
566
+ buffer_transcription = "",
567
+ buffer_diarization = "",
568
+ remaining_time_transcription = 0,
569
+ remaining_time_diarization = 0,
570
+ status = "active_transcription"
571
+ } = data;
572
+
573
+ renderLinesWithBuffer(
574
+ lines,
575
+ buffer_diarization,
576
+ buffer_transcription,
577
+ remaining_time_diarization,
578
+ remaining_time_transcription,
579
+ false,
580
+ status
581
+ );
582
+ };
583
+ });
584
+ }
585
+
586
+ function renderLinesWithBuffer(lines, buffer_diarization, buffer_transcription, remaining_time_diarization, remaining_time_transcription, isFinalizing = false, current_status = "active_transcription") {
587
+ if (current_status === "no_audio_detected") {
588
+ linesTranscriptDiv.innerHTML = "<p style='text-align: center; color: var(--muted); margin-top: 20px;'><em>No audio detected...</em></p>";
589
+ return;
590
+ }
591
+
592
+ // try to keep stable DOM despite having updates every 0.1s. only update numeric lag values if structure hasn't changed
593
+ const showLoading = (!isFinalizing) && (lines || []).some(it => it.speaker == 0);
594
+ const showTransLag = !isFinalizing && remaining_time_transcription > 0;
595
+ const showDiaLag = !isFinalizing && !!buffer_diarization && remaining_time_diarization > 0;
596
+ const signature = JSON.stringify({
597
+ lines: (lines || []).map(it => ({ speaker: it.speaker, text: it.text, beg: it.beg, end: it.end })),
598
+ buffer_transcription: buffer_transcription || "",
599
+ buffer_diarization: buffer_diarization || "",
600
+ status: current_status,
601
+ showLoading,
602
+ showTransLag,
603
+ showDiaLag,
604
+ isFinalizing: !!isFinalizing
605
+ });
606
+ if (lastSignature === signature) {
607
+ const t = document.querySelector(".lag-transcription-value");
608
+ if (t) t.textContent = fmt1(remaining_time_transcription);
609
+ const d = document.querySelector(".lag-diarization-value");
610
+ if (d) d.textContent = fmt1(remaining_time_diarization);
611
+ const ld = document.querySelector(".loading-diarization-value");
612
+ if (ld) ld.textContent = fmt1(remaining_time_diarization);
613
+ return;
614
+ }
615
+ lastSignature = signature;
616
+
617
+ const linesHtml = lines.map((item, idx) => {
618
+ let timeInfo = "";
619
+ if (item.beg !== undefined && item.end !== undefined) {
620
+ timeInfo = ` ${item.beg} - ${item.end}`;
621
+ }
622
+
623
+ let speakerLabel = "";
624
+ if (item.speaker === -2) {
625
+ speakerLabel = `<span class="silence">Silence<span id='timeInfo'>${timeInfo}</span></span>`;
626
+ } else if (item.speaker == 0 && !isFinalizing) {
627
+ speakerLabel = `<span class='loading'><span class="spinner"></span><span id='timeInfo'><span class="loading-diarization-value">${fmt1(remaining_time_diarization)}</span> second(s) of audio are undergoing diarization</span></span>`;
628
+ } else if (item.speaker == -1) {
629
+ speakerLabel = `<span id="speaker">Speaker 1<span id='timeInfo'>${timeInfo}</span></span>`;
630
+ } else if (item.speaker !== -1 && item.speaker !== 0) {
631
+ speakerLabel = `<span id="speaker">Speaker ${item.speaker}<span id='timeInfo'>${timeInfo}</span></span>`;
632
+ }
633
+
634
+
635
+ let currentLineText = item.text || "";
636
+
637
+ if (idx === lines.length - 1) {
638
+ if (!isFinalizing && item.speaker !== -2) {
639
+ if (remaining_time_transcription > 0) {
640
+ speakerLabel += `<span class="label_transcription"><span class="spinner"></span>Transcription lag <span id='timeInfo'><span class="lag-transcription-value">${fmt1(remaining_time_transcription)}</span>s</span></span>`;
641
+ }
642
+ if (buffer_diarization && remaining_time_diarization > 0) {
643
+ speakerLabel += `<span class="label_diarization"><span class="spinner"></span>Diarization lag<span id='timeInfo'><span class="lag-diarization-value">${fmt1(remaining_time_diarization)}</span>s</span></span>`;
644
+ }
645
+ }
646
+
647
+ if (buffer_diarization) {
648
+ if (isFinalizing) {
649
+ currentLineText += (currentLineText.length > 0 && buffer_diarization.trim().length > 0 ? " " : "") + buffer_diarization.trim();
650
+ } else {
651
+ currentLineText += `<span class="buffer_diarization">${buffer_diarization}</span>`;
652
+ }
653
+ }
654
+ if (buffer_transcription) {
655
+ if (isFinalizing) {
656
+ currentLineText += (currentLineText.length > 0 && buffer_transcription.trim().length > 0 ? " " : "") + buffer_transcription.trim();
657
+ } else {
658
+ currentLineText += `<span class="buffer_transcription">${buffer_transcription}</span>`;
659
+ }
660
+ }
661
+ }
662
+
663
+ return currentLineText.trim().length > 0 || speakerLabel.length > 0
664
+ ? `<p>${speakerLabel}<br/><div class='textcontent'>${currentLineText}</div></p>`
665
+ : `<p>${speakerLabel}<br/></p>`;
666
+ }).join("");
667
+
668
+ linesTranscriptDiv.innerHTML = linesHtml;
669
+ window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });
670
+ }
671
+
672
+ function updateTimer() {
673
+ if (!startTime) return;
674
+
675
+ const elapsed = Math.floor((Date.now() - startTime) / 1000);
676
+ const minutes = Math.floor(elapsed / 60).toString().padStart(2, "0");
677
+ const seconds = (elapsed % 60).toString().padStart(2, "0");
678
+ timerElement.textContent = `${minutes}:${seconds}`;
679
+ }
680
+
681
+ function drawWaveform() {
682
+ if (!analyser) return;
683
+
684
+ const bufferLength = analyser.frequencyBinCount;
685
+ const dataArray = new Uint8Array(bufferLength);
686
+ analyser.getByteTimeDomainData(dataArray);
687
+
688
+ waveCtx.clearRect(0, 0, waveCanvas.width / (window.devicePixelRatio || 1), waveCanvas.height / (window.devicePixelRatio || 1));
689
+ waveCtx.lineWidth = 1;
690
+ waveCtx.strokeStyle = waveStroke;
691
+ waveCtx.beginPath();
692
+
693
+ const sliceWidth = (waveCanvas.width / (window.devicePixelRatio || 1)) / bufferLength;
694
+ let x = 0;
695
+
696
+ for (let i = 0; i < bufferLength; i++) {
697
+ const v = dataArray[i] / 128.0;
698
+ const y = v * (waveCanvas.height / (window.devicePixelRatio || 1)) / 2;
699
+
700
+ if (i === 0) {
701
+ waveCtx.moveTo(x, y);
702
+ } else {
703
+ waveCtx.lineTo(x, y);
704
+ }
705
+
706
+ x += sliceWidth;
707
+ }
708
+
709
+ waveCtx.lineTo(waveCanvas.width / (window.devicePixelRatio || 1), waveCanvas.height / (window.devicePixelRatio || 1) / 2);
710
+ waveCtx.stroke();
711
+
712
+ animationFrame = requestAnimationFrame(drawWaveform);
713
+ }
714
+
715
+ async function startRecording() {
716
+ try {
717
+
718
+ // https://developer.mozilla.org/en-US/docs/Web/API/Screen_Wake_Lock_API
719
+ // create an async function to request a wake lock
720
+ try {
721
+ wakeLock = await navigator.wakeLock.request("screen");
722
+ } catch (err) {
723
+ // The Wake Lock request has failed - usually system related, such as battery.
724
+ console.log("Error acquiring wake lock.")
725
+ }
726
+
727
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
728
+
729
+ audioContext = new (window.AudioContext || window.webkitAudioContext)();
730
+ analyser = audioContext.createAnalyser();
731
+ analyser.fftSize = 256;
732
+ microphone = audioContext.createMediaStreamSource(stream);
733
+ microphone.connect(analyser);
734
+
735
+ recorder = new MediaRecorder(stream, { mimeType: "audio/webm" });
736
+ recorder.ondataavailable = (e) => {
737
+ if (websocket && websocket.readyState === WebSocket.OPEN) {
738
+ websocket.send(e.data);
739
+ }
740
+ };
741
+ recorder.start(chunkDuration);
742
+
743
+ startTime = Date.now();
744
+ timerInterval = setInterval(updateTimer, 1000);
745
+ drawWaveform();
746
+
747
+ isRecording = true;
748
+ updateUI();
749
+ } catch (err) {
750
+ statusText.textContent = "Error accessing microphone. Please allow microphone access.";
751
+ console.error(err);
752
+ }
753
+ }
754
+
755
+ async function stopRecording() {
756
+ wakeLock.release().then(() => {
757
+ wakeLock = null;
758
+ });
759
+
760
+ userClosing = true;
761
+ waitingForStop = true;
762
+
763
+ if (websocket && websocket.readyState === WebSocket.OPEN) {
764
+ // Send empty audio buffer as stop signal
765
+ const emptyBlob = new Blob([], { type: 'audio/webm' });
766
+ websocket.send(emptyBlob);
767
+ statusText.textContent = "Recording stopped. Processing final audio...";
768
+ }
769
+
770
+ if (recorder) {
771
+ recorder.stop();
772
+ recorder = null;
773
+ }
774
+
775
+ if (microphone) {
776
+ microphone.disconnect();
777
+ microphone = null;
778
+ }
779
+
780
+ if (analyser) {
781
+ analyser = null;
782
+ }
783
+
784
+ if (audioContext && audioContext.state !== 'closed') {
785
+ try {
786
+ audioContext.close();
787
+ } catch (e) {
788
+ console.warn("Could not close audio context:", e);
789
+ }
790
+ audioContext = null;
791
+ }
792
+
793
+ if (animationFrame) {
794
+ cancelAnimationFrame(animationFrame);
795
+ animationFrame = null;
796
+ }
797
+
798
+ if (timerInterval) {
799
+ clearInterval(timerInterval);
800
+ timerInterval = null;
801
+ }
802
+ timerElement.textContent = "00:00";
803
+ startTime = null;
804
+
805
+
806
+ isRecording = false;
807
+ updateUI();
808
+ }
809
+
810
+ async function toggleRecording() {
811
+ if (!isRecording) {
812
+ if (waitingForStop) {
813
+ console.log("Waiting for stop, early return");
814
+ return; // Early return, UI is already updated
815
+ }
816
+ console.log("Connecting to WebSocket");
817
+ try {
818
+ // If we have an active WebSocket that's still processing, just restart audio capture
819
+ if (websocket && websocket.readyState === WebSocket.OPEN) {
820
+ await startRecording();
821
+ } else {
822
+ // If no active WebSocket or it's closed, create new one
823
+ await setupWebSocket();
824
+ await startRecording();
825
+ }
826
+ } catch (err) {
827
+ statusText.textContent = "Could not connect to WebSocket or access mic. Aborted.";
828
+ console.error(err);
829
+ }
830
+ } else {
831
+ console.log("Stopping recording");
832
+ stopRecording();
833
+ }
834
+ }
835
+
836
+ function updateUI() {
837
+ recordButton.classList.toggle("recording", isRecording);
838
+ recordButton.disabled = waitingForStop;
839
+
840
+ if (waitingForStop) {
841
+ if (statusText.textContent !== "Recording stopped. Processing final audio...") {
842
+ statusText.textContent = "Please wait for processing to complete...";
843
+ }
844
+ } else if (isRecording) {
845
+ statusText.textContent = "Recording...";
846
+ } else {
847
+ if (statusText.textContent !== "Finished processing audio! Ready to record again." &&
848
+ statusText.textContent !== "Processing finalized or connection closed.") {
849
+ statusText.textContent = "Click to start transcription";
850
+ }
851
+ }
852
+ if (!waitingForStop) {
853
+ recordButton.disabled = false;
854
+ }
855
+ }
856
+
857
+ recordButton.addEventListener("click", toggleRecording);
858
+ </script>
859
+ </body>
860
+
861
+ </html>